]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_stacks/bbr.c
Retain only mutually supported TCP options after simultaneous SYN
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_stacks / bbr.c
1 /*-
2  * Copyright (c) 2016-2020 Netflix, Inc.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  */
26 /**
27  * Author: Randall Stewart <rrs@netflix.com>
28  * This work is based on the ACM Queue paper
29  * BBR - Congestion Based Congestion Control
30  * and also numerous discussions with Neal, Yuchung and Van.
31  */
32
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 #include "opt_inet.h"
37 #include "opt_inet6.h"
38 #include "opt_ipsec.h"
39 #include "opt_tcpdebug.h"
40 #include "opt_ratelimit.h"
41 #include "opt_kern_tls.h"
42 #include <sys/param.h>
43 #include <sys/arb.h>
44 #include <sys/module.h>
45 #include <sys/kernel.h>
46 #ifdef TCP_HHOOK
47 #include <sys/hhook.h>
48 #endif
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #include <sys/proc.h>
52 #include <sys/socket.h>
53 #include <sys/socketvar.h>
54 #ifdef KERN_TLS
55 #include <sys/ktls.h>
56 #endif
57 #include <sys/sysctl.h>
58 #include <sys/systm.h>
59 #ifdef STATS
60 #include <sys/qmath.h>
61 #include <sys/tree.h>
62 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
63 #endif
64 #include <sys/refcount.h>
65 #include <sys/queue.h>
66 #include <sys/eventhandler.h>
67 #include <sys/smp.h>
68 #include <sys/kthread.h>
69 #include <sys/lock.h>
70 #include <sys/mutex.h>
71 #include <sys/tim_filter.h>
72 #include <sys/time.h>
73 #include <sys/protosw.h>
74 #include <vm/uma.h>
75 #include <sys/kern_prefetch.h>
76
77 #include <net/route.h>
78 #include <net/route/nhop.h>
79 #include <net/vnet.h>
80
81 #define TCPSTATES               /* for logging */
82
83 #include <netinet/in.h>
84 #include <netinet/in_kdtrace.h>
85 #include <netinet/in_pcb.h>
86 #include <netinet/ip.h>
87 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
88 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet6/in6_pcb.h>
92 #include <netinet6/ip6_var.h>
93 #define TCPOUTFLAGS
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_timer.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcpip.h>
100 #include <netinet/tcp_hpts.h>
101 #include <netinet/cc/cc.h>
102 #include <netinet/tcp_log_buf.h>
103 #include <netinet/tcp_ratelimit.h>
104 #include <netinet/tcp_lro.h>
105 #ifdef TCPDEBUG
106 #include <netinet/tcp_debug.h>
107 #endif                          /* TCPDEBUG */
108 #ifdef TCP_OFFLOAD
109 #include <netinet/tcp_offload.h>
110 #endif
111 #ifdef INET6
112 #include <netinet6/tcp6_var.h>
113 #endif
114 #include <netinet/tcp_fastopen.h>
115
116 #include <netipsec/ipsec_support.h>
117 #include <net/if.h>
118 #include <net/if_var.h>
119 #include <net/ethernet.h>
120
121 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
122 #include <netipsec/ipsec.h>
123 #include <netipsec/ipsec6.h>
124 #endif                          /* IPSEC */
125
126 #include <netinet/udp.h>
127 #include <netinet/udp_var.h>
128 #include <machine/in_cksum.h>
129
130 #ifdef MAC
131 #include <security/mac/mac_framework.h>
132 #endif
133
134 #include "sack_filter.h"
135 #include "tcp_bbr.h"
136 #include "rack_bbr_common.h"
137 uma_zone_t bbr_zone;
138 uma_zone_t bbr_pcb_zone;
139
140 struct sysctl_ctx_list bbr_sysctl_ctx;
141 struct sysctl_oid *bbr_sysctl_root;
142
143 #define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
144         (tv) = (value); \
145         if ((u_long)(tv) < (u_long)(tvmin)) \
146                 (tv) = (tvmin); \
147         if ((u_long)(tv) > (u_long)(tvmax)) \
148                 (tv) = (tvmax); \
149 } while(0)
150
151 /*#define BBR_INVARIANT 1*/
152
153 /*
154  * initial window
155  */
156 static uint32_t bbr_def_init_win = 10;
157 static int32_t bbr_persist_min = 250000;        /* 250ms */
158 static int32_t bbr_persist_max = 1000000;       /* 1 Second */
159 static int32_t bbr_cwnd_may_shrink = 0;
160 static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
161 static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
162 static int32_t bbr_hardware_pacing_limit = 8000;
163 static int32_t bbr_quanta = 3;  /* How much extra quanta do we get? */
164 static int32_t bbr_no_retran = 0;
165
166
167 static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
168 static int32_t bbr_max_net_error_cnt = 10;
169 /* Should the following be dynamic too -- loss wise */
170 static int32_t bbr_rtt_gain_thresh = 0;
171 /* Measurement controls */
172 static int32_t bbr_use_google_algo = 1;
173 static int32_t bbr_ts_limiting = 1;
174 static int32_t bbr_ts_can_raise = 0;
175 static int32_t bbr_do_red = 600;
176 static int32_t bbr_red_scale = 20000;
177 static int32_t bbr_red_mul = 1;
178 static int32_t bbr_red_div = 2;
179 static int32_t bbr_red_growth_restrict = 1;
180 static int32_t  bbr_target_is_bbunit = 0;
181 static int32_t bbr_drop_limit = 0;
182 /*
183  * How much gain do we need to see to
184  * stay in startup?
185  */
186 static int32_t bbr_marks_rxt_sack_passed = 0;
187 static int32_t bbr_start_exit = 25;
188 static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */
189 static int32_t bbr_startup_loss_thresh = 2000;  /* 20.00% loss */
190 static int32_t bbr_hptsi_max_mul = 1;   /* These two mul/div assure a min pacing */
191 static int32_t bbr_hptsi_max_div = 2;   /* time, 0 means turned off. We need this
192                                          * if we go back ever to where the pacer
193                                          * has priority over timers.
194                                          */
195 static int32_t bbr_policer_call_from_rack_to = 0;
196 static int32_t bbr_policer_detection_enabled = 1;
197 static int32_t bbr_min_measurements_req = 1;    /* We need at least 2
198                                                  * measurments before we are
199                                                  * "good" note that 2 == 1.
200                                                  * This is because we use a >
201                                                  * comparison. This means if
202                                                  * min_measure was 0, it takes
203                                                  * num-measures > min(0) and
204                                                  * you get 1 measurement and
205                                                  * you are good. Set to 1, you
206                                                  * have to have two
207                                                  * measurements (this is done
208                                                  * to prevent it from being ok
209                                                  * to have no measurements). */
210 static int32_t bbr_no_pacing_until = 4;
211
212 static int32_t bbr_min_usec_delta = 20000;      /* 20,000 usecs */
213 static int32_t bbr_min_peer_delta = 20;         /* 20 units */
214 static int32_t bbr_delta_percent = 150;         /* 15.0 % */
215
216 static int32_t bbr_target_cwnd_mult_limit = 8;
217 /*
218  * bbr_cwnd_min_val is the number of
219  * segments we hold to in the RTT probe
220  * state typically 4.
221  */
222 static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
223
224
225 static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
226
227 static int32_t bbr_gain_to_target = 1;
228 static int32_t bbr_gain_gets_extra_too = 1;
229 /*
230  * bbr_high_gain is the 2/ln(2) value we need
231  * to double the sending rate in startup. This
232  * is used for both cwnd and hptsi gain's.
233  */
234 static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
235 static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
236 static int32_t bbr_use_lower_gain_in_startup = 1;
237
238 /* thresholds for reduction on drain in sub-states/drain */
239 static int32_t bbr_drain_rtt = BBR_SRTT;
240 static int32_t bbr_drain_floor = 88;
241 static int32_t google_allow_early_out = 1;
242 static int32_t google_consider_lost = 1;
243 static int32_t bbr_drain_drop_mul = 4;
244 static int32_t bbr_drain_drop_div = 5;
245 static int32_t bbr_rand_ot = 50;
246 static int32_t bbr_can_force_probertt = 0;
247 static int32_t bbr_can_adjust_probertt = 1;
248 static int32_t bbr_probertt_sets_rtt = 0;
249 static int32_t bbr_can_use_ts_for_rtt = 1;
250 static int32_t bbr_is_ratio = 0;
251 static int32_t bbr_sub_drain_app_limit = 1;
252 static int32_t bbr_prtt_slam_cwnd = 1;
253 static int32_t bbr_sub_drain_slam_cwnd = 1;
254 static int32_t bbr_slam_cwnd_in_main_drain = 1;
255 static int32_t bbr_filter_len_sec = 6;  /* How long does the rttProp filter
256                                          * hold */
257 static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
258 /*
259  * bbr_drain_gain is the reverse of the high_gain
260  * designed to drain back out the standing queue
261  * that is formed in startup by causing a larger
262  * hptsi gain and thus drainging the packets
263  * in flight.
264  */
265 static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
266 static int32_t bbr_rttprobe_gain = 192;
267
268 /*
269  * The cwnd_gain is the default cwnd gain applied when
270  * calculating a target cwnd. Note that the cwnd is
271  * a secondary factor in the way BBR works (see the
272  * paper and think about it, it will take some time).
273  * Basically the hptsi_gain spreads the packets out
274  * so you never get more than BDP to the peer even
275  * if the cwnd is high. In our implemenation that
276  * means in non-recovery/retransmission scenarios
277  * cwnd will never be reached by the flight-size.
278  */
279 static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
280 static int32_t bbr_tlp_type_to_use = BBR_SRTT;
281 static int32_t bbr_delack_time = 100000;        /* 100ms in useconds */
282 static int32_t bbr_sack_not_required = 0;       /* set to one to allow non-sack to use bbr */
283 static int32_t bbr_initial_bw_bps = 62500;      /* 500kbps in bytes ps */
284 static int32_t bbr_ignore_data_after_close = 1;
285 static int16_t bbr_hptsi_gain[] = {
286         (BBR_UNIT *5 / 4),
287         (BBR_UNIT * 3 / 4),
288         BBR_UNIT,
289         BBR_UNIT,
290         BBR_UNIT,
291         BBR_UNIT,
292         BBR_UNIT,
293         BBR_UNIT
294 };
295 int32_t bbr_use_rack_resend_cheat = 1;
296 int32_t bbr_sends_full_iwnd = 1;
297
298 #define BBR_HPTSI_GAIN_MAX 8
299 /*
300  * The BBR module incorporates a number of
301  * TCP ideas that have been put out into the IETF
302  * over the last few years:
303  * - Yuchung Cheng's RACK TCP (for which its named) that
304  *    will stop us using the number of dup acks and instead
305  *    use time as the gage of when we retransmit.
306  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
307  *    of Dukkipati et.al.
308  * - Van Jacobson's et.al BBR.
309  *
310  * RACK depends on SACK, so if an endpoint arrives that
311  * cannot do SACK the state machine below will shuttle the
312  * connection back to using the "default" TCP stack that is
313  * in FreeBSD.
314  *
315  * To implement BBR and RACK the original TCP stack was first decomposed
316  * into a functional state machine with individual states
317  * for each of the possible TCP connection states. The do_segement
318  * functions role in life is to mandate the connection supports SACK
319  * initially and then assure that the RACK state matches the conenction
320  * state before calling the states do_segment function. Data processing
321  * of inbound segments also now happens in the hpts_do_segment in general
322  * with only one exception. This is so we can keep the connection on
323  * a single CPU.
324  *
325  * Each state is simplified due to the fact that the original do_segment
326  * has been decomposed and we *know* what state we are in (no
327  * switches on the state) and all tests for SACK are gone. This
328  * greatly simplifies what each state does.
329  *
330  * TCP output is also over-written with a new version since it
331  * must maintain the new rack scoreboard and has had hptsi
332  * integrated as a requirment. Still todo is to eliminate the
333  * use of the callout_() system and use the hpts for all
334  * timers as well.
335  */
336 static uint32_t bbr_rtt_probe_time = 200000;    /* 200ms in micro seconds */
337 static uint32_t bbr_rtt_probe_cwndtarg = 4;     /* How many mss's outstanding */
338 static const int32_t bbr_min_req_free = 2;      /* The min we must have on the
339                                                  * free list */
340 static int32_t bbr_tlp_thresh = 1;
341 static int32_t bbr_reorder_thresh = 2;
342 static int32_t bbr_reorder_fade = 60000000;     /* 0 - never fade, def
343                                                  * 60,000,000 - 60 seconds */
344 static int32_t bbr_pkt_delay = 1000;
345 static int32_t bbr_min_to = 1000;       /* Number of usec's minimum timeout */
346 static int32_t bbr_incr_timers = 1;
347
348 static int32_t bbr_tlp_min = 10000;     /* 10ms in usecs */
349 static int32_t bbr_delayed_ack_time = 200000;   /* 200ms in usecs */
350 static int32_t bbr_exit_startup_at_loss = 1;
351
352 /*
353  * bbr_lt_bw_ratio is 1/8th
354  * bbr_lt_bw_diff is  < 4 Kbit/sec
355  */
356 static uint64_t bbr_lt_bw_diff = 4000 / 8;      /* In bytes per second */
357 static uint64_t bbr_lt_bw_ratio = 8;    /* For 1/8th */
358 static uint32_t bbr_lt_bw_max_rtts = 48;        /* How many rtt's do we use
359                                                  * the lt_bw for */
360 static uint32_t bbr_lt_intvl_min_rtts = 4;      /* Min num of RTT's to measure
361                                                  * lt_bw */
362 static int32_t bbr_lt_intvl_fp = 0;             /* False positive epoch diff */
363 static int32_t bbr_lt_loss_thresh = 196;        /* Lost vs delivered % */
364 static int32_t bbr_lt_fd_thresh = 100;          /* false detection % */
365
366 static int32_t bbr_verbose_logging = 0;
367 /*
368  * Currently regular tcp has a rto_min of 30ms
369  * the backoff goes 12 times so that ends up
370  * being a total of 122.850 seconds before a
371  * connection is killed.
372  */
373 static int32_t bbr_rto_min_ms = 30;     /* 30ms same as main freebsd */
374 static int32_t bbr_rto_max_sec = 4;     /* 4 seconds */
375
376 /****************************************************/
377 /* DEFAULT TSO SIZING  (cpu performance impacting)  */
378 /****************************************************/
379 /* What amount is our formula using to get TSO size */
380 static int32_t bbr_hptsi_per_second = 1000;
381
382 /*
383  * For hptsi under bbr_cross_over connections what is delay
384  * target 7ms (in usec) combined with a seg_max of 2
385  * gets us close to identical google behavior in
386  * TSO size selection (possibly more 1MSS sends).
387  */
388 static int32_t bbr_hptsi_segments_delay_tar = 7000;
389
390 /* Does pacing delay include overhead's in its time calculations? */
391 static int32_t bbr_include_enet_oh = 0;
392 static int32_t bbr_include_ip_oh = 1;
393 static int32_t bbr_include_tcp_oh = 1;
394 static int32_t bbr_google_discount = 10;
395
396 /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
397 static int32_t bbr_state_is_pkt_epoch = 0;
398 static int32_t bbr_state_drain_2_tar = 1;
399 /* What is the max the 0 - bbr_cross_over MBPS TSO target
400  * can reach using our delay target. Note that this
401  * value becomes the floor for the cross over
402  * algorithm.
403  */
404 static int32_t bbr_hptsi_segments_max = 2;
405 static int32_t bbr_hptsi_segments_floor = 1;
406 static int32_t bbr_hptsi_utter_max = 0;
407
408 /* What is the min the 0 - bbr_cross-over MBPS  TSO target can be */
409 static int32_t bbr_hptsi_bytes_min = 1460;
410 static int32_t bbr_all_get_min = 0;
411
412 /* Cross over point from algo-a to algo-b */
413 static uint32_t bbr_cross_over = TWENTY_THREE_MBPS;
414
415 /* Do we deal with our restart state? */
416 static int32_t bbr_uses_idle_restart = 0;
417 static int32_t bbr_idle_restart_threshold = 100000;     /* 100ms in useconds */
418
419 /* Do we allow hardware pacing? */
420 static int32_t bbr_allow_hdwr_pacing = 0;
421 static int32_t bbr_hdwr_pace_adjust = 2;        /* multipler when we calc the tso size */
422 static int32_t bbr_hdwr_pace_floor = 1;
423 static int32_t bbr_hdwr_pacing_delay_cnt = 10;
424
425 /****************************************************/
426 static int32_t bbr_resends_use_tso = 0;
427 static int32_t bbr_tlp_max_resend = 2;
428 static int32_t bbr_sack_block_limit = 128;
429
430 #define  BBR_MAX_STAT 19
431 counter_u64_t bbr_state_time[BBR_MAX_STAT];
432 counter_u64_t bbr_state_lost[BBR_MAX_STAT];
433 counter_u64_t bbr_state_resend[BBR_MAX_STAT];
434 counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
435 counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
436 counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE];
437 counter_u64_t bbr_flows_whdwr_pacing;
438 counter_u64_t bbr_flows_nohdwr_pacing;
439
440 counter_u64_t bbr_nohdwr_pacing_enobuf;
441 counter_u64_t bbr_hdwr_pacing_enobuf;
442
443 static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr);
444
445 /*
446  * Static defintions we need for forward declarations.
447  */
448 static uint32_t
449 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain,
450     uint32_t useconds_time, uint64_t bw);
451 static uint32_t
452 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain);
453 static void
454      bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win);
455 static void
456 bbr_set_probebw_gains(struct tcp_bbr *bbr,  uint32_t cts, uint32_t losses);
457 static void
458 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line,
459                     int dolog);
460 static uint32_t
461 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain);
462 static void
463 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch,
464                  int32_t pkt_epoch, uint32_t losses);
465 static uint32_t
466 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm);
467 static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp);
468 static uint32_t
469 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
470     struct bbr_sendmap *rsm, uint32_t srtt,
471     uint32_t cts);
472 static void
473 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
474     int32_t line);
475 static void
476      bbr_set_state_target(struct tcp_bbr *bbr, int line);
477 static void
478      bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line);
479
480 static void
481      bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line);
482
483 static void
484      tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts);
485
486 static void
487      bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts);
488
489 static void
490      bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt,
491                          uint32_t line, uint8_t is_start, uint16_t set);
492
493 static struct bbr_sendmap *
494             bbr_find_lowest_rsm(struct tcp_bbr *bbr);
495 static __inline uint32_t
496 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
497 static void
498      bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which);
499
500 static void
501 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
502     uint32_t thresh, uint32_t to);
503 static void
504      bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
505
506 static void
507 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
508     uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay);
509
510 static void
511 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr,
512     uint32_t cts, int32_t line);
513 static void
514      bbr_stop_all_timers(struct tcpcb *tp);
515 static void
516      bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
517 static void
518      bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts);
519 static void
520      bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts);
521
522
523 static void
524 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
525     uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod);
526
527 static inline uint8_t
528 bbr_state_val(struct tcp_bbr *bbr)
529 {
530         return(bbr->rc_bbr_substate);
531 }
532
533 static inline uint32_t
534 get_min_cwnd(struct tcp_bbr *bbr)
535 {
536         int mss;
537
538         mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
539         if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED)
540                 return (bbr_cwnd_min_val_hs * mss);
541         else
542                 return (bbr_cwnd_min_val * mss);
543 }
544
545 static uint32_t
546 bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr)
547 {
548         uint64_t srtt, var;
549         uint64_t ret_val;
550
551         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
552         if (tp->t_srtt == 0) {
553                 srtt = (uint64_t)BBR_INITIAL_RTO;
554                 var = 0;
555         } else {
556                 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
557                 var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT);
558         }
559         TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]),
560             bbr_persist_min, bbr_persist_max);
561         return ((uint32_t)ret_val);
562 }
563
564 static uint32_t
565 bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
566 {
567         /*
568          * Start the FR timer, we do this based on getting the first one in
569          * the rc_tmap. Note that if its NULL we must stop the timer. in all
570          * events we need to stop the running timer (if its running) before
571          * starting the new one.
572          */
573         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
574         int32_t idx;
575         int32_t is_tlp_timer = 0;
576         struct bbr_sendmap *rsm;
577
578         if (bbr->rc_all_timers_stopped) {
579                 /* All timers have been stopped none are to run */
580                 return (0);
581         }
582         if (bbr->rc_in_persist) {
583                 /* We can't start any timer in persists */
584                 return (bbr_get_persists_timer_val(tp, bbr));
585         }
586         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
587         if ((rsm == NULL) ||
588             ((tp->t_flags & TF_SACK_PERMIT) == 0) ||
589             (tp->t_state < TCPS_ESTABLISHED)) {
590                 /* Nothing on the send map */
591 activate_rxt:
592                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
593                         uint64_t tov;
594
595                         time_since_sent = 0;
596                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
597                         if (rsm) {
598                                 idx = rsm->r_rtr_cnt - 1;
599                                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
600                                         tstmp_touse = rsm->r_tim_lastsent[idx];
601                                 else
602                                         tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
603                                 if (TSTMP_GT(tstmp_touse, cts))
604                                     time_since_sent = cts - tstmp_touse;
605                         }
606                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
607                         if (tp->t_srtt == 0)
608                                 tov = BBR_INITIAL_RTO;
609                         else
610                                 tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) +
611                                     ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT);
612                         if (tp->t_rxtshift)
613                                 tov *= tcp_backoff[tp->t_rxtshift];
614                         if (tov > time_since_sent)
615                                 tov -= time_since_sent;
616                         else
617                                 tov = bbr->r_ctl.rc_min_to;
618                         TCPT_RANGESET_NOSLOP(to, tov,
619                             (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC),
620                             (bbr->rc_max_rto_sec * USECS_IN_SECOND));
621                         bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to);
622                         return (to);
623                 }
624                 return (0);
625         }
626         if (rsm->r_flags & BBR_ACKED) {
627                 rsm = bbr_find_lowest_rsm(bbr);
628                 if (rsm == NULL) {
629                         /* No lowest? */
630                         goto activate_rxt;
631                 }
632         }
633         /* Convert from ms to usecs */
634         if (rsm->r_flags & BBR_SACK_PASSED) {
635                 if ((tp->t_flags & TF_SENTFIN) &&
636                     ((tp->snd_max - tp->snd_una) == 1) &&
637                     (rsm->r_flags & BBR_HAS_FIN)) {
638                         /*
639                          * We don't start a bbr rack timer if all we have is
640                          * a FIN outstanding.
641                          */
642                         goto activate_rxt;
643                 }
644                 srtt = bbr_get_rtt(bbr, BBR_RTT_RACK);
645                 thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm);
646                 idx = rsm->r_rtr_cnt - 1;
647                 exp = rsm->r_tim_lastsent[idx] + thresh;
648                 if (SEQ_GEQ(exp, cts)) {
649                         to = exp - cts;
650                         if (to < bbr->r_ctl.rc_min_to) {
651                                 to = bbr->r_ctl.rc_min_to;
652                         }
653                 } else {
654                         to = bbr->r_ctl.rc_min_to;
655                 }
656         } else {
657                 /* Ok we need to do a TLP not RACK */
658                 if (bbr->rc_tlp_in_progress != 0) {
659                         /*
660                          * The previous send was a TLP.
661                          */
662                         goto activate_rxt;
663                 }
664                 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
665                 if (rsm == NULL) {
666                         /* We found no rsm to TLP with. */
667                         goto activate_rxt;
668                 }
669                 if (rsm->r_flags & BBR_HAS_FIN) {
670                         /* If its a FIN we don't do TLP */
671                         rsm = NULL;
672                         goto activate_rxt;
673                 }
674                 time_since_sent = 0;
675                 idx = rsm->r_rtr_cnt - 1;
676                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
677                         tstmp_touse = rsm->r_tim_lastsent[idx];
678                 else
679                         tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
680                 if (TSTMP_GT(tstmp_touse, cts))
681                     time_since_sent = cts - tstmp_touse;
682                 is_tlp_timer = 1;
683                 srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use);
684                 thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts);
685                 if (thresh > time_since_sent)
686                         to = thresh - time_since_sent;
687                 else
688                         to = bbr->r_ctl.rc_min_to;
689                 if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
690                         /*
691                          * If the TLP time works out to larger than the max
692                          * RTO lets not do TLP.. just RTO.
693                          */
694                         goto activate_rxt;
695                 }
696                 if ((bbr->rc_tlp_rtx_out == 1) &&
697                     (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) {
698                         /*
699                          * Second retransmit of the same TLP
700                          * lets not.
701                          */
702                         bbr->rc_tlp_rtx_out = 0;
703                         goto activate_rxt;
704                 }
705                 if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) {
706                         /*
707                          * The tail is no longer the last one I did a probe
708                          * on
709                          */
710                         bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
711                         bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
712                 }
713         }
714         if (is_tlp_timer == 0) {
715                 BBR_STAT_INC(bbr_to_arm_rack);
716                 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
717         } else {
718                 bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to);
719                 if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
720                         /*
721                          * We have exceeded how many times we can retran the
722                          * current TLP timer, switch to the RTO timer.
723                          */
724                         goto activate_rxt;
725                 } else {
726                         BBR_STAT_INC(bbr_to_arm_tlp);
727                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
728                 }
729         }
730         return (to);
731 }
732
733 static inline int32_t
734 bbr_minseg(struct tcp_bbr *bbr)
735 {
736         return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options);
737 }
738
739 static void
740 bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
741 {
742         struct inpcb *inp;
743         struct hpts_diag diag;
744         uint32_t delayed_ack = 0;
745         uint32_t left = 0;
746         uint32_t hpts_timeout;
747         uint8_t stopped;
748         int32_t delay_calc = 0;
749         uint32_t prev_delay = 0;
750
751         inp = tp->t_inpcb;
752         if (inp->inp_in_hpts) {
753                 /* A previous call is already set up */
754                 return;
755         }
756         if ((tp->t_state == TCPS_CLOSED) ||
757             (tp->t_state == TCPS_LISTEN)) {
758                 return;
759         }
760         stopped = bbr->rc_tmr_stopped;
761         if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
762                 left = bbr->r_ctl.rc_timer_exp - cts;
763         }
764         bbr->r_ctl.rc_hpts_flags = 0;
765         bbr->r_ctl.rc_timer_exp = 0;
766         prev_delay = bbr->r_ctl.rc_last_delay_val;
767         if (bbr->r_ctl.rc_last_delay_val &&
768             (slot == 0)) {
769                 /*
770                  * If a previous pacer delay was in place we
771                  * are not coming from the output side (where
772                  * we calculate a delay, more likely a timer).
773                  */
774                 slot = bbr->r_ctl.rc_last_delay_val;
775                 if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
776                         /* Compensate for time passed  */
777                         delay_calc = cts - bbr->rc_pacer_started;
778                         if (delay_calc <= slot)
779                                 slot -= delay_calc;
780                 }
781         }
782         /* Do we have early to make up for by pushing out the pacing time? */
783         if (bbr->r_agg_early_set) {
784                 bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
785                 slot += bbr->r_ctl.rc_agg_early;
786                 bbr->r_ctl.rc_agg_early = 0;
787                 bbr->r_agg_early_set = 0;
788         }
789         /* Are we running a total debt that needs to be compensated for? */
790         if (bbr->r_ctl.rc_hptsi_agg_delay) {
791                 if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
792                         /* We nuke the delay */
793                         slot -= bbr->r_ctl.rc_hptsi_agg_delay;
794                         bbr->r_ctl.rc_hptsi_agg_delay = 0;
795                 } else {
796                         /* We nuke some of the delay, put in a minimal 100usecs  */
797                         bbr->r_ctl.rc_hptsi_agg_delay -= slot;
798                         bbr->r_ctl.rc_last_delay_val = slot = 100;
799                 }
800         }
801         bbr->r_ctl.rc_last_delay_val = slot;
802         hpts_timeout = bbr_timer_start(tp, bbr, cts);
803         if (tp->t_flags & TF_DELACK) {
804                 if (bbr->rc_in_persist == 0) {
805                         delayed_ack = bbr_delack_time;
806                 } else {
807                         /*
808                          * We are in persists and have
809                          * gotten a new data element.
810                          */
811                         if (hpts_timeout > bbr_delack_time) {
812                                 /*
813                                  * Lets make the persists timer (which acks)
814                                  * be the smaller of hpts_timeout and bbr_delack_time.
815                                  */
816                                 hpts_timeout = bbr_delack_time;
817                         }
818                 }
819         }
820         if (delayed_ack &&
821             ((hpts_timeout == 0) ||
822              (delayed_ack < hpts_timeout))) {
823                 /* We need a Delayed ack timer */
824                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
825                 hpts_timeout = delayed_ack;
826         }
827         if (slot) {
828                 /* Mark that we have a pacing timer up */
829                 BBR_STAT_INC(bbr_paced_segments);
830                 bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
831         }
832         /*
833          * If no timers are going to run and we will fall off thfe hptsi
834          * wheel, we resort to a keep-alive timer if its configured.
835          */
836         if ((hpts_timeout == 0) &&
837             (slot == 0)) {
838                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
839                     (tp->t_state <= TCPS_CLOSING)) {
840                         /*
841                          * Ok we have no timer (persists, rack, tlp, rxt  or
842                          * del-ack), we don't have segments being paced. So
843                          * all that is left is the keepalive timer.
844                          */
845                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
846                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
847                         } else {
848                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
849                         }
850                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
851                 }
852         }
853         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
854             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
855                 /*
856                  * RACK, TLP, persists and RXT timers all are restartable
857                  * based on actions input .. i.e we received a packet (ack
858                  * or sack) and that changes things (rw, or snd_una etc).
859                  * Thus we can restart them with a new value. For
860                  * keep-alive, delayed_ack we keep track of what was left
861                  * and restart the timer with a smaller value.
862                  */
863                 if (left < hpts_timeout)
864                         hpts_timeout = left;
865         }
866         if (bbr->r_ctl.rc_incr_tmrs && slot &&
867             (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
868                 /*
869                  * If configured to do so, and the timer is either
870                  * the TLP or RXT timer, we need to increase the timeout
871                  * by the pacing time. Consider the bottleneck at my
872                  * machine as an example, we are sending something
873                  * to start a TLP on. The last packet won't be emitted
874                  * fully until the pacing time (the bottleneck will hold
875                  * the data in place). Once the packet is emitted that
876                  * is when we want to start waiting for the TLP. This
877                  * is most evident with hardware pacing (where the nic
878                  * is holding the packet(s) before emitting). But it
879                  * can also show up in the network so we do it for all
880                  * cases. Technically we would take off one packet from
881                  * this extra delay but this is easier and being more
882                  * conservative is probably better.
883                  */
884                 hpts_timeout += slot;
885         }
886         if (hpts_timeout) {
887                 /*
888                  * Hack alert for now we can't time-out over 2147 seconds (a
889                  * bit more than 35min)
890                  */
891                 if (hpts_timeout > 0x7ffffffe)
892                         hpts_timeout = 0x7ffffffe;
893                 bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
894         } else
895                 bbr->r_ctl.rc_timer_exp = 0;
896         if ((slot) &&
897             (bbr->rc_use_google ||
898              bbr->output_error_seen ||
899              (slot <= hpts_timeout))  ) {
900                 /*
901                  * Tell LRO that it can queue packets while
902                  * we pace.
903                  */
904                 bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
905                 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
906                     (bbr->rc_cwnd_limited == 0)) {
907                         /*
908                          * If we are not cwnd limited and we
909                          * are running a rack timer we put on
910                          * the do not disturbe even for sack.
911                          */
912                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
913                 } else
914                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
915                 bbr->rc_pacer_started = cts;
916
917                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
918                                            __LINE__, &diag);
919                 bbr->rc_timer_first = 0;
920                 bbr->bbr_timer_src = frm;
921                 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
922                 bbr_log_hpts_diag(bbr, cts, &diag);
923         } else if (hpts_timeout) {
924                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
925                                            __LINE__, &diag);
926                 /*
927                  * We add the flag here as well if the slot is set,
928                  * since hpts will call in to clear the queue first before
929                  * calling the output routine (which does our timers).
930                  * We don't want to set the flag if its just a timer
931                  * else the arrival of data might (that causes us
932                  * to send more) might get delayed. Imagine being
933                  * on a keep-alive timer and a request comes in for
934                  * more data.
935                  */
936                 if (slot)
937                         bbr->rc_pacer_started = cts;
938                 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
939                     (bbr->rc_cwnd_limited == 0)) {
940                         /*
941                          * For a rack timer, don't wake us even
942                          * if a sack arrives as long as we are
943                          * not cwnd limited.
944                          */
945                         bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
946                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
947                 } else {
948                         /* All other timers wake us up */
949                         bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
950                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
951                 }
952                 bbr->bbr_timer_src = frm;
953                 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
954                 bbr_log_hpts_diag(bbr, cts, &diag);
955                 bbr->rc_timer_first = 1;
956         }
957         bbr->rc_tmr_stopped = 0;
958         bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
959 }
960
961 static void
962 bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb)
963 {
964         /*
965          * We received an ack, and then did not call send or were bounced
966          * out due to the hpts was running. Now a timer is up as well, is it
967          * the right timer?
968          */
969         struct inpcb *inp;
970         struct bbr_sendmap *rsm;
971         uint32_t hpts_timeout;
972         int tmr_up;
973
974         tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
975         if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
976                 return;
977         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
978         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
979             (tmr_up == PACE_TMR_RXT)) {
980                 /* Should be an RXT */
981                 return;
982         }
983         inp = bbr->rc_inp;
984         if (rsm == NULL) {
985                 /* Nothing outstanding? */
986                 if (tp->t_flags & TF_DELACK) {
987                         if (tmr_up == PACE_TMR_DELACK)
988                                 /*
989                                  * We are supposed to have delayed ack up
990                                  * and we do
991                                  */
992                                 return;
993                 } else if (sbavail(&inp->inp_socket->so_snd) &&
994                     (tmr_up == PACE_TMR_RXT)) {
995                         /*
996                          * if we hit enobufs then we would expect the
997                          * possiblity of nothing outstanding and the RXT up
998                          * (and the hptsi timer).
999                          */
1000                         return;
1001                 } else if (((V_tcp_always_keepalive ||
1002                             inp->inp_socket->so_options & SO_KEEPALIVE) &&
1003                             (tp->t_state <= TCPS_CLOSING)) &&
1004                             (tmr_up == PACE_TMR_KEEP) &&
1005                     (tp->snd_max == tp->snd_una)) {
1006                         /* We should have keep alive up and we do */
1007                         return;
1008                 }
1009         }
1010         if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) {
1011                 if ((tp->t_flags & TF_SENTFIN) &&
1012                     ((tp->snd_max - tp->snd_una) == 1) &&
1013                     (rsm->r_flags & BBR_HAS_FIN)) {
1014                         /* needs to be a RXT */
1015                         if (tmr_up == PACE_TMR_RXT)
1016                                 return;
1017                         else
1018                                 goto wrong_timer;
1019                 } else if (tmr_up == PACE_TMR_RACK)
1020                         return;
1021                 else
1022                         goto wrong_timer;
1023         } else if (rsm && (tmr_up == PACE_TMR_RACK)) {
1024                 /* Rack timer has priority if we have data out */
1025                 return;
1026         } else if (SEQ_GT(tp->snd_max, tp->snd_una) &&
1027                     ((tmr_up == PACE_TMR_TLP) ||
1028             (tmr_up == PACE_TMR_RXT))) {
1029                 /*
1030                  * Either a TLP or RXT is fine if no sack-passed is in place
1031                  * and data is outstanding.
1032                  */
1033                 return;
1034         } else if (tmr_up == PACE_TMR_DELACK) {
1035                 /*
1036                  * If the delayed ack was going to go off before the
1037                  * rtx/tlp/rack timer were going to expire, then that would
1038                  * be the timer in control. Note we don't check the time
1039                  * here trusting the code is correct.
1040                  */
1041                 return;
1042         }
1043         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
1044             ((tmr_up == PACE_TMR_RXT) ||
1045              (tmr_up == PACE_TMR_TLP) ||
1046              (tmr_up == PACE_TMR_RACK))) {
1047                 /*
1048                  * We have outstanding data and
1049                  * we *do* have a RACK, TLP or RXT
1050                  * timer running. We won't restart
1051                  * anything here since thats probably ok we
1052                  * will get called with some timer here shortly.
1053                  */
1054                 return;
1055         }
1056         /*
1057          * Ok the timer originally started is not what we want now. We will
1058          * force the hpts to be stopped if any, and restart with the slot
1059          * set to what was in the saved slot.
1060          */
1061 wrong_timer:
1062         if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
1063                 if (inp->inp_in_hpts)
1064                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
1065                 bbr_timer_cancel(bbr, __LINE__, cts);
1066                 bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
1067                     0);
1068         } else {
1069                 /*
1070                  * Output is hptsi so we just need to switch the type of
1071                  * timer. We don't bother with keep-alive, since when we
1072                  * jump through the output, it will start the keep-alive if
1073                  * nothing is sent.
1074                  *
1075                  * We only need a delayed-ack added and or the hpts_timeout.
1076                  */
1077                 hpts_timeout = bbr_timer_start(tp, bbr, cts);
1078                 if (tp->t_flags & TF_DELACK) {
1079                         if (hpts_timeout == 0) {
1080                                 hpts_timeout = bbr_delack_time;
1081                                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
1082                         }
1083                         else if (hpts_timeout > bbr_delack_time) {
1084                                 hpts_timeout = bbr_delack_time;
1085                                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
1086                         }
1087                 }
1088                 if (hpts_timeout) {
1089                         if (hpts_timeout > 0x7ffffffe)
1090                                 hpts_timeout = 0x7ffffffe;
1091                         bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
1092                 }
1093         }
1094 }
1095
1096 int32_t bbr_clear_lost = 0;
1097
1098 /*
1099  * Considers the two time values now (cts) and earlier.
1100  * If cts is smaller than earlier, we could have
1101  * had a sequence wrap (our counter wraps every
1102  * 70 min or so) or it could be just clock skew
1103  * getting us two differnt time values. Clock skew
1104  * will show up within 10ms or so. So in such
1105  * a case (where cts is behind earlier time by
1106  * less than 10ms) we return 0. Otherwise we
1107  * return the true difference between them.
1108  */
1109 static inline uint32_t
1110 bbr_calc_time(uint32_t cts, uint32_t earlier_time) {
1111         /*
1112          * Given two timestamps, the current time stamp cts, and some other
1113          * time-stamp taken in theory earlier return the difference. The
1114          * trick is here sometimes locking will get the other timestamp
1115          * after the cts. If this occurs we need to return 0.
1116          */
1117         if (TSTMP_GEQ(cts, earlier_time))
1118                 return (cts - earlier_time);
1119         /*
1120          * cts is behind earlier_time if its less than 10ms consider it 0.
1121          * If its more than 10ms difference then we had a time wrap. Else
1122          * its just the normal locking foo. I wonder if we should not go to
1123          * 64bit TS and get rid of this issue.
1124          */
1125         if (TSTMP_GEQ((cts + 10000), earlier_time))
1126                 return (0);
1127         /*
1128          * Ok the time must have wrapped. So we need to answer a large
1129          * amount of time, which the normal subtraction should do.
1130          */
1131         return (cts - earlier_time);
1132 }
1133
1134
1135
1136 static int
1137 sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS)
1138 {
1139         uint32_t stat;
1140         int32_t error;
1141
1142         error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t));
1143         if (error || req->newptr == NULL)
1144                 return error;
1145
1146         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
1147         if (error)
1148                 return (error);
1149         if (stat == 1) {
1150 #ifdef BBR_INVARIANTS
1151                 printf("Clearing BBR lost counters\n");
1152 #endif
1153                 COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT);
1154                 COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT);
1155                 COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT);
1156         } else if (stat == 2) {
1157 #ifdef BBR_INVARIANTS
1158                 printf("Clearing BBR option counters\n");
1159 #endif
1160                 COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE);
1161         } else if (stat == 3) {
1162 #ifdef BBR_INVARIANTS
1163                 printf("Clearing BBR stats counters\n");
1164 #endif
1165                 COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE);
1166         } else if (stat == 4) {
1167 #ifdef BBR_INVARIANTS
1168                 printf("Clearing BBR out-size counters\n");
1169 #endif
1170                 COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE);
1171         }
1172         bbr_clear_lost = 0;
1173         return (0);
1174 }
1175
1176 static void
1177 bbr_init_sysctls(void)
1178 {
1179         struct sysctl_oid *bbr_probertt;
1180         struct sysctl_oid *bbr_hptsi;
1181         struct sysctl_oid *bbr_measure;
1182         struct sysctl_oid *bbr_cwnd;
1183         struct sysctl_oid *bbr_timeout;
1184         struct sysctl_oid *bbr_states;
1185         struct sysctl_oid *bbr_startup;
1186         struct sysctl_oid *bbr_policer;
1187
1188         /* Probe rtt controls */
1189         bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1190             SYSCTL_CHILDREN(bbr_sysctl_root),
1191             OID_AUTO,
1192             "probertt",
1193             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1194             "");
1195         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1196             SYSCTL_CHILDREN(bbr_probertt),
1197             OID_AUTO, "gain", CTLFLAG_RW,
1198             &bbr_rttprobe_gain, 192,
1199             "What is the filter gain drop in probe_rtt (0=disable)?");
1200         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1201             SYSCTL_CHILDREN(bbr_probertt),
1202             OID_AUTO, "cwnd", CTLFLAG_RW,
1203             &bbr_rtt_probe_cwndtarg, 4,
1204             "How many mss's are outstanding during probe-rtt");
1205         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1206             SYSCTL_CHILDREN(bbr_probertt),
1207             OID_AUTO, "int", CTLFLAG_RW,
1208             &bbr_rtt_probe_limit, 4000000,
1209             "If RTT has not shrank in this many micro-seconds enter probe-rtt");
1210         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1211             SYSCTL_CHILDREN(bbr_probertt),
1212             OID_AUTO, "mintime", CTLFLAG_RW,
1213             &bbr_rtt_probe_time, 200000,
1214             "How many microseconds in probe-rtt");
1215         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1216             SYSCTL_CHILDREN(bbr_probertt),
1217             OID_AUTO, "filter_len_sec", CTLFLAG_RW,
1218             &bbr_filter_len_sec, 6,
1219             "How long in seconds does the rttProp filter run?");
1220         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1221             SYSCTL_CHILDREN(bbr_probertt),
1222             OID_AUTO, "drain_rtt", CTLFLAG_RW,
1223             &bbr_drain_rtt, BBR_SRTT,
1224             "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?");
1225         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1226             SYSCTL_CHILDREN(bbr_probertt),
1227             OID_AUTO, "can_force", CTLFLAG_RW,
1228             &bbr_can_force_probertt, 0,
1229             "If we keep setting new low rtt's but delay going in probe-rtt can we force in??");
1230         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1231             SYSCTL_CHILDREN(bbr_probertt),
1232             OID_AUTO, "enter_sets_force", CTLFLAG_RW,
1233             &bbr_probertt_sets_rtt, 0,
1234             "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?");
1235         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1236             SYSCTL_CHILDREN(bbr_probertt),
1237             OID_AUTO, "can_adjust", CTLFLAG_RW,
1238             &bbr_can_adjust_probertt, 1,
1239             "Can we dynamically adjust the probe-rtt limits and times?");
1240         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1241             SYSCTL_CHILDREN(bbr_probertt),
1242             OID_AUTO, "is_ratio", CTLFLAG_RW,
1243             &bbr_is_ratio, 0,
1244             "is the limit to filter a ratio?");
1245         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1246             SYSCTL_CHILDREN(bbr_probertt),
1247             OID_AUTO, "use_cwnd", CTLFLAG_RW,
1248             &bbr_prtt_slam_cwnd, 0,
1249             "Should we set/recover cwnd?");
1250         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1251             SYSCTL_CHILDREN(bbr_probertt),
1252             OID_AUTO, "can_use_ts", CTLFLAG_RW,
1253             &bbr_can_use_ts_for_rtt, 1,
1254             "Can we use the ms timestamp if available for retransmistted rtt calculations?");
1255
1256         /* Pacing controls */
1257         bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1258             SYSCTL_CHILDREN(bbr_sysctl_root),
1259             OID_AUTO,
1260             "pacing",
1261             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1262             "");
1263         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1264             SYSCTL_CHILDREN(bbr_hptsi),
1265             OID_AUTO, "hw_pacing", CTLFLAG_RW,
1266             &bbr_allow_hdwr_pacing, 1,
1267             "Do we allow hardware pacing?");
1268         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1269             SYSCTL_CHILDREN(bbr_hptsi),
1270             OID_AUTO, "hw_pacing_limit", CTLFLAG_RW,
1271             &bbr_hardware_pacing_limit, 4000,
1272             "Do we have a limited number of connections for pacing chelsio (0=no limit)?");
1273         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1274             SYSCTL_CHILDREN(bbr_hptsi),
1275             OID_AUTO, "hw_pacing_adj", CTLFLAG_RW,
1276             &bbr_hdwr_pace_adjust, 2,
1277             "Multiplier to calculated tso size?");
1278         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1279             SYSCTL_CHILDREN(bbr_hptsi),
1280             OID_AUTO, "hw_pacing_floor", CTLFLAG_RW,
1281             &bbr_hdwr_pace_floor, 1,
1282             "Do we invoke the hardware pacing floor?");
1283         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1284             SYSCTL_CHILDREN(bbr_hptsi),
1285             OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW,
1286             &bbr_hdwr_pacing_delay_cnt, 10,
1287             "How many packets must be sent after hdwr pacing is enabled");
1288         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1289             SYSCTL_CHILDREN(bbr_hptsi),
1290             OID_AUTO, "bw_cross", CTLFLAG_RW,
1291             &bbr_cross_over, 3000000,
1292             "What is the point where we cross over to linux like TSO size set");
1293         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1294             SYSCTL_CHILDREN(bbr_hptsi),
1295             OID_AUTO, "seg_deltarg", CTLFLAG_RW,
1296             &bbr_hptsi_segments_delay_tar, 7000,
1297             "What is the worse case delay target for hptsi < 48Mbp connections");
1298         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1299             SYSCTL_CHILDREN(bbr_hptsi),
1300             OID_AUTO, "enet_oh", CTLFLAG_RW,
1301             &bbr_include_enet_oh, 0,
1302             "Do we include the ethernet overhead in calculating pacing delay?");
1303         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1304             SYSCTL_CHILDREN(bbr_hptsi),
1305             OID_AUTO, "ip_oh", CTLFLAG_RW,
1306             &bbr_include_ip_oh, 1,
1307             "Do we include the IP overhead in calculating pacing delay?");
1308         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1309             SYSCTL_CHILDREN(bbr_hptsi),
1310             OID_AUTO, "tcp_oh", CTLFLAG_RW,
1311             &bbr_include_tcp_oh, 0,
1312             "Do we include the TCP overhead in calculating pacing delay?");
1313         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1314             SYSCTL_CHILDREN(bbr_hptsi),
1315             OID_AUTO, "google_discount", CTLFLAG_RW,
1316             &bbr_google_discount, 10,
1317             "What is the default google discount percentage wise for pacing (11 = 1.1%%)?");
1318         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1319             SYSCTL_CHILDREN(bbr_hptsi),
1320             OID_AUTO, "all_get_min", CTLFLAG_RW,
1321             &bbr_all_get_min, 0,
1322             "If you are less than a MSS do you just get the min?");
1323         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1324             SYSCTL_CHILDREN(bbr_hptsi),
1325             OID_AUTO, "tso_min", CTLFLAG_RW,
1326             &bbr_hptsi_bytes_min, 1460,
1327             "For 0 -> 24Mbps what is floor number of segments for TSO");
1328         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1329             SYSCTL_CHILDREN(bbr_hptsi),
1330             OID_AUTO, "seg_tso_max", CTLFLAG_RW,
1331             &bbr_hptsi_segments_max, 6,
1332             "For 0 -> 24Mbps what is top number of segments for TSO");
1333         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1334             SYSCTL_CHILDREN(bbr_hptsi),
1335             OID_AUTO, "seg_floor", CTLFLAG_RW,
1336             &bbr_hptsi_segments_floor, 1,
1337             "Minimum TSO size we will fall too in segments");
1338         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1339             SYSCTL_CHILDREN(bbr_hptsi),
1340             OID_AUTO, "utter_max", CTLFLAG_RW,
1341             &bbr_hptsi_utter_max, 0,
1342             "The absolute maximum that any pacing (outside of hardware) can be");
1343         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1344             SYSCTL_CHILDREN(bbr_hptsi),
1345             OID_AUTO, "seg_divisor", CTLFLAG_RW,
1346             &bbr_hptsi_per_second, 100,
1347             "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps ");
1348         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1349             SYSCTL_CHILDREN(bbr_hptsi),
1350             OID_AUTO, "srtt_mul", CTLFLAG_RW,
1351             &bbr_hptsi_max_mul, 1,
1352             "The multiplier for pace len max");
1353         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1354             SYSCTL_CHILDREN(bbr_hptsi),
1355             OID_AUTO, "srtt_div", CTLFLAG_RW,
1356             &bbr_hptsi_max_div, 2,
1357             "The divisor for pace len max");
1358         /* Measurement controls */
1359         bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1360             SYSCTL_CHILDREN(bbr_sysctl_root),
1361             OID_AUTO,
1362             "measure",
1363             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1364             "Measurement controls");
1365         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1366             SYSCTL_CHILDREN(bbr_measure),
1367             OID_AUTO, "min_i_bw", CTLFLAG_RW,
1368             &bbr_initial_bw_bps, 62500,
1369             "Minimum initial b/w in bytes per second");
1370         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1371             SYSCTL_CHILDREN(bbr_measure),
1372             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1373             &bbr_sack_not_required, 0,
1374             "Do we allow bbr to run on connections not supporting SACK?");
1375         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1376             SYSCTL_CHILDREN(bbr_measure),
1377             OID_AUTO, "use_google", CTLFLAG_RW,
1378             &bbr_use_google_algo, 0,
1379             "Use has close to google V1.0 has possible?");
1380         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1381             SYSCTL_CHILDREN(bbr_measure),
1382             OID_AUTO, "ts_limiting", CTLFLAG_RW,
1383             &bbr_ts_limiting, 1,
1384             "Do we attempt to use the peers timestamp to limit b/w caculations?");
1385         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1386             SYSCTL_CHILDREN(bbr_measure),
1387             OID_AUTO, "ts_can_raise", CTLFLAG_RW,
1388             &bbr_ts_can_raise, 0,
1389             "Can we raise the b/w via timestamp b/w calculation?");
1390         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1391             SYSCTL_CHILDREN(bbr_measure),
1392             OID_AUTO, "ts_delta", CTLFLAG_RW,
1393             &bbr_min_usec_delta, 20000,
1394             "How long in usec between ts of our sends in ts validation code?");
1395         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1396             SYSCTL_CHILDREN(bbr_measure),
1397             OID_AUTO, "ts_peer_delta", CTLFLAG_RW,
1398             &bbr_min_peer_delta, 20,
1399             "What min numerical value should be between the peer deltas?");
1400         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1401             SYSCTL_CHILDREN(bbr_measure),
1402             OID_AUTO, "ts_delta_percent", CTLFLAG_RW,
1403             &bbr_delta_percent, 150,
1404             "What percentage (150 = 15.0) do we allow variance for?");
1405         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1406             SYSCTL_CHILDREN(bbr_measure),
1407             OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
1408             &bbr_min_measurements_req, 1,
1409             "What is the minimum measurment count we need before we switch to our b/w estimate");
1410         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1411             SYSCTL_CHILDREN(bbr_measure),
1412             OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
1413             &bbr_no_pacing_until, 4,
1414             "How many pkt-epoch's (0 is off) do we need before pacing is on?");
1415         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1416             SYSCTL_CHILDREN(bbr_measure),
1417             OID_AUTO, "quanta", CTLFLAG_RW,
1418             &bbr_quanta, 2,
1419             "Extra quanta to add when calculating the target (ID section 4.2.3.2).");
1420         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1421             SYSCTL_CHILDREN(bbr_measure),
1422             OID_AUTO, "noretran", CTLFLAG_RW,
1423             &bbr_no_retran, 0,
1424             "Should google mode not use retransmission measurements for the b/w estimation?");
1425         /* State controls */
1426         bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1427             SYSCTL_CHILDREN(bbr_sysctl_root),
1428             OID_AUTO,
1429             "states",
1430             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1431             "State controls");
1432         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1433             SYSCTL_CHILDREN(bbr_states),
1434             OID_AUTO, "idle_restart", CTLFLAG_RW,
1435             &bbr_uses_idle_restart, 0,
1436             "Do we use a new special idle_restart state to ramp back up quickly?");
1437         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1438             SYSCTL_CHILDREN(bbr_states),
1439             OID_AUTO, "idle_restart_threshold", CTLFLAG_RW,
1440             &bbr_idle_restart_threshold, 100000,
1441             "How long must we be idle before we restart??");
1442         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1443             SYSCTL_CHILDREN(bbr_states),
1444             OID_AUTO, "use_pkt_epoch", CTLFLAG_RW,
1445             &bbr_state_is_pkt_epoch, 0,
1446             "Do we use a pkt-epoch for substate if 0 rttProp?");
1447         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1448             SYSCTL_CHILDREN(bbr_states),
1449             OID_AUTO, "startup_rtt_gain", CTLFLAG_RW,
1450             &bbr_rtt_gain_thresh, 0,
1451             "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?");
1452         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1453             SYSCTL_CHILDREN(bbr_states),
1454             OID_AUTO, "drain_floor", CTLFLAG_RW,
1455             &bbr_drain_floor, 88,
1456             "What is the lowest we can drain (pg) too?");
1457         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1458             SYSCTL_CHILDREN(bbr_states),
1459             OID_AUTO, "drain_2_target", CTLFLAG_RW,
1460             &bbr_state_drain_2_tar, 1,
1461             "Do we drain to target in drain substate?");
1462         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1463             SYSCTL_CHILDREN(bbr_states),
1464             OID_AUTO, "gain_2_target", CTLFLAG_RW,
1465             &bbr_gain_to_target, 1,
1466             "Does probe bw gain to target??");
1467         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1468             SYSCTL_CHILDREN(bbr_states),
1469             OID_AUTO, "gain_extra_time", CTLFLAG_RW,
1470             &bbr_gain_gets_extra_too, 1,
1471             "Does probe bw gain get the extra time too?");
1472         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1473             SYSCTL_CHILDREN(bbr_states),
1474             OID_AUTO, "ld_div", CTLFLAG_RW,
1475             &bbr_drain_drop_div, 5,
1476             "Long drain drop divider?");
1477         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1478             SYSCTL_CHILDREN(bbr_states),
1479             OID_AUTO, "ld_mul", CTLFLAG_RW,
1480             &bbr_drain_drop_mul, 4,
1481             "Long drain drop multiplier?");
1482         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1483             SYSCTL_CHILDREN(bbr_states),
1484             OID_AUTO, "rand_ot_disc", CTLFLAG_RW,
1485             &bbr_rand_ot, 50,
1486             "Random discount of the ot?");
1487         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1488             SYSCTL_CHILDREN(bbr_states),
1489             OID_AUTO, "dr_filter_life", CTLFLAG_RW,
1490             &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT,
1491             "How many packet-epochs does the b/w delivery rate last?");
1492         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1493             SYSCTL_CHILDREN(bbr_states),
1494             OID_AUTO, "subdrain_applimited", CTLFLAG_RW,
1495             &bbr_sub_drain_app_limit, 0,
1496             "Does our sub-state drain invoke app limited if its long?");
1497         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1498             SYSCTL_CHILDREN(bbr_states),
1499             OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW,
1500             &bbr_sub_drain_slam_cwnd, 0,
1501             "Should we set/recover cwnd for sub-state drain?");
1502         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1503             SYSCTL_CHILDREN(bbr_states),
1504             OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW,
1505             &bbr_slam_cwnd_in_main_drain, 0,
1506             "Should we set/recover cwnd for main-state drain?");
1507         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1508             SYSCTL_CHILDREN(bbr_states),
1509             OID_AUTO, "google_gets_earlyout", CTLFLAG_RW,
1510             &google_allow_early_out, 1,
1511             "Should we allow google probe-bw/drain to exit early at flight target?");
1512         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1513             SYSCTL_CHILDREN(bbr_states),
1514             OID_AUTO, "google_exit_loss", CTLFLAG_RW,
1515             &google_consider_lost, 1,
1516             "Should we have losses exit gain of probebw in google mode??");
1517         /* Startup controls */
1518         bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1519             SYSCTL_CHILDREN(bbr_sysctl_root),
1520             OID_AUTO,
1521             "startup",
1522             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1523             "Startup controls");
1524         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1525             SYSCTL_CHILDREN(bbr_startup),
1526             OID_AUTO, "cheat_iwnd", CTLFLAG_RW,
1527             &bbr_sends_full_iwnd, 1,
1528             "Do we not pace but burst out initial windows has our TSO size?");
1529         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1530             SYSCTL_CHILDREN(bbr_startup),
1531             OID_AUTO, "loss_threshold", CTLFLAG_RW,
1532             &bbr_startup_loss_thresh, 2000,
1533             "In startup what is the loss threshold in a pe that will exit us from startup?");
1534         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1535             SYSCTL_CHILDREN(bbr_startup),
1536             OID_AUTO, "use_lowerpg", CTLFLAG_RW,
1537             &bbr_use_lower_gain_in_startup, 1,
1538             "Should we use a lower hptsi gain if we see loss in startup?");
1539         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1540             SYSCTL_CHILDREN(bbr_startup),
1541             OID_AUTO, "gain", CTLFLAG_RW,
1542             &bbr_start_exit, 25,
1543             "What gain percent do we need to see to stay in startup??");
1544         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1545             SYSCTL_CHILDREN(bbr_startup),
1546             OID_AUTO, "low_gain", CTLFLAG_RW,
1547             &bbr_low_start_exit, 15,
1548             "What gain percent do we need to see to stay in the lower gain startup??");
1549         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1550             SYSCTL_CHILDREN(bbr_startup),
1551             OID_AUTO, "loss_exit", CTLFLAG_RW,
1552             &bbr_exit_startup_at_loss, 1,
1553             "Should we exit startup at loss in an epoch if we are not gaining?");
1554         /* CWND controls */
1555         bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1556             SYSCTL_CHILDREN(bbr_sysctl_root),
1557             OID_AUTO,
1558             "cwnd",
1559             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1560             "Cwnd controls");
1561         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1562             SYSCTL_CHILDREN(bbr_cwnd),
1563             OID_AUTO, "tar_rtt", CTLFLAG_RW,
1564             &bbr_cwndtarget_rtt_touse, 0,
1565             "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
1566         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1567             SYSCTL_CHILDREN(bbr_cwnd),
1568             OID_AUTO, "may_shrink", CTLFLAG_RW,
1569             &bbr_cwnd_may_shrink, 0,
1570             "Can the cwnd shrink if it would grow to more than the target?");
1571         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1572             SYSCTL_CHILDREN(bbr_cwnd),
1573             OID_AUTO, "max_target_limit", CTLFLAG_RW,
1574             &bbr_target_cwnd_mult_limit, 8,
1575             "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?");
1576         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1577             SYSCTL_CHILDREN(bbr_cwnd),
1578             OID_AUTO, "highspeed_min", CTLFLAG_RW,
1579             &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS,
1580             "What is the high-speed min cwnd (rttProp under 1ms)");
1581         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1582             SYSCTL_CHILDREN(bbr_cwnd),
1583             OID_AUTO, "lowspeed_min", CTLFLAG_RW,
1584             &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS,
1585             "What is the min cwnd (rttProp > 1ms)");
1586         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1587             SYSCTL_CHILDREN(bbr_cwnd),
1588             OID_AUTO, "initwin", CTLFLAG_RW,
1589             &bbr_def_init_win, 10,
1590             "What is the BBR initial window, if 0 use tcp version");
1591         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1592             SYSCTL_CHILDREN(bbr_cwnd),
1593             OID_AUTO, "do_loss_red", CTLFLAG_RW,
1594             &bbr_do_red, 600,
1595             "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?");
1596         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1597             SYSCTL_CHILDREN(bbr_cwnd),
1598             OID_AUTO, "red_scale", CTLFLAG_RW,
1599             &bbr_red_scale, 20000,
1600             "What RTT do we scale with?");
1601         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1602             SYSCTL_CHILDREN(bbr_cwnd),
1603             OID_AUTO, "red_growslow", CTLFLAG_RW,
1604             &bbr_red_growth_restrict, 1,
1605             "Do we restrict cwnd growth for whats in flight?");
1606         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1607             SYSCTL_CHILDREN(bbr_cwnd),
1608             OID_AUTO, "red_div", CTLFLAG_RW,
1609             &bbr_red_div, 2,
1610             "If we reduce whats the divisor?");
1611         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1612             SYSCTL_CHILDREN(bbr_cwnd),
1613             OID_AUTO, "red_mul", CTLFLAG_RW,
1614             &bbr_red_mul, 1,
1615             "If we reduce whats the mulitiplier?");
1616         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1617             SYSCTL_CHILDREN(bbr_cwnd),
1618             OID_AUTO, "target_is_unit", CTLFLAG_RW,
1619             &bbr_target_is_bbunit, 0,
1620             "Is the state target the pacing_gain or BBR_UNIT?");
1621         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1622             SYSCTL_CHILDREN(bbr_cwnd),
1623             OID_AUTO, "drop_limit", CTLFLAG_RW,
1624             &bbr_drop_limit, 0,
1625             "Number of segments limit for drop (0=use min_cwnd w/flight)?");
1626
1627         /* Timeout controls */
1628         bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1629             SYSCTL_CHILDREN(bbr_sysctl_root),
1630             OID_AUTO,
1631             "timeout",
1632             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1633             "Time out controls");
1634         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1635             SYSCTL_CHILDREN(bbr_timeout),
1636             OID_AUTO, "delack", CTLFLAG_RW,
1637             &bbr_delack_time, 100000,
1638             "BBR's delayed ack time");
1639         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1640             SYSCTL_CHILDREN(bbr_timeout),
1641             OID_AUTO, "tlp_uses", CTLFLAG_RW,
1642             &bbr_tlp_type_to_use, 3,
1643             "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt");
1644         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1645             SYSCTL_CHILDREN(bbr_timeout),
1646             OID_AUTO, "persmin", CTLFLAG_RW,
1647             &bbr_persist_min, 250000,
1648             "What is the minimum time in microseconds between persists");
1649         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1650             SYSCTL_CHILDREN(bbr_timeout),
1651             OID_AUTO, "persmax", CTLFLAG_RW,
1652             &bbr_persist_max, 1000000,
1653             "What is the largest delay in microseconds between persists");
1654         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1655             SYSCTL_CHILDREN(bbr_timeout),
1656             OID_AUTO, "tlp_minto", CTLFLAG_RW,
1657             &bbr_tlp_min, 10000,
1658             "TLP Min timeout in usecs");
1659         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1660             SYSCTL_CHILDREN(bbr_timeout),
1661             OID_AUTO, "tlp_dack_time", CTLFLAG_RW,
1662             &bbr_delayed_ack_time, 200000,
1663             "TLP delayed ack compensation value");
1664         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1665             SYSCTL_CHILDREN(bbr_sysctl_root),
1666             OID_AUTO, "minrto", CTLFLAG_RW,
1667             &bbr_rto_min_ms, 30,
1668             "Minimum RTO in ms");
1669         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1670             SYSCTL_CHILDREN(bbr_timeout),
1671             OID_AUTO, "maxrto", CTLFLAG_RW,
1672             &bbr_rto_max_sec, 4,
1673             "Maxiumum RTO in seconds -- should be at least as large as min_rto");
1674         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1675             SYSCTL_CHILDREN(bbr_timeout),
1676             OID_AUTO, "tlp_retry", CTLFLAG_RW,
1677             &bbr_tlp_max_resend, 2,
1678             "How many times does TLP retry a single segment or multiple with no ACK");
1679         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1680             SYSCTL_CHILDREN(bbr_timeout),
1681             OID_AUTO, "minto", CTLFLAG_RW,
1682             &bbr_min_to, 1000,
1683             "Minimum rack timeout in useconds");
1684         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1685             SYSCTL_CHILDREN(bbr_timeout),
1686             OID_AUTO, "pktdelay", CTLFLAG_RW,
1687             &bbr_pkt_delay, 1000,
1688             "Extra RACK time (in useconds) besides reordering thresh");
1689         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1690             SYSCTL_CHILDREN(bbr_timeout),
1691             OID_AUTO, "incr_tmrs", CTLFLAG_RW,
1692             &bbr_incr_timers, 1,
1693             "Increase the RXT/TLP timer by the pacing time used?");
1694         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1695             SYSCTL_CHILDREN(bbr_timeout),
1696             OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW,
1697             &bbr_marks_rxt_sack_passed, 0,
1698             "Mark sack passed on all those not ack'd when a RXT hits?");
1699         /* Policer controls */
1700         bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1701             SYSCTL_CHILDREN(bbr_sysctl_root),
1702             OID_AUTO,
1703             "policer",
1704             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1705             "Policer controls");
1706         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1707             SYSCTL_CHILDREN(bbr_policer),
1708             OID_AUTO, "detect_enable", CTLFLAG_RW,
1709             &bbr_policer_detection_enabled, 1,
1710             "Is policer detection enabled??");
1711         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1712             SYSCTL_CHILDREN(bbr_policer),
1713             OID_AUTO, "min_pes", CTLFLAG_RW,
1714             &bbr_lt_intvl_min_rtts, 4,
1715             "Minimum number of PE's?");
1716         SYSCTL_ADD_U64(&bbr_sysctl_ctx,
1717             SYSCTL_CHILDREN(bbr_policer),
1718             OID_AUTO, "bwdiff", CTLFLAG_RW,
1719             &bbr_lt_bw_diff, (4000/8),
1720             "Minimal bw diff?");
1721         SYSCTL_ADD_U64(&bbr_sysctl_ctx,
1722             SYSCTL_CHILDREN(bbr_policer),
1723             OID_AUTO, "bwratio", CTLFLAG_RW,
1724             &bbr_lt_bw_ratio, 8,
1725             "Minimal bw diff?");
1726         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1727             SYSCTL_CHILDREN(bbr_policer),
1728             OID_AUTO, "from_rack_rxt", CTLFLAG_RW,
1729             &bbr_policer_call_from_rack_to, 0,
1730             "Do we call the policer detection code from a rack-timeout?");
1731         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1732             SYSCTL_CHILDREN(bbr_policer),
1733             OID_AUTO, "false_postive", CTLFLAG_RW,
1734             &bbr_lt_intvl_fp, 0,
1735             "What packet epoch do we do false-postive detection at (0=no)?");
1736         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1737             SYSCTL_CHILDREN(bbr_policer),
1738             OID_AUTO, "loss_thresh", CTLFLAG_RW,
1739             &bbr_lt_loss_thresh, 196,
1740             "Loss threshold 196 = 19.6%?");
1741         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1742             SYSCTL_CHILDREN(bbr_policer),
1743             OID_AUTO, "false_postive_thresh", CTLFLAG_RW,
1744             &bbr_lt_fd_thresh, 100,
1745             "What percentage is the false detection threshold (150=15.0)?");
1746         /* All the rest */
1747         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1748             SYSCTL_CHILDREN(bbr_sysctl_root),
1749             OID_AUTO, "cheat_rxt", CTLFLAG_RW,
1750             &bbr_use_rack_resend_cheat, 0,
1751             "Do we burst 1ms between sends on retransmissions (like rack)?");
1752         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1753             SYSCTL_CHILDREN(bbr_sysctl_root),
1754             OID_AUTO, "error_paceout", CTLFLAG_RW,
1755             &bbr_error_base_paceout, 10000,
1756             "When we hit an error what is the min to pace out in usec's?");
1757         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1758             SYSCTL_CHILDREN(bbr_sysctl_root),
1759             OID_AUTO, "kill_paceout", CTLFLAG_RW,
1760             &bbr_max_net_error_cnt, 10,
1761             "When we hit this many errors in a row, kill the session?");
1762         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1763             SYSCTL_CHILDREN(bbr_sysctl_root),
1764             OID_AUTO, "data_after_close", CTLFLAG_RW,
1765             &bbr_ignore_data_after_close, 1,
1766             "Do we hold off sending a RST until all pending data is ack'd");
1767         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1768             SYSCTL_CHILDREN(bbr_sysctl_root),
1769             OID_AUTO, "resend_use_tso", CTLFLAG_RW,
1770             &bbr_resends_use_tso, 0,
1771             "Can resends use TSO?");
1772         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1773             SYSCTL_CHILDREN(bbr_sysctl_root),
1774             OID_AUTO, "sblklimit", CTLFLAG_RW,
1775             &bbr_sack_block_limit, 128,
1776             "When do we start ignoring small sack blocks");
1777         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1778             SYSCTL_CHILDREN(bbr_sysctl_root),
1779             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1780             &bbr_verbose_logging, 0,
1781             "Should BBR black box logging be verbose");
1782         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1783             SYSCTL_CHILDREN(bbr_sysctl_root),
1784             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1785             &bbr_reorder_thresh, 2,
1786             "What factor for rack will be added when seeing reordering (shift right)");
1787         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1788             SYSCTL_CHILDREN(bbr_sysctl_root),
1789             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1790             &bbr_reorder_fade, 0,
1791             "Does reorder detection fade, if so how many ms (0 means never)");
1792         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1793             SYSCTL_CHILDREN(bbr_sysctl_root),
1794             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1795             &bbr_tlp_thresh, 1,
1796             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1797         /* Stats and counters */
1798         /* The pacing counters for hdwr/software can't be in the array */
1799         bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
1800         bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
1801         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1802             SYSCTL_CHILDREN(bbr_sysctl_root),
1803             OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD,
1804             &bbr_hdwr_pacing_enobuf,
1805             "Total number of enobufs for hardware paced flows");
1806         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1807             SYSCTL_CHILDREN(bbr_sysctl_root),
1808             OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD,
1809             &bbr_nohdwr_pacing_enobuf,
1810             "Total number of enobufs for non-hardware paced flows");
1811
1812
1813         bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK);
1814         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1815             SYSCTL_CHILDREN(bbr_sysctl_root),
1816             OID_AUTO, "hdwr_pacing", CTLFLAG_RD,
1817             &bbr_flows_whdwr_pacing,
1818             "Total number of hardware paced flows");
1819         bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK);
1820         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1821             SYSCTL_CHILDREN(bbr_sysctl_root),
1822             OID_AUTO, "software_pacing", CTLFLAG_RD,
1823             &bbr_flows_nohdwr_pacing,
1824             "Total number of software paced flows");
1825         COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK);
1826         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1827             OID_AUTO, "stats", CTLFLAG_RD,
1828             bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats");
1829         COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK);
1830         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1831             OID_AUTO, "opts", CTLFLAG_RD,
1832             bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats");
1833         COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK);
1834         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1835             OID_AUTO, "lost", CTLFLAG_RD,
1836             bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur");
1837         COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK);
1838         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1839             OID_AUTO, "stateresend", CTLFLAG_RD,
1840             bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend");
1841         COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK);
1842         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1843             OID_AUTO, "statetime", CTLFLAG_RD,
1844             bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states");
1845         COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1846         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1847             OID_AUTO, "outsize", CTLFLAG_RD,
1848             bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls");
1849         SYSCTL_ADD_PROC(&bbr_sysctl_ctx,
1850             SYSCTL_CHILDREN(bbr_sysctl_root),
1851             OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1852             &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
1853 }
1854
1855 static void
1856 bbr_counter_destroy(void)
1857 {
1858         COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE);
1859         COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE);
1860         COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE);
1861         COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
1862         COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
1863         COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
1864         counter_u64_free(bbr_nohdwr_pacing_enobuf);
1865         counter_u64_free(bbr_hdwr_pacing_enobuf);
1866         counter_u64_free(bbr_flows_whdwr_pacing);
1867         counter_u64_free(bbr_flows_nohdwr_pacing);
1868
1869 }
1870
1871 static __inline void
1872 bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)
1873 {
1874         memset(l, 0, sizeof(union tcp_log_stackspecific));
1875         l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate;
1876         l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate);
1877         l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
1878         l->bw_inuse = bbr_get_bw(bbr);
1879         l->inflight = ctf_flight_size(bbr->rc_tp,
1880                           (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
1881         l->applimited = bbr->r_ctl.r_app_limited_until;
1882         l->delivered = bbr->r_ctl.rc_delivered;
1883         l->timeStamp = cts;
1884         l->lost = bbr->r_ctl.rc_lost;
1885         l->bbr_state = bbr->rc_bbr_state;
1886         l->bbr_substate = bbr_state_val(bbr);
1887         l->epoch = bbr->r_ctl.rc_rtt_epoch;
1888         l->lt_epoch = bbr->r_ctl.rc_lt_epoch;
1889         l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
1890         l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
1891         l->inhpts = bbr->rc_inp->inp_in_hpts;
1892         l->ininput = bbr->rc_inp->inp_in_input;
1893         l->use_lt_bw = bbr->rc_lt_use_bw;
1894         l->pkts_out = bbr->r_ctl.rc_flight_at_input;
1895         l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
1896 }
1897
1898 static void
1899 bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason)
1900 {
1901         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1902                 union tcp_log_stackspecific log;
1903
1904                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1905                 log.u_bbr.flex1 = 0;
1906                 log.u_bbr.flex2 = 0;
1907                 log.u_bbr.flex5 = 0;
1908                 log.u_bbr.flex3 = 0;
1909                 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate;
1910                 log.u_bbr.flex7 = reason;
1911                 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt;
1912                 log.u_bbr.flex8 = 0;
1913                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1914                     &bbr->rc_inp->inp_socket->so_rcv,
1915                     &bbr->rc_inp->inp_socket->so_snd,
1916                     BBR_LOG_BW_RED_EV, 0,
1917                     0, &log, false, &bbr->rc_tv);
1918         }
1919 }
1920
1921 static void
1922 bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count)
1923 {
1924         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1925                 union tcp_log_stackspecific log;
1926
1927                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1928                 log.u_bbr.flex1 = seq;
1929                 log.u_bbr.flex2 = count;
1930                 log.u_bbr.flex8 = mode;
1931                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1932                     &bbr->rc_inp->inp_socket->so_rcv,
1933                     &bbr->rc_inp->inp_socket->so_snd,
1934                     BBR_LOG_LOWGAIN, 0,
1935                     0, &log, false, &bbr->rc_tv);
1936         }
1937 }
1938
1939
1940
1941 static void
1942 bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling,
1943     uint8_t reason, uint32_t p_maxseg, int len)
1944 {
1945         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1946                 union tcp_log_stackspecific log;
1947
1948                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
1949                 log.u_bbr.flex1 = p_maxseg;
1950                 log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags;
1951                 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
1952                 log.u_bbr.flex4 = reason;
1953                 log.u_bbr.flex5 = bbr->rc_in_persist;
1954                 log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val;
1955                 log.u_bbr.flex7 = p_maxseg;
1956                 log.u_bbr.flex8 = bbr->rc_in_persist;
1957                 log.u_bbr.pkts_out = 0;
1958                 log.u_bbr.applimited = len;
1959                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1960                     &bbr->rc_inp->inp_socket->so_rcv,
1961                     &bbr->rc_inp->inp_socket->so_snd,
1962                     BBR_LOG_JUSTRET, 0,
1963                     tlen, &log, false, &bbr->rc_tv);
1964         }
1965 }
1966
1967
1968 static void
1969 bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
1970 {
1971         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1972                 union tcp_log_stackspecific log;
1973
1974                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1975                 log.u_bbr.flex1 = seq;
1976                 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
1977                 log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start;
1978                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1979                     &bbr->rc_inp->inp_socket->so_rcv,
1980                     &bbr->rc_inp->inp_socket->so_snd,
1981                     BBR_LOG_ENTREC, 0,
1982                     0, &log, false, &bbr->rc_tv);
1983         }
1984 }
1985
1986 static void
1987 bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
1988 {
1989         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1990                 union tcp_log_stackspecific log;
1991
1992                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
1993                 log.u_bbr.flex1 = tso;
1994                 log.u_bbr.flex2 = maxseg;
1995                 log.u_bbr.flex3 = mtu;
1996                 log.u_bbr.flex4 = csum_flags;
1997                 TCP_LOG_EVENTP(tp, NULL,
1998                     &bbr->rc_inp->inp_socket->so_rcv,
1999                     &bbr->rc_inp->inp_socket->so_snd,
2000                     BBR_LOG_MSGSIZE, 0,
2001                     0, &log, false, &bbr->rc_tv);
2002         }
2003 }
2004
2005 static void
2006 bbr_log_flowend(struct tcp_bbr *bbr)
2007 {
2008         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2009                 union tcp_log_stackspecific log;
2010                 struct sockbuf *r, *s;
2011                 struct timeval tv;
2012
2013                 if (bbr->rc_inp->inp_socket) {
2014                         r = &bbr->rc_inp->inp_socket->so_rcv;
2015                         s = &bbr->rc_inp->inp_socket->so_snd;
2016                 } else {
2017                         r = s = NULL;
2018                 }
2019                 bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv));
2020                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2021                     r, s,
2022                     TCP_LOG_FLOWEND, 0,
2023                     0, &log, false, &tv);
2024         }
2025 }
2026
2027 static void
2028 bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line,
2029     uint32_t lost, uint32_t del)
2030 {
2031         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2032                 union tcp_log_stackspecific log;
2033
2034                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2035                 log.u_bbr.flex1 = lost;
2036                 log.u_bbr.flex2 = del;
2037                 log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw;
2038                 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt;
2039                 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
2040                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2041                 log.u_bbr.flex7 = line;
2042                 log.u_bbr.flex8 = 0;
2043                 log.u_bbr.inflight = bbr->r_ctl.r_measurement_count;
2044                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2045                     &bbr->rc_inp->inp_socket->so_rcv,
2046                     &bbr->rc_inp->inp_socket->so_snd,
2047                     BBR_LOG_PKT_EPOCH, 0,
2048                     0, &log, false, &bbr->rc_tv);
2049         }
2050 }
2051
2052 static void
2053 bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time)
2054 {
2055         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2056                 union tcp_log_stackspecific log;
2057
2058                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2059                 log.u_bbr.flex1 = bbr->r_ctl.rc_lost;
2060                 log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat;
2061                 log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat;
2062                 log.u_bbr.flex7 = line;
2063                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2064                     &bbr->rc_inp->inp_socket->so_rcv,
2065                     &bbr->rc_inp->inp_socket->so_snd,
2066                     BBR_LOG_TIME_EPOCH, 0,
2067                     0, &log, false, &bbr->rc_tv);
2068         }
2069 }
2070
2071 static void
2072 bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth)
2073 {
2074         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2075                 union tcp_log_stackspecific log;
2076
2077                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2078                 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
2079                 log.u_bbr.flex2 = new_tar;
2080                 log.u_bbr.flex3 = line;
2081                 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
2082                 log.u_bbr.flex5 = bbr_quanta;
2083                 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs;
2084                 log.u_bbr.flex7 = bbr->rc_last_options;
2085                 log.u_bbr.flex8 = meth;
2086                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2087                     &bbr->rc_inp->inp_socket->so_rcv,
2088                     &bbr->rc_inp->inp_socket->so_snd,
2089                     BBR_LOG_STATE_TARGET, 0,
2090                     0, &log, false, &bbr->rc_tv);
2091         }
2092
2093 }
2094
2095 static void
2096 bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2097 {
2098         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2099                 union tcp_log_stackspecific log;
2100
2101                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2102                 log.u_bbr.flex1 = line;
2103                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2104                 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
2105                 if (bbr_state_is_pkt_epoch)
2106                         log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
2107                 else
2108                         log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP);
2109                 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
2110                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2111                 log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000);
2112                 log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra;
2113                 log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state;
2114                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2115                     &bbr->rc_inp->inp_socket->so_rcv,
2116                     &bbr->rc_inp->inp_socket->so_snd,
2117                     BBR_LOG_STATE, 0,
2118                     0, &log, false, &bbr->rc_tv);
2119         }
2120 }
2121
2122 static void
2123 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
2124                     uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond)
2125 {
2126         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2127                 union tcp_log_stackspecific log;
2128
2129                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2130                 log.u_bbr.flex1 = line;
2131                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2132                 log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt;
2133                 log.u_bbr.flex4 = applied;
2134                 log.u_bbr.flex5 = rtt;
2135                 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
2136                 log.u_bbr.flex7 = cond;
2137                 log.u_bbr.flex8 = reas;
2138                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2139                     &bbr->rc_inp->inp_socket->so_rcv,
2140                     &bbr->rc_inp->inp_socket->so_snd,
2141                     BBR_LOG_RTT_SHRINKS, 0,
2142                     0, &log, false, &bbr->rc_tv);
2143         }
2144 }
2145
2146 static void
2147 bbr_log_type_exit_rec(struct tcp_bbr *bbr)
2148 {
2149         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2150                 union tcp_log_stackspecific log;
2151
2152                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2153                 log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start;
2154                 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
2155                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2156                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2157                     &bbr->rc_inp->inp_socket->so_rcv,
2158                     &bbr->rc_inp->inp_socket->so_snd,
2159                     BBR_LOG_EXITREC, 0,
2160                     0, &log, false, &bbr->rc_tv);
2161         }
2162 }
2163
2164 static void
2165 bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg,
2166     uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line)
2167 {
2168         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2169                 union tcp_log_stackspecific log;
2170
2171                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2172                 log.u_bbr.flex1 = line;
2173                 log.u_bbr.flex2 = prev_acked;
2174                 log.u_bbr.flex3 = bytes_this_ack;
2175                 log.u_bbr.flex4 = chg;
2176                 log.u_bbr.flex5 = th_ack;
2177                 log.u_bbr.flex6 = target;
2178                 log.u_bbr.flex8 = meth;
2179                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2180                     &bbr->rc_inp->inp_socket->so_rcv,
2181                     &bbr->rc_inp->inp_socket->so_snd,
2182                     BBR_LOG_CWND, 0,
2183                     0, &log, false, &bbr->rc_tv);
2184         }
2185 }
2186
2187 static void
2188 bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
2189 {
2190         /*
2191          * Log the rtt sample we are applying to the srtt algorithm in
2192          * useconds.
2193          */
2194         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2195                 union tcp_log_stackspecific log;
2196
2197                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2198                 log.u_bbr.flex1 = rtt;
2199                 log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time;
2200                 log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
2201                 log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
2202                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2203                 log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
2204                 log.u_bbr.flex6 = tsin;
2205                 log.u_bbr.flex7 = 0;
2206                 log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
2207                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2208                     &bbr->rc_inp->inp_socket->so_rcv,
2209                     &bbr->rc_inp->inp_socket->so_snd,
2210                     TCP_LOG_RTT, 0,
2211                     0, &log, false, &bbr->rc_tv);
2212         }
2213 }
2214
2215 static void
2216 bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit)
2217 {
2218         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2219                 union tcp_log_stackspecific log;
2220
2221                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2222                 log.u_bbr.flex1 = time_in;
2223                 log.u_bbr.flex2 = line;
2224                 log.u_bbr.flex8 = enter_exit;
2225                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2226                     &bbr->rc_inp->inp_socket->so_rcv,
2227                     &bbr->rc_inp->inp_socket->so_snd,
2228                     BBR_LOG_PERSIST, 0,
2229                     0, &log, false, &bbr->rc_tv);
2230         }
2231 }
2232 static void
2233 bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts)
2234 {
2235         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2236                 union tcp_log_stackspecific log;
2237
2238                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2239                 log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age;
2240                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2241                 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
2242                 log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time;
2243                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2244                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2245                     &bbr->rc_inp->inp_socket->so_rcv,
2246                     &bbr->rc_inp->inp_socket->so_snd,
2247                     BBR_LOG_ACKCLEAR, 0,
2248                     0, &log, false, &bbr->rc_tv);
2249         }
2250 }
2251
2252 static void
2253 bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen,
2254                   uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m)
2255 {
2256         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2257                 union tcp_log_stackspecific log;
2258                 struct timeval tv;
2259
2260                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2261                 log.u_bbr.flex1 = nsegs;
2262                 log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes;
2263                 if (m) {
2264                         struct timespec ts;
2265
2266                         log.u_bbr.flex3 = m->m_flags;
2267                         if (m->m_flags & M_TSTMP) {
2268                                 mbuf_tstmp2timespec(m, &ts);
2269                                 tv.tv_sec = ts.tv_sec;
2270                                 tv.tv_usec = ts.tv_nsec / 1000;
2271                                 log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
2272                         } else {
2273                                 log.u_bbr.lt_epoch = 0;
2274                         }
2275                         if (m->m_flags & M_TSTMP_LRO) {
2276                                 tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
2277                                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
2278                                 log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
2279                         } else {
2280                                 /* No arrival timestamp */
2281                                 log.u_bbr.flex5 = 0;
2282                         }
2283
2284                         log.u_bbr.pkts_out = tcp_get_usecs(&tv);
2285                 } else {
2286                         log.u_bbr.flex3 = 0;
2287                         log.u_bbr.flex5 = 0;
2288                         log.u_bbr.flex6 = 0;
2289                         log.u_bbr.pkts_out = 0;
2290                 }
2291                 log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state;
2292                 log.u_bbr.flex7 = bbr->r_wanted_output;
2293                 log.u_bbr.flex8 = bbr->rc_in_persist;
2294                 TCP_LOG_EVENTP(bbr->rc_tp, th,
2295                     &bbr->rc_inp->inp_socket->so_rcv,
2296                     &bbr->rc_inp->inp_socket->so_snd,
2297                     TCP_LOG_IN, 0,
2298                     tlen, &log, true, &bbr->rc_tv);
2299         }
2300 }
2301
2302 static void
2303 bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out)
2304 {
2305         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2306                 union tcp_log_stackspecific log;
2307
2308                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2309                 log.u_bbr.flex1 = did_out;
2310                 log.u_bbr.flex2 = nxt_pkt;
2311                 log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val;
2312                 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
2313                 log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp;
2314                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes;
2315                 log.u_bbr.flex7 = bbr->r_wanted_output;
2316                 log.u_bbr.flex8 = bbr->rc_in_persist;
2317                 log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay;
2318                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2319                     &bbr->rc_inp->inp_socket->so_rcv,
2320                     &bbr->rc_inp->inp_socket->so_snd,
2321                     BBR_LOG_DOSEG_DONE, 0,
2322                     0, &log, true, &bbr->rc_tv);
2323         }
2324 }
2325
2326 static void
2327 bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts,
2328     int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz)
2329 {
2330         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2331                 union tcp_log_stackspecific log;
2332
2333                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2334                 log.u_bbr.flex1 = line;
2335                 log.u_bbr.flex2 = o_len;
2336                 log.u_bbr.flex3 = segcnt;
2337                 log.u_bbr.flex4 = segsiz;
2338                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2339                     &bbr->rc_inp->inp_socket->so_rcv,
2340                     &bbr->rc_inp->inp_socket->so_snd,
2341                     BBR_LOG_ENOBUF_JMP, ENOBUFS,
2342                     len, &log, true, &bbr->rc_tv);
2343         }
2344 }
2345
2346 static void
2347 bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling)
2348 {
2349         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2350                 union tcp_log_stackspecific log;
2351
2352                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2353                 log.u_bbr.flex1 = timers;
2354                 log.u_bbr.flex2 = ret;
2355                 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
2356                 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
2357                 log.u_bbr.flex5 = cts;
2358                 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
2359                 log.u_bbr.flex8 = hpts_calling;
2360                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2361                     &bbr->rc_inp->inp_socket->so_rcv,
2362                     &bbr->rc_inp->inp_socket->so_snd,
2363                     BBR_LOG_TO_PROCESS, 0,
2364                     0, &log, false, &bbr->rc_tv);
2365         }
2366 }
2367
2368 static void
2369 bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num)
2370 {
2371         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2372                 union tcp_log_stackspecific log;
2373                 uint64_t ar;
2374
2375                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2376                 log.u_bbr.flex1 = bbr->bbr_timer_src;
2377                 log.u_bbr.flex2 = 0;
2378                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2379                 ar = (uint64_t)(bbr->r_ctl.rc_resend);
2380                 ar >>= 32;
2381                 ar &= 0x00000000ffffffff;
2382                 log.u_bbr.flex4 = (uint32_t)ar;
2383                 ar = (uint64_t)bbr->r_ctl.rc_resend;
2384                 ar &= 0x00000000ffffffff;
2385                 log.u_bbr.flex5 = (uint32_t)ar;
2386                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2387                 log.u_bbr.flex8 = to_num;
2388                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2389                     &bbr->rc_inp->inp_socket->so_rcv,
2390                     &bbr->rc_inp->inp_socket->so_snd,
2391                     BBR_LOG_RTO, 0,
2392                     0, &log, false, &bbr->rc_tv);
2393         }
2394 }
2395
2396 static void
2397 bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason)
2398 {
2399         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2400                 union tcp_log_stackspecific log;
2401
2402                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2403                 log.u_bbr.flex1 = flex1;
2404                 log.u_bbr.flex2 = flex2;
2405                 log.u_bbr.flex3 = flex3;
2406                 log.u_bbr.flex4 = 0;
2407                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2408                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2409                 log.u_bbr.flex8 = reason;
2410                 log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw;
2411                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2412                     &bbr->rc_inp->inp_socket->so_rcv,
2413                     &bbr->rc_inp->inp_socket->so_snd,
2414                     BBR_LOG_REDUCE, 0,
2415                     0, &log, false, &bbr->rc_tv);
2416         }
2417 }
2418
2419 static void
2420 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
2421 {
2422         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2423                 union tcp_log_stackspecific log;
2424
2425                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2426                 log.u_bbr.flex1 = diag->p_nxt_slot;
2427                 log.u_bbr.flex2 = diag->p_cur_slot;
2428                 log.u_bbr.flex3 = diag->slot_req;
2429                 log.u_bbr.flex4 = diag->inp_hptsslot;
2430                 log.u_bbr.flex5 = diag->slot_remaining;
2431                 log.u_bbr.flex6 = diag->need_new_to;
2432                 log.u_bbr.flex7 = diag->p_hpts_active;
2433                 log.u_bbr.flex8 = diag->p_on_min_sleep;
2434                 /* Hijack other fields as needed  */
2435                 log.u_bbr.epoch = diag->have_slept;
2436                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
2437                 log.u_bbr.pkts_out = diag->co_ret;
2438                 log.u_bbr.applimited = diag->hpts_sleep_time;
2439                 log.u_bbr.delivered = diag->p_prev_slot;
2440                 log.u_bbr.inflight = diag->p_runningtick;
2441                 log.u_bbr.bw_inuse = diag->wheel_tick;
2442                 log.u_bbr.rttProp = diag->wheel_cts;
2443                 log.u_bbr.delRate = diag->maxticks;
2444                 log.u_bbr.cur_del_rate = diag->p_curtick;
2445                 log.u_bbr.cur_del_rate <<= 32;
2446                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
2447                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2448                     &bbr->rc_inp->inp_socket->so_rcv,
2449                     &bbr->rc_inp->inp_socket->so_snd,
2450                     BBR_LOG_HPTSDIAG, 0,
2451                     0, &log, false, &bbr->rc_tv);
2452         }
2453 }
2454
2455 static void
2456 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
2457     uint32_t thresh, uint32_t to)
2458 {
2459         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2460                 union tcp_log_stackspecific log;
2461
2462                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2463                 log.u_bbr.flex1 = bbr->rc_tp->t_rttvar;
2464                 log.u_bbr.flex2 = time_since_sent;
2465                 log.u_bbr.flex3 = srtt;
2466                 log.u_bbr.flex4 = thresh;
2467                 log.u_bbr.flex5 = to;
2468                 log.u_bbr.flex6 = bbr->rc_tp->t_srtt;
2469                 log.u_bbr.flex8 = mode;
2470                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2471                     &bbr->rc_inp->inp_socket->so_rcv,
2472                     &bbr->rc_inp->inp_socket->so_snd,
2473                     BBR_LOG_TIMERPREP, 0,
2474                     0, &log, false, &bbr->rc_tv);
2475         }
2476 }
2477
2478 static void
2479 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
2480     uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod)
2481 {
2482         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2483                 union tcp_log_stackspecific log;
2484
2485                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2486                 log.u_bbr.flex1 = usecs;
2487                 log.u_bbr.flex2 = len;
2488                 log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff);
2489                 log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff);
2490                 if (override)
2491                         log.u_bbr.flex5 = (1 << 2);
2492                 else
2493                         log.u_bbr.flex5 = 0;
2494                 log.u_bbr.flex6 = override;
2495                 log.u_bbr.flex7 = gain;
2496                 log.u_bbr.flex8 = mod;
2497                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2498                     &bbr->rc_inp->inp_socket->so_rcv,
2499                     &bbr->rc_inp->inp_socket->so_snd,
2500                     BBR_LOG_HPTSI_CALC, 0,
2501                     len, &log, false, &bbr->rc_tv);
2502         }
2503 }
2504
2505 static void
2506 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2507 {
2508         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2509                 union tcp_log_stackspecific log;
2510
2511                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2512
2513                 log.u_bbr.flex1 = bbr->bbr_timer_src;
2514                 log.u_bbr.flex2 = to;
2515                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2516                 log.u_bbr.flex4 = slot;
2517                 log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;
2518                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2519                 log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;
2520                 log.u_bbr.flex8 = which;
2521                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2522                     &bbr->rc_inp->inp_socket->so_rcv,
2523                     &bbr->rc_inp->inp_socket->so_snd,
2524                     BBR_LOG_TIMERSTAR, 0,
2525                     0, &log, false, &bbr->rc_tv);
2526         }
2527 }
2528
2529 static void
2530 bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm)
2531 {
2532         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2533                 union tcp_log_stackspecific log;
2534
2535                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2536                 log.u_bbr.flex1 = thresh;
2537                 log.u_bbr.flex2 = lro;
2538                 log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts;
2539                 log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
2540                 log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2541                 log.u_bbr.flex6 = srtt;
2542                 log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift;
2543                 log.u_bbr.flex8 = frm;
2544                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2545                     &bbr->rc_inp->inp_socket->so_rcv,
2546                     &bbr->rc_inp->inp_socket->so_snd,
2547                     BBR_LOG_THRESH_CALC, 0,
2548                     0, &log, false, &bbr->rc_tv);
2549         }
2550 }
2551
2552 static void
2553 bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed)
2554 {
2555         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2556                 union tcp_log_stackspecific log;
2557
2558                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2559                 log.u_bbr.flex1 = line;
2560                 log.u_bbr.flex2 = bbr->bbr_timer_src;
2561                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2562                 log.u_bbr.flex4 = bbr->rc_in_persist;
2563                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2564                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2565                 log.u_bbr.flex8 = hpts_removed;
2566                 log.u_bbr.pkts_out = bbr->rc_pacer_started;
2567                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2568                     &bbr->rc_inp->inp_socket->so_rcv,
2569                     &bbr->rc_inp->inp_socket->so_snd,
2570                     BBR_LOG_TIMERCANC, 0,
2571                     0, &log, false, &bbr->rc_tv);
2572         }
2573 }
2574
2575
2576 static void
2577 bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta)
2578 {
2579         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2580                 union tcp_log_stackspecific log;
2581
2582                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2583                 log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio;
2584                 log.u_bbr.flex2 = (peer_delta >> 32);
2585                 log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff);
2586                 log.u_bbr.flex4 = (delta >> 32);
2587                 log.u_bbr.flex5 = (delta & 0x00000000ffffffff);
2588                 log.u_bbr.flex7 = bbr->rc_ts_clock_set;
2589                 log.u_bbr.flex8 = bbr->rc_ts_cant_be_used;
2590                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2591                     &bbr->rc_inp->inp_socket->so_rcv,
2592                     &bbr->rc_inp->inp_socket->so_snd,
2593                     BBR_LOG_TSTMP_VAL, 0,
2594                     0, &log, false, &bbr->rc_tv);
2595
2596         }
2597 }
2598
2599 static void
2600 bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr)
2601 {
2602         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2603                 union tcp_log_stackspecific log;
2604
2605                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2606                 log.u_bbr.flex1 = tsosz;
2607                 log.u_bbr.flex2 = tls;
2608                 log.u_bbr.flex3 = tcp_min_hptsi_time;
2609                 log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min;
2610                 log.u_bbr.flex5 = old_val;
2611                 log.u_bbr.flex6 = maxseg;
2612                 log.u_bbr.flex7 = bbr->rc_no_pacing;
2613                 log.u_bbr.flex7 <<= 1;
2614                 log.u_bbr.flex7 |= bbr->rc_past_init_win;
2615                 if (hdwr)
2616                         log.u_bbr.flex8 = 0x80 | bbr->rc_use_google;
2617                 else
2618                         log.u_bbr.flex8 = bbr->rc_use_google;
2619                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2620                     &bbr->rc_inp->inp_socket->so_rcv,
2621                     &bbr->rc_inp->inp_socket->so_snd,
2622                     BBR_LOG_BBRTSO, 0,
2623                     0, &log, false, &bbr->rc_tv);
2624         }
2625 }
2626
2627 static void
2628 bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm,
2629                       uint32_t flags, uint32_t line)
2630 {
2631         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2632                 union tcp_log_stackspecific log;
2633
2634                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2635                 log.u_bbr.flex1 = line;
2636                 log.u_bbr.flex2 = rsm->r_start;
2637                 log.u_bbr.flex3 = rsm->r_end;
2638                 log.u_bbr.flex4 = rsm->r_delivered;
2639                 log.u_bbr.flex5 = rsm->r_rtr_cnt;
2640                 log.u_bbr.flex6 = rsm->r_dupack;
2641                 log.u_bbr.flex7 = rsm->r_tim_lastsent[0];
2642                 log.u_bbr.flex8 = rsm->r_flags;
2643                 /* Hijack the pkts_out fids */
2644                 log.u_bbr.applimited = flags;
2645                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2646                     &bbr->rc_inp->inp_socket->so_rcv,
2647                     &bbr->rc_inp->inp_socket->so_snd,
2648                     BBR_RSM_CLEARED, 0,
2649                     0, &log, false, &bbr->rc_tv);
2650         }
2651 }
2652
2653 static void
2654 bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts,
2655     uint32_t flex3, uint32_t flex2, uint32_t flex5,
2656     uint32_t flex6, uint32_t pkts_out, int flex7,
2657     uint32_t flex4, uint32_t flex1)
2658 {
2659
2660         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2661                 union tcp_log_stackspecific log;
2662
2663                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2664                 log.u_bbr.flex1 = flex1;
2665                 log.u_bbr.flex2 = flex2;
2666                 log.u_bbr.flex3 = flex3;
2667                 log.u_bbr.flex4 = flex4;
2668                 log.u_bbr.flex5 = flex5;
2669                 log.u_bbr.flex6 = flex6;
2670                 log.u_bbr.flex7 = flex7;
2671                 /* Hijack the pkts_out fids */
2672                 log.u_bbr.pkts_out = pkts_out;
2673                 log.u_bbr.flex8 = flex8;
2674                 if (bbr->rc_ack_was_delayed)
2675                         log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay;
2676                 else
2677                         log.u_bbr.epoch = 0;
2678                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2679                     &bbr->rc_inp->inp_socket->so_rcv,
2680                     &bbr->rc_inp->inp_socket->so_snd,
2681                     BBR_LOG_BBRUPD, 0,
2682                     flex2, &log, false, &bbr->rc_tv);
2683         }
2684 }
2685
2686
2687 static void
2688 bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
2689         uint32_t newbw, uint32_t obw, uint32_t diff,
2690         uint32_t tim)
2691 {
2692         if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2693                 union tcp_log_stackspecific log;
2694
2695                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2696                 log.u_bbr.flex1 = reason;
2697                 log.u_bbr.flex2 = newbw;
2698                 log.u_bbr.flex3 = obw;
2699                 log.u_bbr.flex4 = diff;
2700                 log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost;
2701                 log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del;
2702                 log.u_bbr.flex7 = bbr->rc_lt_is_sampling;
2703                 log.u_bbr.pkts_out = tim;
2704                 log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw;
2705                 if (bbr->rc_lt_use_bw == 0)
2706                         log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
2707                 else
2708                         log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
2709                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2710                     &bbr->rc_inp->inp_socket->so_rcv,
2711                     &bbr->rc_inp->inp_socket->so_snd,
2712                     BBR_LOG_BWSAMP, 0,
2713                     0, &log, false, &bbr->rc_tv);
2714         }
2715 }
2716
2717 static inline void
2718 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
2719 {
2720         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2721                 union tcp_log_stackspecific log;
2722
2723                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2724                 log.u_bbr.flex1 = line;
2725                 log.u_bbr.flex2 = tick;
2726                 log.u_bbr.flex3 = tp->t_maxunacktime;
2727                 log.u_bbr.flex4 = tp->t_acktime;
2728                 log.u_bbr.flex8 = event;
2729                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2730                     &bbr->rc_inp->inp_socket->so_rcv,
2731                     &bbr->rc_inp->inp_socket->so_snd,
2732                     BBR_LOG_PROGRESS, 0,
2733                     0, &log, false, &bbr->rc_tv);
2734         }
2735 }
2736
2737 static void
2738 bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
2739                          uint64_t rate, uint64_t hw_rate, int line, uint32_t cts,
2740                          int error)
2741 {
2742         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2743                 union tcp_log_stackspecific log;
2744
2745                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2746                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2747                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2748                 log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2749                 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2750                 log.u_bbr.bw_inuse = rate;
2751                 log.u_bbr.flex5 = line;
2752                 log.u_bbr.flex6 = error;
2753                 log.u_bbr.flex8 = bbr->skip_gain;
2754                 log.u_bbr.flex8 <<= 1;
2755                 log.u_bbr.flex8 |= bbr->gain_is_limited;
2756                 log.u_bbr.flex8 <<= 1;
2757                 log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing;
2758                 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
2759                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2760                     &bbr->rc_inp->inp_socket->so_rcv,
2761                     &bbr->rc_inp->inp_socket->so_snd,
2762                     BBR_LOG_HDWR_PACE, 0,
2763                     0, &log, false, &bbr->rc_tv);
2764         }
2765 }
2766
2767 static void
2768 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
2769 {
2770         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2771                 union tcp_log_stackspecific log;
2772
2773                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2774                 log.u_bbr.flex1 = slot;
2775                 log.u_bbr.flex2 = del_by;
2776                 log.u_bbr.flex3 = prev_delay;
2777                 log.u_bbr.flex4 = line;
2778                 log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val;
2779                 log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay;
2780                 log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags);
2781                 log.u_bbr.flex8 = bbr->rc_in_persist;
2782                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2783                     &bbr->rc_inp->inp_socket->so_rcv,
2784                     &bbr->rc_inp->inp_socket->so_snd,
2785                     BBR_LOG_BBRSND, 0,
2786                     len, &log, false, &bbr->rc_tv);
2787         }
2788 }
2789
2790 static void
2791 bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags)
2792 {
2793         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2794                 union tcp_log_stackspecific log;
2795
2796                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2797                 log.u_bbr.flex1 = bbr->r_ctl.rc_delivered;
2798                 log.u_bbr.flex2 = 0;
2799                 log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt;
2800                 log.u_bbr.flex4 = end;
2801                 log.u_bbr.flex5 = seq;
2802                 log.u_bbr.flex6 = t;
2803                 log.u_bbr.flex7 = match;
2804                 log.u_bbr.flex8 = flags;
2805                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2806                     &bbr->rc_inp->inp_socket->so_rcv,
2807                     &bbr->rc_inp->inp_socket->so_snd,
2808                     BBR_LOG_BBRRTT, 0,
2809                     0, &log, false, &bbr->rc_tv);
2810         }
2811 }
2812
2813 static void
2814 bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method)
2815 {
2816         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2817                 union tcp_log_stackspecific log;
2818
2819                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2820                 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
2821                 log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
2822                 log.u_bbr.flex3 = bbr->r_ctl.gain_epoch;
2823                 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
2824                 log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs;
2825                 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight;
2826                 log.u_bbr.flex7 = 0;
2827                 log.u_bbr.flex8 = entry_method;
2828                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2829                     &bbr->rc_inp->inp_socket->so_rcv,
2830                     &bbr->rc_inp->inp_socket->so_snd,
2831                     BBR_LOG_EXIT_GAIN, 0,
2832                     0, &log, false, &bbr->rc_tv);
2833         }
2834 }
2835
2836 static void
2837 bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired)
2838 {
2839         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2840                 union tcp_log_stackspecific log;
2841
2842                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2843                 /* R-HU */
2844                 log.u_bbr.flex1 = 0;
2845                 log.u_bbr.flex2 = 0;
2846                 log.u_bbr.flex3 = 0;
2847                 log.u_bbr.flex4 = 0;
2848                 log.u_bbr.flex7 = 0;
2849                 log.u_bbr.flex8 = settings_desired;
2850
2851                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2852                     &bbr->rc_inp->inp_socket->so_rcv,
2853                     &bbr->rc_inp->inp_socket->so_snd,
2854                     BBR_LOG_SETTINGS_CHG, 0,
2855                     0, &log, false, &bbr->rc_tv);
2856         }
2857 }
2858
2859 /*
2860  * Returns the bw from the our filter.
2861  */
2862 static inline uint64_t
2863 bbr_get_full_bw(struct tcp_bbr *bbr)
2864 {
2865         uint64_t bw;
2866
2867         bw = get_filter_value(&bbr->r_ctl.rc_delrate);
2868
2869         return (bw);
2870 }
2871
2872 static inline void
2873 bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2874 {
2875         uint64_t calclr;
2876         uint32_t lost, del;
2877
2878         if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch)
2879                 lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch;
2880         else
2881                 lost = 0;
2882         del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del;
2883         if (lost == 0)  {
2884                 calclr = 0;
2885         } else if (del) {
2886                 calclr = lost;
2887                 calclr *= (uint64_t)1000;
2888                 calclr /= (uint64_t)del;
2889         } else {
2890                 /* Nothing delivered? 100.0% loss */
2891                 calclr = 1000;
2892         }
2893         bbr->r_ctl.rc_pkt_epoch_loss_rate =  (uint32_t)calclr;
2894         if (IN_RECOVERY(bbr->rc_tp->t_flags))
2895                 bbr->r_ctl.recovery_lr += (uint32_t)calclr;
2896         bbr->r_ctl.rc_pkt_epoch++;
2897         if (bbr->rc_no_pacing &&
2898             (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) {
2899                 bbr->rc_no_pacing = 0;
2900                 tcp_bbr_tso_size_check(bbr, cts);
2901         }
2902         bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time);
2903         bbr->r_ctl.rc_pkt_epoch_time = cts;
2904         /* What was our loss rate */
2905         bbr_log_pkt_epoch(bbr, cts, line, lost, del);
2906         bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered;
2907         bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost;
2908 }
2909
2910 static inline void
2911 bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2912 {
2913         uint32_t epoch_time;
2914
2915         /* Tick the RTT clock */
2916         bbr->r_ctl.rc_rtt_epoch++;
2917         epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start;
2918         bbr_log_time_epoch(bbr, cts, line, epoch_time);
2919         bbr->r_ctl.rc_rcv_epoch_start = cts;
2920 }
2921
2922
2923 static inline void
2924 bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked)
2925 {
2926         if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) {
2927                 bbr->rc_is_pkt_epoch_now = 1;
2928         }
2929 }
2930
2931 /*
2932  * Returns the bw from either the b/w filter
2933  * or from the lt_bw (if the connection is being
2934  * policed).
2935  */
2936 static inline uint64_t
2937 __bbr_get_bw(struct tcp_bbr *bbr)
2938 {
2939         uint64_t bw, min_bw;
2940         uint64_t rtt;
2941         int gm_measure_cnt = 1;
2942
2943         /*
2944          * For startup we make, like google, a
2945          * minimum b/w. This is generated from the
2946          * IW and the rttProp. We do fall back to srtt
2947          * if for some reason (initial handshake) we don't
2948          * have a rttProp. We, in the worst case, fall back
2949          * to the configured min_bw (rc_initial_hptsi_bw).
2950          */
2951         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
2952                 /* Attempt first to use rttProp */
2953                 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
2954                 if (rtt && (rtt < 0xffffffff)) {
2955 measure:
2956                         min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
2957                                 ((uint64_t)1000000);
2958                         min_bw /= rtt;
2959                         if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) {
2960                                 min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
2961                         }
2962
2963                 } else if (bbr->rc_tp->t_srtt != 0) {
2964                         /* No rttProp, use srtt? */
2965                         rtt = bbr_get_rtt(bbr, BBR_SRTT);
2966                         goto measure;
2967                 } else {
2968                         min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
2969                 }
2970         } else
2971                 min_bw = 0;
2972
2973         if ((bbr->rc_past_init_win == 0) &&
2974             (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp)))
2975                 bbr->rc_past_init_win = 1;
2976         if ((bbr->rc_use_google)  && (bbr->r_ctl.r_measurement_count >= 1))
2977                 gm_measure_cnt = 0;
2978         if (gm_measure_cnt &&
2979             ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) ||
2980              (bbr->rc_past_init_win == 0))) {
2981                 /* For google we use our guess rate until we get 1 measurement */
2982
2983 use_initial_window:
2984                 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
2985                 if (rtt && (rtt < 0xffffffff)) {
2986                         /*
2987                          * We have an RTT measurment. Use that in
2988                          * combination with our initial window to calculate
2989                          * a b/w.
2990                          */
2991                         bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
2992                                 ((uint64_t)1000000);
2993                         bw /= rtt;
2994                         if (bw < bbr->r_ctl.rc_initial_hptsi_bw) {
2995                                 bw = bbr->r_ctl.rc_initial_hptsi_bw;
2996                         }
2997                 } else {
2998                         /* Drop back to the 40 and punt to a default */
2999                         bw = bbr->r_ctl.rc_initial_hptsi_bw;
3000                 }
3001                 if (bw < 1)
3002                         /* Probably should panic */
3003                         bw = 1;
3004                 if (bw > min_bw)
3005                         return (bw);
3006                 else
3007                         return (min_bw);
3008         }
3009         if (bbr->rc_lt_use_bw)
3010                 bw = bbr->r_ctl.rc_lt_bw;
3011         else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0))
3012                 bw = bbr->r_ctl.red_bw;
3013         else
3014                 bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3015         if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) {
3016                 /*
3017                  * Enforce user set rate limit, keep in mind that
3018                  * t_peakrate_thr is in B/s already
3019                  */
3020                 bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw);
3021         }
3022         if (bw == 0) {
3023                 /* We should not be at 0, go to the initial window then  */
3024                 goto use_initial_window;
3025         }
3026         if (bw < 1)
3027                 /* Probably should panic */
3028                 bw = 1;
3029         if (bw < min_bw)
3030                 bw = min_bw;
3031         return (bw);
3032 }
3033
3034 static inline uint64_t
3035 bbr_get_bw(struct tcp_bbr *bbr)
3036 {
3037         uint64_t bw;
3038
3039         bw = __bbr_get_bw(bbr);
3040         return (bw);
3041 }
3042
3043 static inline void
3044 bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts)
3045 {
3046         bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch;
3047         bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time;
3048         bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
3049         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
3050 }
3051
3052 static inline void
3053 bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts)
3054 {
3055         bbr->rc_lt_is_sampling = 0;
3056         bbr->rc_lt_use_bw = 0;
3057         bbr->r_ctl.rc_lt_bw = 0;
3058         bbr_reset_lt_bw_interval(bbr, cts);
3059 }
3060
3061 static inline void
3062 bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin)
3063 {
3064         uint64_t diff;
3065
3066         /* Do we have a previous sample? */
3067         if (bbr->r_ctl.rc_lt_bw) {
3068                 /* Get the diff in bytes per second */
3069                 if (bbr->r_ctl.rc_lt_bw > bw)
3070                         diff = bbr->r_ctl.rc_lt_bw - bw;
3071                 else
3072                         diff = bw - bbr->r_ctl.rc_lt_bw;
3073                 if ((diff <= bbr_lt_bw_diff) ||
3074                     (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) {
3075                         /* Consider us policed */
3076                         uint32_t saved_bw;
3077
3078                         saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw;
3079                         bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2;   /* average of two */
3080                         bbr->rc_lt_use_bw = 1;
3081                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
3082                         /*
3083                          * Use pkt based epoch for measuring length of
3084                          * policer up
3085                          */
3086                         bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch;
3087                         /*
3088                          * reason 4 is we need to start consider being
3089                          * policed
3090                          */
3091                         bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin);
3092                         return;
3093                 }
3094         }
3095         bbr->r_ctl.rc_lt_bw = bw;
3096         bbr_reset_lt_bw_interval(bbr, cts);
3097         bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin);
3098 }
3099
3100 /*
3101  * RRS: Copied from user space!
3102  * Calculate a uniformly distributed random number less than upper_bound
3103  * avoiding "modulo bias".
3104  *
3105  * Uniformity is achieved by generating new random numbers until the one
3106  * returned is outside the range [0, 2**32 % upper_bound).  This
3107  * guarantees the selected random number will be inside
3108  * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound)
3109  * after reduction modulo upper_bound.
3110  */
3111 static uint32_t
3112 arc4random_uniform(uint32_t upper_bound)
3113 {
3114         uint32_t r, min;
3115
3116         if (upper_bound < 2)
3117                 return 0;
3118
3119         /* 2**32 % x == (2**32 - x) % x */
3120         min = -upper_bound % upper_bound;
3121
3122         /*
3123          * This could theoretically loop forever but each retry has
3124          * p > 0.5 (worst case, usually far better) of selecting a
3125          * number inside the range we need, so it should rarely need
3126          * to re-roll.
3127          */
3128         for (;;) {
3129                 r = arc4random();
3130                 if (r >= min)
3131                         break;
3132         }
3133
3134         return r % upper_bound;
3135 }
3136
3137 static void
3138 bbr_randomize_extra_state_time(struct tcp_bbr *bbr)
3139 {
3140         uint32_t ran, deduct;
3141
3142         ran = arc4random_uniform(bbr_rand_ot);
3143         if (ran) {
3144                 deduct = bbr->r_ctl.rc_level_state_extra / ran;
3145                 bbr->r_ctl.rc_level_state_extra -= deduct;
3146         }
3147 }
3148 /*
3149  * Return randomly the starting state
3150  * to use in probebw.
3151  */
3152 static uint8_t
3153 bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts)
3154 {
3155         uint32_t ran;
3156         uint8_t ret_val;
3157
3158         /* Initialize the offset to 0 */
3159         bbr->r_ctl.rc_exta_time_gd = 0;
3160         bbr->rc_hit_state_1 = 0;
3161         bbr->r_ctl.rc_level_state_extra = 0;
3162         ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1));
3163         /*
3164          * The math works funny here :) the return value is used to set the
3165          * substate and then the state change is called which increments by
3166          * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when
3167          * we fully enter the state. Note that the (8 - 1 - ran) assures that
3168          * we return 1 - 7, so we dont return 0 and end up starting in
3169          * state 1 (DRAIN).
3170          */
3171         ret_val = BBR_SUBSTATE_COUNT - 1 - ran;
3172         /* Set an epoch */
3173         if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP))
3174                 bbr_set_epoch(bbr, cts, __LINE__);
3175
3176         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
3177         return (ret_val);
3178 }
3179
3180 static void
3181 bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected)
3182 {
3183         uint32_t diff, d_time;
3184         uint64_t del_time, bw, lost, delivered;
3185
3186         if (bbr->r_use_policer == 0)
3187                 return;
3188         if (bbr->rc_lt_use_bw) {
3189                 /* We are using lt bw do we stop yet? */
3190                 diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
3191                 if (diff > bbr_lt_bw_max_rtts) {
3192                         /* Reset it all */
3193 reset_all:
3194                         bbr_reset_lt_bw_sampling(bbr, cts);
3195                         if (bbr->rc_filled_pipe) {
3196                                 bbr_set_epoch(bbr, cts, __LINE__);
3197                                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
3198                                 bbr_substate_change(bbr, cts, __LINE__, 0);
3199                                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
3200                                 bbr_log_type_statechange(bbr, cts, __LINE__);
3201                         } else {
3202                                 /*
3203                                  * This should not happen really
3204                                  * unless we remove the startup/drain
3205                                  * restrictions above.
3206                                  */
3207                                 bbr->rc_bbr_state = BBR_STATE_STARTUP;
3208                                 bbr_set_epoch(bbr, cts, __LINE__);
3209                                 bbr->r_ctl.rc_bbr_state_time = cts;
3210                                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
3211                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
3212                                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
3213                                 bbr_set_state_target(bbr, __LINE__);
3214                                 bbr_log_type_statechange(bbr, cts, __LINE__);
3215                         }
3216                         /* reason 0 is to stop using lt-bw */
3217                         bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0);
3218                         return;
3219                 }
3220                 if (bbr_lt_intvl_fp == 0) {
3221                         /* Not doing false-postive detection */
3222                         return;
3223                 }
3224                 /* False positive detection */
3225                 if (diff == bbr_lt_intvl_fp) {
3226                         /* At bbr_lt_intvl_fp we record the lost */
3227                         bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
3228                         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
3229                 } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) {
3230                         /* Now is our loss rate still high? */
3231                         lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
3232                         delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
3233                         if ((delivered == 0) ||
3234                             (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) {
3235                                 /* No still below our threshold */
3236                                 bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0);
3237                         } else {
3238                                 /* Yikes its still high, it must be a false positive */
3239                                 bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0);
3240                                 goto reset_all;
3241                         }
3242                 }
3243                 return;
3244         }
3245         /*
3246          * Wait for the first loss before sampling, to let the policer
3247          * exhaust its tokens and estimate the steady-state rate allowed by
3248          * the policer. Starting samples earlier includes bursts that
3249          * over-estimate the bw.
3250          */
3251         if (bbr->rc_lt_is_sampling == 0) {
3252                 /* reason 1 is to begin doing the sampling  */
3253                 if (loss_detected == 0)
3254                         return;
3255                 bbr_reset_lt_bw_interval(bbr, cts);
3256                 bbr->rc_lt_is_sampling = 1;
3257                 bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0);
3258                 return;
3259         }
3260         /* Now how long were we delivering long term last> */
3261         if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time))
3262                 d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time;
3263         else
3264                 d_time = 0;
3265
3266         /* To avoid underestimates, reset sampling if we run out of data. */
3267         if (bbr->r_ctl.r_app_limited_until) {
3268                 /* Can not measure in app-limited state */
3269                 bbr_reset_lt_bw_sampling(bbr, cts);
3270                 /* reason 2 is to reset sampling due to app limits  */
3271                 bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time);
3272                 return;
3273         }
3274         diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
3275         if (diff < bbr_lt_intvl_min_rtts) {
3276                 /*
3277                  * need more samples (we don't
3278                  * start on a round like linux so
3279                  * we need 1 more).
3280                  */
3281                 /* 6 is not_enough time or no-loss */
3282                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3283                 return;
3284         }
3285         if (diff > (4 * bbr_lt_intvl_min_rtts)) {
3286                 /*
3287                  * For now if we wait too long, reset all sampling. We need
3288                  * to do some research here, its possible that we should
3289                  * base this on how much loss as occurred.. something like
3290                  * if its under 10% (or some thresh) reset all otherwise
3291                  * don't.  Thats for phase II I guess.
3292                  */
3293                 bbr_reset_lt_bw_sampling(bbr, cts);
3294                 /* reason 3 is to reset sampling due too long of sampling */
3295                 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
3296                 return;
3297         }
3298         /*
3299          * End sampling interval when a packet is lost, so we estimate the
3300          * policer tokens were exhausted. Stopping the sampling before the
3301          * tokens are exhausted under-estimates the policed rate.
3302          */
3303         if (loss_detected == 0) {
3304                 /* 6 is not_enough time or no-loss */
3305                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3306                 return;
3307         }
3308         /* Calculate packets lost and delivered in sampling interval. */
3309         lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
3310         delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
3311         if ((delivered == 0) ||
3312             (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) {
3313                 bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time);
3314                 return;
3315         }
3316         if (d_time < 1000) {
3317                 /* Not enough time. wait */
3318                 /* 6 is not_enough time or no-loss */
3319                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3320                 return;
3321         }
3322         if (d_time >= (0xffffffff / USECS_IN_MSEC)) {
3323                 /* Too long */
3324                 bbr_reset_lt_bw_sampling(bbr, cts);
3325                 /* reason 3 is to reset sampling due too long of sampling */
3326                 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
3327                 return;
3328         }
3329         del_time = d_time;
3330         bw = delivered;
3331         bw *= (uint64_t)USECS_IN_SECOND;
3332         bw /= del_time;
3333         bbr_lt_bw_samp_done(bbr, bw, cts, d_time);
3334 }
3335
3336 /*
3337  * Allocate a sendmap from our zone.
3338  */
3339 static struct bbr_sendmap *
3340 bbr_alloc(struct tcp_bbr *bbr)
3341 {
3342         struct bbr_sendmap *rsm;
3343
3344         BBR_STAT_INC(bbr_to_alloc);
3345         rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO));
3346         if (rsm) {
3347                 bbr->r_ctl.rc_num_maps_alloced++;
3348                 return (rsm);
3349         }
3350         if (bbr->r_ctl.rc_free_cnt) {
3351                 BBR_STAT_INC(bbr_to_alloc_emerg);
3352                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
3353                 TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
3354                 bbr->r_ctl.rc_free_cnt--;
3355                 return (rsm);
3356         }
3357         BBR_STAT_INC(bbr_to_alloc_failed);
3358         return (NULL);
3359 }
3360
3361 static struct bbr_sendmap *
3362 bbr_alloc_full_limit(struct tcp_bbr *bbr)
3363 {
3364         if ((V_tcp_map_entries_limit > 0) &&
3365             (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3366                 BBR_STAT_INC(bbr_alloc_limited);
3367                 if (!bbr->alloc_limit_reported) {
3368                         bbr->alloc_limit_reported = 1;
3369                         BBR_STAT_INC(bbr_alloc_limited_conns);
3370                 }
3371                 return (NULL);
3372         }
3373         return (bbr_alloc(bbr));
3374 }
3375
3376
3377 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3378 static struct bbr_sendmap *
3379 bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type)
3380 {
3381         struct bbr_sendmap *rsm;
3382
3383         if (limit_type) {
3384                 /* currently there is only one limit type */
3385                 if (V_tcp_map_split_limit > 0 &&
3386                     bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
3387                         BBR_STAT_INC(bbr_split_limited);
3388                         if (!bbr->alloc_limit_reported) {
3389                                 bbr->alloc_limit_reported = 1;
3390                                 BBR_STAT_INC(bbr_alloc_limited_conns);
3391                         }
3392                         return (NULL);
3393                 }
3394         }
3395
3396         /* allocate and mark in the limit type, if set */
3397         rsm = bbr_alloc(bbr);
3398         if (rsm != NULL && limit_type) {
3399                 rsm->r_limit_type = limit_type;
3400                 bbr->r_ctl.rc_num_split_allocs++;
3401         }
3402         return (rsm);
3403 }
3404
3405 static void
3406 bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
3407 {
3408         if (rsm->r_limit_type) {
3409                 /* currently there is only one limit type */
3410                 bbr->r_ctl.rc_num_split_allocs--;
3411         }
3412         if (rsm->r_is_smallmap)
3413                 bbr->r_ctl.rc_num_small_maps_alloced--;
3414         if (bbr->r_ctl.rc_tlp_send == rsm)
3415                 bbr->r_ctl.rc_tlp_send = NULL;
3416         if (bbr->r_ctl.rc_resend == rsm) {
3417                 bbr->r_ctl.rc_resend = NULL;
3418         }
3419         if (bbr->r_ctl.rc_next == rsm)
3420                 bbr->r_ctl.rc_next = NULL;
3421         if (bbr->r_ctl.rc_sacklast == rsm)
3422                 bbr->r_ctl.rc_sacklast = NULL;
3423         if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
3424                 memset(rsm, 0, sizeof(struct bbr_sendmap));
3425                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
3426                 rsm->r_limit_type = 0;
3427                 bbr->r_ctl.rc_free_cnt++;
3428                 return;
3429         }
3430         bbr->r_ctl.rc_num_maps_alloced--;
3431         uma_zfree(bbr_zone, rsm);
3432 }
3433
3434 /*
3435  * Returns the BDP.
3436  */
3437 static uint64_t
3438 bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) {
3439         /*
3440          * Calculate the bytes in flight needed given the bw (in bytes per
3441          * second) and the specifyed rtt in useconds. We need to put out the
3442          * returned value per RTT to match that rate. Gain will normaly
3443          * raise it up from there.
3444          *
3445          * This should not overflow as long as the bandwidth is below 1
3446          * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller
3447          * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30).
3448          */
3449         uint64_t usec_per_sec;
3450
3451         usec_per_sec = USECS_IN_SECOND;
3452         return ((rtt * bw) / usec_per_sec);
3453 }
3454
3455 /*
3456  * Return the initial cwnd.
3457  */
3458 static uint32_t
3459 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp)
3460 {
3461         uint32_t i_cwnd;
3462
3463         if (bbr->rc_init_win) {
3464                 i_cwnd = bbr->rc_init_win * tp->t_maxseg;
3465         } else if (V_tcp_initcwnd_segments)
3466                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
3467                     max(2 * tp->t_maxseg, 14600));
3468         else if (V_tcp_do_rfc3390)
3469                 i_cwnd = min(4 * tp->t_maxseg,
3470                     max(2 * tp->t_maxseg, 4380));
3471         else {
3472                 /* Per RFC5681 Section 3.1 */
3473                 if (tp->t_maxseg > 2190)
3474                         i_cwnd = 2 * tp->t_maxseg;
3475                 else if (tp->t_maxseg > 1095)
3476                         i_cwnd = 3 * tp->t_maxseg;
3477                 else
3478                         i_cwnd = 4 * tp->t_maxseg;
3479         }
3480         return (i_cwnd);
3481 }
3482
3483 /*
3484  * Given a specified gain, return the target
3485  * cwnd based on that gain.
3486  */
3487 static uint32_t
3488 bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw)
3489 {
3490         uint64_t bdp, rtt;
3491         uint32_t cwnd;
3492
3493         if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) ||
3494             (bbr_get_full_bw(bbr) == 0)) {
3495                 /* No measurements yet */
3496                 return (bbr_initial_cwnd(bbr, bbr->rc_tp));
3497         }
3498         /*
3499          * Get bytes per RTT needed (rttProp is normally in
3500          * bbr_cwndtarget_rtt_touse)
3501          */
3502         rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse);
3503         /* Get the bdp from the two values */
3504         bdp = bbr_get_bw_delay_prod(rtt, bw);
3505         /* Now apply the gain */
3506         cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT));
3507
3508         return (cwnd);
3509 }
3510
3511 static uint32_t
3512 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain)
3513 {
3514         uint32_t cwnd, mss;
3515
3516         mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
3517         /* Get the base cwnd with gain rounded to a mss */
3518         cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss);
3519         /*
3520          * Add in N (2 default since we do not have a
3521          * fq layer to trap packets in) quanta's per the I-D
3522          * section 4.2.3.2 quanta adjust.
3523          */
3524         cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs);
3525         if (bbr->rc_use_google) {
3526                 if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
3527                    (bbr_state_val(bbr) == BBR_SUB_GAIN)) {
3528                         /*
3529                          * The linux implementation adds
3530                          * an extra 2 x mss in gain cycle which
3531                          * is documented no-where except in the code.
3532                          * so we add more for Neal undocumented feature
3533                          */
3534                         cwnd += 2 * mss;
3535                 }
3536                 if ((cwnd / mss) & 0x1) {
3537                         /* Round up for odd num mss */
3538                         cwnd += mss;
3539                 }
3540         }
3541         /* Are we below the min cwnd? */
3542         if (cwnd < get_min_cwnd(bbr))
3543                 return (get_min_cwnd(bbr));
3544         return (cwnd);
3545 }
3546
3547 static uint16_t
3548 bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain)
3549 {
3550         if (gain < 1)
3551                 gain = 1;
3552         return (gain);
3553 }
3554
3555 static uint32_t
3556 bbr_get_header_oh(struct tcp_bbr *bbr)
3557 {
3558         int seg_oh;
3559
3560         seg_oh = 0;
3561         if (bbr->r_ctl.rc_inc_tcp_oh) {
3562                 /* Do we include TCP overhead? */
3563                 seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr));
3564         }
3565         if (bbr->r_ctl.rc_inc_ip_oh) {
3566                 /* Do we include IP overhead? */
3567 #ifdef INET6
3568                 if (bbr->r_is_v6)
3569                         seg_oh += sizeof(struct ip6_hdr);
3570                 else
3571 #endif
3572 #ifdef INET
3573                         seg_oh += sizeof(struct ip);
3574 #endif
3575         }
3576         if (bbr->r_ctl.rc_inc_enet_oh) {
3577                 /* Do we include the ethernet overhead?  */
3578                 seg_oh += sizeof(struct ether_header);
3579         }
3580         return(seg_oh);
3581 }
3582
3583
3584 static uint32_t
3585 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw)
3586 {
3587         uint64_t divor, res, tim;
3588
3589         if (useconds_time == 0)
3590                 return (0);
3591         gain = bbr_gain_adjust(bbr, gain);
3592         divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT;
3593         tim = useconds_time;
3594         res = (tim * bw * gain) / divor;
3595         if (res == 0)
3596                 res = 1;
3597         return ((uint32_t)res);
3598 }
3599
3600 /*
3601  * Given a gain and a length return the delay in useconds that
3602  * should be used to evenly space out packets
3603  * on the connection (based on the gain factor).
3604  */
3605 static uint32_t
3606 bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog)
3607 {
3608         uint64_t bw, lentim, res;
3609         uint32_t usecs, srtt, over = 0;
3610         uint32_t seg_oh, num_segs, maxseg;
3611
3612         if (len == 0)
3613                 return (0);
3614
3615         maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
3616         num_segs = (len + maxseg - 1) / maxseg;
3617         if (bbr->rc_use_google == 0) {
3618                 seg_oh = bbr_get_header_oh(bbr);
3619                 len += (num_segs * seg_oh);
3620         }
3621         gain = bbr_gain_adjust(bbr, gain);
3622         bw = bbr_get_bw(bbr);
3623         if (bbr->rc_use_google) {
3624                 uint64_t cbw;
3625
3626                 /*
3627                  * Reduce the b/w by the google discount
3628                  * factor 10 = 1%.
3629                  */
3630                 cbw = bw *  (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount);
3631                 cbw /= (uint64_t)1000;
3632                 /* We don't apply a discount if it results in 0 */
3633                 if (cbw > 0)
3634                         bw = cbw;
3635         }
3636         lentim = ((uint64_t)len *
3637                   (uint64_t)USECS_IN_SECOND *
3638                   (uint64_t)BBR_UNIT);
3639         res = lentim / ((uint64_t)gain * bw);
3640         if (res == 0)
3641                 res = 1;
3642         usecs = (uint32_t)res;
3643         srtt = bbr_get_rtt(bbr, BBR_SRTT);
3644         if (bbr_hptsi_max_mul && bbr_hptsi_max_div &&
3645             (bbr->rc_use_google == 0) &&
3646             (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) {
3647                 /*
3648                  * We cannot let the delay be more than 1/2 the srtt time.
3649                  * Otherwise we cannot pace out or send properly.
3650                  */
3651                 over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div;
3652                 BBR_STAT_INC(bbr_hpts_min_time);
3653         }
3654         if (!nolog)
3655                 bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1);
3656         return (usecs);
3657 }
3658
3659 static void
3660 bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack,
3661                  uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses)
3662 {
3663         INP_WLOCK_ASSERT(tp->t_inpcb);
3664         uint64_t bw;
3665         uint32_t cwnd, target_cwnd, saved_bytes, maxseg;
3666         int32_t meth;
3667
3668 #ifdef STATS
3669         if ((tp->t_flags & TF_GPUTINPROG) &&
3670             SEQ_GEQ(th->th_ack, tp->gput_ack)) {
3671                 /*
3672                  * Strech acks and compressed acks will cause this to
3673                  * oscillate but we are doing it the same way as the main
3674                  * stack so it will be compariable (though possibly not
3675                  * ideal).
3676                  */
3677                 int32_t cgput;
3678                 int64_t gput, time_stamp;
3679
3680                 gput = (int64_t) (th->th_ack - tp->gput_seq) * 8;
3681                 time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000));
3682                 cgput = gput / time_stamp;
3683                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
3684                                          cgput);
3685                 if (tp->t_stats_gput_prev > 0)
3686                         stats_voi_update_abs_s32(tp->t_stats,
3687                                                  VOI_TCP_GPUT_ND,
3688                                                  ((gput - tp->t_stats_gput_prev) * 100) /
3689                                                  tp->t_stats_gput_prev);
3690                 tp->t_flags &= ~TF_GPUTINPROG;
3691                 tp->t_stats_gput_prev = cgput;
3692         }
3693 #endif
3694         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
3695             ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
3696                 /* We don't change anything in probe-rtt */
3697                 return;
3698         }
3699         maxseg = tp->t_maxseg - bbr->rc_last_options;
3700         saved_bytes = bytes_this_ack;
3701         bytes_this_ack += sack_changed;
3702         if (bytes_this_ack > prev_acked) {
3703                 bytes_this_ack -= prev_acked;
3704                 /*
3705                  * A byte ack'd gives us a full mss
3706                  * to be like linux i.e. they count packets.
3707                  */
3708                 if ((bytes_this_ack < maxseg) && bbr->rc_use_google)
3709                         bytes_this_ack = maxseg;
3710         } else {
3711                 /* Unlikely */
3712                 bytes_this_ack = 0;
3713         }
3714         cwnd = tp->snd_cwnd;
3715         bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3716         if (bw)
3717                 target_cwnd = bbr_get_target_cwnd(bbr,
3718                                                   bw,
3719                                                   (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain);
3720         else
3721                 target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp);
3722         if (IN_RECOVERY(tp->t_flags) &&
3723             (bbr->bbr_prev_in_rec == 0)) {
3724                 /*
3725                  * We are entering recovery and
3726                  * thus packet conservation.
3727                  */
3728                 bbr->pkt_conservation = 1;
3729                 bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime;
3730                 cwnd = ctf_flight_size(tp,
3731                                        (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
3732                         bytes_this_ack;
3733         }
3734         if (IN_RECOVERY(tp->t_flags)) {
3735                 uint32_t flight;
3736
3737                 bbr->bbr_prev_in_rec = 1;
3738                 if (cwnd > losses) {
3739                         cwnd -= losses;
3740                         if (cwnd < maxseg)
3741                                 cwnd = maxseg;
3742                 } else
3743                         cwnd = maxseg;
3744                 flight = ctf_flight_size(tp,
3745                                          (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
3746                 bbr_log_type_cwndupd(bbr, flight, 0,
3747                                      losses, 10, 0, 0, line);
3748                 if (bbr->pkt_conservation) {
3749                         uint32_t time_in;
3750
3751                         if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start))
3752                                 time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start;
3753                         else
3754                                 time_in = 0;
3755
3756                         if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
3757                                 /* Clear packet conservation after an rttProp */
3758                                 bbr->pkt_conservation = 0;
3759                         } else {
3760                                 if ((flight + bytes_this_ack) > cwnd)
3761                                         cwnd = flight + bytes_this_ack;
3762                                 if (cwnd < get_min_cwnd(bbr))
3763                                         cwnd = get_min_cwnd(bbr);
3764                                 tp->snd_cwnd = cwnd;
3765                                 bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed,
3766                                                      prev_acked, 1, target_cwnd, th->th_ack, line);
3767                                 return;
3768                         }
3769                 }
3770         } else
3771                 bbr->bbr_prev_in_rec = 0;
3772         if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) {
3773                 bbr->r_ctl.restrict_growth--;
3774                 if (bytes_this_ack > maxseg)
3775                         bytes_this_ack = maxseg;
3776         }
3777         if (bbr->rc_filled_pipe) {
3778                 /*
3779                  * Here we have exited startup and filled the pipe. We will
3780                  * thus allow the cwnd to shrink to the target. We hit here
3781                  * mostly.
3782                  */
3783                 uint32_t s_cwnd;
3784
3785                 meth = 2;
3786                 s_cwnd = min((cwnd + bytes_this_ack), target_cwnd);
3787                 if (s_cwnd > cwnd)
3788                         cwnd = s_cwnd;
3789                 else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing)
3790                         cwnd = s_cwnd;
3791         } else {
3792                 /*
3793                  * Here we are still in startup, we increase cwnd by what
3794                  * has been acked.
3795                  */
3796                 if ((cwnd < target_cwnd) ||
3797                     (bbr->rc_past_init_win == 0)) {
3798                         meth = 3;
3799                         cwnd += bytes_this_ack;
3800                 } else {
3801                         /*
3802                          * Method 4 means we are at target so no gain in
3803                          * startup and past the initial window.
3804                          */
3805                         meth = 4;
3806                 }
3807         }
3808         tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr));
3809         bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line);
3810 }
3811
3812 static void
3813 tcp_bbr_partialack(struct tcpcb *tp)
3814 {
3815         struct tcp_bbr *bbr;
3816
3817         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
3818         INP_WLOCK_ASSERT(tp->t_inpcb);
3819         if (ctf_flight_size(tp,
3820                 (bbr->r_ctl.rc_sacked  + bbr->r_ctl.rc_lost_bytes)) <=
3821             tp->snd_cwnd) {
3822                 bbr->r_wanted_output = 1;
3823         }
3824 }
3825
3826 static void
3827 bbr_post_recovery(struct tcpcb *tp)
3828 {
3829         struct tcp_bbr *bbr;
3830         uint32_t  flight;
3831
3832         INP_WLOCK_ASSERT(tp->t_inpcb);
3833         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
3834         /*
3835          * Here we just exit recovery.
3836          */
3837         EXIT_RECOVERY(tp->t_flags);
3838         /* Lock in our b/w reduction for the specified number of pkt-epochs */
3839         bbr->r_recovery_bw = 0;
3840         tp->snd_recover = tp->snd_una;
3841         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
3842         bbr->pkt_conservation = 0;
3843         if (bbr->rc_use_google == 0) {
3844                 /*
3845                  * For non-google mode lets
3846                  * go ahead and make sure we clear
3847                  * the recovery state so if we
3848                  * bounce back in to recovery we
3849                  * will do PC.
3850                  */
3851                 bbr->bbr_prev_in_rec = 0;
3852         }
3853         bbr_log_type_exit_rec(bbr);
3854         if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
3855                 tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
3856                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__);
3857         } else {
3858                 /* For probe-rtt case lets fix up its saved_cwnd */
3859                 if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) {
3860                         bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent;
3861                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__);
3862                 }
3863         }
3864         flight = ctf_flight_size(tp,
3865                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
3866         if ((bbr->rc_use_google == 0) &&
3867             bbr_do_red) {
3868                 uint64_t val, lr2use;
3869                 uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd;
3870                 uint32_t *cwnd_p;
3871
3872                 if (bbr_get_rtt(bbr, BBR_SRTT)) {
3873                         val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000);
3874                         val /= bbr_get_rtt(bbr, BBR_SRTT);
3875                         ratio = (uint32_t)val;
3876                 } else
3877                         ratio = 1000;
3878
3879                 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div,
3880                                      bbr->r_ctl.recovery_lr, 21,
3881                                      ratio,
3882                                      bbr->r_ctl.rc_red_cwnd_pe,
3883                                      __LINE__);
3884                 if ((ratio < bbr_do_red) || (bbr_do_red == 0))
3885                         goto done;
3886                 if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
3887                      bbr_prtt_slam_cwnd) ||
3888                     (bbr_sub_drain_slam_cwnd &&
3889                      (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
3890                      bbr->rc_hit_state_1 &&
3891                      (bbr_state_val(bbr) == BBR_SUB_DRAIN)) ||
3892                     ((bbr->rc_bbr_state == BBR_STATE_DRAIN) &&
3893                      bbr_slam_cwnd_in_main_drain)) {
3894                         /*
3895                          * Here we must poke at the saved cwnd
3896                          * as well as the cwnd.
3897                          */
3898                         cwnd = bbr->r_ctl.rc_saved_cwnd;
3899                         cwnd_p = &bbr->r_ctl.rc_saved_cwnd;
3900                 } else {
3901                         cwnd = tp->snd_cwnd;
3902                         cwnd_p = &tp->snd_cwnd;
3903                 }
3904                 maxseg = tp->t_maxseg - bbr->rc_last_options;
3905                 /* Add the overall lr with the recovery lr */
3906                 if (bbr->r_ctl.rc_lost == 0)
3907                         lr2use = 0;
3908                 else if (bbr->r_ctl.rc_delivered == 0)
3909                         lr2use = 1000;
3910                 else {
3911                         lr2use = bbr->r_ctl.rc_lost * 1000;
3912                         lr2use /= bbr->r_ctl.rc_delivered;
3913                 }
3914                 lr2use += bbr->r_ctl.recovery_lr;
3915                 acks_inflight = (flight / (maxseg * 2));
3916                 if (bbr_red_scale) {
3917                         lr2use *= bbr_get_rtt(bbr, BBR_SRTT);
3918                         lr2use /= bbr_red_scale;
3919                         if ((bbr_red_growth_restrict) &&
3920                             ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1))
3921                             bbr->r_ctl.restrict_growth += acks_inflight;
3922                 }
3923                 if (lr2use) {
3924                         val = (uint64_t)cwnd * lr2use;
3925                         val /= 1000;
3926                         if (cwnd > val)
3927                                 newcwnd = roundup((cwnd - val), maxseg);
3928                         else
3929                                 newcwnd = maxseg;
3930                 } else {
3931                         val = (uint64_t)cwnd * (uint64_t)bbr_red_mul;
3932                         val /= (uint64_t)bbr_red_div;
3933                         newcwnd = roundup((uint32_t)val, maxseg);
3934                 }
3935                 /* with standard delayed acks how many acks can I expect? */
3936                 if (bbr_drop_limit == 0) {
3937                         /*
3938                          * Anticpate how much we will
3939                          * raise the cwnd based on the acks.
3940                          */
3941                         if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) {
3942                                 /* We do enforce the min (with the acks) */
3943                                 newcwnd = (get_min_cwnd(bbr) - acks_inflight);
3944                         }
3945                 } else {
3946                         /*
3947                          * A strict drop limit of N is is inplace
3948                          */
3949                         if (newcwnd < (bbr_drop_limit * maxseg)) {
3950                                 newcwnd = bbr_drop_limit * maxseg;
3951                         }
3952                 }
3953                 /* For the next N acks do we restrict the growth */
3954                 *cwnd_p = newcwnd;
3955                 if (tp->snd_cwnd > newcwnd)
3956                         tp->snd_cwnd = newcwnd;
3957                 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22,
3958                                      (uint32_t)lr2use,
3959                                      bbr_get_rtt(bbr, BBR_SRTT), __LINE__);
3960                 bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch;
3961         }
3962 done:
3963         bbr->r_ctl.recovery_lr = 0;
3964         if (flight <= tp->snd_cwnd) {
3965                 bbr->r_wanted_output = 1;
3966         }
3967         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
3968 }
3969
3970 static void
3971 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts)
3972 {
3973         bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3974         /* Limit the drop in b/w to 1/2 our current filter. */
3975         if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate)
3976                 bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate;
3977         if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2))
3978                 bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2;
3979         tcp_bbr_tso_size_check(bbr, cts);
3980 }
3981
3982 static void
3983 bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm)
3984 {
3985         struct tcp_bbr *bbr;
3986
3987         INP_WLOCK_ASSERT(tp->t_inpcb);
3988         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
3989         switch (type) {
3990         case CC_NDUPACK:
3991                 if (!IN_RECOVERY(tp->t_flags)) {
3992                         tp->snd_recover = tp->snd_max;
3993                         /* Start a new epoch */
3994                         bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
3995                         if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) {
3996                                 /*
3997                                  * Move forward the lt epoch
3998                                  * so it won't count the truncated
3999                                  * epoch.
4000                                  */
4001                                 bbr->r_ctl.rc_lt_epoch++;
4002                         }
4003                         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
4004                                 /*
4005                                  * Just like the policer detection code
4006                                  * if we are in startup we must push
4007                                  * forward the last startup epoch
4008                                  * to hide the truncated PE.
4009                                  */
4010                                 bbr->r_ctl.rc_bbr_last_startup_epoch++;
4011                         }
4012                         bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd;
4013                         ENTER_RECOVERY(tp->t_flags);
4014                         bbr->rc_tlp_rtx_out = 0;
4015                         bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate;
4016                         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
4017                         if (bbr->rc_inp->inp_in_hpts &&
4018                             ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) {
4019                                 /*
4020                                  * When we enter recovery, we need to restart
4021                                  * any timers. This may mean we gain an agg
4022                                  * early, which will be made up for at the last
4023                                  * rxt out.
4024                                  */
4025                                 bbr->rc_timer_first = 1;
4026                                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
4027                         }
4028                         /*
4029                          * Calculate a new cwnd based on to the current
4030                          * delivery rate with no gain. We get the bdp
4031                          * without gaining it up like we normally would and
4032                          * we use the last cur_del_rate.
4033                          */
4034                         if ((bbr->rc_use_google == 0) &&
4035                             (bbr->r_ctl.bbr_rttprobe_gain_val ||
4036                              (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) {
4037                                 tp->snd_cwnd = ctf_flight_size(tp,
4038                                                    (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
4039                                         (tp->t_maxseg - bbr->rc_last_options);
4040                                 if (tp->snd_cwnd < get_min_cwnd(bbr)) {
4041                                         /* We always gate to min cwnd */
4042                                         tp->snd_cwnd = get_min_cwnd(bbr);
4043                                 }
4044                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__);
4045                         }
4046                         bbr_log_type_enter_rec(bbr, rsm->r_start);
4047                 }
4048                 break;
4049         case CC_RTO_ERR:
4050                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
4051                 /* RTO was unnecessary, so reset everything. */
4052                 bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime);
4053                 if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
4054                         tp->snd_cwnd = tp->snd_cwnd_prev;
4055                         tp->snd_ssthresh = tp->snd_ssthresh_prev;
4056                         tp->snd_recover = tp->snd_recover_prev;
4057                         tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
4058                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__);
4059                 }
4060                 tp->t_badrxtwin = 0;
4061                 break;
4062         }
4063 }
4064
4065 /*
4066  * Indicate whether this ack should be delayed.  We can delay the ack if
4067  * following conditions are met:
4068  *      - There is no delayed ack timer in progress.
4069  *      - Our last ack wasn't a 0-sized window. We never want to delay
4070  *        the ack that opens up a 0-sized window.
4071  *      - LRO wasn't used for this segment. We make sure by checking that the
4072  *        segment size is not larger than the MSS.
4073  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
4074  *        connection.
4075  *      - The data being acked is less than a full segment (a stretch ack
4076  *        of more than a segment we should ack.
4077  *      - nsegs is 1 (if its more than that we received more than 1 ack).
4078  */
4079 #define DELAY_ACK(tp, bbr, nsegs)                               \
4080         (((tp->t_flags & TF_RXWIN0SENT) == 0) &&                \
4081          ((tp->t_flags & TF_DELACK) == 0) &&                    \
4082          ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) &&  \
4083          (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
4084
4085 /*
4086  * Return the lowest RSM in the map of
4087  * packets still in flight that is not acked.
4088  * This should normally find on the first one
4089  * since we remove packets from the send
4090  * map after they are marked ACKED.
4091  */
4092 static struct bbr_sendmap *
4093 bbr_find_lowest_rsm(struct tcp_bbr *bbr)
4094 {
4095         struct bbr_sendmap *rsm;
4096
4097         /*
4098          * Walk the time-order transmitted list looking for an rsm that is
4099          * not acked. This will be the one that was sent the longest time
4100          * ago that is still outstanding.
4101          */
4102         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) {
4103                 if (rsm->r_flags & BBR_ACKED) {
4104                         continue;
4105                 }
4106                 goto finish;
4107         }
4108 finish:
4109         return (rsm);
4110 }
4111
4112 static struct bbr_sendmap *
4113 bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
4114 {
4115         struct bbr_sendmap *prsm;
4116
4117         /*
4118          * Walk the sequence order list backward until we hit and arrive at
4119          * the highest seq not acked. In theory when this is called it
4120          * should be the last segment (which it was not).
4121          */
4122         prsm = rsm;
4123         TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
4124                 if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) {
4125                         continue;
4126                 }
4127                 return (prsm);
4128         }
4129         return (NULL);
4130 }
4131
4132 /*
4133  * Returns to the caller the number of microseconds that
4134  * the packet can be outstanding before we think we
4135  * should have had an ack returned.
4136  */
4137 static uint32_t
4138 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm)
4139 {
4140         /*
4141          * lro is the flag we use to determine if we have seen reordering.
4142          * If it gets set we have seen reordering. The reorder logic either
4143          * works in one of two ways:
4144          *
4145          * If reorder-fade is configured, then we track the last time we saw
4146          * re-ordering occur. If we reach the point where enough time as
4147          * passed we no longer consider reordering has occuring.
4148          *
4149          * Or if reorder-face is 0, then once we see reordering we consider
4150          * the connection to alway be subject to reordering and just set lro
4151          * to 1.
4152          *
4153          * In the end if lro is non-zero we add the extra time for
4154          * reordering in.
4155          */
4156         int32_t lro;
4157         uint32_t thresh, t_rxtcur;
4158
4159         if (srtt == 0)
4160                 srtt = 1;
4161         if (bbr->r_ctl.rc_reorder_ts) {
4162                 if (bbr->r_ctl.rc_reorder_fade) {
4163                         if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) {
4164                                 lro = cts - bbr->r_ctl.rc_reorder_ts;
4165                                 if (lro == 0) {
4166                                         /*
4167                                          * No time as passed since the last
4168                                          * reorder, mark it as reordering.
4169                                          */
4170                                         lro = 1;
4171                                 }
4172                         } else {
4173                                 /* Negative time? */
4174                                 lro = 0;
4175                         }
4176                         if (lro > bbr->r_ctl.rc_reorder_fade) {
4177                                 /* Turn off reordering seen too */
4178                                 bbr->r_ctl.rc_reorder_ts = 0;
4179                                 lro = 0;
4180                         }
4181                 } else {
4182                         /* Reodering does not fade */
4183                         lro = 1;
4184                 }
4185         } else {
4186                 lro = 0;
4187         }
4188         thresh = srtt + bbr->r_ctl.rc_pkt_delay;
4189         if (lro) {
4190                 /* It must be set, if not you get 1/4 rtt */
4191                 if (bbr->r_ctl.rc_reorder_shift)
4192                         thresh += (srtt >> bbr->r_ctl.rc_reorder_shift);
4193                 else
4194                         thresh += (srtt >> 2);
4195         } else {
4196                 thresh += 1000;
4197         }
4198         /* We don't let the rack timeout be above a RTO */
4199         if ((bbr->rc_tp)->t_srtt == 0)
4200                 t_rxtcur = BBR_INITIAL_RTO;
4201         else
4202                 t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
4203         if (thresh > t_rxtcur) {
4204                 thresh = t_rxtcur;
4205         }
4206         /* And we don't want it above the RTO max either */
4207         if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
4208                 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
4209         }
4210         bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK);
4211         return (thresh);
4212 }
4213
4214 /*
4215  * Return to the caller the amount of time in mico-seconds
4216  * that should be used for the TLP timer from the last
4217  * send time of this packet.
4218  */
4219 static uint32_t
4220 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
4221     struct bbr_sendmap *rsm, uint32_t srtt,
4222     uint32_t cts)
4223 {
4224         uint32_t thresh, len, maxseg, t_rxtcur;
4225         struct bbr_sendmap *prsm;
4226
4227         if (srtt == 0)
4228                 srtt = 1;
4229         if (bbr->rc_tlp_threshold)
4230                 thresh = srtt + (srtt / bbr->rc_tlp_threshold);
4231         else
4232                 thresh = (srtt * 2);
4233         maxseg = tp->t_maxseg - bbr->rc_last_options;
4234         /* Get the previous sent packet, if any  */
4235         len = rsm->r_end - rsm->r_start;
4236
4237         /* 2.1 behavior */
4238         prsm = TAILQ_PREV(rsm, bbr_head, r_tnext);
4239         if (prsm && (len <= maxseg)) {
4240                 /*
4241                  * Two packets outstanding, thresh should be (2*srtt) +
4242                  * possible inter-packet delay (if any).
4243                  */
4244                 uint32_t inter_gap = 0;
4245                 int idx, nidx;
4246
4247                 idx = rsm->r_rtr_cnt - 1;
4248                 nidx = prsm->r_rtr_cnt - 1;
4249                 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
4250                         /* Yes it was sent later (or at the same time) */
4251                         inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
4252                 }
4253                 thresh += inter_gap;
4254         } else if (len <= maxseg) {
4255                 /*
4256                  * Possibly compensate for delayed-ack.
4257                  */
4258                 uint32_t alt_thresh;
4259
4260                 alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time;
4261                 if (alt_thresh > thresh)
4262                         thresh = alt_thresh;
4263         }
4264         /* Not above the current  RTO */
4265         if (tp->t_srtt == 0)
4266                 t_rxtcur = BBR_INITIAL_RTO;
4267         else
4268                 t_rxtcur = TICKS_2_USEC(tp->t_rxtcur);
4269
4270         bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP);
4271         /* Not above an RTO */
4272         if (thresh > t_rxtcur) {
4273                 thresh = t_rxtcur;
4274         }
4275         /* Not above a RTO max */
4276         if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
4277                 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
4278         }
4279         /* And now apply the user TLP min */
4280         if (thresh < bbr_tlp_min) {
4281                 thresh = bbr_tlp_min;
4282         }
4283         return (thresh);
4284 }
4285
4286 /*
4287  * Return one of three RTTs to use (in microseconds).
4288  */
4289 static __inline uint32_t
4290 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type)
4291 {
4292         uint32_t f_rtt;
4293         uint32_t srtt;
4294
4295         f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
4296         if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) {
4297                 /* We have no rtt at all */
4298                 if (bbr->rc_tp->t_srtt == 0)
4299                         f_rtt = BBR_INITIAL_RTO;
4300                 else
4301                         f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
4302                 /*
4303                  * Since we don't know how good the rtt is apply a
4304                  * delayed-ack min
4305                  */
4306                 if (f_rtt < bbr_delayed_ack_time) {
4307                         f_rtt = bbr_delayed_ack_time;
4308                 }
4309         }
4310         /* Take the filter version or last measured pkt-rtt */
4311         if (rtt_type == BBR_RTT_PROP) {
4312                 srtt = f_rtt;
4313         } else if (rtt_type == BBR_RTT_PKTRTT) {
4314                 if (bbr->r_ctl.rc_pkt_epoch_rtt) {
4315                         srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
4316                 } else {
4317                         /* No pkt rtt yet */
4318                         srtt = f_rtt;
4319                 }
4320         } else if (rtt_type == BBR_RTT_RACK) {
4321                 srtt = bbr->r_ctl.rc_last_rtt;
4322                 /* We need to add in any internal delay for our timer */
4323                 if (bbr->rc_ack_was_delayed)
4324                         srtt += bbr->r_ctl.rc_ack_hdwr_delay;
4325         } else if (rtt_type == BBR_SRTT) {
4326                 srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
4327         } else {
4328                 /* TSNH */
4329                 srtt = f_rtt;
4330 #ifdef BBR_INVARIANTS
4331                 panic("Unknown rtt request type %d", rtt_type);
4332 #endif
4333         }
4334         return (srtt);
4335 }
4336
4337 static int
4338 bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts)
4339 {
4340         uint32_t thresh;
4341
4342
4343         thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK),
4344                                       cts, rsm);
4345         if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) {
4346                 /* It is lost (past time) */
4347                 return (1);
4348         }
4349         return (0);
4350 }
4351
4352 /*
4353  * Return a sendmap if we need to retransmit something.
4354  */
4355 static struct bbr_sendmap *
4356 bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4357 {
4358         /*
4359          * Check to see that we don't need to fall into recovery. We will
4360          * need to do so if our oldest transmit is past the time we should
4361          * have had an ack.
4362          */
4363
4364         struct bbr_sendmap *rsm;
4365         int32_t idx;
4366
4367         if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) {
4368                 /* Nothing outstanding that we know of */
4369                 return (NULL);
4370         }
4371         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
4372         if (rsm == NULL) {
4373                 /* Nothing in the transmit map */
4374                 return (NULL);
4375         }
4376         if (tp->t_flags & TF_SENTFIN) {
4377                 /* Fin restricted, don't find anything once a fin is sent */
4378                 return (NULL);
4379         }
4380         if (rsm->r_flags & BBR_ACKED) {
4381                 /*
4382                  * Ok the first one is acked (this really should not happen
4383                  * since we remove the from the tmap once they are acked)
4384                  */
4385                 rsm = bbr_find_lowest_rsm(bbr);
4386                 if (rsm == NULL)
4387                         return (NULL);
4388         }
4389         idx = rsm->r_rtr_cnt - 1;
4390         if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) {
4391                 /* Send timestamp is the same or less? can't be ready */
4392                 return (NULL);
4393         }
4394         /* Get our RTT time */
4395         if (bbr_is_lost(bbr, rsm, cts) &&
4396             ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
4397              (rsm->r_flags & BBR_SACK_PASSED))) {
4398                 if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
4399                         rsm->r_flags |= BBR_MARKED_LOST;
4400                         bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
4401                         bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
4402                 }
4403                 bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm);
4404 #ifdef BBR_INVARIANTS
4405                 if ((rsm->r_end - rsm->r_start) == 0)
4406                         panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm);
4407 #endif
4408                 return (rsm);
4409         }
4410         return (NULL);
4411 }
4412
4413 /*
4414  * RACK Timer, here we simply do logging and house keeping.
4415  * the normal bbr_output_wtime() function will call the
4416  * appropriate thing to check if we need to do a RACK retransmit.
4417  * We return 1, saying don't proceed with bbr_output_wtime only
4418  * when all timers have been stopped (destroyed PCB?).
4419  */
4420 static int
4421 bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4422 {
4423         /*
4424          * This timer simply provides an internal trigger to send out data.
4425          * The check_recovery_mode call will see if there are needed
4426          * retransmissions, if so we will enter fast-recovery. The output
4427          * call may or may not do the same thing depending on sysctl
4428          * settings.
4429          */
4430         uint32_t lost;
4431
4432         if (bbr->rc_all_timers_stopped) {
4433                 return (1);
4434         }
4435         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
4436                 /* Its not time yet */
4437                 return (0);
4438         }
4439         BBR_STAT_INC(bbr_to_tot);
4440         lost = bbr->r_ctl.rc_lost;
4441         if (bbr->r_state && (bbr->r_state != tp->t_state))
4442                 bbr_set_state(tp, bbr, 0);
4443         bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK);
4444         if (bbr->r_ctl.rc_resend == NULL) {
4445                 /* Lets do the check here */
4446                 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
4447         }
4448         if (bbr_policer_call_from_rack_to)
4449                 bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
4450         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
4451         return (0);
4452 }
4453
4454 static __inline void
4455 bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start)
4456 {
4457         int idx;
4458
4459         nrsm->r_start = start;
4460         nrsm->r_end = rsm->r_end;
4461         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4462         nrsm->r_flags = rsm->r_flags;
4463         /* We don't transfer forward the SYN flag */
4464         nrsm->r_flags &= ~BBR_HAS_SYN;
4465         /* We move forward the FIN flag, not that this should happen */
4466         rsm->r_flags &= ~BBR_HAS_FIN;
4467         nrsm->r_dupack = rsm->r_dupack;
4468         nrsm->r_rtr_bytes = 0;
4469         nrsm->r_is_gain = rsm->r_is_gain;
4470         nrsm->r_is_drain = rsm->r_is_drain;
4471         nrsm->r_delivered = rsm->r_delivered;
4472         nrsm->r_ts_valid = rsm->r_ts_valid;
4473         nrsm->r_del_ack_ts = rsm->r_del_ack_ts;
4474         nrsm->r_del_time = rsm->r_del_time;
4475         nrsm->r_app_limited = rsm->r_app_limited;
4476         nrsm->r_first_sent_time = rsm->r_first_sent_time;
4477         nrsm->r_flight_at_send = rsm->r_flight_at_send;
4478         /* We split a piece the lower section looses any just_ret flag. */
4479         nrsm->r_bbr_state = rsm->r_bbr_state;
4480         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4481                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4482         }
4483         rsm->r_end = nrsm->r_start;
4484         idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
4485         idx /= 8;
4486         /* Check if we got too small */
4487         if ((rsm->r_is_smallmap == 0) &&
4488             ((rsm->r_end - rsm->r_start) <= idx)) {
4489                 bbr->r_ctl.rc_num_small_maps_alloced++;
4490                 rsm->r_is_smallmap = 1;
4491         }
4492         /* Check the new one as well */
4493         if ((nrsm->r_end - nrsm->r_start) <= idx) {
4494                 bbr->r_ctl.rc_num_small_maps_alloced++;
4495                 nrsm->r_is_smallmap = 1;
4496         }
4497 }
4498
4499 static int
4500 bbr_sack_mergable(struct bbr_sendmap *at,
4501                   uint32_t start, uint32_t end)
4502 {
4503         /*
4504          * Given a sack block defined by
4505          * start and end, and a current postion
4506          * at. Return 1 if either side of at
4507          * would show that the block is mergable
4508          * to that side. A block to be mergable
4509          * must have overlap with the start/end
4510          * and be in the SACK'd state.
4511          */
4512         struct bbr_sendmap *l_rsm;
4513         struct bbr_sendmap *r_rsm;
4514
4515         /* first get the either side blocks */
4516         l_rsm = TAILQ_PREV(at, bbr_head, r_next);
4517         r_rsm = TAILQ_NEXT(at, r_next);
4518         if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) {
4519                 /* Potentially mergeable */
4520                 if ((l_rsm->r_end == start) ||
4521                     (SEQ_LT(start, l_rsm->r_end) &&
4522                      SEQ_GT(end, l_rsm->r_end))) {
4523                             /*
4524                              * map blk   |------|
4525                              * sack blk         |------|
4526                              * <or>
4527                              * map blk   |------|
4528                              * sack blk      |------|
4529                              */
4530                             return (1);
4531                     }
4532         }
4533         if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) {
4534                 /* Potentially mergeable */
4535                 if ((r_rsm->r_start == end) ||
4536                     (SEQ_LT(start, r_rsm->r_start) &&
4537                      SEQ_GT(end, r_rsm->r_start))) {
4538                         /*
4539                          * map blk          |---------|
4540                          * sack blk    |----|
4541                          * <or>
4542                          * map blk          |---------|
4543                          * sack blk    |-------|
4544                          */
4545                         return (1);
4546                 }
4547         }
4548         return (0);
4549 }
4550
4551 static struct bbr_sendmap *
4552 bbr_merge_rsm(struct tcp_bbr *bbr,
4553               struct bbr_sendmap *l_rsm,
4554               struct bbr_sendmap *r_rsm)
4555 {
4556         /*
4557          * We are merging two ack'd RSM's,
4558          * the l_rsm is on the left (lower seq
4559          * values) and the r_rsm is on the right
4560          * (higher seq value). The simplest way
4561          * to merge these is to move the right
4562          * one into the left. I don't think there
4563          * is any reason we need to try to find
4564          * the oldest (or last oldest retransmitted).
4565          */
4566         l_rsm->r_end = r_rsm->r_end;
4567         if (l_rsm->r_dupack < r_rsm->r_dupack)
4568                 l_rsm->r_dupack = r_rsm->r_dupack;
4569         if (r_rsm->r_rtr_bytes)
4570                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
4571         if (r_rsm->r_in_tmap) {
4572                 /* This really should not happen */
4573                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext);
4574         }
4575         if (r_rsm->r_app_limited)
4576                 l_rsm->r_app_limited = r_rsm->r_app_limited;
4577         /* Now the flags */
4578         if (r_rsm->r_flags & BBR_HAS_FIN)
4579                 l_rsm->r_flags |= BBR_HAS_FIN;
4580         if (r_rsm->r_flags & BBR_TLP)
4581                 l_rsm->r_flags |= BBR_TLP;
4582         if (r_rsm->r_flags & BBR_RWND_COLLAPSED)
4583                 l_rsm->r_flags |= BBR_RWND_COLLAPSED;
4584         if (r_rsm->r_flags & BBR_MARKED_LOST) {
4585                 /* This really should not happen */
4586                 bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start;
4587         }
4588         TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next);
4589         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
4590                 /* Transfer the split limit to the map we free */
4591                 r_rsm->r_limit_type = l_rsm->r_limit_type;
4592                 l_rsm->r_limit_type = 0;
4593         }
4594         bbr_free(bbr, r_rsm);
4595         return(l_rsm);
4596 }
4597
4598 /*
4599  * TLP Timer, here we simply setup what segment we want to
4600  * have the TLP expire on, the normal bbr_output_wtime() will then
4601  * send it out.
4602  *
4603  * We return 1, saying don't proceed with bbr_output_wtime only
4604  * when all timers have been stopped (destroyed PCB?).
4605  */
4606 static int
4607 bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4608 {
4609         /*
4610          * Tail Loss Probe.
4611          */
4612         struct bbr_sendmap *rsm = NULL;
4613         struct socket *so;
4614         uint32_t amm;
4615         uint32_t out, avail;
4616         uint32_t maxseg;
4617         int collapsed_win = 0;
4618
4619         if (bbr->rc_all_timers_stopped) {
4620                 return (1);
4621         }
4622         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
4623                 /* Its not time yet */
4624                 return (0);
4625         }
4626         if (ctf_progress_timeout_check(tp, true)) {
4627                 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
4628                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4629                 return (1);
4630         }
4631         /* Did we somehow get into persists? */
4632         if (bbr->rc_in_persist) {
4633                 return (0);
4634         }
4635         if (bbr->r_state && (bbr->r_state != tp->t_state))
4636                 bbr_set_state(tp, bbr, 0);
4637         BBR_STAT_INC(bbr_tlp_tot);
4638         maxseg = tp->t_maxseg - bbr->rc_last_options;
4639 #ifdef KERN_TLS
4640         if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
4641                 /*
4642                  * For hardware TLS we do *not* want to send
4643                  * new data.
4644                  */
4645                 goto need_retran;
4646         }
4647 #endif
4648         /*
4649          * A TLP timer has expired. We have been idle for 2 rtts. So we now
4650          * need to figure out how to force a full MSS segment out.
4651          */
4652         so = tp->t_inpcb->inp_socket;
4653         avail = sbavail(&so->so_snd);
4654         out = ctf_outstanding(tp);
4655         if (out > tp->snd_wnd) {
4656                 /* special case, we need a retransmission */
4657                 collapsed_win = 1;
4658                 goto need_retran;
4659         }
4660         if (avail > out) {
4661                 /* New data is available */
4662                 amm = avail - out;
4663                 if (amm > maxseg) {
4664                         amm = maxseg;
4665                 } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
4666                         /* not enough to fill a MTU and no-delay is off */
4667                         goto need_retran;
4668                 }
4669                 /* Set the send-new override */
4670                 if ((out + amm) <= tp->snd_wnd) {
4671                         bbr->rc_tlp_new_data = 1;
4672                 } else {
4673                         goto need_retran;
4674                 }
4675                 bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
4676                 bbr->r_ctl.rc_last_tlp_seq = tp->snd_max;
4677                 bbr->r_ctl.rc_tlp_send = NULL;
4678                 /* cap any slots */
4679                 BBR_STAT_INC(bbr_tlp_newdata);
4680                 goto send;
4681         }
4682 need_retran:
4683         /*
4684          * Ok we need to arrange the last un-acked segment to be re-sent, or
4685          * optionally the first un-acked segment.
4686          */
4687         if (collapsed_win == 0) {
4688                 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
4689                 if (rsm && (BBR_ACKED | BBR_HAS_FIN)) {
4690                         rsm = bbr_find_high_nonack(bbr, rsm);
4691                 }
4692                 if (rsm == NULL) {
4693                         goto restore;
4694                 }
4695         } else {
4696                 /*
4697                  * We must find the last segment
4698                  * that was acceptable by the client.
4699                  */
4700                 TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
4701                         if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) {
4702                                 /* Found one */
4703                                 break;
4704                         }
4705                 }
4706                 if (rsm == NULL) {
4707                         /* None? if so send the first */
4708                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
4709                         if (rsm == NULL)
4710                                 goto restore;
4711                 }
4712         }
4713         if ((rsm->r_end - rsm->r_start) > maxseg) {
4714                 /*
4715                  * We need to split this the last segment in two.
4716                  */
4717                 struct bbr_sendmap *nrsm;
4718
4719                 nrsm = bbr_alloc_full_limit(bbr);
4720                 if (nrsm == NULL) {
4721                         /*
4722                          * We can't get memory to split, we can either just
4723                          * not split it. Or retransmit the whole piece, lets
4724                          * do the large send (BTLP :-) ).
4725                          */
4726                         goto go_for_it;
4727                 }
4728                 bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg));
4729                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
4730                 if (rsm->r_in_tmap) {
4731                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4732                         nrsm->r_in_tmap = 1;
4733                 }
4734                 rsm->r_flags &= (~BBR_HAS_FIN);
4735                 rsm = nrsm;
4736         }
4737 go_for_it:
4738         bbr->r_ctl.rc_tlp_send = rsm;
4739         bbr->rc_tlp_rtx_out = 1;
4740         if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) {
4741                 bbr->r_ctl.rc_tlp_seg_send_cnt++;
4742                 tp->t_rxtshift++;
4743         } else {
4744                 bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
4745                 bbr->r_ctl.rc_tlp_seg_send_cnt = 1;
4746         }
4747 send:
4748         if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
4749                 /*
4750                  * Can't [re]/transmit a segment we have retranmitted the
4751                  * max times. We need the retransmit timer to take over.
4752                  */
4753 restore:
4754                 bbr->rc_tlp_new_data = 0;
4755                 bbr->r_ctl.rc_tlp_send = NULL;
4756                 if (rsm)
4757                         rsm->r_flags &= ~BBR_TLP;
4758                 BBR_STAT_INC(bbr_tlp_retran_fail);
4759                 return (0);
4760         } else if (rsm) {
4761                 rsm->r_flags |= BBR_TLP;
4762         }
4763         if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) &&
4764             (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) {
4765                 /*
4766                  * We have retransmitted to many times for TLP. Switch to
4767                  * the regular RTO timer
4768                  */
4769                 goto restore;
4770         }
4771         bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP);
4772         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
4773         return (0);
4774 }
4775
4776 /*
4777  * Delayed ack Timer, here we simply need to setup the
4778  * ACK_NOW flag and remove the DELACK flag. From there
4779  * the output routine will send the ack out.
4780  *
4781  * We only return 1, saying don't proceed, if all timers
4782  * are stopped (destroyed PCB?).
4783  */
4784 static int
4785 bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4786 {
4787         if (bbr->rc_all_timers_stopped) {
4788                 return (1);
4789         }
4790         bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK);
4791         tp->t_flags &= ~TF_DELACK;
4792         tp->t_flags |= TF_ACKNOW;
4793         KMOD_TCPSTAT_INC(tcps_delack);
4794         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
4795         return (0);
4796 }
4797
4798 /*
4799  * Here we send a KEEP-ALIVE like probe to the
4800  * peer, we do not send data.
4801  *
4802  * We only return 1, saying don't proceed, if all timers
4803  * are stopped (destroyed PCB?).
4804  */
4805 static int
4806 bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4807 {
4808         struct tcptemp *t_template;
4809         int32_t retval = 1;
4810
4811         if (bbr->rc_all_timers_stopped) {
4812                 return (1);
4813         }
4814         if (bbr->rc_in_persist == 0)
4815                 return (0);
4816         KASSERT(tp->t_inpcb != NULL,
4817             ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
4818         /*
4819          * Persistence timer into zero window. Force a byte to be output, if
4820          * possible.
4821          */
4822         bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST);
4823         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
4824         KMOD_TCPSTAT_INC(tcps_persisttimeo);
4825         /*
4826          * Have we exceeded the user specified progress time?
4827          */
4828         if (ctf_progress_timeout_check(tp, true)) {
4829                 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
4830                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4831                 goto out;
4832         }
4833         /*
4834          * Hack: if the peer is dead/unreachable, we do not time out if the
4835          * window is closed.  After a full backoff, drop the connection if
4836          * the idle time (no responses to probes) reaches the maximum
4837          * backoff that we would use if retransmitting.
4838          */
4839         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
4840             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
4841             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
4842                 KMOD_TCPSTAT_INC(tcps_persistdrop);
4843                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
4844                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4845                 goto out;
4846         }
4847         if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
4848             tp->snd_una == tp->snd_max) {
4849                 bbr_exit_persist(tp, bbr, cts, __LINE__);
4850                 retval = 0;
4851                 goto out;
4852         }
4853         /*
4854          * If the user has closed the socket then drop a persisting
4855          * connection after a much reduced timeout.
4856          */
4857         if (tp->t_state > TCPS_CLOSE_WAIT &&
4858             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
4859                 KMOD_TCPSTAT_INC(tcps_persistdrop);
4860                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
4861                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4862                 goto out;
4863         }
4864         t_template = tcpip_maketemplate(bbr->rc_inp);
4865         if (t_template) {
4866                 tcp_respond(tp, t_template->tt_ipgen,
4867                             &t_template->tt_t, (struct mbuf *)NULL,
4868                             tp->rcv_nxt, tp->snd_una - 1, 0);
4869                 /* This sends an ack */
4870                 if (tp->t_flags & TF_DELACK)
4871                         tp->t_flags &= ~TF_DELACK;
4872                 free(t_template, M_TEMP);
4873         }
4874         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
4875                 tp->t_rxtshift++;
4876         bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0);
4877 out:
4878         return (retval);
4879 }
4880
4881 /*
4882  * If a keepalive goes off, we had no other timers
4883  * happening. We always return 1 here since this
4884  * routine either drops the connection or sends
4885  * out a segment with respond.
4886  */
4887 static int
4888 bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4889 {
4890         struct tcptemp *t_template;
4891         struct inpcb *inp;
4892
4893         if (bbr->rc_all_timers_stopped) {
4894                 return (1);
4895         }
4896         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
4897         inp = tp->t_inpcb;
4898         bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP);
4899         /*
4900          * Keep-alive timer went off; send something or drop connection if
4901          * idle for too long.
4902          */
4903         KMOD_TCPSTAT_INC(tcps_keeptimeo);
4904         if (tp->t_state < TCPS_ESTABLISHED)
4905                 goto dropit;
4906         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
4907             tp->t_state <= TCPS_CLOSING) {
4908                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
4909                         goto dropit;
4910                 /*
4911                  * Send a packet designed to force a response if the peer is
4912                  * up and reachable: either an ACK if the connection is
4913                  * still alive, or an RST if the peer has closed the
4914                  * connection due to timeout or reboot. Using sequence
4915                  * number tp->snd_una-1 causes the transmitted zero-length
4916                  * segment to lie outside the receive window; by the
4917                  * protocol spec, this requires the correspondent TCP to
4918                  * respond.
4919                  */
4920                 KMOD_TCPSTAT_INC(tcps_keepprobe);
4921                 t_template = tcpip_maketemplate(inp);
4922                 if (t_template) {
4923                         tcp_respond(tp, t_template->tt_ipgen,
4924                             &t_template->tt_t, (struct mbuf *)NULL,
4925                             tp->rcv_nxt, tp->snd_una - 1, 0);
4926                         free(t_template, M_TEMP);
4927                 }
4928         }
4929         bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0);
4930         return (1);
4931 dropit:
4932         KMOD_TCPSTAT_INC(tcps_keepdrops);
4933         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
4934         tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4935         return (1);
4936 }
4937
4938 /*
4939  * Retransmit helper function, clear up all the ack
4940  * flags and take care of important book keeping.
4941  */
4942 static void
4943 bbr_remxt_tmr(struct tcpcb *tp)
4944 {
4945         /*
4946          * The retransmit timer went off, all sack'd blocks must be
4947          * un-acked.
4948          */
4949         struct bbr_sendmap *rsm, *trsm = NULL;
4950         struct tcp_bbr *bbr;
4951         uint32_t cts, lost;
4952
4953         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
4954         cts = tcp_get_usecs(&bbr->rc_tv);
4955         lost = bbr->r_ctl.rc_lost;
4956         if (bbr->r_state && (bbr->r_state != tp->t_state))
4957                 bbr_set_state(tp, bbr, 0);
4958
4959         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
4960                 if (rsm->r_flags & BBR_ACKED) {
4961                         uint32_t old_flags;
4962
4963                         rsm->r_dupack = 0;
4964                         if (rsm->r_in_tmap == 0) {
4965                                 /* We must re-add it back to the tlist */
4966                                 if (trsm == NULL) {
4967                                         TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
4968                                 } else {
4969                                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext);
4970                                 }
4971                                 rsm->r_in_tmap = 1;
4972                         }
4973                         old_flags = rsm->r_flags;
4974                         rsm->r_flags |= BBR_RXT_CLEARED;
4975                         rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS);
4976                         bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
4977                 } else {
4978                         if ((tp->t_state < TCPS_ESTABLISHED) &&
4979                             (rsm->r_start == tp->snd_una)) {
4980                                 /*
4981                                  * Special case for TCP FO. Where
4982                                  * we sent more data beyond the snd_max.
4983                                  * We don't mark that as lost and stop here.
4984                                  */
4985                                 break;
4986                         }
4987                         if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
4988                                 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
4989                                 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
4990                         }
4991                         if (bbr_marks_rxt_sack_passed) {
4992                                 /*
4993                                  * With this option, we will rack out
4994                                  * in 1ms increments the rest of the packets.
4995                                  */
4996                                 rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST;
4997                                 rsm->r_flags &= ~BBR_WAS_SACKPASS;
4998                         } else {
4999                                 /*
5000                                  * With this option we only mark them lost
5001                                  * and remove all sack'd markings. We will run
5002                                  * another RXT or a TLP. This will cause
5003                                  * us to eventually send more based on what
5004                                  * ack's come in.
5005                                  */
5006                                 rsm->r_flags |= BBR_MARKED_LOST;
5007                                 rsm->r_flags &= ~BBR_WAS_SACKPASS;
5008                                 rsm->r_flags &= ~BBR_SACK_PASSED;
5009                         }
5010                 }
5011                 trsm = rsm;
5012         }
5013         bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map);
5014         /* Clear the count (we just un-acked them) */
5015         bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR);
5016         bbr->rc_tlp_new_data = 0;
5017         bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
5018         /* zap the behindness on a rxt */
5019         bbr->r_ctl.rc_hptsi_agg_delay = 0;
5020         bbr->r_agg_early_set = 0;
5021         bbr->r_ctl.rc_agg_early = 0;
5022         bbr->rc_tlp_rtx_out = 0;
5023         bbr->r_ctl.rc_sacked = 0;
5024         bbr->r_ctl.rc_sacklast = NULL;
5025         bbr->r_timer_override = 1;
5026         bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
5027 }
5028
5029 /*
5030  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
5031  * we will setup to retransmit the lowest seq number outstanding.
5032  */
5033 static int
5034 bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
5035 {
5036         int32_t rexmt;
5037         int32_t retval = 0;
5038         bool isipv6;
5039
5040         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
5041         if (bbr->rc_all_timers_stopped) {
5042                 return (1);
5043         }
5044         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
5045             (tp->snd_una == tp->snd_max)) {
5046                 /* Nothing outstanding .. nothing to do */
5047                 return (0);
5048         }
5049         /*
5050          * Retransmission timer went off.  Message has not been acked within
5051          * retransmit interval.  Back off to a longer retransmit interval
5052          * and retransmit one segment.
5053          */
5054         if (ctf_progress_timeout_check(tp, true)) {
5055                 retval = 1;
5056                 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
5057                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
5058                 goto out;
5059         }
5060         bbr_remxt_tmr(tp);
5061         if ((bbr->r_ctl.rc_resend == NULL) ||
5062             ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) {
5063                 /*
5064                  * If the rwnd collapsed on
5065                  * the one we are retransmitting
5066                  * it does not count against the
5067                  * rxt count.
5068                  */
5069                 tp->t_rxtshift++;
5070         }
5071         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
5072                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
5073                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
5074                 retval = 1;
5075                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
5076                 tcp_set_inp_to_drop(bbr->rc_inp,
5077                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
5078                 goto out;
5079         }
5080         if (tp->t_state == TCPS_SYN_SENT) {
5081                 /*
5082                  * If the SYN was retransmitted, indicate CWND to be limited
5083                  * to 1 segment in cc_conn_init().
5084                  */
5085                 tp->snd_cwnd = 1;
5086         } else if (tp->t_rxtshift == 1) {
5087                 /*
5088                  * first retransmit; record ssthresh and cwnd so they can be
5089                  * recovered if this turns out to be a "bad" retransmit. A
5090                  * retransmit is considered "bad" if an ACK for this segment
5091                  * is received within RTT/2 interval; the assumption here is
5092                  * that the ACK was already in flight.  See "On Estimating
5093                  * End-to-End Network Path Properties" by Allman and Paxson
5094                  * for more details.
5095                  */
5096                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5097                 if (!IN_RECOVERY(tp->t_flags)) {
5098                         tp->snd_cwnd_prev = tp->snd_cwnd;
5099                         tp->snd_ssthresh_prev = tp->snd_ssthresh;
5100                         tp->snd_recover_prev = tp->snd_recover;
5101                         tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
5102                         tp->t_flags |= TF_PREVVALID;
5103                 } else {
5104                         tp->t_flags &= ~TF_PREVVALID;
5105                 }
5106                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5107         } else {
5108                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5109                 tp->t_flags &= ~TF_PREVVALID;
5110         }
5111         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
5112         if ((tp->t_state == TCPS_SYN_SENT) ||
5113             (tp->t_state == TCPS_SYN_RECEIVED))
5114                 rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift];
5115         else
5116                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
5117         TCPT_RANGESET(tp->t_rxtcur, rexmt,
5118             MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms),
5119             MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
5120         /*
5121          * We enter the path for PLMTUD if connection is established or, if
5122          * connection is FIN_WAIT_1 status, reason for the last is that if
5123          * amount of data we send is very small, we could send it in couple
5124          * of packets and process straight to FIN. In that case we won't
5125          * catch ESTABLISHED state.
5126          */
5127 #ifdef INET6
5128         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
5129 #else
5130         isipv6 = false;
5131 #endif
5132         if (((V_tcp_pmtud_blackhole_detect == 1) ||
5133             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
5134             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
5135             ((tp->t_state == TCPS_ESTABLISHED) ||
5136             (tp->t_state == TCPS_FIN_WAIT_1))) {
5137
5138                 /*
5139                  * Idea here is that at each stage of mtu probe (usually,
5140                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
5141                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
5142                  * should take care of that.
5143                  */
5144                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
5145                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
5146                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
5147                     tp->t_rxtshift % 2 == 0)) {
5148                         /*
5149                          * Enter Path MTU Black-hole Detection mechanism: -
5150                          * Disable Path MTU Discovery (IP "DF" bit). -
5151                          * Reduce MTU to lower value than what we negotiated
5152                          * with peer.
5153                          */
5154                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
5155                                 /*
5156                                  * Record that we may have found a black
5157                                  * hole.
5158                                  */
5159                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
5160                                 /* Keep track of previous MSS. */
5161                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
5162                         }
5163                         /*
5164                          * Reduce the MSS to blackhole value or to the
5165                          * default in an attempt to retransmit.
5166                          */
5167 #ifdef INET6
5168                         isipv6 = bbr->r_is_v6;
5169                         if (isipv6 &&
5170                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
5171                                 /* Use the sysctl tuneable blackhole MSS. */
5172                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
5173                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5174                         } else if (isipv6) {
5175                                 /* Use the default MSS. */
5176                                 tp->t_maxseg = V_tcp_v6mssdflt;
5177                                 /*
5178                                  * Disable Path MTU Discovery when we switch
5179                                  * to minmss.
5180                                  */
5181                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5182                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5183                         }
5184 #endif
5185 #if defined(INET6) && defined(INET)
5186                         else
5187 #endif
5188 #ifdef INET
5189                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
5190                                 /* Use the sysctl tuneable blackhole MSS. */
5191                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
5192                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5193                         } else {
5194                                 /* Use the default MSS. */
5195                                 tp->t_maxseg = V_tcp_mssdflt;
5196                                 /*
5197                                  * Disable Path MTU Discovery when we switch
5198                                  * to minmss.
5199                                  */
5200                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5201                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5202                         }
5203 #endif
5204                 } else {
5205                         /*
5206                          * If further retransmissions are still unsuccessful
5207                          * with a lowered MTU, maybe this isn't a blackhole
5208                          * and we restore the previous MSS and blackhole
5209                          * detection flags. The limit '6' is determined by
5210                          * giving each probe stage (1448, 1188, 524) 2
5211                          * chances to recover.
5212                          */
5213                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
5214                             (tp->t_rxtshift >= 6)) {
5215                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
5216                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
5217                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
5218                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
5219                         }
5220                 }
5221         }
5222         /*
5223          * Disable RFC1323 and SACK if we haven't got any response to our
5224          * third SYN to work-around some broken terminal servers (most of
5225          * which have hopefully been retired) that have bad VJ header
5226          * compression code which trashes TCP segments containing
5227          * unknown-to-them TCP options.
5228          */
5229         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
5230             (tp->t_rxtshift == 3))
5231                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
5232         /*
5233          * If we backed off this far, our srtt estimate is probably bogus.
5234          * Clobber it so we'll take the next rtt measurement as our srtt;
5235          * move the current srtt into rttvar to keep the current retransmit
5236          * times until then.
5237          */
5238         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
5239 #ifdef INET6
5240                 if (bbr->r_is_v6)
5241                         in6_losing(tp->t_inpcb);
5242                 else
5243 #endif
5244                         in_losing(tp->t_inpcb);
5245                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
5246                 tp->t_srtt = 0;
5247         }
5248         sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
5249         tp->snd_recover = tp->snd_max;
5250         tp->t_flags |= TF_ACKNOW;
5251         tp->t_rtttime = 0;
5252 out:
5253         return (retval);
5254 }
5255
5256 static int
5257 bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling)
5258 {
5259         int32_t ret = 0;
5260         int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
5261
5262         if (timers == 0) {
5263                 return (0);
5264         }
5265         if (tp->t_state == TCPS_LISTEN) {
5266                 /* no timers on listen sockets */
5267                 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
5268                         return (0);
5269                 return (1);
5270         }
5271         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
5272                 uint32_t left;
5273
5274                 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
5275                         ret = -1;
5276                         bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
5277                         return (0);
5278                 }
5279                 if (hpts_calling == 0) {
5280                         ret = -2;
5281                         bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
5282                         return (0);
5283                 }
5284                 /*
5285                  * Ok our timer went off early and we are not paced false
5286                  * alarm, go back to sleep.
5287                  */
5288                 left = bbr->r_ctl.rc_timer_exp - cts;
5289                 ret = -3;
5290                 bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
5291                 tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left));
5292                 return (1);
5293         }
5294         bbr->rc_tmr_stopped = 0;
5295         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
5296         if (timers & PACE_TMR_DELACK) {
5297                 ret = bbr_timeout_delack(tp, bbr, cts);
5298         } else if (timers & PACE_TMR_PERSIT) {
5299                 ret = bbr_timeout_persist(tp, bbr, cts);
5300         } else if (timers & PACE_TMR_RACK) {
5301                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5302                 ret = bbr_timeout_rack(tp, bbr, cts);
5303         } else if (timers & PACE_TMR_TLP) {
5304                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5305                 ret = bbr_timeout_tlp(tp, bbr, cts);
5306         } else if (timers & PACE_TMR_RXT) {
5307                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5308                 ret = bbr_timeout_rxt(tp, bbr, cts);
5309         } else if (timers & PACE_TMR_KEEP) {
5310                 ret = bbr_timeout_keepalive(tp, bbr, cts);
5311         }
5312         bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling);
5313         return (ret);
5314 }
5315
5316 static void
5317 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
5318 {
5319         if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
5320                 uint8_t hpts_removed = 0;
5321
5322                 if (bbr->rc_inp->inp_in_hpts &&
5323                     (bbr->rc_timer_first == 1)) {
5324                         /*
5325                          * If we are canceling timer's when we have the
5326                          * timer ahead of the output being paced. We also
5327                          * must remove ourselves from the hpts.
5328                          */
5329                         hpts_removed = 1;
5330                         tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
5331                         if (bbr->r_ctl.rc_last_delay_val) {
5332                                 /* Update the last hptsi delay too */
5333                                 uint32_t time_since_send;
5334
5335                                 if (TSTMP_GT(cts, bbr->rc_pacer_started))
5336                                         time_since_send = cts - bbr->rc_pacer_started;
5337                                 else
5338                                         time_since_send = 0;
5339                                 if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
5340                                         /* Cut down our slot time */
5341                                         bbr->r_ctl.rc_last_delay_val -= time_since_send;
5342                                 } else {
5343                                         bbr->r_ctl.rc_last_delay_val = 0;
5344                                 }
5345                                 bbr->rc_pacer_started = cts;
5346                         }
5347                 }
5348                 bbr->rc_timer_first = 0;
5349                 bbr_log_to_cancel(bbr, line, cts, hpts_removed);
5350                 bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
5351                 bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
5352         }
5353 }
5354
5355 static void
5356 bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
5357 {
5358         struct tcp_bbr *bbr;
5359
5360         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
5361         bbr->rc_all_timers_stopped = 1;
5362         return;
5363 }
5364
5365 /*
5366  * stop all timers always returning 0.
5367  */
5368 static int
5369 bbr_stopall(struct tcpcb *tp)
5370 {
5371         return (0);
5372 }
5373
5374 static void
5375 bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
5376 {
5377         return;
5378 }
5379
5380 /*
5381  * return true if a bbr timer (rack or tlp) is active.
5382  */
5383 static int
5384 bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
5385 {
5386         return (0);
5387 }
5388
5389 static uint32_t
5390 bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts)
5391 {
5392         struct bbr_sendmap *rsm;
5393
5394         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
5395         if ((rsm == NULL) || (u_rsm == rsm))
5396                 return (cts);
5397         return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
5398 }
5399
5400 static void
5401 bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr,
5402      struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time)
5403 {
5404         int32_t idx;
5405
5406         rsm->r_rtr_cnt++;
5407         rsm->r_dupack = 0;
5408         if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) {
5409                 rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS;
5410                 rsm->r_flags |= BBR_OVERMAX;
5411         }
5412         if (rsm->r_flags & BBR_RWND_COLLAPSED) {
5413                 /* Take off the collapsed flag at rxt */
5414                 rsm->r_flags &= ~BBR_RWND_COLLAPSED;
5415         }
5416         if (rsm->r_flags & BBR_MARKED_LOST) {
5417                 /* We have retransmitted, its no longer lost */
5418                 rsm->r_flags &= ~BBR_MARKED_LOST;
5419                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
5420         }
5421         if (rsm->r_flags & BBR_RXT_CLEARED) {
5422                 /*
5423                  * We hit a RXT timer on it and
5424                  * we cleared the "acked" flag.
5425                  * We now have it going back into
5426                  * flight, we can remove the cleared
5427                  * flag and possibly do accounting on
5428                  * this piece.
5429                  */
5430                 rsm->r_flags &= ~BBR_RXT_CLEARED;
5431         }
5432         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) {
5433                 bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
5434                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
5435         }
5436         idx = rsm->r_rtr_cnt - 1;
5437         rsm->r_tim_lastsent[idx] = cts;
5438         rsm->r_pacing_delay = pacing_time;
5439         rsm->r_delivered = bbr->r_ctl.rc_delivered;
5440         rsm->r_ts_valid = bbr->rc_ts_valid;
5441         if (bbr->rc_ts_valid)
5442                 rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
5443         if (bbr->r_ctl.r_app_limited_until)
5444                 rsm->r_app_limited = 1;
5445         else
5446                 rsm->r_app_limited = 0;
5447         if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
5448                 rsm->r_bbr_state = bbr_state_val(bbr);
5449         else
5450                 rsm->r_bbr_state = 8;
5451         if (rsm->r_flags & BBR_ACKED) {
5452                 /* Problably MTU discovery messing with us */
5453                 uint32_t old_flags;
5454
5455                 old_flags = rsm->r_flags;
5456                 rsm->r_flags &= ~BBR_ACKED;
5457                 bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
5458                 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
5459                 if (bbr->r_ctl.rc_sacked == 0)
5460                         bbr->r_ctl.rc_sacklast = NULL;
5461         }
5462         if (rsm->r_in_tmap) {
5463                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
5464         }
5465         TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
5466         rsm->r_in_tmap = 1;
5467         if (rsm->r_flags & BBR_SACK_PASSED) {
5468                 /* We have retransmitted due to the SACK pass */
5469                 rsm->r_flags &= ~BBR_SACK_PASSED;
5470                 rsm->r_flags |= BBR_WAS_SACKPASS;
5471         }
5472         rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
5473         rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
5474                                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
5475         bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
5476         if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
5477                 rsm->r_is_gain = 1;
5478                 rsm->r_is_drain = 0;
5479         } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
5480                 rsm->r_is_drain = 1;
5481                 rsm->r_is_gain = 0;
5482         } else {
5483                 rsm->r_is_drain = 0;
5484                 rsm->r_is_gain = 0;
5485         }
5486         rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */
5487 }
5488
5489 /*
5490  * Returns 0, or the sequence where we stopped
5491  * updating. We also update the lenp to be the amount
5492  * of data left.
5493  */
5494
5495 static uint32_t
5496 bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr,
5497     struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time)
5498 {
5499         /*
5500          * We (re-)transmitted starting at rsm->r_start for some length
5501          * (possibly less than r_end.
5502          */
5503         struct bbr_sendmap *nrsm;
5504         uint32_t c_end;
5505         int32_t len;
5506
5507         len = *lenp;
5508         c_end = rsm->r_start + len;
5509         if (SEQ_GEQ(c_end, rsm->r_end)) {
5510                 /*
5511                  * We retransmitted the whole piece or more than the whole
5512                  * slopping into the next rsm.
5513                  */
5514                 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
5515                 if (c_end == rsm->r_end) {
5516                         *lenp = 0;
5517                         return (0);
5518                 } else {
5519                         int32_t act_len;
5520
5521                         /* Hangs over the end return whats left */
5522                         act_len = rsm->r_end - rsm->r_start;
5523                         *lenp = (len - act_len);
5524                         return (rsm->r_end);
5525                 }
5526                 /* We don't get out of this block. */
5527         }
5528         /*
5529          * Here we retransmitted less than the whole thing which means we
5530          * have to split this into what was transmitted and what was not.
5531          */
5532         nrsm = bbr_alloc_full_limit(bbr);
5533         if (nrsm == NULL) {
5534                 *lenp = 0;
5535                 return (0);
5536         }
5537         /*
5538          * So here we are going to take the original rsm and make it what we
5539          * retransmitted. nrsm will be the tail portion we did not
5540          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
5541          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
5542          * 1, 6 and the new piece will be 6, 11.
5543          */
5544         bbr_clone_rsm(bbr, nrsm, rsm, c_end);
5545         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
5546         nrsm->r_dupack = 0;
5547         if (rsm->r_in_tmap) {
5548                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5549                 nrsm->r_in_tmap = 1;
5550         }
5551         rsm->r_flags &= (~BBR_HAS_FIN);
5552         bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
5553         *lenp = 0;
5554         return (0);
5555 }
5556
5557 static uint64_t
5558 bbr_get_hardware_rate(struct tcp_bbr *bbr)
5559 {
5560         uint64_t bw;
5561
5562         bw = bbr_get_bw(bbr);
5563         bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN];
5564         bw /= (uint64_t)BBR_UNIT;
5565         return(bw);
5566 }
5567
5568 static void
5569 bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts,
5570                        uint64_t act_rate, uint64_t rate_wanted)
5571 {
5572         /*
5573          * We could not get a full gains worth
5574          * of rate.
5575          */
5576         if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) {
5577                 /* we can't even get the real rate */
5578                 uint64_t red;
5579
5580                 bbr->skip_gain = 1;
5581                 bbr->gain_is_limited = 0;
5582                 red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate;
5583                 if (red)
5584                         filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts);
5585         } else {
5586                 /* We can use a lower gain */
5587                 bbr->skip_gain = 0;
5588                 bbr->gain_is_limited = 1;
5589         }
5590 }
5591
5592 static void
5593 bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts)
5594 {
5595         const struct tcp_hwrate_limit_table *nrte;
5596         int error, rate = -1;
5597
5598         if (bbr->r_ctl.crte == NULL)
5599                 return;
5600         if ((bbr->rc_inp->inp_route.ro_nh == NULL) ||
5601             (bbr->rc_inp->inp_route.ro_nh->nh_ifp == NULL)) {
5602                 /* Lost our routes? */
5603                 /* Clear the way for a re-attempt */
5604                 bbr->bbr_attempt_hdwr_pace = 0;
5605 lost_rate:
5606                 bbr->gain_is_limited = 0;
5607                 bbr->skip_gain = 0;
5608                 bbr->bbr_hdrw_pacing = 0;
5609                 counter_u64_add(bbr_flows_whdwr_pacing, -1);
5610                 counter_u64_add(bbr_flows_nohdwr_pacing, 1);
5611                 tcp_bbr_tso_size_check(bbr, cts);
5612                 return;
5613         }
5614         rate = bbr_get_hardware_rate(bbr);
5615         nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte,
5616                                    bbr->rc_tp,
5617                                    bbr->rc_inp->inp_route.ro_nh->nh_ifp,
5618                                    rate,
5619                                    (RS_PACING_GEQ|RS_PACING_SUB_OK),
5620                                    &error);
5621         if (nrte == NULL) {
5622                 goto lost_rate;
5623         }
5624         if (nrte != bbr->r_ctl.crte) {
5625                 bbr->r_ctl.crte = nrte;
5626                 if (error == 0)  {
5627                         BBR_STAT_INC(bbr_hdwr_rl_mod_ok);
5628                         if (bbr->r_ctl.crte->rate < rate) {
5629                                 /* We have a problem */
5630                                 bbr_setup_less_of_rate(bbr, cts,
5631                                                        bbr->r_ctl.crte->rate, rate);
5632                         } else {
5633                                 /* We are good */
5634                                 bbr->gain_is_limited = 0;
5635                                 bbr->skip_gain = 0;
5636                         }
5637                 } else {
5638                         /* A failure should release the tag */
5639                         BBR_STAT_INC(bbr_hdwr_rl_mod_fail);
5640                         bbr->gain_is_limited = 0;
5641                         bbr->skip_gain = 0;
5642                         bbr->bbr_hdrw_pacing = 0;
5643                 }
5644                 bbr_type_log_hdwr_pacing(bbr,
5645                                          bbr->r_ctl.crte->ptbl->rs_ifp,
5646                                          rate,
5647                                          ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate),
5648                                          __LINE__,
5649                                          cts,
5650                                          error);
5651         }
5652 }
5653
5654 static void
5655 bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts)
5656 {
5657         /*
5658          * If we have hardware pacing support
5659          * we need to factor that in for our
5660          * TSO size.
5661          */
5662         const struct tcp_hwrate_limit_table *rlp;
5663         uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay;
5664
5665         if ((bbr->bbr_hdrw_pacing == 0) ||
5666             (IN_RECOVERY(bbr->rc_tp->t_flags)) ||
5667             (bbr->r_ctl.crte == NULL))
5668                 return;
5669         if (bbr->hw_pacing_set == 0) {
5670                 /* Not yet by the hdwr pacing count delay */
5671                 return;
5672         }
5673         if (bbr_hdwr_pace_adjust == 0) {
5674                 /* No adjustment */
5675                 return;
5676         }
5677         rlp = bbr->r_ctl.crte;
5678         if (bbr->rc_tp->t_maxseg > bbr->rc_last_options)
5679                 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5680         else
5681                 maxseg = BBR_MIN_SEG - bbr->rc_last_options;
5682         /*
5683          * So lets first get the
5684          * time we will take between
5685          * TSO sized sends currently without
5686          * hardware help.
5687          */
5688         cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT,
5689                         bbr->r_ctl.rc_pace_max_segs, cts, 1);
5690         hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg;
5691         hdwr_delay *= rlp->time_between;
5692         if (cur_delay > hdwr_delay)
5693                 delta = cur_delay - hdwr_delay;
5694         else
5695                 delta = 0;
5696         bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay,
5697                              (bbr->r_ctl.rc_pace_max_segs / maxseg),
5698                              1);
5699         if (delta &&
5700             (delta < (max(rlp->time_between,
5701                           bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) {
5702                 /*
5703                  * Now lets divide by the pacing
5704                  * time between each segment the
5705                  * hardware sends rounding up and
5706                  * derive a bytes from that. We multiply
5707                  * that by bbr_hdwr_pace_adjust to get
5708                  * more bang for our buck.
5709                  *
5710                  * The goal is to have the software pacer
5711                  * waiting no more than an additional
5712                  * pacing delay if we can (without the
5713                  * compensation i.e. x bbr_hdwr_pace_adjust).
5714                  */
5715                 seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between),
5716                              (bbr->r_ctl.rc_pace_max_segs/maxseg));
5717                 seg_sz *= bbr_hdwr_pace_adjust;
5718                 if (bbr_hdwr_pace_floor &&
5719                     (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
5720                         /* Currently hardware paces
5721                          * out rs_min_seg segments at a time.
5722                          * We need to make sure we always send at least
5723                          * a full burst of bbr_hdwr_pace_floor down.
5724                          */
5725                         seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
5726                 }
5727                 seg_sz *= maxseg;
5728         } else if (delta == 0) {
5729                 /*
5730                  * The highest pacing rate is
5731                  * above our b/w gained. This means
5732                  * we probably are going quite fast at
5733                  * the hardware highest rate. Lets just multiply
5734                  * the calculated TSO size by the
5735                  * multiplier factor (its probably
5736                  * 4 segments in the default config for
5737                  * mlx).
5738                  */
5739                 seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust;
5740                 if (bbr_hdwr_pace_floor &&
5741                     (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
5742                         /* Currently hardware paces
5743                          * out rs_min_seg segments at a time.
5744                          * We need to make sure we always send at least
5745                          * a full burst of bbr_hdwr_pace_floor down.
5746                          */
5747                         seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
5748                 }
5749         } else {
5750                 /*
5751                  * The pacing time difference is so
5752                  * big that the hardware will
5753                  * pace out more rapidly then we
5754                  * really want and then we
5755                  * will have a long delay. Lets just keep
5756                  * the same TSO size so its as if
5757                  * we were not using hdwr pacing (we
5758                  * just gain a bit of spacing from the
5759                  * hardware if seg_sz > 1).
5760                  */
5761                 seg_sz = bbr->r_ctl.rc_pace_max_segs;
5762         }
5763         if (seg_sz > bbr->r_ctl.rc_pace_max_segs)
5764                 new_tso = seg_sz;
5765         else
5766                 new_tso = bbr->r_ctl.rc_pace_max_segs;
5767         if (new_tso >= (PACE_MAX_IP_BYTES-maxseg))
5768                 new_tso = PACE_MAX_IP_BYTES - maxseg;
5769
5770         if (new_tso != bbr->r_ctl.rc_pace_max_segs) {
5771                 bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0);
5772                 bbr->r_ctl.rc_pace_max_segs = new_tso;
5773         }
5774 }
5775
5776 static void
5777 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts)
5778 {
5779         uint64_t bw;
5780         uint32_t old_tso = 0, new_tso;
5781         uint32_t maxseg, bytes;
5782         uint32_t tls_seg=0;
5783         /*
5784          * Google/linux uses the following algorithm to determine
5785          * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18):
5786          *
5787          *  bytes = bw_in_bytes_per_second / 1000
5788          *  bytes = min(bytes, 64k)
5789          *  tso_segs = bytes / MSS
5790          *  if (bw < 1.2Mbs)
5791          *      min_tso_segs = 1
5792          *  else
5793          *      min_tso_segs = 2
5794          * tso_segs = max(tso_segs, min_tso_segs)
5795          *
5796          * * Note apply a device specific limit (we apply this in the
5797          *   tcp_m_copym).
5798          * Note that before the initial measurement is made google bursts out
5799          * a full iwnd just like new-reno/cubic.
5800          *
5801          * We do not use this algorithm. Instead we
5802          * use a two phased approach:
5803          *
5804          *  if ( bw <= per-tcb-cross-over)
5805          *     goal_tso =  calculate how much with this bw we
5806          *                 can send in goal-time seconds.
5807          *     if (goal_tso > mss)
5808          *         seg = goal_tso / mss
5809          *         tso = seg * mss
5810          *     else
5811          *         tso = mss
5812          *     if (tso > per-tcb-max)
5813          *         tso = per-tcb-max
5814          *  else if ( bw > 512Mbps)
5815          *     tso = max-tso (64k/mss)
5816          *  else
5817          *     goal_tso = bw / per-tcb-divsor
5818          *     seg = (goal_tso + mss-1)/mss
5819          *     tso = seg * mss
5820          *
5821          * if (tso < per-tcb-floor)
5822          *    tso = per-tcb-floor
5823          * if (tso > per-tcb-utter_max)
5824          *    tso = per-tcb-utter_max
5825          *
5826          * Note the default per-tcb-divisor is 1000 (same as google).
5827          * the goal cross over is 30Mbps however. To recreate googles
5828          * algorithm you need to set:
5829          *
5830          * cross-over = 23,168,000 bps
5831          * goal-time = 18000
5832          * per-tcb-max = 2
5833          * per-tcb-divisor = 1000
5834          * per-tcb-floor = 1
5835          *
5836          * This will get you "google bbr" behavior with respect to tso size.
5837          *
5838          * Note we do set anything TSO size until we are past the initial
5839          * window. Before that we gnerally use either a single MSS
5840          * or we use the full IW size (so we burst a IW at a time)
5841          * Also note that Hardware-TLS is special and does alternate
5842          * things to minimize PCI Bus Bandwidth use.
5843          */
5844
5845         if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) {
5846                 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5847         } else {
5848                 maxseg = BBR_MIN_SEG - bbr->rc_last_options;
5849         }
5850 #ifdef KERN_TLS
5851         if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
5852                 tls_seg =  ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd);
5853                 bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options);
5854         }
5855 #endif
5856         old_tso = bbr->r_ctl.rc_pace_max_segs;
5857         if (bbr->rc_past_init_win == 0) {
5858                 /*
5859                  * Not enough data has been acknowledged to make a
5860                  * judgement unless we are hardware TLS. Set up
5861                  * the initial TSO based on if we are sending a
5862                  * full IW at once or not.
5863                  */
5864                 if (bbr->rc_use_google)
5865                         bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2);
5866                 else if (bbr->bbr_init_win_cheat)
5867                         bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp);
5868                 else
5869                         bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5870                 if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg)
5871                         bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg;
5872 #ifdef KERN_TLS
5873                 if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) {
5874                         /*
5875                          * For hardware TLS we set our min to the tls_seg size.
5876                          */
5877                         bbr->r_ctl.rc_pace_max_segs = tls_seg;
5878                         bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
5879                 }
5880 #endif
5881                 if (bbr->r_ctl.rc_pace_max_segs == 0) {
5882                         bbr->r_ctl.rc_pace_max_segs = maxseg;
5883                 }
5884                 bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0);
5885 #ifdef KERN_TLS
5886                 if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
5887 #endif
5888                         bbr_adjust_for_hw_pacing(bbr, cts);
5889                 return;
5890         }
5891         /**
5892          * Now lets set the TSO goal based on our delivery rate in
5893          * bytes per second. Note we only do this if
5894          * we have acked at least the initial cwnd worth of data.
5895          */
5896         bw = bbr_get_bw(bbr);
5897         if (IN_RECOVERY(bbr->rc_tp->t_flags) &&
5898              (bbr->rc_use_google == 0)) {
5899                 /* We clamp to one MSS in recovery */
5900                 new_tso = maxseg;
5901         } else if (bbr->rc_use_google) {
5902                 int min_tso_segs;
5903
5904                 /* Google considers the gain too */
5905                 if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) {
5906                         bw *= bbr->r_ctl.rc_bbr_hptsi_gain;
5907                         bw /= BBR_UNIT;
5908                 }
5909                 bytes = bw / 1024;
5910                 if (bytes > (64 * 1024))
5911                         bytes = 64 * 1024;
5912                 new_tso = bytes / maxseg;
5913                 if (bw < ONE_POINT_TWO_MEG)
5914                         min_tso_segs = 1;
5915                 else
5916                         min_tso_segs = 2;
5917                 if (new_tso < min_tso_segs)
5918                         new_tso = min_tso_segs;
5919                 new_tso *= maxseg;
5920         } else if (bbr->rc_no_pacing) {
5921                 new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg;
5922         } else if (bw <= bbr->r_ctl.bbr_cross_over) {
5923                 /*
5924                  * Calculate the worse case b/w TSO if we are inserting no
5925                  * more than a delay_target number of TSO's.
5926                  */
5927                 uint32_t tso_len, min_tso;
5928
5929                 tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw);
5930                 if (tso_len > maxseg) {
5931                         new_tso = tso_len / maxseg;
5932                         if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max)
5933                                 new_tso = bbr->r_ctl.bbr_hptsi_segments_max;
5934                         new_tso *= maxseg;
5935                 } else {
5936                         /*
5937                          * less than a full sized frame yikes.. long rtt or
5938                          * low bw?
5939                          */
5940                         min_tso = bbr_minseg(bbr);
5941                         if ((tso_len > min_tso) && (bbr_all_get_min == 0))
5942                                 new_tso = rounddown(tso_len, min_tso);
5943                         else
5944                                 new_tso = min_tso;
5945                 }
5946         } else if (bw > FIVETWELVE_MBPS) {
5947                 /*
5948                  * This guy is so fast b/w wise that we can TSO as large as
5949                  * possible of segments that the NIC will allow.
5950                  */
5951                 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
5952         } else {
5953                 /*
5954                  * This formula is based on attempting to send a segment or
5955                  * more every bbr_hptsi_per_second. The default is 1000
5956                  * which means you are targeting what you can send every 1ms
5957                  * based on the peers bw.
5958                  *
5959                  * If the number drops to say 500, then you are looking more
5960                  * at 2ms and you will raise how much we send in a single
5961                  * TSO thus saving CPU (less bbr_output_wtime() calls). The
5962                  * trade off of course is you will send more at once and
5963                  * thus tend to clump up the sends into larger "bursts"
5964                  * building a queue.
5965                  */
5966                 bw /= bbr->r_ctl.bbr_hptsi_per_second;
5967                 new_tso = roundup(bw, (uint64_t)maxseg);
5968                 /*
5969                  * Gate the floor to match what our lower than 48Mbps
5970                  * algorithm does. The ceiling (bbr_hptsi_segments_max) thus
5971                  * becomes the floor for this calculation.
5972                  */
5973                 if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg))
5974                         new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg);
5975         }
5976         if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor)))
5977                 new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor;
5978         if (new_tso > PACE_MAX_IP_BYTES)
5979                 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
5980         /* Enforce an utter maximum if we are not HW-TLS */
5981 #ifdef KERN_TLS
5982         if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
5983 #endif
5984                 if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) {
5985                         new_tso = bbr->r_ctl.bbr_utter_max * maxseg;
5986                 }
5987 #ifdef KERN_TLS
5988         if (tls_seg) {
5989                 /*
5990                  * Lets move the output size
5991                  * up to 1 or more TLS record sizes.
5992                  */
5993                 uint32_t temp;
5994
5995                 temp = roundup(new_tso, tls_seg);
5996                 new_tso = temp;
5997                 /* Back down if needed to under a full frame */
5998                 while (new_tso > PACE_MAX_IP_BYTES)
5999                         new_tso -= tls_seg;
6000         }
6001 #endif
6002         if (old_tso != new_tso) {
6003                 /* Only log changes */
6004                 bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0);
6005                 bbr->r_ctl.rc_pace_max_segs = new_tso;
6006         }
6007 #ifdef KERN_TLS
6008         if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) &&
6009              tls_seg) {
6010                 bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
6011         } else
6012 #endif
6013                 /* We have hardware pacing and not hardware TLS! */
6014                 bbr_adjust_for_hw_pacing(bbr, cts);
6015 }
6016
6017 static void
6018 bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len,
6019     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts,
6020     struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc,
6021     struct sockbuf *sb)
6022 {
6023
6024         struct bbr_sendmap *rsm, *nrsm;
6025         register uint32_t snd_max, snd_una;
6026         uint32_t pacing_time;
6027         /*
6028          * Add to the RACK log of packets in flight or retransmitted. If
6029          * there is a TS option we will use the TS echoed, if not we will
6030          * grab a TS.
6031          *
6032          * Retransmissions will increment the count and move the ts to its
6033          * proper place. Note that if options do not include TS's then we
6034          * won't be able to effectively use the ACK for an RTT on a retran.
6035          *
6036          * Notes about r_start and r_end. Lets consider a send starting at
6037          * sequence 1 for 10 bytes. In such an example the r_start would be
6038          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
6039          * This means that r_end is actually the first sequence for the next
6040          * slot (11).
6041          *
6042          */
6043         INP_WLOCK_ASSERT(tp->t_inpcb);
6044         if (err) {
6045                 /*
6046                  * We don't log errors -- we could but snd_max does not
6047                  * advance in this case either.
6048                  */
6049                 return;
6050         }
6051         if (th_flags & TH_RST) {
6052                 /*
6053                  * We don't log resets and we return immediately from
6054                  * sending
6055                  */
6056                 *abandon = 1;
6057                 return;
6058         }
6059         snd_una = tp->snd_una;
6060         if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) {
6061                 /*
6062                  * The call to bbr_log_output is made before bumping
6063                  * snd_max. This means we can record one extra byte on a SYN
6064                  * or FIN if seq_out is adding more on and a FIN is present
6065                  * (and we are not resending).
6066                  */
6067                 if (th_flags & TH_SYN)
6068                         len++;
6069                 if (th_flags & TH_FIN)
6070                         len++;
6071         }
6072         if (SEQ_LEQ((seq_out + len), snd_una)) {
6073                 /* Are sending an old segment to induce an ack (keep-alive)? */
6074                 return;
6075         }
6076         if (SEQ_LT(seq_out, snd_una)) {
6077                 /* huh? should we panic? */
6078                 uint32_t end;
6079
6080                 end = seq_out + len;
6081                 seq_out = snd_una;
6082                 len = end - seq_out;
6083         }
6084         snd_max = tp->snd_max;
6085         if (len == 0) {
6086                 /* We don't log zero window probes */
6087                 return;
6088         }
6089         pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1);
6090         /* First question is it a retransmission? */
6091         if (seq_out == snd_max) {
6092 again:
6093                 rsm = bbr_alloc(bbr);
6094                 if (rsm == NULL) {
6095                         return;
6096                 }
6097                 rsm->r_flags = 0;
6098                 if (th_flags & TH_SYN)
6099                         rsm->r_flags |= BBR_HAS_SYN;
6100                 if (th_flags & TH_FIN)
6101                         rsm->r_flags |= BBR_HAS_FIN;
6102                 rsm->r_tim_lastsent[0] = cts;
6103                 rsm->r_rtr_cnt = 1;
6104                 rsm->r_rtr_bytes = 0;
6105                 rsm->r_start = seq_out;
6106                 rsm->r_end = rsm->r_start + len;
6107                 rsm->r_dupack = 0;
6108                 rsm->r_delivered = bbr->r_ctl.rc_delivered;
6109                 rsm->r_pacing_delay = pacing_time;
6110                 rsm->r_ts_valid = bbr->rc_ts_valid;
6111                 if (bbr->rc_ts_valid)
6112                         rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
6113                 rsm->r_del_time = bbr->r_ctl.rc_del_time;
6114                 if (bbr->r_ctl.r_app_limited_until)
6115                         rsm->r_app_limited = 1;
6116                 else
6117                         rsm->r_app_limited = 0;
6118                 rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
6119                 rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
6120                                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
6121                 /*
6122                  * Here we must also add in this rsm since snd_max
6123                  * is updated after we return from a new send.
6124                  */
6125                 rsm->r_flight_at_send += len;
6126                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
6127                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
6128                 rsm->r_in_tmap = 1;
6129                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
6130                         rsm->r_bbr_state = bbr_state_val(bbr);
6131                 else
6132                         rsm->r_bbr_state = 8;
6133                 if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
6134                         rsm->r_is_gain = 1;
6135                         rsm->r_is_drain = 0;
6136                 } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
6137                         rsm->r_is_drain = 1;
6138                         rsm->r_is_gain = 0;
6139                 } else {
6140                         rsm->r_is_drain = 0;
6141                         rsm->r_is_gain = 0;
6142                 }
6143                 return;
6144         }
6145         /*
6146          * If we reach here its a retransmission and we need to find it.
6147          */
6148 more:
6149         if (hintrsm && (hintrsm->r_start == seq_out)) {
6150                 rsm = hintrsm;
6151                 hintrsm = NULL;
6152         } else if (bbr->r_ctl.rc_next) {
6153                 /* We have a hint from a previous run */
6154                 rsm = bbr->r_ctl.rc_next;
6155         } else {
6156                 /* No hints sorry */
6157                 rsm = NULL;
6158         }
6159         if ((rsm) && (rsm->r_start == seq_out)) {
6160                 /*
6161                  * We used rc_next or hintrsm  to retransmit, hopefully the
6162                  * likely case.
6163                  */
6164                 seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
6165                 if (len == 0) {
6166                         return;
6167                 } else {
6168                         goto more;
6169                 }
6170         }
6171         /* Ok it was not the last pointer go through it the hard way. */
6172         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
6173                 if (rsm->r_start == seq_out) {
6174                         seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
6175                         bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
6176                         if (len == 0) {
6177                                 return;
6178                         } else {
6179                                 continue;
6180                         }
6181                 }
6182                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
6183                         /* Transmitted within this piece */
6184                         /*
6185                          * Ok we must split off the front and then let the
6186                          * update do the rest
6187                          */
6188                         nrsm = bbr_alloc_full_limit(bbr);
6189                         if (nrsm == NULL) {
6190                                 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
6191                                 return;
6192                         }
6193                         /*
6194                          * copy rsm to nrsm and then trim the front of rsm
6195                          * to not include this part.
6196                          */
6197                         bbr_clone_rsm(bbr, nrsm, rsm, seq_out);
6198                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
6199                         if (rsm->r_in_tmap) {
6200                                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6201                                 nrsm->r_in_tmap = 1;
6202                         }
6203                         rsm->r_flags &= (~BBR_HAS_FIN);
6204                         seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time);
6205                         if (len == 0) {
6206                                 return;
6207                         }
6208                 }
6209         }
6210         /*
6211          * Hmm not found in map did they retransmit both old and on into the
6212          * new?
6213          */
6214         if (seq_out == tp->snd_max) {
6215                 goto again;
6216         } else if (SEQ_LT(seq_out, tp->snd_max)) {
6217 #ifdef BBR_INVARIANTS
6218                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
6219                     seq_out, len, tp->snd_una, tp->snd_max);
6220                 printf("Starting Dump of all rack entries\n");
6221                 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
6222                         printf("rsm:%p start:%u end:%u\n",
6223                             rsm, rsm->r_start, rsm->r_end);
6224                 }
6225                 printf("Dump complete\n");
6226                 panic("seq_out not found rack:%p tp:%p",
6227                     bbr, tp);
6228 #endif
6229         } else {
6230 #ifdef BBR_INVARIANTS
6231                 /*
6232                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
6233                  * flag)
6234                  */
6235                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
6236                     seq_out, len, tp->snd_max, tp);
6237 #endif
6238         }
6239 }
6240
6241 static void
6242 bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt)
6243 {
6244         /*
6245          * Collapse timeout back the cum-ack moved.
6246          */
6247         tp->t_rxtshift = 0;
6248         tp->t_softerror = 0;
6249 }
6250
6251
6252 static void
6253 tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin)
6254 {
6255         bbr->rtt_valid = 1;
6256         bbr->r_ctl.cur_rtt = rtt_usecs;
6257         bbr->r_ctl.ts_in = tsin;
6258         if (rsm_send_time)
6259                 bbr->r_ctl.cur_rtt_send_time = rsm_send_time;
6260 }
6261
6262 static void
6263 bbr_make_timestamp_determination(struct tcp_bbr *bbr)
6264 {
6265         /**
6266          * We have in our bbr control:
6267          * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp).
6268          * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts).
6269          * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts)
6270          * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time)
6271          *
6272          * Now we can calculate the time between the sends by doing:
6273          *
6274          * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts
6275          *
6276          * And the peer's time between receiving them by doing:
6277          *
6278          * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp
6279          *
6280          * We want to figure out if the timestamp values are in msec, 10msec or usec.
6281          * We also may find that we can't use the timestamps if say we see
6282          * that the peer_delta indicates that though we may have taken 10ms to
6283          * pace out the data, it only saw 1ms between the two packets. This would
6284          * indicate that somewhere on the path is a batching entity that is giving
6285          * out time-slices of the actual b/w. This would mean we could not use
6286          * reliably the peers timestamps.
6287          *
6288          * We expect delta > peer_delta initially. Until we figure out the
6289          * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio.
6290          * If we place 1000 there then its a ms vs our usec. If we place 10000 there
6291          * then its 10ms vs our usec. If the peer is running a usec clock we would
6292          * put a 1 there. If the value is faster then ours, we will disable the
6293          * use of timestamps (though we could revist this later if we find it to be not
6294          * just an isolated one or two flows)).
6295          *
6296          * To detect the batching middle boxes we will come up with our compensation and
6297          * if with it in place, we find the peer is drastically off (by some margin) in
6298          * the smaller direction, then we will assume the worst case and disable use of timestamps.
6299          *
6300          */
6301         uint64_t delta, peer_delta, delta_up;
6302
6303         delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts;
6304         if (delta < bbr_min_usec_delta) {
6305                 /*
6306                  * Have not seen a min amount of time
6307                  * between our send times so we can
6308                  * make a determination of the timestamp
6309                  * yet.
6310                  */
6311                 return;
6312         }
6313         peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp;
6314         if (peer_delta < bbr_min_peer_delta) {
6315                 /*
6316                  * We may have enough in the form of
6317                  * our delta but the peers number
6318                  * has not changed that much. It could
6319                  * be its clock ratio is such that
6320                  * we need more data (10ms tick) or
6321                  * there may be other compression scenarios
6322                  * going on. In any event we need the
6323                  * spread to be larger.
6324                  */
6325                 return;
6326         }
6327         /* Ok lets first see which way our delta is going */
6328         if (peer_delta > delta) {
6329                 /* Very unlikely, the peer without
6330                  * compensation shows that it saw
6331                  * the two sends arrive further apart
6332                  * then we saw then in micro-seconds.
6333                  */
6334                 if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) {
6335                         /* well it looks like the peer is a micro-second clock. */
6336                         bbr->rc_ts_clock_set = 1;
6337                         bbr->r_ctl.bbr_peer_tsratio = 1;
6338                 } else {
6339                         bbr->rc_ts_cant_be_used = 1;
6340                         bbr->rc_ts_clock_set = 1;
6341                 }
6342                 return;
6343         }
6344         /* Ok we know that the peer_delta is smaller than our send distance */
6345         bbr->rc_ts_clock_set = 1;
6346         /* First question is it within the percentage that they are using usec time? */
6347         delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent;
6348         if ((peer_delta + delta_up) >= delta) {
6349                 /* Its a usec clock */
6350                 bbr->r_ctl.bbr_peer_tsratio = 1;
6351                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6352                 return;
6353         }
6354         /* Ok if not usec, what about 10usec (though unlikely)? */
6355         delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent;
6356         if (((peer_delta * 10) + delta_up) >= delta) {
6357                 bbr->r_ctl.bbr_peer_tsratio = 10;
6358                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6359                 return;
6360         }
6361         /* And what about 100usec (though again unlikely)? */
6362         delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent;
6363         if (((peer_delta * 100) + delta_up) >= delta) {
6364                 bbr->r_ctl.bbr_peer_tsratio = 100;
6365                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6366                 return;
6367         }
6368         /* And how about 1 msec (the most likely one)? */
6369         delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent;
6370         if (((peer_delta * 1000) + delta_up) >= delta) {
6371                 bbr->r_ctl.bbr_peer_tsratio = 1000;
6372                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6373                 return;
6374         }
6375         /* Ok if not msec could it be 10 msec? */
6376         delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent;
6377         if (((peer_delta * 10000) + delta_up) >= delta) {
6378                 bbr->r_ctl.bbr_peer_tsratio = 10000;
6379                 return;
6380         }
6381         /* If we fall down here the clock tick so slowly we can't use it */
6382         bbr->rc_ts_cant_be_used = 1;
6383         bbr->r_ctl.bbr_peer_tsratio = 0;
6384         bbr_log_tstmp_validation(bbr, peer_delta, delta);
6385 }
6386
6387 /*
6388  * Collect new round-trip time estimate
6389  * and update averages and current timeout.
6390  */
6391 static void
6392 tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
6393 {
6394         int32_t delta;
6395         uint32_t rtt, tsin;
6396         int32_t rtt_ticks;
6397
6398
6399         if (bbr->rtt_valid == 0)
6400                 /* No valid sample */
6401                 return;
6402
6403         rtt = bbr->r_ctl.cur_rtt;
6404         tsin = bbr->r_ctl.ts_in;
6405         if (bbr->rc_prtt_set_ts) {
6406                 /*
6407                  * We are to force feed the rttProp filter due
6408                  * to an entry into PROBE_RTT. This assures
6409                  * that the times are sync'd between when we
6410                  * go into PROBE_RTT and the filter expiration.
6411                  *
6412                  * Google does not use a true filter, so they do
6413                  * this implicitly since they only keep one value
6414                  * and when they enter probe-rtt they update the
6415                  * value to the newest rtt.
6416                  */
6417                 uint32_t rtt_prop;
6418
6419                 bbr->rc_prtt_set_ts = 0;
6420                 rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
6421                 if (rtt > rtt_prop)
6422                         filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts);
6423                 else
6424                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
6425         }
6426         if (bbr->rc_ack_was_delayed)
6427                 rtt += bbr->r_ctl.rc_ack_hdwr_delay;
6428
6429         if (rtt < bbr->r_ctl.rc_lowest_rtt)
6430                 bbr->r_ctl.rc_lowest_rtt = rtt;
6431         bbr_log_rtt_sample(bbr, rtt, tsin);
6432         if (bbr->r_init_rtt) {
6433                 /*
6434                  * The initial rtt is not-trusted, nuke it and lets get
6435                  * our first valid measurement in.
6436                  */
6437                 bbr->r_init_rtt = 0;
6438                 tp->t_srtt = 0;
6439         }
6440         if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) {
6441                 /*
6442                  * So we have not yet figured out
6443                  * what the peers TSTMP value is
6444                  * in (most likely ms). We need a
6445                  * series of cum-ack's to determine
6446                  * this reliably.
6447                  */
6448                 if (bbr->rc_ack_is_cumack) {
6449                         if (bbr->rc_ts_data_set) {
6450                                 /* Lets attempt to determine the timestamp granularity. */
6451                                 bbr_make_timestamp_determination(bbr);
6452                         } else {
6453                                 bbr->rc_ts_data_set = 1;
6454                                 bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts;
6455                                 bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time;
6456                         }
6457                 } else {
6458                         /*
6459                          * We have to have consecutive acks
6460                          * reset any "filled" state to none.
6461                          */
6462                         bbr->rc_ts_data_set = 0;
6463                 }
6464         }
6465         /* Round it up */
6466         rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1)));
6467         if (rtt_ticks == 0)
6468                 rtt_ticks = 1;
6469         if (tp->t_srtt != 0) {
6470                 /*
6471                  * srtt is stored as fixed point with 5 bits after the
6472                  * binary point (i.e., scaled by 8).  The following magic is
6473                  * equivalent to the smoothing algorithm in rfc793 with an
6474                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
6475                  * Adjust rtt to origin 0.
6476                  */
6477
6478                 delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT)
6479                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6480
6481                 tp->t_srtt += delta;
6482                 if (tp->t_srtt <= 0)
6483                         tp->t_srtt = 1;
6484
6485                 /*
6486                  * We accumulate a smoothed rtt variance (actually, a
6487                  * smoothed mean difference), then set the retransmit timer
6488                  * to smoothed rtt + 4 times the smoothed variance. rttvar
6489                  * is stored as fixed point with 4 bits after the binary
6490                  * point (scaled by 16).  The following is equivalent to
6491                  * rfc793 smoothing with an alpha of .75 (rttvar =
6492                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
6493                  * wired-in beta.
6494                  */
6495                 if (delta < 0)
6496                         delta = -delta;
6497                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
6498                 tp->t_rttvar += delta;
6499                 if (tp->t_rttvar <= 0)
6500                         tp->t_rttvar = 1;
6501                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
6502                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6503         } else {
6504                 /*
6505                  * No rtt measurement yet - use the unsmoothed rtt. Set the
6506                  * variance to half the rtt (so our first retransmit happens
6507                  * at 3*rtt).
6508                  */
6509                 tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT;
6510                 tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1);
6511                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6512         }
6513         KMOD_TCPSTAT_INC(tcps_rttupdated);
6514         tp->t_rttupdated++;
6515 #ifdef STATS
6516         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks));
6517 #endif
6518         /*
6519          * the retransmit should happen at rtt + 4 * rttvar. Because of the
6520          * way we do the smoothing, srtt and rttvar will each average +1/2
6521          * tick of bias.  When we compute the retransmit timer, we want 1/2
6522          * tick of rounding and 1 extra tick because of +-1/2 tick
6523          * uncertainty in the firing of the timer.  The bias will give us
6524          * exactly the 1.5 tick we need.  But, because the bias is
6525          * statistical, we have to test that we don't drop below the minimum
6526          * feasible timer (which is 2 ticks).
6527          */
6528         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6529             max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2),
6530             MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
6531
6532         /*
6533          * We received an ack for a packet that wasn't retransmitted; it is
6534          * probably safe to discard any error indications we've received
6535          * recently.  This isn't quite right, but close enough for now (a
6536          * route might have failed after we sent a segment, and the return
6537          * path might not be symmetrical).
6538          */
6539         tp->t_softerror = 0;
6540         rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
6541         if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt)
6542                 bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
6543 }
6544
6545 static void
6546 bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
6547                    uint32_t t, uint32_t cts, int ack_type)
6548 {
6549         /*
6550          * For this RSM, we acknowledged the data from a previous
6551          * transmission, not the last one we made. This means we did a false
6552          * retransmit.
6553          */
6554         if (rsm->r_flags & BBR_HAS_FIN) {
6555                 /*
6556                  * The sending of the FIN often is multiple sent when we
6557                  * have everything outstanding ack'd. We ignore this case
6558                  * since its over now.
6559                  */
6560                 return;
6561         }
6562         if (rsm->r_flags & BBR_TLP) {
6563                 /*
6564                  * We expect TLP's to have this occur often
6565                  */
6566                 bbr->rc_tlp_rtx_out = 0;
6567                 return;
6568         }
6569         if (ack_type != BBR_CUM_ACKED) {
6570                 /*
6571                  * If it was not a cum-ack we
6572                  * don't really know for sure since
6573                  * the timestamp could be from some
6574                  * other transmission.
6575                  */
6576                 return;
6577         }
6578
6579         if (rsm->r_flags & BBR_WAS_SACKPASS) {
6580                 /*
6581                  * We retransmitted based on a sack and the earlier
6582                  * retransmission ack'd it - re-ordering is occuring.
6583                  */
6584                 BBR_STAT_INC(bbr_reorder_seen);
6585                 bbr->r_ctl.rc_reorder_ts = cts;
6586         }
6587         /* Back down the loss count */
6588         if (rsm->r_flags & BBR_MARKED_LOST) {
6589                 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
6590                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
6591                 rsm->r_flags &= ~BBR_MARKED_LOST;
6592                 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
6593                         /* LT sampling also needs adjustment */
6594                         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
6595         }
6596         /***** RRS HERE ************************/
6597         /* Do we need to do this???            */
6598         /* bbr_reset_lt_bw_sampling(bbr, cts); */
6599         /***** RRS HERE ************************/
6600         BBR_STAT_INC(bbr_badfr);
6601         BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
6602 }
6603
6604
6605 static void
6606 bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
6607 {
6608         bbr->r_ctl.rc_rtt_shrinks = cts;
6609         if (bbr_can_force_probertt &&
6610             (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
6611             ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
6612                 /*
6613                  * We should enter probe-rtt its been too long
6614                  * since we have been there.
6615                  */
6616                 bbr_enter_probe_rtt(bbr, cts, __LINE__);
6617         } else
6618                 bbr_check_probe_rtt_limits(bbr, cts);
6619 }
6620
6621 static void
6622 tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts)
6623 {
6624         uint64_t orig_bw;
6625
6626         if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) {
6627                 /* We never apply a zero measurment */
6628                 bbr_log_type_bbrupd(bbr, 20, cts, 0, 0,
6629                                     0, 0, 0, 0, 0, 0);
6630                 return;
6631         }
6632         if (bbr->r_ctl.r_measurement_count < 0xffffffff)
6633                 bbr->r_ctl.r_measurement_count++;
6634         orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
6635         apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch);
6636         bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw,
6637                             (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate),
6638                             0, 0, 0, 0, 0, 0);
6639         if (orig_bw &&
6640             (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) {
6641                 if (bbr->bbr_hdrw_pacing) {
6642                         /*
6643                          * Apply a new rate to the hardware
6644                          * possibly.
6645                          */
6646                         bbr_update_hardware_pacing_rate(bbr, cts);
6647                 }
6648                 bbr_set_state_target(bbr, __LINE__);
6649                 tcp_bbr_tso_size_check(bbr, cts);
6650                 if (bbr->r_recovery_bw)  {
6651                         bbr_setup_red_bw(bbr, cts);
6652                         bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW);
6653                 }
6654         } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate))
6655                 tcp_bbr_tso_size_check(bbr, cts);
6656 }
6657
6658 static void
6659 bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
6660 {
6661         if (bbr->rc_in_persist == 0) {
6662                 /* We log only when not in persist */
6663                 /* Translate to a Bytes Per Second */
6664                 uint64_t tim, bw, ts_diff, ts_bw;
6665                 uint32_t upper, lower, delivered;
6666
6667                 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
6668                         tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
6669                 else
6670                         tim = 1;
6671                 /*
6672                  * Now that we have processed the tim (skipping the sample
6673                  * or possibly updating the time, go ahead and
6674                  * calculate the cdr.
6675                  */
6676                 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
6677                 bw = (uint64_t)delivered;
6678                 bw *= (uint64_t)USECS_IN_SECOND;
6679                 bw /= tim;
6680                 if (bw == 0) {
6681                         /* We must have a calculatable amount */
6682                         return;
6683                 }
6684                 upper = (bw >> 32) & 0x00000000ffffffff;
6685                 lower = bw & 0x00000000ffffffff;
6686                 /*
6687                  * If we are using this b/w shove it in now so we
6688                  * can see in the trace viewer if it gets over-ridden.
6689                  */
6690                 if (rsm->r_ts_valid &&
6691                     bbr->rc_ts_valid &&
6692                     bbr->rc_ts_clock_set &&
6693                     (bbr->rc_ts_cant_be_used == 0) &&
6694                     bbr->rc_use_ts_limit) {
6695                         ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1);
6696                         ts_diff *= bbr->r_ctl.bbr_peer_tsratio;
6697                         if ((delivered == 0) ||
6698                             (rtt < 1000)) {
6699                                 /* Can't use the ts */
6700                                 bbr_log_type_bbrupd(bbr, 61, cts,
6701                                                     ts_diff,
6702                                                     bbr->r_ctl.last_inbound_ts,
6703                                                     rsm->r_del_ack_ts, 0,
6704                                                     0, 0, 0, delivered);
6705                         } else {
6706                                 ts_bw = (uint64_t)delivered;
6707                                 ts_bw *= (uint64_t)USECS_IN_SECOND;
6708                                 ts_bw /= ts_diff;
6709                                 bbr_log_type_bbrupd(bbr, 62, cts,
6710                                                     (ts_bw >> 32),
6711                                                     (ts_bw & 0xffffffff), 0, 0,
6712                                                     0, 0, ts_diff, delivered);
6713                                 if ((bbr->ts_can_raise) &&
6714                                     (ts_bw > bw)) {
6715                                         bbr_log_type_bbrupd(bbr, 8, cts,
6716                                                             delivered,
6717                                                             ts_diff,
6718                                                             (bw >> 32),
6719                                                             (bw & 0x00000000ffffffff),
6720                                                             0, 0, 0, 0);
6721                                         bw = ts_bw;
6722                                 } else if (ts_bw && (ts_bw < bw)) {
6723                                         bbr_log_type_bbrupd(bbr, 7, cts,
6724                                                             delivered,
6725                                                             ts_diff,
6726                                                             (bw >> 32),
6727                                                             (bw & 0x00000000ffffffff),
6728                                                             0, 0, 0, 0);
6729                                         bw = ts_bw;
6730                                 }
6731                         }
6732                 }
6733                 if (rsm->r_first_sent_time &&
6734                     TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
6735                         uint64_t sbw, sti;
6736                         /*
6737                          * We use what was in flight at the time of our
6738                          * send  and the size of this send to figure
6739                          * out what we have been sending at (amount).
6740                          * For the time we take from the time of
6741                          * the send of the first send outstanding
6742                          * until this send plus this sends pacing
6743                          * time. This gives us a good calculation
6744                          * as to the rate we have been sending at.
6745                          */
6746
6747                         sbw = (uint64_t)(rsm->r_flight_at_send);
6748                         sbw *= (uint64_t)USECS_IN_SECOND;
6749                         sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
6750                         sti += rsm->r_pacing_delay;
6751                         sbw /= sti;
6752                         if (sbw < bw) {
6753                                 bbr_log_type_bbrupd(bbr, 6, cts,
6754                                                     delivered,
6755                                                     (uint32_t)sti,
6756                                                     (bw >> 32),
6757                                                     (uint32_t)bw,
6758                                                     rsm->r_first_sent_time, 0, (sbw >> 32),
6759                                                     (uint32_t)sbw);
6760                                 bw = sbw;
6761                         }
6762                 }
6763                 /* Use the google algorithm for b/w measurements */
6764                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6765                 if ((rsm->r_app_limited == 0) ||
6766                     (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) {
6767                         tcp_bbr_commit_bw(bbr, cts);
6768                         bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
6769                                             0, 0, 0, 0,  bbr->r_ctl.rc_del_time,  rsm->r_del_time);
6770                 }
6771         }
6772 }
6773
6774 static void
6775 bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
6776 {
6777         if (bbr->rc_in_persist == 0) {
6778                 /* We log only when not in persist */
6779                 /* Translate to a Bytes Per Second */
6780                 uint64_t tim, bw;
6781                 uint32_t upper, lower, delivered;
6782                 int no_apply = 0;
6783
6784                 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
6785                         tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
6786                 else
6787                         tim = 1;
6788                 /*
6789                  * Now that we have processed the tim (skipping the sample
6790                  * or possibly updating the time, go ahead and
6791                  * calculate the cdr.
6792                  */
6793                 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
6794                 bw = (uint64_t)delivered;
6795                 bw *= (uint64_t)USECS_IN_SECOND;
6796                 bw /= tim;
6797                 if (tim < bbr->r_ctl.rc_lowest_rtt) {
6798                         bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
6799                                             tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
6800
6801                         no_apply = 1;
6802                 }
6803                 upper = (bw >> 32) & 0x00000000ffffffff;
6804                 lower = bw & 0x00000000ffffffff;
6805                 /*
6806                  * If we are using this b/w shove it in now so we
6807                  * can see in the trace viewer if it gets over-ridden.
6808                  */
6809                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6810                 /* Gate by the sending rate */
6811                 if (rsm->r_first_sent_time &&
6812                     TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
6813                         uint64_t sbw, sti;
6814                         /*
6815                          * We use what was in flight at the time of our
6816                          * send  and the size of this send to figure
6817                          * out what we have been sending at (amount).
6818                          * For the time we take from the time of
6819                          * the send of the first send outstanding
6820                          * until this send plus this sends pacing
6821                          * time. This gives us a good calculation
6822                          * as to the rate we have been sending at.
6823                          */
6824
6825                         sbw = (uint64_t)(rsm->r_flight_at_send);
6826                         sbw *= (uint64_t)USECS_IN_SECOND;
6827                         sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
6828                         sti += rsm->r_pacing_delay;
6829                         sbw /= sti;
6830                         if (sbw < bw) {
6831                                 bbr_log_type_bbrupd(bbr, 6, cts,
6832                                                     delivered,
6833                                                     (uint32_t)sti,
6834                                                     (bw >> 32),
6835                                                     (uint32_t)bw,
6836                                                     rsm->r_first_sent_time, 0, (sbw >> 32),
6837                                                     (uint32_t)sbw);
6838                                 bw = sbw;
6839                         }
6840                         if ((sti > tim) &&
6841                             (sti < bbr->r_ctl.rc_lowest_rtt)) {
6842                                 bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
6843                                                     (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
6844                                 no_apply = 1;
6845                         } else
6846                                 no_apply = 0;
6847                 }
6848                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6849                 if ((no_apply == 0) &&
6850                     ((rsm->r_app_limited == 0) ||
6851                      (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) {
6852                         tcp_bbr_commit_bw(bbr, cts);
6853                         bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
6854                                             0, 0, 0, 0, bbr->r_ctl.rc_del_time,  rsm->r_del_time);
6855                 }
6856         }
6857 }
6858
6859
6860 static void
6861 bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin,
6862     uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to)
6863 {
6864         uint64_t old_rttprop;
6865
6866         /* Update our delivery time and amount */
6867         bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start);
6868         bbr->r_ctl.rc_del_time = cts;
6869         if (rtt == 0) {
6870                 /*
6871                  * 0 means its a retransmit, for now we don't use these for
6872                  * the rest of BBR.
6873                  */
6874                 return;
6875         }
6876         if ((bbr->rc_use_google == 0) &&
6877             (match != BBR_RTT_BY_EXACTMATCH) &&
6878             (match != BBR_RTT_BY_TIMESTAMP)){
6879                 /*
6880                  * We get a lot of rtt updates, lets not pay attention to
6881                  * any that are not an exact match. That way we don't have
6882                  * to worry about timestamps and the whole nonsense of
6883                  * unsure if its a retransmission etc (if we ever had the
6884                  * timestamp fixed to always have the last thing sent this
6885                  * would not be a issue).
6886                  */
6887                 return;
6888         }
6889         if ((bbr_no_retran && bbr->rc_use_google) &&
6890             (match != BBR_RTT_BY_EXACTMATCH) &&
6891             (match != BBR_RTT_BY_TIMESTAMP)){
6892                 /*
6893                  * We only do measurements in google mode
6894                  * with bbr_no_retran on for sure things.
6895                  */
6896                 return;
6897         }
6898         /* Only update srtt if we know by exact match */
6899         tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin);
6900         if (ack_type == BBR_CUM_ACKED)
6901                 bbr->rc_ack_is_cumack = 1;
6902         else
6903                 bbr->rc_ack_is_cumack = 0;
6904         old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP);
6905         /*
6906          * Note the following code differs to the original
6907          * BBR spec. It calls for <= not <. However after a
6908          * long discussion in email with Neal, he acknowledged
6909          * that it should be < than so that we will have flows
6910          * going into probe-rtt (we were seeing cases where that
6911          * did not happen and caused ugly things to occur). We
6912          * have added this agreed upon fix to our code base.
6913          */
6914         if (rtt < old_rttprop) {
6915                 /* Update when we last saw a rtt drop */
6916                 bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0);
6917                 bbr_set_reduced_rtt(bbr, cts, __LINE__);
6918         }
6919         bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts,
6920             match, rsm->r_start, rsm->r_flags);
6921         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
6922         if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) {
6923                 /*
6924                  * The RTT-prop moved, reset the target (may be a
6925                  * nop for some states).
6926                  */
6927                 bbr_set_state_target(bbr, __LINE__);
6928                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)
6929                         bbr_log_rtt_shrinks(bbr, cts, 0, 0,
6930                                             __LINE__, BBR_RTTS_NEW_TARGET, 0);
6931                 else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP))
6932                         /* It went up */
6933                         bbr_check_probe_rtt_limits(bbr, cts);
6934         }
6935         if ((bbr->rc_use_google == 0) &&
6936             (match == BBR_RTT_BY_TIMESTAMP)) {
6937                 /*
6938                  * We don't do b/w update with
6939                  * these since they are not really
6940                  * reliable.
6941                  */
6942                 return;
6943         }
6944         if (bbr->r_ctl.r_app_limited_until &&
6945             (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) {
6946                 /* We are no longer app-limited */
6947                 bbr->r_ctl.r_app_limited_until = 0;
6948         }
6949         if (bbr->rc_use_google) {
6950                 bbr_google_measurement(bbr, rsm, rtt, cts);
6951         } else {
6952                 bbr_nf_measurement(bbr, rsm, rtt, cts);
6953         }
6954 }
6955
6956 /*
6957  * Convert a timestamp that the main stack
6958  * uses (milliseconds) into one that bbr uses
6959  * (microseconds). Return that converted timestamp.
6960  */
6961 static uint32_t
6962 bbr_ts_convert(uint32_t cts) {
6963         uint32_t sec, msec;
6964
6965         sec = cts / MS_IN_USEC;
6966         msec = cts - (MS_IN_USEC * sec);
6967         return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC));
6968 }
6969
6970 /*
6971  * Return 0 if we did not update the RTT time, return
6972  * 1 if we did.
6973  */
6974 static int
6975 bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
6976     struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack)
6977 {
6978         int32_t i;
6979         uint32_t t, uts = 0;
6980
6981         if ((rsm->r_flags & BBR_ACKED) ||
6982             (rsm->r_flags & BBR_WAS_RENEGED) ||
6983             (rsm->r_flags & BBR_RXT_CLEARED)) {
6984                 /* Already done */
6985                 return (0);
6986         }
6987         if (rsm->r_rtr_cnt == 1) {
6988                 /*
6989                  * Only one transmit. Hopefully the normal case.
6990                  */
6991                 if (TSTMP_GT(cts, rsm->r_tim_lastsent[0]))
6992                         t = cts - rsm->r_tim_lastsent[0];
6993                 else
6994                         t = 1;
6995                 if ((int)t <= 0)
6996                         t = 1;
6997                 bbr->r_ctl.rc_last_rtt = t;
6998                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
6999                                     BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to);
7000                 return (1);
7001         }
7002         /* Convert to usecs */
7003         if ((bbr_can_use_ts_for_rtt == 1) &&
7004             (bbr->rc_use_google == 1) &&
7005             (ack_type == BBR_CUM_ACKED) &&
7006             (to->to_flags & TOF_TS) &&
7007             (to->to_tsecr != 0)) {
7008
7009                 t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr;
7010                 if (t < 1)
7011                         t = 1;
7012                 t *= MS_IN_USEC;
7013                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
7014                                     BBR_RTT_BY_TIMESTAMP,
7015                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)],
7016                                     ack_type, to);
7017                 return (1);
7018         }
7019         uts = bbr_ts_convert(to->to_tsecr);
7020         if ((to->to_flags & TOF_TS) &&
7021             (to->to_tsecr != 0) &&
7022             (ack_type == BBR_CUM_ACKED) &&
7023             ((rsm->r_flags & BBR_OVERMAX) == 0)) {
7024                 /*
7025                  * Now which timestamp does it match? In this block the ACK
7026                  * may be coming from a previous transmission.
7027                  */
7028                 uint32_t fudge;
7029
7030                 fudge = BBR_TIMER_FUDGE;
7031                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
7032                         if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) &&
7033                             (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) {
7034                                 if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7035                                         t = cts - rsm->r_tim_lastsent[i];
7036                                 else
7037                                         t = 1;
7038                                 if ((int)t <= 0)
7039                                         t = 1;
7040                                 bbr->r_ctl.rc_last_rtt = t;
7041                                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING,
7042                                                     rsm->r_tim_lastsent[i], ack_type, to);
7043                                 if ((i + 1) < rsm->r_rtr_cnt) {
7044                                         /* Likely */
7045                                         bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
7046                                 } else if (rsm->r_flags & BBR_TLP) {
7047                                         bbr->rc_tlp_rtx_out = 0;
7048                                 }
7049                                 return (1);
7050                         }
7051                 }
7052                 /* Fall through if we can't find a matching timestamp */
7053         }
7054         /*
7055          * Ok its a SACK block that we retransmitted. or a windows
7056          * machine without timestamps. We can tell nothing from the
7057          * time-stamp since its not there or the time the peer last
7058          * recieved a segment that moved forward its cum-ack point.
7059          *
7060          * Lets look at the last retransmit and see what we can tell
7061          * (with BBR for space we only keep 2 note we have to keep
7062          * at least 2 so the map can not be condensed more).
7063          */
7064         i = rsm->r_rtr_cnt - 1;
7065         if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7066                 t = cts - rsm->r_tim_lastsent[i];
7067         else
7068                 goto not_sure;
7069         if (t < bbr->r_ctl.rc_lowest_rtt) {
7070                 /*
7071                  * We retransmitted and the ack came back in less
7072                  * than the smallest rtt we have observed in the
7073                  * windowed rtt. We most likey did an improper
7074                  * retransmit as outlined in 4.2 Step 3 point 2 in
7075                  * the rack-draft.
7076                  *
7077                  * Use the prior transmission to update all the
7078                  * information as long as there is only one prior
7079                  * transmission.
7080                  */
7081                 if ((rsm->r_flags & BBR_OVERMAX) == 0) {
7082 #ifdef BBR_INVARIANTS
7083                         if (rsm->r_rtr_cnt == 1)
7084                                 panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags);
7085 #endif
7086                         i = rsm->r_rtr_cnt - 2;
7087                         if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7088                                 t = cts - rsm->r_tim_lastsent[i];
7089                         else
7090                                 t = 1;
7091                         bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
7092                                             rsm->r_tim_lastsent[i], ack_type, to);
7093                         bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
7094                 } else {
7095                         /*
7096                          * Too many prior transmissions, just
7097                          * updated BBR delivered
7098                          */
7099 not_sure:
7100                         bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
7101                                             BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
7102                 }
7103         } else {
7104                 /*
7105                  * We retransmitted it and the retransmit did the
7106                  * job.
7107                  */
7108                 if (rsm->r_flags & BBR_TLP)
7109                         bbr->rc_tlp_rtx_out = 0;
7110                 if ((rsm->r_flags & BBR_OVERMAX) == 0)
7111                         bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts,
7112                                             BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to);
7113                 else
7114                         bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
7115                                             BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
7116                 return (1);
7117         }
7118         return (0);
7119 }
7120
7121 /*
7122  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
7123  */
7124 static void
7125 bbr_log_sack_passed(struct tcpcb *tp,
7126     struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
7127 {
7128         struct bbr_sendmap *nrsm;
7129
7130         nrsm = rsm;
7131         TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap,
7132             bbr_head, r_tnext) {
7133                 if (nrsm == rsm) {
7134                         /* Skip orginal segment he is acked */
7135                         continue;
7136                 }
7137                 if (nrsm->r_flags & BBR_ACKED) {
7138                         /* Skip ack'd segments */
7139                         continue;
7140                 }
7141                 if (nrsm->r_flags & BBR_SACK_PASSED) {
7142                         /*
7143                          * We found one that is already marked
7144                          * passed, we have been here before and
7145                          * so all others below this are marked.
7146                          */
7147                         break;
7148                 }
7149                 BBR_STAT_INC(bbr_sack_passed);
7150                 nrsm->r_flags |= BBR_SACK_PASSED;
7151                 if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) &&
7152                     bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) {
7153                         bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start;
7154                         bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start;
7155                         nrsm->r_flags |= BBR_MARKED_LOST;
7156                 }
7157                 nrsm->r_flags &= ~BBR_WAS_SACKPASS;
7158         }
7159 }
7160
7161 /*
7162  * Returns the number of bytes that were
7163  * newly ack'd by sack blocks.
7164  */
7165 static uint32_t
7166 bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack,
7167     struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts)
7168 {
7169         int32_t times = 0;
7170         uint32_t start, end, maxseg, changed = 0;
7171         struct bbr_sendmap *rsm, *nrsm;
7172         int32_t used_ref = 1;
7173         uint8_t went_back = 0, went_fwd = 0;
7174
7175         maxseg = tp->t_maxseg - bbr->rc_last_options;
7176         start = sack->start;
7177         end = sack->end;
7178         rsm = *prsm;
7179         if (rsm == NULL)
7180                 used_ref = 0;
7181
7182         /* Do we locate the block behind where we last were? */
7183         if (rsm && SEQ_LT(start, rsm->r_start)) {
7184                 went_back = 1;
7185                 TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
7186                         if (SEQ_GEQ(start, rsm->r_start) &&
7187                             SEQ_LT(start, rsm->r_end)) {
7188                                 goto do_rest_ofb;
7189                         }
7190                 }
7191         }
7192 start_at_beginning:
7193         went_fwd = 1;
7194         /*
7195          * Ok lets locate the block where this guy is fwd from rsm (if its
7196          * set)
7197          */
7198         TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) {
7199                 if (SEQ_GEQ(start, rsm->r_start) &&
7200                     SEQ_LT(start, rsm->r_end)) {
7201                         break;
7202                 }
7203         }
7204 do_rest_ofb:
7205         if (rsm == NULL) {
7206                 /*
7207                  * This happens when we get duplicate sack blocks with the
7208                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
7209                  * will not change there location so we would just start at
7210                  * the end of the first one and get lost.
7211                  */
7212                 if (tp->t_flags & TF_SENTFIN) {
7213                         /*
7214                          * Check to see if we have not logged the FIN that
7215                          * went out.
7216                          */
7217                         nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
7218                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
7219                                 /*
7220                                  * Ok we did not get the FIN logged.
7221                                  */
7222                                 nrsm->r_end++;
7223                                 rsm = nrsm;
7224                                 goto do_rest_ofb;
7225                         }
7226                 }
7227                 if (times == 1) {
7228 #ifdef BBR_INVARIANTS
7229                         panic("tp:%p bbr:%p sack:%p to:%p prsm:%p",
7230                             tp, bbr, sack, to, prsm);
7231 #else
7232                         goto out;
7233 #endif
7234                 }
7235                 times++;
7236                 BBR_STAT_INC(bbr_sack_proc_restart);
7237                 rsm = NULL;
7238                 goto start_at_beginning;
7239         }
7240         /* Ok we have an ACK for some piece of rsm */
7241         if (rsm->r_start != start) {
7242                 /*
7243                  * Need to split this in two pieces the before and after.
7244                  */
7245                 if (bbr_sack_mergable(rsm, start, end))
7246                         nrsm = bbr_alloc_full_limit(bbr);
7247                 else
7248                         nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
7249                 if (nrsm == NULL) {
7250                         /* We could not allocate ignore the sack */
7251                         struct sackblk blk;
7252
7253                         blk.start = start;
7254                         blk.end = end;
7255                         sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
7256                         goto out;
7257                 }
7258                 bbr_clone_rsm(bbr, nrsm, rsm, start);
7259                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
7260                 if (rsm->r_in_tmap) {
7261                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7262                         nrsm->r_in_tmap = 1;
7263                 }
7264                 rsm->r_flags &= (~BBR_HAS_FIN);
7265                 rsm = nrsm;
7266         }
7267         if (SEQ_GEQ(end, rsm->r_end)) {
7268                 /*
7269                  * The end of this block is either beyond this guy or right
7270                  * at this guy.
7271                  */
7272                 if ((rsm->r_flags & BBR_ACKED) == 0) {
7273                         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
7274                         changed += (rsm->r_end - rsm->r_start);
7275                         bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7276                         bbr_log_sack_passed(tp, bbr, rsm);
7277                         if (rsm->r_flags & BBR_MARKED_LOST) {
7278                                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7279                         }
7280                         /* Is Reordering occuring? */
7281                         if (rsm->r_flags & BBR_SACK_PASSED) {
7282                                 BBR_STAT_INC(bbr_reorder_seen);
7283                                 bbr->r_ctl.rc_reorder_ts = cts;
7284                                 if (rsm->r_flags & BBR_MARKED_LOST) {
7285                                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7286                                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7287                                                 /* LT sampling also needs adjustment */
7288                                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7289                                 }
7290                         }
7291                         rsm->r_flags |= BBR_ACKED;
7292                         rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
7293                         if (rsm->r_in_tmap) {
7294                                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7295                                 rsm->r_in_tmap = 0;
7296                         }
7297                 }
7298                 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
7299                 if (end == rsm->r_end) {
7300                         /* This block only - done */
7301                         goto out;
7302                 }
7303                 /* There is more not coverend by this rsm move on */
7304                 start = rsm->r_end;
7305                 nrsm = TAILQ_NEXT(rsm, r_next);
7306                 rsm = nrsm;
7307                 times = 0;
7308                 goto do_rest_ofb;
7309         }
7310         if (rsm->r_flags & BBR_ACKED) {
7311                 /* Been here done that */
7312                 goto out;
7313         }
7314         /* Ok we need to split off this one at the tail */
7315         if (bbr_sack_mergable(rsm, start, end))
7316                 nrsm = bbr_alloc_full_limit(bbr);
7317         else
7318                 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
7319         if (nrsm == NULL) {
7320                 /* failed XXXrrs what can we do but loose the sack info? */
7321                 struct sackblk blk;
7322
7323                 blk.start = start;
7324                 blk.end = end;
7325                 sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
7326                 goto out;
7327         }
7328         /* Clone it */
7329         bbr_clone_rsm(bbr, nrsm, rsm, end);
7330         /* The sack block does not cover this guy fully */
7331         rsm->r_flags &= (~BBR_HAS_FIN);
7332         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
7333         if (rsm->r_in_tmap) {
7334                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7335                 nrsm->r_in_tmap = 1;
7336         }
7337         nrsm->r_dupack = 0;
7338         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
7339         bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
7340         changed += (rsm->r_end - rsm->r_start);
7341         bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7342         bbr_log_sack_passed(tp, bbr, rsm);
7343         /* Is Reordering occuring? */
7344         if (rsm->r_flags & BBR_MARKED_LOST) {
7345                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7346         }
7347         if (rsm->r_flags & BBR_SACK_PASSED) {
7348                 BBR_STAT_INC(bbr_reorder_seen);
7349                 bbr->r_ctl.rc_reorder_ts = cts;
7350                 if (rsm->r_flags & BBR_MARKED_LOST) {
7351                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7352                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7353                                 /* LT sampling also needs adjustment */
7354                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7355                 }
7356         }
7357         rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
7358         rsm->r_flags |= BBR_ACKED;
7359         if (rsm->r_in_tmap) {
7360                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7361                 rsm->r_in_tmap = 0;
7362         }
7363 out:
7364         if (rsm && (rsm->r_flags & BBR_ACKED)) {
7365                 /*
7366                  * Now can we merge this newly acked
7367                  * block with either the previous or
7368                  * next block?
7369                  */
7370                 nrsm = TAILQ_NEXT(rsm, r_next);
7371                 if (nrsm &&
7372                     (nrsm->r_flags & BBR_ACKED)) {
7373                         /* yep this and next can be merged */
7374                         rsm = bbr_merge_rsm(bbr, rsm, nrsm);
7375                 }
7376                 /* Now what about the previous? */
7377                 nrsm = TAILQ_PREV(rsm, bbr_head, r_next);
7378                 if (nrsm &&
7379                     (nrsm->r_flags & BBR_ACKED)) {
7380                         /* yep the previous and this can be merged */
7381                         rsm = bbr_merge_rsm(bbr, nrsm, rsm);
7382                 }
7383         }
7384         if (used_ref == 0) {
7385                 BBR_STAT_INC(bbr_sack_proc_all);
7386         } else {
7387                 BBR_STAT_INC(bbr_sack_proc_short);
7388         }
7389         if (went_fwd && went_back) {
7390                 BBR_STAT_INC(bbr_sack_search_both);
7391         } else if (went_fwd) {
7392                 BBR_STAT_INC(bbr_sack_search_fwd);
7393         } else if (went_back) {
7394                 BBR_STAT_INC(bbr_sack_search_back);
7395         }
7396         /* Save off where the next seq is */
7397         if (rsm)
7398                 bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
7399         else
7400                 bbr->r_ctl.rc_sacklast = NULL;
7401         *prsm = rsm;
7402         return (changed);
7403 }
7404
7405
7406 static void inline
7407 bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack)
7408 {
7409         struct bbr_sendmap *tmap;
7410
7411         BBR_STAT_INC(bbr_reneges_seen);
7412         tmap = NULL;
7413         while (rsm && (rsm->r_flags & BBR_ACKED)) {
7414                 /* Its no longer sacked, mark it so */
7415                 uint32_t oflags;
7416                 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7417 #ifdef BBR_INVARIANTS
7418                 if (rsm->r_in_tmap) {
7419                         panic("bbr:%p rsm:%p flags:0x%x in tmap?",
7420                             bbr, rsm, rsm->r_flags);
7421                 }
7422 #endif
7423                 oflags = rsm->r_flags;
7424                 if (rsm->r_flags & BBR_MARKED_LOST) {
7425                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7426                         bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7427                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7428                                 /* LT sampling also needs adjustment */
7429                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7430                 }
7431                 rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST);
7432                 rsm->r_flags |= BBR_WAS_RENEGED;
7433                 rsm->r_flags |= BBR_RXT_CLEARED;
7434                 bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__);
7435                 /* Rebuild it into our tmap */
7436                 if (tmap == NULL) {
7437                         TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7438                         tmap = rsm;
7439                 } else {
7440                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext);
7441                         tmap = rsm;
7442                 }
7443                 tmap->r_in_tmap = 1;
7444                 /*
7445                  * XXXrrs Delivered? Should we do anything here?
7446                  *
7447                  * Of course we don't on a rxt timeout so maybe its ok that
7448                  * we don't?
7449                  *
7450                  * For now lets not.
7451                  */
7452                 rsm = TAILQ_NEXT(rsm, r_next);
7453         }
7454         /*
7455          * Now lets possibly clear the sack filter so we start recognizing
7456          * sacks that cover this area.
7457          */
7458         sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack);
7459 }
7460
7461 static void
7462 bbr_log_syn(struct tcpcb *tp, struct tcpopt *to)
7463 {
7464         struct tcp_bbr *bbr;
7465         struct bbr_sendmap *rsm;
7466         uint32_t cts;
7467
7468         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7469         cts = bbr->r_ctl.rc_rcvtime;
7470         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7471         if (rsm && (rsm->r_flags & BBR_HAS_SYN)) {
7472                 if ((rsm->r_end - rsm->r_start) <= 1) {
7473                         /* Log out the SYN completely */
7474                         bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7475                         rsm->r_rtr_bytes = 0;
7476                         TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
7477                         if (rsm->r_in_tmap) {
7478                                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7479                                 rsm->r_in_tmap = 0;
7480                         }
7481                         if (bbr->r_ctl.rc_next == rsm) {
7482                                 /* scoot along the marker */
7483                                 bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7484                         }
7485                         if (to != NULL)
7486                                 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0);
7487                         bbr_free(bbr, rsm);
7488                 } else {
7489                         /* There is more (Fast open)? strip out SYN. */
7490                         rsm->r_flags &= ~BBR_HAS_SYN;
7491                         rsm->r_start++;
7492                 }
7493         }
7494 }
7495
7496 /*
7497  * Returns the number of bytes that were
7498  * acknowledged by SACK blocks.
7499  */
7500
7501 static uint32_t
7502 bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
7503     uint32_t *prev_acked)
7504 {
7505         uint32_t changed, last_seq, entered_recovery = 0;
7506         struct tcp_bbr *bbr;
7507         struct bbr_sendmap *rsm;
7508         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
7509         register uint32_t th_ack;
7510         int32_t i, j, k, new_sb, num_sack_blks = 0;
7511         uint32_t cts, acked, ack_point, sack_changed = 0;
7512         uint32_t p_maxseg, maxseg, p_acked = 0;
7513
7514         INP_WLOCK_ASSERT(tp->t_inpcb);
7515         if (th->th_flags & TH_RST) {
7516                 /* We don't log resets */
7517                 return (0);
7518         }
7519         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7520         cts = bbr->r_ctl.rc_rcvtime;
7521
7522         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7523         changed = 0;
7524         maxseg = tp->t_maxseg - bbr->rc_last_options;
7525         p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg);
7526         th_ack = th->th_ack;
7527         if (SEQ_GT(th_ack, tp->snd_una)) {
7528                 acked = th_ack - tp->snd_una;
7529                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__);
7530                 bbr->rc_tp->t_acktime = ticks;
7531         } else
7532                 acked = 0;
7533         if (SEQ_LEQ(th_ack, tp->snd_una)) {
7534                 /* Only sent here for sack processing */
7535                 goto proc_sack;
7536         }
7537         if (rsm && SEQ_GT(th_ack, rsm->r_start)) {
7538                 changed = th_ack - rsm->r_start;
7539         } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) {
7540                 /*
7541                  * For the SYN incoming case we will not have called
7542                  * tcp_output for the sending of the SYN, so there will be
7543                  * no map. All other cases should probably be a panic.
7544                  */
7545                 if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) {
7546                         /*
7547                          * We have a timestamp that can be used to generate
7548                          * an initial RTT.
7549                          */
7550                         uint32_t ts, now, rtt;
7551
7552                         ts = bbr_ts_convert(to->to_tsecr);
7553                         now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv));
7554                         rtt = now - ts;
7555                         if (rtt < 1)
7556                                 rtt = 1;
7557                         bbr_log_type_bbrrttprop(bbr, rtt,
7558                                                 tp->iss, 0, cts,
7559                                                 BBR_RTT_BY_TIMESTAMP, tp->iss, 0);
7560                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
7561                         changed = 1;
7562                         bbr->r_wanted_output = 1;
7563                         goto out;
7564                 }
7565                 goto proc_sack;
7566         } else if (rsm == NULL) {
7567                 goto out;
7568         }
7569         if (changed) {
7570                 /*
7571                  * The ACK point is advancing to th_ack, we must drop off
7572                  * the packets in the rack log and calculate any eligble
7573                  * RTT's.
7574                  */
7575                 bbr->r_wanted_output = 1;
7576 more:
7577                 if (rsm == NULL) {
7578
7579                         if (tp->t_flags & TF_SENTFIN) {
7580                                 /* if we send a FIN we will not hav a map */
7581                                 goto proc_sack;
7582                         }
7583 #ifdef BBR_INVARIANTS
7584                         panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n",
7585                             tp,
7586                             th, tp->t_state, bbr,
7587                             tp->snd_una, tp->snd_max, changed);
7588 #endif
7589                         goto proc_sack;
7590                 }
7591         }
7592         if (SEQ_LT(th_ack, rsm->r_start)) {
7593                 /* Huh map is missing this */
7594 #ifdef BBR_INVARIANTS
7595                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n",
7596                     rsm->r_start,
7597                     th_ack, tp->t_state,
7598                     bbr->r_state, bbr);
7599                 panic("th-ack is bad bbr:%p tp:%p", bbr, tp);
7600 #endif
7601                 goto proc_sack;
7602         } else if (th_ack == rsm->r_start) {
7603                 /* None here to ack */
7604                 goto proc_sack;
7605         }
7606         /*
7607          * Clear the dup ack counter, it will
7608          * either be freed or if there is some
7609          * remaining we need to start it at zero.
7610          */
7611         rsm->r_dupack = 0;
7612         /* Now do we consume the whole thing? */
7613         if (SEQ_GEQ(th_ack, rsm->r_end)) {
7614                 /* Its all consumed. */
7615                 uint32_t left;
7616
7617                 if (rsm->r_flags & BBR_ACKED) {
7618                         /*
7619                          * It was acked on the scoreboard -- remove it from
7620                          * total
7621                          */
7622                         p_acked += (rsm->r_end - rsm->r_start);
7623                         bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7624                         if (bbr->r_ctl.rc_sacked == 0)
7625                                 bbr->r_ctl.rc_sacklast = NULL;
7626                 } else {
7627                         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack);
7628                         if (rsm->r_flags & BBR_MARKED_LOST) {
7629                                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7630                         }
7631                         if (rsm->r_flags & BBR_SACK_PASSED) {
7632                                 /*
7633                                  * There are acked segments ACKED on the
7634                                  * scoreboard further up. We are seeing
7635                                  * reordering.
7636                                  */
7637                                 BBR_STAT_INC(bbr_reorder_seen);
7638                                 bbr->r_ctl.rc_reorder_ts = cts;
7639                                 if (rsm->r_flags & BBR_MARKED_LOST) {
7640                                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7641                                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7642                                                 /* LT sampling also needs adjustment */
7643                                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7644                                 }
7645                         }
7646                         rsm->r_flags &= ~BBR_MARKED_LOST;
7647                 }
7648                 bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7649                 rsm->r_rtr_bytes = 0;
7650                 TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
7651                 if (rsm->r_in_tmap) {
7652                         TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7653                         rsm->r_in_tmap = 0;
7654                 }
7655                 if (bbr->r_ctl.rc_next == rsm) {
7656                         /* scoot along the marker */
7657                         bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7658                 }
7659                 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
7660                 /* Adjust the packet counts */
7661                 left = th_ack - rsm->r_end;
7662                 /* Free back to zone */
7663                 bbr_free(bbr, rsm);
7664                 if (left) {
7665                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7666                         goto more;
7667                 }
7668                 goto proc_sack;
7669         }
7670         if (rsm->r_flags & BBR_ACKED) {
7671                 /*
7672                  * It was acked on the scoreboard -- remove it from total
7673                  * for the part being cum-acked.
7674                  */
7675                 p_acked += (rsm->r_end - rsm->r_start);
7676                 bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
7677                 if (bbr->r_ctl.rc_sacked == 0)
7678                         bbr->r_ctl.rc_sacklast = NULL;
7679         } else {
7680                 /*
7681                  * It was acked up to th_ack point for the first time
7682                  */
7683                 struct bbr_sendmap lrsm;
7684
7685                 memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap));
7686                 lrsm.r_end = th_ack;
7687                 bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack);
7688         }
7689         if ((rsm->r_flags & BBR_MARKED_LOST) &&
7690             ((rsm->r_flags & BBR_ACKED) == 0)) {
7691                 /*
7692                  * It was marked lost and partly ack'd now
7693                  * for the first time. We lower the rc_lost_bytes
7694                  * and still leave it MARKED.
7695                  */
7696                 bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start;
7697         }
7698         bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
7699         bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7700         rsm->r_rtr_bytes = 0;
7701         /* adjust packet count */
7702         rsm->r_start = th_ack;
7703 proc_sack:
7704         /* Check for reneging */
7705         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7706         if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) {
7707                 /*
7708                  * The peer has moved snd_una up to the edge of this send,
7709                  * i.e. one that it had previously acked. The only way that
7710                  * can be true if the peer threw away data (space issues)
7711                  * that it had previously sacked (else it would have given
7712                  * us snd_una up to (rsm->r_end). We need to undo the acked
7713                  * markings here.
7714                  *
7715                  * Note we have to look to make sure th_ack is our
7716                  * rsm->r_start in case we get an old ack where th_ack is
7717                  * behind snd_una.
7718                  */
7719                 bbr_peer_reneges(bbr, rsm, th->th_ack);
7720         }
7721         if ((to->to_flags & TOF_SACK) == 0) {
7722                 /* We are done nothing left to log */
7723                 goto out;
7724         }
7725         rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
7726         if (rsm) {
7727                 last_seq = rsm->r_end;
7728         } else {
7729                 last_seq = tp->snd_max;
7730         }
7731         /* Sack block processing */
7732         if (SEQ_GT(th_ack, tp->snd_una))
7733                 ack_point = th_ack;
7734         else
7735                 ack_point = tp->snd_una;
7736         for (i = 0; i < to->to_nsacks; i++) {
7737                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
7738                     &sack, sizeof(sack));
7739                 sack.start = ntohl(sack.start);
7740                 sack.end = ntohl(sack.end);
7741                 if (SEQ_GT(sack.end, sack.start) &&
7742                     SEQ_GT(sack.start, ack_point) &&
7743                     SEQ_LT(sack.start, tp->snd_max) &&
7744                     SEQ_GT(sack.end, ack_point) &&
7745                     SEQ_LEQ(sack.end, tp->snd_max)) {
7746                         if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) &&
7747                             (SEQ_LT(sack.end, last_seq)) &&
7748                             ((sack.end - sack.start) < (p_maxseg / 8))) {
7749                                 /*
7750                                  * Not the last piece and its smaller than
7751                                  * 1/8th of a p_maxseg. We ignore this.
7752                                  */
7753                                 BBR_STAT_INC(bbr_runt_sacks);
7754                                 continue;
7755                         }
7756                         sack_blocks[num_sack_blks] = sack;
7757                         num_sack_blks++;
7758 #ifdef NETFLIX_STATS
7759                 } else if (SEQ_LEQ(sack.start, th_ack) &&
7760                     SEQ_LEQ(sack.end, th_ack)) {
7761                         /*
7762                          * Its a D-SACK block.
7763                          */
7764                         tcp_record_dsack(sack.start, sack.end);
7765 #endif
7766                 }
7767         }
7768         if (num_sack_blks == 0)
7769                 goto out;
7770         /*
7771          * Sort the SACK blocks so we can update the rack scoreboard with
7772          * just one pass.
7773          */
7774         new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks,
7775                                   num_sack_blks, th->th_ack);
7776         ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks);
7777         BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks);
7778         BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb));
7779         num_sack_blks = new_sb;
7780         if (num_sack_blks < 2) {
7781                 goto do_sack_work;
7782         }
7783         /* Sort the sacks */
7784         for (i = 0; i < num_sack_blks; i++) {
7785                 for (j = i + 1; j < num_sack_blks; j++) {
7786                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
7787                                 sack = sack_blocks[i];
7788                                 sack_blocks[i] = sack_blocks[j];
7789                                 sack_blocks[j] = sack;
7790                         }
7791                 }
7792         }
7793         /*
7794          * Now are any of the sack block ends the same (yes some
7795          * implememtations send these)?
7796          */
7797 again:
7798         if (num_sack_blks > 1) {
7799                 for (i = 0; i < num_sack_blks; i++) {
7800                         for (j = i + 1; j < num_sack_blks; j++) {
7801                                 if (sack_blocks[i].end == sack_blocks[j].end) {
7802                                         /*
7803                                          * Ok these two have the same end we
7804                                          * want the smallest end and then
7805                                          * throw away the larger and start
7806                                          * again.
7807                                          */
7808                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
7809                                                 /*
7810                                                  * The second block covers
7811                                                  * more area use that
7812                                                  */
7813                                                 sack_blocks[i].start = sack_blocks[j].start;
7814                                         }
7815                                         /*
7816                                          * Now collapse out the dup-sack and
7817                                          * lower the count
7818                                          */
7819                                         for (k = (j + 1); k < num_sack_blks; k++) {
7820                                                 sack_blocks[j].start = sack_blocks[k].start;
7821                                                 sack_blocks[j].end = sack_blocks[k].end;
7822                                                 j++;
7823                                         }
7824                                         num_sack_blks--;
7825                                         goto again;
7826                                 }
7827                         }
7828                 }
7829         }
7830 do_sack_work:
7831         rsm = bbr->r_ctl.rc_sacklast;
7832         for (i = 0; i < num_sack_blks; i++) {
7833                 acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts);
7834                 if (acked) {
7835                         bbr->r_wanted_output = 1;
7836                         changed += acked;
7837                         sack_changed += acked;
7838                 }
7839         }
7840 out:
7841         *prev_acked = p_acked;
7842         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
7843                 /*
7844                  * Ok we have a high probability that we need to go in to
7845                  * recovery since we have data sack'd
7846                  */
7847                 struct bbr_sendmap *rsm;
7848
7849                 rsm = bbr_check_recovery_mode(tp, bbr, cts);
7850                 if (rsm) {
7851                         /* Enter recovery */
7852                         entered_recovery = 1;
7853                         bbr->r_wanted_output = 1;
7854                         /*
7855                          * When we enter recovery we need to assure we send
7856                          * one packet.
7857                          */
7858                         if (bbr->r_ctl.rc_resend == NULL) {
7859                                 bbr->r_ctl.rc_resend = rsm;
7860                         }
7861                 }
7862         }
7863         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
7864                 /*
7865                  * See if we need to rack-retransmit anything if so set it
7866                  * up as the thing to resend assuming something else is not
7867                  * already in that position.
7868                  */
7869                 if (bbr->r_ctl.rc_resend == NULL) {
7870                         bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
7871                 }
7872         }
7873         /*
7874          * We return the amount that changed via sack, this is used by the
7875          * ack-received code to augment what was changed between th_ack <->
7876          * snd_una.
7877          */
7878         return (sack_changed);
7879 }
7880
7881 static void
7882 bbr_strike_dupack(struct tcp_bbr *bbr)
7883 {
7884         struct bbr_sendmap *rsm;
7885
7886         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
7887         if (rsm && (rsm->r_dupack < 0xff)) {
7888                 rsm->r_dupack++;
7889                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD)
7890                         bbr->r_wanted_output = 1;
7891         }
7892 }
7893
7894 /*
7895  * Return value of 1, we do not need to call bbr_process_data().
7896  * return value of 0, bbr_process_data can be called.
7897  * For ret_val if its 0 the TCB is locked and valid, if its non-zero
7898  * its unlocked and probably unsafe to touch the TCB.
7899  */
7900 static int
7901 bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
7902     struct tcpcb *tp, struct tcpopt *to,
7903     uint32_t tiwin, int32_t tlen,
7904     int32_t * ofia, int32_t thflags, int32_t * ret_val)
7905 {
7906         int32_t ourfinisacked = 0;
7907         int32_t acked_amount;
7908         uint16_t nsegs;
7909         int32_t acked;
7910         uint32_t lost, sack_changed = 0;
7911         struct mbuf *mfree;
7912         struct tcp_bbr *bbr;
7913         uint32_t prev_acked = 0;
7914
7915         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7916         lost = bbr->r_ctl.rc_lost;
7917         nsegs = max(1, m->m_pkthdr.lro_nsegs);
7918         if (SEQ_GT(th->th_ack, tp->snd_max)) {
7919                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
7920                 bbr->r_wanted_output = 1;
7921                 return (1);
7922         }
7923         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
7924                 /* Process the ack */
7925                 if (bbr->rc_in_persist)
7926                         tp->t_rxtshift = 0;
7927                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
7928                         bbr_strike_dupack(bbr);
7929                 sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
7930         }
7931         bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost));
7932         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
7933                 /*
7934                  * Old ack, behind the last one rcv'd or a duplicate ack
7935                  * with SACK info.
7936                  */
7937                 if (th->th_ack == tp->snd_una) {
7938                         bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0);
7939                         if (bbr->r_state == TCPS_SYN_SENT) {
7940                                 /*
7941                                  * Special case on where we sent SYN. When
7942                                  * the SYN-ACK is processed in syn_sent
7943                                  * state it bumps the snd_una. This causes
7944                                  * us to hit here even though we did ack 1
7945                                  * byte.
7946                                  *
7947                                  * Go through the nothing left case so we
7948                                  * send data.
7949                                  */
7950                                 goto nothing_left;
7951                         }
7952                 }
7953                 return (0);
7954         }
7955         /*
7956          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
7957          * something we sent.
7958          */
7959         if (tp->t_flags & TF_NEEDSYN) {
7960                 /*
7961                  * T/TCP: Connection was half-synchronized, and our SYN has
7962                  * been ACK'd (so connection is now fully synchronized).  Go
7963                  * to non-starred state, increment snd_una for ACK of SYN,
7964                  * and check if we can do window scaling.
7965                  */
7966                 tp->t_flags &= ~TF_NEEDSYN;
7967                 tp->snd_una++;
7968                 /* Do window scaling? */
7969                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
7970                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
7971                         tp->rcv_scale = tp->request_r_scale;
7972                         /* Send window already scaled. */
7973                 }
7974         }
7975         INP_WLOCK_ASSERT(tp->t_inpcb);
7976
7977         acked = BYTES_THIS_ACK(tp, th);
7978         KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
7979         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
7980
7981         /*
7982          * If we just performed our first retransmit, and the ACK arrives
7983          * within our recovery window, then it was a mistake to do the
7984          * retransmit in the first place.  Recover our original cwnd and
7985          * ssthresh, and proceed to transmit where we left off.
7986          */
7987         if (tp->t_flags & TF_PREVVALID) {
7988                 tp->t_flags &= ~TF_PREVVALID;
7989                 if (tp->t_rxtshift == 1 &&
7990                     (int)(ticks - tp->t_badrxtwin) < 0)
7991                         bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
7992         }
7993         SOCKBUF_LOCK(&so->so_snd);
7994         acked_amount = min(acked, (int)sbavail(&so->so_snd));
7995         tp->snd_wnd -= acked_amount;
7996         mfree = sbcut_locked(&so->so_snd, acked_amount);
7997         /* NB: sowwakeup_locked() does an implicit unlock. */
7998         sowwakeup_locked(so);
7999         m_freem(mfree);
8000         if (SEQ_GT(th->th_ack, tp->snd_una)) {
8001                 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
8002         }
8003         tp->snd_una = th->th_ack;
8004         bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost));
8005         if (IN_RECOVERY(tp->t_flags)) {
8006                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
8007                     (SEQ_LT(th->th_ack, tp->snd_max))) {
8008                         tcp_bbr_partialack(tp);
8009                 } else {
8010                         bbr_post_recovery(tp);
8011                 }
8012         }
8013         if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
8014                 tp->snd_recover = tp->snd_una;
8015         }
8016         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
8017                 tp->snd_nxt = tp->snd_max;
8018         }
8019         if (tp->snd_una == tp->snd_max) {
8020                 /* Nothing left outstanding */
8021 nothing_left:
8022                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
8023                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
8024                         bbr->rc_tp->t_acktime = 0;
8025                 if ((sbused(&so->so_snd) == 0) &&
8026                     (tp->t_flags & TF_SENTFIN)) {
8027                         ourfinisacked = 1;
8028                 }
8029                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8030                 if (bbr->rc_in_persist == 0) {
8031                         bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
8032                 }
8033                 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
8034                 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
8035                 /*
8036                  * We invalidate the last ack here since we
8037                  * don't want to transfer forward the time
8038                  * for our sum's calculations.
8039                  */
8040                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
8041                     (sbavail(&so->so_snd) == 0) &&
8042                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
8043                         /*
8044                          * The socket was gone and the peer sent data, time
8045                          * to reset him.
8046                          */
8047                         *ret_val = 1;
8048                         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
8049                         /* tcp_close will kill the inp pre-log the Reset */
8050                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
8051                         tp = tcp_close(tp);
8052                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
8053                         BBR_STAT_INC(bbr_dropped_af_data);
8054                         return (1);
8055                 }
8056                 /* Set need output so persist might get set */
8057                 bbr->r_wanted_output = 1;
8058         }
8059         if (ofia)
8060                 *ofia = ourfinisacked;
8061         return (0);
8062 }
8063
8064 static void
8065 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
8066 {
8067         if (bbr->rc_in_persist == 0) {
8068                 bbr_timer_cancel(bbr, __LINE__, cts);
8069                 bbr->r_ctl.rc_last_delay_val = 0;
8070                 tp->t_rxtshift = 0;
8071                 bbr->rc_in_persist = 1;
8072                 bbr->r_ctl.rc_went_idle_time = cts;
8073                 /* We should be capped when rw went to 0 but just in case */
8074                 bbr_log_type_pesist(bbr, cts, 0, line, 1);
8075                 /* Time freezes for the state, so do the accounting now */
8076                 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
8077                         uint32_t time_in;
8078
8079                         time_in = cts - bbr->r_ctl.rc_bbr_state_time;
8080                         if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
8081                                 int32_t idx;
8082
8083                                 idx = bbr_state_val(bbr);
8084                                 counter_u64_add(bbr_state_time[(idx + 5)], time_in);
8085                         } else {
8086                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
8087                         }
8088                 }
8089                 bbr->r_ctl.rc_bbr_state_time = cts;
8090         }
8091 }
8092
8093 static void
8094 bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time)
8095 {
8096         /*
8097          * Note that if idle time does not exceed our
8098          * threshold, we do nothing continuing the state
8099          * transitions we were last walking through.
8100          */
8101         if (idle_time >= bbr_idle_restart_threshold) {
8102                 if (bbr->rc_use_idle_restart) {
8103                         bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT;
8104                         /*
8105                          * Set our target using BBR_UNIT, so
8106                          * we increase at a dramatic rate but
8107                          * we stop when we get the pipe
8108                          * full again for our current b/w estimate.
8109                          */
8110                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
8111                         bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
8112                         bbr_set_state_target(bbr, __LINE__);
8113                         /* Now setup our gains to ramp up */
8114                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
8115                         bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
8116                         bbr_log_type_statechange(bbr, cts, __LINE__);
8117                 } else {
8118                         bbr_substate_change(bbr, cts, __LINE__, 1);
8119                 }
8120         }
8121 }
8122
8123 static void
8124 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
8125 {
8126         uint32_t idle_time;
8127
8128         if (bbr->rc_in_persist == 0)
8129                 return;
8130         idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time);
8131         bbr->rc_in_persist = 0;
8132         bbr->rc_hit_state_1 = 0;
8133         bbr->r_ctl.rc_del_time = cts;
8134         /*
8135          * We invalidate the last ack here since we
8136          * don't want to transfer forward the time
8137          * for our sum's calculations.
8138          */
8139         if (bbr->rc_inp->inp_in_hpts) {
8140                 tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
8141                 bbr->rc_timer_first = 0;
8142                 bbr->r_ctl.rc_hpts_flags = 0;
8143                 bbr->r_ctl.rc_last_delay_val = 0;
8144                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
8145                 bbr->r_agg_early_set = 0;
8146                 bbr->r_ctl.rc_agg_early = 0;
8147         }
8148         bbr_log_type_pesist(bbr, cts, idle_time, line, 0);
8149         if (idle_time >= bbr_rtt_probe_time) {
8150                 /*
8151                  * This qualifies as a RTT_PROBE session since we drop the
8152                  * data outstanding to nothing and waited more than
8153                  * bbr_rtt_probe_time.
8154                  */
8155                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0);
8156                 bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts;
8157         }
8158         tp->t_rxtshift = 0;
8159         /*
8160          * If in probeBW and we have persisted more than an RTT lets do
8161          * special handling.
8162          */
8163         /* Force a time based epoch */
8164         bbr_set_epoch(bbr, cts, __LINE__);
8165         /*
8166          * Setup the lost so we don't count anything against the guy
8167          * we have been stuck with during persists.
8168          */
8169         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
8170         /* Time un-freezes for the state */
8171         bbr->r_ctl.rc_bbr_state_time = cts;
8172         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) ||
8173             (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) {
8174                 /*
8175                  * If we are going back to probe-bw
8176                  * or probe_rtt, we may need to possibly
8177                  * do a fast restart.
8178                  */
8179                 bbr_restart_after_idle(bbr, cts, idle_time);
8180         }
8181 }
8182
8183 static void
8184 bbr_collapsed_window(struct tcp_bbr *bbr)
8185 {
8186         /*
8187          * Now we must walk the
8188          * send map and divide the
8189          * ones left stranded. These
8190          * guys can't cause us to abort
8191          * the connection and are really
8192          * "unsent". However if a buggy
8193          * client actually did keep some
8194          * of the data i.e. collapsed the win
8195          * and refused to ack and then opened
8196          * the win and acked that data. We would
8197          * get into an ack war, the simplier
8198          * method then of just pretending we
8199          * did not send those segments something
8200          * won't work.
8201          */
8202         struct bbr_sendmap *rsm, *nrsm;
8203         tcp_seq max_seq;
8204         uint32_t maxseg;
8205         int can_split = 0;
8206         int fnd = 0;
8207
8208         maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
8209         max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd;
8210         bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0);
8211         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
8212                 /* Find the first seq past or at maxseq */
8213                 if (rsm->r_flags & BBR_RWND_COLLAPSED)
8214                         rsm->r_flags &= ~BBR_RWND_COLLAPSED;
8215                 if (SEQ_GEQ(max_seq, rsm->r_start) &&
8216                     SEQ_GEQ(rsm->r_end, max_seq)) {
8217                         fnd = 1;
8218                         break;
8219                 }
8220         }
8221         bbr->rc_has_collapsed = 0;
8222         if (!fnd) {
8223                 /* Nothing to do strange */
8224                 return;
8225         }
8226         /*
8227          * Now can we split?
8228          *
8229          * We don't want to split if splitting
8230          * would generate too many small segments
8231          * less we let an attacker fragment our
8232          * send_map and leave us out of memory.
8233          */
8234         if ((max_seq != rsm->r_start) &&
8235             (max_seq != rsm->r_end)){
8236                 /* can we split? */
8237                 int res1, res2;
8238
8239                 res1 = max_seq - rsm->r_start;
8240                 res2 = rsm->r_end - max_seq;
8241                 if ((res1 >= (maxseg/8)) &&
8242                     (res2 >= (maxseg/8))) {
8243                         /* No small pieces here */
8244                         can_split = 1;
8245                 } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) {
8246                         /* We are under the limit */
8247                         can_split = 1;
8248                 }
8249         }
8250         /* Ok do we need to split this rsm? */
8251         if (max_seq == rsm->r_start) {
8252                 /* It's this guy no split required */
8253                 nrsm = rsm;
8254         } else if (max_seq == rsm->r_end) {
8255                 /* It's the next one no split required. */
8256                 nrsm = TAILQ_NEXT(rsm, r_next);
8257                 if (nrsm == NULL) {
8258                         /* Huh? */
8259                         return;
8260                 }
8261         } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) {
8262                 /* yep we need to split it */
8263                 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
8264                 if (nrsm == NULL) {
8265                         /* failed XXXrrs what can we do mark the whole? */
8266                         nrsm = rsm;
8267                         goto no_split;
8268                 }
8269                 /* Clone it */
8270                 bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0);
8271                 bbr_clone_rsm(bbr, nrsm, rsm, max_seq);
8272                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
8273                 if (rsm->r_in_tmap) {
8274                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8275                         nrsm->r_in_tmap = 1;
8276                 }
8277         } else {
8278                 /*
8279                  * Split not allowed just start here just
8280                  * use this guy.
8281                  */
8282                 nrsm = rsm;
8283         }
8284 no_split:
8285         BBR_STAT_INC(bbr_collapsed_win);
8286         /* reuse fnd as a count */
8287         fnd = 0;
8288         TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) {
8289                 nrsm->r_flags |= BBR_RWND_COLLAPSED;
8290                 fnd++;
8291                 bbr->rc_has_collapsed = 1;
8292         }
8293         bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd);
8294 }
8295
8296 static void
8297 bbr_un_collapse_window(struct tcp_bbr *bbr)
8298 {
8299         struct bbr_sendmap *rsm;
8300         int cleared = 0;
8301
8302         TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
8303                 if (rsm->r_flags & BBR_RWND_COLLAPSED) {
8304                         /* Clear the flag */
8305                         rsm->r_flags &= ~BBR_RWND_COLLAPSED;
8306                         cleared++;
8307                 } else
8308                         break;
8309         }
8310         bbr_log_type_rwnd_collapse(bbr,
8311                                    (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared);
8312         bbr->rc_has_collapsed = 0;
8313 }
8314
8315 /*
8316  * Return value of 1, the TCB is unlocked and most
8317  * likely gone, return value of 0, the TCB is still
8318  * locked.
8319  */
8320 static int
8321 bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
8322     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
8323     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
8324 {
8325         /*
8326          * Update window information. Don't look at window if no ACK: TAC's
8327          * send garbage on first SYN.
8328          */
8329         uint16_t nsegs;
8330         int32_t tfo_syn;
8331         struct tcp_bbr *bbr;
8332
8333         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8334         INP_WLOCK_ASSERT(tp->t_inpcb);
8335         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8336         if ((thflags & TH_ACK) &&
8337             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
8338             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
8339             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
8340                 /* keep track of pure window updates */
8341                 if (tlen == 0 &&
8342                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
8343                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
8344                 tp->snd_wnd = tiwin;
8345                 tp->snd_wl1 = th->th_seq;
8346                 tp->snd_wl2 = th->th_ack;
8347                 if (tp->snd_wnd > tp->max_sndwnd)
8348                         tp->max_sndwnd = tp->snd_wnd;
8349                 bbr->r_wanted_output = 1;
8350         } else if (thflags & TH_ACK) {
8351                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
8352                         tp->snd_wnd = tiwin;
8353                         tp->snd_wl1 = th->th_seq;
8354                         tp->snd_wl2 = th->th_ack;
8355                 }
8356         }
8357         if (tp->snd_wnd < ctf_outstanding(tp))
8358                 /* The peer collapsed its window on us */
8359                 bbr_collapsed_window(bbr);
8360         else if (bbr->rc_has_collapsed)
8361                 bbr_un_collapse_window(bbr);
8362         /* Was persist timer active and now we have window space? */
8363         if ((bbr->rc_in_persist != 0) &&
8364             (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
8365                                 bbr_minseg(bbr)))) {
8366                 /*
8367                  * Make the rate persist at end of persist mode if idle long
8368                  * enough
8369                  */
8370                 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8371
8372                 /* Make sure we output to start the timer */
8373                 bbr->r_wanted_output = 1;
8374         }
8375         /* Do we need to enter persist? */
8376         if ((bbr->rc_in_persist == 0) &&
8377             (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
8378             TCPS_HAVEESTABLISHED(tp->t_state) &&
8379             (tp->snd_max == tp->snd_una) &&
8380             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8381             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
8382                 /* No send window.. we must enter persist */
8383                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8384         }
8385         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
8386                 m_freem(m);
8387                 return (0);
8388         }
8389         /*
8390          * We don't support urgent data but
8391          * drag along the up just to make sure
8392          * if there is a stack switch no one
8393          * is surprised.
8394          */
8395         tp->rcv_up = tp->rcv_nxt;
8396         INP_WLOCK_ASSERT(tp->t_inpcb);
8397
8398         /*
8399          * Process the segment text, merging it into the TCP sequencing
8400          * queue, and arranging for acknowledgment of receipt if necessary.
8401          * This process logically involves adjusting tp->rcv_wnd as data is
8402          * presented to the user (this happens in tcp_usrreq.c, case
8403          * PRU_RCVD).  If a FIN has already been received on this connection
8404          * then we just ignore the text.
8405          */
8406         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
8407                    IS_FASTOPEN(tp->t_flags));
8408         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
8409             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8410                 tcp_seq save_start = th->th_seq;
8411                 tcp_seq save_rnxt  = tp->rcv_nxt;
8412                 int     save_tlen  = tlen;
8413
8414                 m_adj(m, drop_hdrlen);  /* delayed header drop */
8415                 /*
8416                  * Insert segment which includes th into TCP reassembly
8417                  * queue with control block tp.  Set thflags to whether
8418                  * reassembly now includes a segment with FIN.  This handles
8419                  * the common case inline (segment is the next to be
8420                  * received on an established connection, and the queue is
8421                  * empty), avoiding linkage into and removal from the queue
8422                  * and repetition of various conversions. Set DELACK for
8423                  * segments received in order, but ack immediately when
8424                  * segments are out of order (so fast retransmit can work).
8425                  */
8426                 if (th->th_seq == tp->rcv_nxt &&
8427                     SEGQ_EMPTY(tp) &&
8428                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
8429                     tfo_syn)) {
8430 #ifdef NETFLIX_SB_LIMITS
8431                         u_int mcnt, appended;
8432
8433                         if (so->so_rcv.sb_shlim) {
8434                                 mcnt = m_memcnt(m);
8435                                 appended = 0;
8436                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8437                                     CFO_NOSLEEP, NULL) == false) {
8438                                         counter_u64_add(tcp_sb_shlim_fails, 1);
8439                                         m_freem(m);
8440                                         return (0);
8441                                 }
8442                         }
8443
8444 #endif
8445                         if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) {
8446                                 bbr->bbr_segs_rcvd += max(1, nsegs);
8447                                 tp->t_flags |= TF_DELACK;
8448                                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8449                         } else {
8450                                 bbr->r_wanted_output = 1;
8451                                 tp->t_flags |= TF_ACKNOW;
8452                         }
8453                         tp->rcv_nxt += tlen;
8454                         thflags = th->th_flags & TH_FIN;
8455                         KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
8456                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
8457                         SOCKBUF_LOCK(&so->so_rcv);
8458                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
8459                                 m_freem(m);
8460                         else
8461 #ifdef NETFLIX_SB_LIMITS
8462                                 appended =
8463 #endif
8464                                         sbappendstream_locked(&so->so_rcv, m, 0);
8465                         /* NB: sorwakeup_locked() does an implicit unlock. */
8466                         sorwakeup_locked(so);
8467 #ifdef NETFLIX_SB_LIMITS
8468                         if (so->so_rcv.sb_shlim && appended != mcnt)
8469                                 counter_fo_release(so->so_rcv.sb_shlim,
8470                                     mcnt - appended);
8471 #endif
8472                 } else {
8473                         /*
8474                          * XXX: Due to the header drop above "th" is
8475                          * theoretically invalid by now.  Fortunately
8476                          * m_adj() doesn't actually frees any mbufs when
8477                          * trimming from the head.
8478                          */
8479                         tcp_seq temp = save_start;
8480                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
8481                         tp->t_flags |= TF_ACKNOW;
8482                 }
8483                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
8484                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
8485                                 /*
8486                                  * DSACK actually handled in the fastpath
8487                                  * above.
8488                                  */
8489                                 tcp_update_sack_list(tp, save_start,
8490                                     save_start + save_tlen);
8491                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
8492                                 if ((tp->rcv_numsacks >= 1) &&
8493                                     (tp->sackblks[0].end == save_start)) {
8494                                         /*
8495                                          * Partial overlap, recorded at todrop
8496                                          * above.
8497                                          */
8498                                         tcp_update_sack_list(tp,
8499                                             tp->sackblks[0].start,
8500                                             tp->sackblks[0].end);
8501                                 } else {
8502                                         tcp_update_dsack_list(tp, save_start,
8503                                             save_start + save_tlen);
8504                                 }
8505                         } else if (tlen >= save_tlen) {
8506                                 /* Update of sackblks. */
8507                                 tcp_update_dsack_list(tp, save_start,
8508                                     save_start + save_tlen);
8509                         } else if (tlen > 0) {
8510                                 tcp_update_dsack_list(tp, save_start,
8511                                     save_start + tlen);
8512                         }
8513                 }
8514         } else {
8515                 m_freem(m);
8516                 thflags &= ~TH_FIN;
8517         }
8518
8519         /*
8520          * If FIN is received ACK the FIN and let the user know that the
8521          * connection is closing.
8522          */
8523         if (thflags & TH_FIN) {
8524                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8525                         socantrcvmore(so);
8526                         /*
8527                          * If connection is half-synchronized (ie NEEDSYN
8528                          * flag on) then delay ACK, so it may be piggybacked
8529                          * when SYN is sent. Otherwise, since we received a
8530                          * FIN then no more input can be expected, send ACK
8531                          * now.
8532                          */
8533                         if (tp->t_flags & TF_NEEDSYN) {
8534                                 tp->t_flags |= TF_DELACK;
8535                                 bbr_timer_cancel(bbr,
8536                                     __LINE__, bbr->r_ctl.rc_rcvtime);
8537                         } else {
8538                                 tp->t_flags |= TF_ACKNOW;
8539                         }
8540                         tp->rcv_nxt++;
8541                 }
8542                 switch (tp->t_state) {
8543
8544                         /*
8545                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
8546                          * CLOSE_WAIT state.
8547                          */
8548                 case TCPS_SYN_RECEIVED:
8549                         tp->t_starttime = ticks;
8550                         /* FALLTHROUGH */
8551                 case TCPS_ESTABLISHED:
8552                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
8553                         break;
8554
8555                         /*
8556                          * If still in FIN_WAIT_1 STATE FIN has not been
8557                          * acked so enter the CLOSING state.
8558                          */
8559                 case TCPS_FIN_WAIT_1:
8560                         tcp_state_change(tp, TCPS_CLOSING);
8561                         break;
8562
8563                         /*
8564                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
8565                          * starting the time-wait timer, turning off the
8566                          * other standard timers.
8567                          */
8568                 case TCPS_FIN_WAIT_2:
8569                         bbr->rc_timer_first = 1;
8570                         bbr_timer_cancel(bbr,
8571                             __LINE__, bbr->r_ctl.rc_rcvtime);
8572                         INP_WLOCK_ASSERT(tp->t_inpcb);
8573                         tcp_twstart(tp);
8574                         return (1);
8575                 }
8576         }
8577         /*
8578          * Return any desired output.
8579          */
8580         if ((tp->t_flags & TF_ACKNOW) ||
8581             (sbavail(&so->so_snd) > ctf_outstanding(tp))) {
8582                 bbr->r_wanted_output = 1;
8583         }
8584         INP_WLOCK_ASSERT(tp->t_inpcb);
8585         return (0);
8586 }
8587
8588 /*
8589  * Here nothing is really faster, its just that we
8590  * have broken out the fast-data path also just like
8591  * the fast-ack. Return 1 if we processed the packet
8592  * return 0 if you need to take the "slow-path".
8593  */
8594 static int
8595 bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
8596     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8597     uint32_t tiwin, int32_t nxt_pkt)
8598 {
8599         uint16_t nsegs;
8600         int32_t newsize = 0;    /* automatic sockbuf scaling */
8601         struct tcp_bbr *bbr;
8602 #ifdef NETFLIX_SB_LIMITS
8603         u_int mcnt, appended;
8604 #endif
8605 #ifdef TCPDEBUG
8606         /*
8607          * The size of tcp_saveipgen must be the size of the max ip header,
8608          * now IPv6.
8609          */
8610         u_char tcp_saveipgen[IP6_HDR_LEN];
8611         struct tcphdr tcp_savetcp;
8612         short ostate = 0;
8613
8614 #endif
8615         /* On the hpts and we would have called output */
8616         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8617
8618         /*
8619          * If last ACK falls within this segment's sequence numbers, record
8620          * the timestamp. NOTE that the test is modified according to the
8621          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
8622          */
8623         if (bbr->r_ctl.rc_resend != NULL) {
8624                 return (0);
8625         }
8626         if (tiwin && tiwin != tp->snd_wnd) {
8627                 return (0);
8628         }
8629         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
8630                 return (0);
8631         }
8632         if (__predict_false((to->to_flags & TOF_TS) &&
8633             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
8634                 return (0);
8635         }
8636         if (__predict_false((th->th_ack != tp->snd_una))) {
8637                 return (0);
8638         }
8639         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
8640                 return (0);
8641         }
8642         if ((to->to_flags & TOF_TS) != 0 &&
8643             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
8644                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
8645                 tp->ts_recent = to->to_tsval;
8646         }
8647         /*
8648          * This is a pure, in-sequence data packet with nothing on the
8649          * reassembly queue and we have enough buffer space to take it.
8650          */
8651         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8652
8653 #ifdef NETFLIX_SB_LIMITS
8654         if (so->so_rcv.sb_shlim) {
8655                 mcnt = m_memcnt(m);
8656                 appended = 0;
8657                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8658                     CFO_NOSLEEP, NULL) == false) {
8659                         counter_u64_add(tcp_sb_shlim_fails, 1);
8660                         m_freem(m);
8661                         return (1);
8662                 }
8663         }
8664 #endif
8665         /* Clean receiver SACK report if present */
8666         if (tp->rcv_numsacks)
8667                 tcp_clean_sackreport(tp);
8668         KMOD_TCPSTAT_INC(tcps_preddat);
8669         tp->rcv_nxt += tlen;
8670         /*
8671          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
8672          */
8673         tp->snd_wl1 = th->th_seq;
8674         /*
8675          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
8676          */
8677         tp->rcv_up = tp->rcv_nxt;
8678         KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
8679         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
8680 #ifdef TCPDEBUG
8681         if (so->so_options & SO_DEBUG)
8682                 tcp_trace(TA_INPUT, ostate, tp,
8683                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
8684 #endif
8685         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
8686
8687         /* Add data to socket buffer. */
8688         SOCKBUF_LOCK(&so->so_rcv);
8689         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
8690                 m_freem(m);
8691         } else {
8692                 /*
8693                  * Set new socket buffer size. Give up when limit is
8694                  * reached.
8695                  */
8696                 if (newsize)
8697                         if (!sbreserve_locked(&so->so_rcv,
8698                             newsize, so, NULL))
8699                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
8700                 m_adj(m, drop_hdrlen);  /* delayed header drop */
8701
8702 #ifdef NETFLIX_SB_LIMITS
8703                 appended =
8704 #endif
8705                         sbappendstream_locked(&so->so_rcv, m, 0);
8706                 ctf_calc_rwin(so, tp);
8707         }
8708         /* NB: sorwakeup_locked() does an implicit unlock. */
8709         sorwakeup_locked(so);
8710 #ifdef NETFLIX_SB_LIMITS
8711         if (so->so_rcv.sb_shlim && mcnt != appended)
8712                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
8713 #endif
8714         if (DELAY_ACK(tp, bbr, nsegs)) {
8715                 bbr->bbr_segs_rcvd += max(1, nsegs);
8716                 tp->t_flags |= TF_DELACK;
8717                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8718         } else {
8719                 bbr->r_wanted_output = 1;
8720                 tp->t_flags |= TF_ACKNOW;
8721         }
8722         return (1);
8723 }
8724
8725 /*
8726  * This subfunction is used to try to highly optimize the
8727  * fast path. We again allow window updates that are
8728  * in sequence to remain in the fast-path. We also add
8729  * in the __predict's to attempt to help the compiler.
8730  * Note that if we return a 0, then we can *not* process
8731  * it and the caller should push the packet into the
8732  * slow-path. If we return 1, then all is well and
8733  * the packet is fully processed.
8734  */
8735 static int
8736 bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
8737     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8738     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
8739 {
8740         int32_t acked;
8741         uint16_t nsegs;
8742         uint32_t sack_changed;
8743 #ifdef TCPDEBUG
8744         /*
8745          * The size of tcp_saveipgen must be the size of the max ip header,
8746          * now IPv6.
8747          */
8748         u_char tcp_saveipgen[IP6_HDR_LEN];
8749         struct tcphdr tcp_savetcp;
8750         short ostate = 0;
8751
8752 #endif
8753         uint32_t prev_acked = 0;
8754         struct tcp_bbr *bbr;
8755
8756         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
8757                 /* Old ack, behind (or duplicate to) the last one rcv'd */
8758                 return (0);
8759         }
8760         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
8761                 /* Above what we have sent? */
8762                 return (0);
8763         }
8764         if (__predict_false(tiwin == 0)) {
8765                 /* zero window */
8766                 return (0);
8767         }
8768         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
8769                 /* We need a SYN or a FIN, unlikely.. */
8770                 return (0);
8771         }
8772         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
8773                 /* Timestamp is behind .. old ack with seq wrap? */
8774                 return (0);
8775         }
8776         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
8777                 /* Still recovering */
8778                 return (0);
8779         }
8780         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8781         if (__predict_false(bbr->r_ctl.rc_resend != NULL)) {
8782                 /* We are retransmitting */
8783                 return (0);
8784         }
8785         if (__predict_false(bbr->rc_in_persist != 0)) {
8786                 /* In persist mode */
8787                 return (0);
8788         }
8789         if (bbr->r_ctl.rc_sacked) {
8790                 /* We have sack holes on our scoreboard */
8791                 return (0);
8792         }
8793         /* Ok if we reach here, we can process a fast-ack */
8794         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8795         sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
8796         /*
8797          * We never detect loss in fast ack [we can't
8798          * have a sack and can't be in recovery so
8799          * we always pass 0 (nothing detected)].
8800          */
8801         bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0);
8802         /* Did the window get updated? */
8803         if (tiwin != tp->snd_wnd) {
8804                 tp->snd_wnd = tiwin;
8805                 tp->snd_wl1 = th->th_seq;
8806                 if (tp->snd_wnd > tp->max_sndwnd)
8807                         tp->max_sndwnd = tp->snd_wnd;
8808         }
8809         /* Do we need to exit persists? */
8810         if ((bbr->rc_in_persist != 0) &&
8811             (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
8812                                bbr_minseg(bbr)))) {
8813                 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8814                 bbr->r_wanted_output = 1;
8815         }
8816         /* Do we need to enter persists? */
8817         if ((bbr->rc_in_persist == 0) &&
8818             (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
8819             TCPS_HAVEESTABLISHED(tp->t_state) &&
8820             (tp->snd_max == tp->snd_una) &&
8821             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8822             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
8823                 /* No send window.. we must enter persist */
8824                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8825         }
8826         /*
8827          * If last ACK falls within this segment's sequence numbers, record
8828          * the timestamp. NOTE that the test is modified according to the
8829          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
8830          */
8831         if ((to->to_flags & TOF_TS) != 0 &&
8832             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
8833                 tp->ts_recent_age = bbr->r_ctl.rc_rcvtime;
8834                 tp->ts_recent = to->to_tsval;
8835         }
8836         /*
8837          * This is a pure ack for outstanding data.
8838          */
8839         KMOD_TCPSTAT_INC(tcps_predack);
8840
8841         /*
8842          * "bad retransmit" recovery.
8843          */
8844         if (tp->t_flags & TF_PREVVALID) {
8845                 tp->t_flags &= ~TF_PREVVALID;
8846                 if (tp->t_rxtshift == 1 &&
8847                     (int)(ticks - tp->t_badrxtwin) < 0)
8848                         bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
8849         }
8850         /*
8851          * Recalculate the transmit timer / rtt.
8852          *
8853          * Some boxes send broken timestamp replies during the SYN+ACK
8854          * phase, ignore timestamps of 0 or we could calculate a huge RTT
8855          * and blow up the retransmit timer.
8856          */
8857         acked = BYTES_THIS_ACK(tp, th);
8858
8859 #ifdef TCP_HHOOK
8860         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
8861         hhook_run_tcp_est_in(tp, th, to);
8862 #endif
8863
8864         KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
8865         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
8866         sbdrop(&so->so_snd, acked);
8867
8868         if (SEQ_GT(th->th_ack, tp->snd_una))
8869                 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
8870         tp->snd_una = th->th_ack;
8871         if (tp->snd_wnd < ctf_outstanding(tp))
8872                 /* The peer collapsed its window on us */
8873                 bbr_collapsed_window(bbr);
8874         else if (bbr->rc_has_collapsed)
8875                 bbr_un_collapse_window(bbr);
8876
8877         if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
8878                 tp->snd_recover = tp->snd_una;
8879         }
8880         bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0);
8881         /*
8882          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
8883          */
8884         tp->snd_wl2 = th->th_ack;
8885         m_freem(m);
8886         /*
8887          * If all outstanding data are acked, stop retransmit timer,
8888          * otherwise restart timer using current (possibly backed-off)
8889          * value. If process is waiting for space, wakeup/selwakeup/signal.
8890          * If data are ready to send, let tcp_output decide between more
8891          * output or persist.
8892          */
8893 #ifdef TCPDEBUG
8894         if (so->so_options & SO_DEBUG)
8895                 tcp_trace(TA_INPUT, ostate, tp,
8896                     (void *)tcp_saveipgen,
8897                     &tcp_savetcp, 0);
8898 #endif
8899         /* Wake up the socket if we have room to write more */
8900         sowwakeup(so);
8901         if (tp->snd_una == tp->snd_max) {
8902                 /* Nothing left outstanding */
8903                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
8904                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
8905                         bbr->rc_tp->t_acktime = 0;
8906                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8907                 if (bbr->rc_in_persist == 0) {
8908                         bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
8909                 }
8910                 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
8911                 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
8912                 /*
8913                  * We invalidate the last ack here since we
8914                  * don't want to transfer forward the time
8915                  * for our sum's calculations.
8916                  */
8917                 bbr->r_wanted_output = 1;
8918         }
8919         if (sbavail(&so->so_snd)) {
8920                 bbr->r_wanted_output = 1;
8921         }
8922         return (1);
8923 }
8924
8925 /*
8926  * Return value of 1, the TCB is unlocked and most
8927  * likely gone, return value of 0, the TCB is still
8928  * locked.
8929  */
8930 static int
8931 bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
8932     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8933     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
8934 {
8935         int32_t todrop;
8936         int32_t ourfinisacked = 0;
8937         struct tcp_bbr *bbr;
8938         int32_t ret_val = 0;
8939
8940         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8941         ctf_calc_rwin(so, tp);
8942         /*
8943          * If the state is SYN_SENT: if seg contains an ACK, but not for our
8944          * SYN, drop the input. if seg contains a RST, then drop the
8945          * connection. if seg does not contain SYN, then drop it. Otherwise
8946          * this is an acceptable SYN segment initialize tp->rcv_nxt and
8947          * tp->irs if seg contains ack then advance tp->snd_una. BRR does
8948          * not support ECN so we will not say we are capable. if SYN has
8949          * been acked change to ESTABLISHED else SYN_RCVD state arrange for
8950          * segment to be acked (eventually) continue processing rest of
8951          * data/controls, beginning with URG
8952          */
8953         if ((thflags & TH_ACK) &&
8954             (SEQ_LEQ(th->th_ack, tp->iss) ||
8955             SEQ_GT(th->th_ack, tp->snd_max))) {
8956                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
8957                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
8958                 return (1);
8959         }
8960         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
8961                 TCP_PROBE5(connect__refused, NULL, tp,
8962                     mtod(m, const char *), tp, th);
8963                 tp = tcp_drop(tp, ECONNREFUSED);
8964                 ctf_do_drop(m, tp);
8965                 return (1);
8966         }
8967         if (thflags & TH_RST) {
8968                 ctf_do_drop(m, tp);
8969                 return (1);
8970         }
8971         if (!(thflags & TH_SYN)) {
8972                 ctf_do_drop(m, tp);
8973                 return (1);
8974         }
8975         tp->irs = th->th_seq;
8976         tcp_rcvseqinit(tp);
8977         if (thflags & TH_ACK) {
8978                 int tfo_partial = 0;
8979
8980                 KMOD_TCPSTAT_INC(tcps_connects);
8981                 soisconnected(so);
8982 #ifdef MAC
8983                 mac_socketpeer_set_from_mbuf(m, so);
8984 #endif
8985                 /* Do window scaling on this connection? */
8986                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
8987                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
8988                         tp->rcv_scale = tp->request_r_scale;
8989                 }
8990                 tp->rcv_adv += min(tp->rcv_wnd,
8991                     TCP_MAXWIN << tp->rcv_scale);
8992                 /*
8993                  * If not all the data that was sent in the TFO SYN
8994                  * has been acked, resend the remainder right away.
8995                  */
8996                 if (IS_FASTOPEN(tp->t_flags) &&
8997                     (tp->snd_una != tp->snd_max)) {
8998                         tp->snd_nxt = th->th_ack;
8999                         tfo_partial = 1;
9000                 }
9001                 /*
9002                  * If there's data, delay ACK; if there's also a FIN ACKNOW
9003                  * will be turned on later.
9004                  */
9005                 if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && !tfo_partial) {
9006                         bbr->bbr_segs_rcvd += 1;
9007                         tp->t_flags |= TF_DELACK;
9008                         bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
9009                 } else {
9010                         bbr->r_wanted_output = 1;
9011                         tp->t_flags |= TF_ACKNOW;
9012                 }
9013                 if (SEQ_GT(th->th_ack, tp->iss)) {
9014                         /*
9015                          * The SYN is acked
9016                          * handle it specially.
9017                          */
9018                         bbr_log_syn(tp, to);
9019                 }
9020                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
9021                         /*
9022                          * We advance snd_una for the
9023                          * fast open case. If th_ack is
9024                          * acknowledging data beyond
9025                          * snd_una we can't just call
9026                          * ack-processing since the
9027                          * data stream in our send-map
9028                          * will start at snd_una + 1 (one
9029                          * beyond the SYN). If its just
9030                          * equal we don't need to do that
9031                          * and there is no send_map.
9032                          */
9033                         tp->snd_una++;
9034                 }
9035                 /*
9036                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
9037                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
9038                  */
9039                 tp->t_starttime = ticks;
9040                 if (tp->t_flags & TF_NEEDFIN) {
9041                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
9042                         tp->t_flags &= ~TF_NEEDFIN;
9043                         thflags &= ~TH_SYN;
9044                 } else {
9045                         tcp_state_change(tp, TCPS_ESTABLISHED);
9046                         TCP_PROBE5(connect__established, NULL, tp,
9047                             mtod(m, const char *), tp, th);
9048                         cc_conn_init(tp);
9049                 }
9050         } else {
9051                 /*
9052                  * Received initial SYN in SYN-SENT[*] state => simultaneous
9053                  * open.  If segment contains CC option and there is a
9054                  * cached CC, apply TAO test. If it succeeds, connection is *
9055                  * half-synchronized. Otherwise, do 3-way handshake:
9056                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
9057                  * there was no CC option, clear cached CC value.
9058                  */
9059                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
9060                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
9061         }
9062         INP_WLOCK_ASSERT(tp->t_inpcb);
9063         /*
9064          * Advance th->th_seq to correspond to first data byte. If data,
9065          * trim to stay within window, dropping FIN if necessary.
9066          */
9067         th->th_seq++;
9068         if (tlen > tp->rcv_wnd) {
9069                 todrop = tlen - tp->rcv_wnd;
9070                 m_adj(m, -todrop);
9071                 tlen = tp->rcv_wnd;
9072                 thflags &= ~TH_FIN;
9073                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
9074                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
9075         }
9076         tp->snd_wl1 = th->th_seq - 1;
9077         tp->rcv_up = th->th_seq;
9078         /*
9079          * Client side of transaction: already sent SYN and data. If the
9080          * remote host used T/TCP to validate the SYN, our data will be
9081          * ACK'd; if so, enter normal data segment processing in the middle
9082          * of step 5, ack processing. Otherwise, goto step 6.
9083          */
9084         if (thflags & TH_ACK) {
9085                 if ((to->to_flags & TOF_TS) != 0) {
9086                         uint32_t t, rtt;
9087
9088                         t = tcp_tv_to_mssectick(&bbr->rc_tv);
9089                         if (TSTMP_GEQ(t, to->to_tsecr)) {
9090                                 rtt = t - to->to_tsecr;
9091                                 if (rtt == 0) {
9092                                         rtt = 1;
9093                                 }
9094                                 rtt *= MS_IN_USEC;
9095                                 tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
9096                                 apply_filter_min_small(&bbr->r_ctl.rc_rttprop,
9097                                                        rtt, bbr->r_ctl.rc_rcvtime);
9098                         }
9099                 }
9100                 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
9101                         return (ret_val);
9102                 /* We may have changed to FIN_WAIT_1 above */
9103                 if (tp->t_state == TCPS_FIN_WAIT_1) {
9104                         /*
9105                          * In FIN_WAIT_1 STATE in addition to the processing
9106                          * for the ESTABLISHED state if our FIN is now
9107                          * acknowledged then enter FIN_WAIT_2.
9108                          */
9109                         if (ourfinisacked) {
9110                                 /*
9111                                  * If we can't receive any more data, then
9112                                  * closing user can proceed. Starting the
9113                                  * timer is contrary to the specification,
9114                                  * but if we don't get a FIN we'll hang
9115                                  * forever.
9116                                  *
9117                                  * XXXjl: we should release the tp also, and
9118                                  * use a compressed state.
9119                                  */
9120                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9121                                         soisdisconnected(so);
9122                                         tcp_timer_activate(tp, TT_2MSL,
9123                                             (tcp_fast_finwait2_recycle ?
9124                                             tcp_finwait2_timeout :
9125                                             TP_MAXIDLE(tp)));
9126                                 }
9127                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
9128                         }
9129                 }
9130         }
9131         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9132             tiwin, thflags, nxt_pkt));
9133 }
9134
9135 /*
9136  * Return value of 1, the TCB is unlocked and most
9137  * likely gone, return value of 0, the TCB is still
9138  * locked.
9139  */
9140 static int
9141 bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
9142                 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9143                 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9144 {
9145         int32_t ourfinisacked = 0;
9146         int32_t ret_val;
9147         struct tcp_bbr *bbr;
9148
9149         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9150         ctf_calc_rwin(so, tp);
9151         if ((thflags & TH_ACK) &&
9152             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
9153              SEQ_GT(th->th_ack, tp->snd_max))) {
9154                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9155                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9156                 return (1);
9157         }
9158         if (IS_FASTOPEN(tp->t_flags)) {
9159                 /*
9160                  * When a TFO connection is in SYN_RECEIVED, the only valid
9161                  * packets are the initial SYN, a retransmit/copy of the
9162                  * initial SYN (possibly with a subset of the original
9163                  * data), a valid ACK, a FIN, or a RST.
9164                  */
9165                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
9166                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9167                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9168                         return (1);
9169                 } else if (thflags & TH_SYN) {
9170                         /* non-initial SYN is ignored */
9171                         if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
9172                             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
9173                             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
9174                                 ctf_do_drop(m, NULL);
9175                                 return (0);
9176                         }
9177                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
9178                         ctf_do_drop(m, NULL);
9179                         return (0);
9180                 }
9181         }
9182         if ((thflags & TH_RST) ||
9183             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9184                 return (ctf_process_rst(m, th, so, tp));
9185         /*
9186          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9187          * it's less than ts_recent, drop it.
9188          */
9189         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9190             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9191                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9192                         return (ret_val);
9193         }
9194         /*
9195          * In the SYN-RECEIVED state, validate that the packet belongs to
9196          * this connection before trimming the data to fit the receive
9197          * window.  Check the sequence number versus IRS since we know the
9198          * sequence numbers haven't wrapped.  This is a partial fix for the
9199          * "LAND" DoS attack.
9200          */
9201         if (SEQ_LT(th->th_seq, tp->irs)) {
9202                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9203                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9204                 return (1);
9205         }
9206         INP_WLOCK_ASSERT(tp->t_inpcb);
9207         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9208                 return (ret_val);
9209         }
9210         /*
9211          * If last ACK falls within this segment's sequence numbers, record
9212          * its timestamp. NOTE: 1) That the test incorporates suggestions
9213          * from the latest proposal of the tcplw@cray.com list (Braden
9214          * 1993/04/26). 2) That updating only on newer timestamps interferes
9215          * with our earlier PAWS tests, so this check should be solely
9216          * predicated on the sequence space of this segment. 3) That we
9217          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9218          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9219          * SEG.Len, This modified check allows us to overcome RFC1323's
9220          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9221          * p.869. In such cases, we can still calculate the RTT correctly
9222          * when RCV.NXT == Last.ACK.Sent.
9223          */
9224         if ((to->to_flags & TOF_TS) != 0 &&
9225             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9226             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9227                     ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9228                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9229                 tp->ts_recent = to->to_tsval;
9230         }
9231         tp->snd_wnd = tiwin;
9232         /*
9233          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9234          * is on (half-synchronized state), then queue data for later
9235          * processing; else drop segment and return.
9236          */
9237         if ((thflags & TH_ACK) == 0) {
9238                 if (IS_FASTOPEN(tp->t_flags)) {
9239                         cc_conn_init(tp);
9240                 }
9241                 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9242                                          tiwin, thflags, nxt_pkt));
9243         }
9244         KMOD_TCPSTAT_INC(tcps_connects);
9245         soisconnected(so);
9246         /* Do window scaling? */
9247         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
9248             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
9249                 tp->rcv_scale = tp->request_r_scale;
9250         }
9251         /*
9252          * ok for the first time in lets see if we can use the ts to figure
9253          * out what the initial RTT was.
9254          */
9255         if ((to->to_flags & TOF_TS) != 0) {
9256                 uint32_t t, rtt;
9257
9258                 t = tcp_tv_to_mssectick(&bbr->rc_tv);
9259                 if (TSTMP_GEQ(t, to->to_tsecr)) {
9260                         rtt = t - to->to_tsecr;
9261                         if (rtt == 0) {
9262                                 rtt = 1;
9263                         }
9264                         rtt *= MS_IN_USEC;
9265                         tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
9266                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime);
9267                 }
9268         }
9269         /* Drop off any SYN in the send map (probably not there)  */
9270         if (thflags & TH_ACK)
9271                 bbr_log_syn(tp, to);
9272         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
9273
9274                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
9275                 tp->t_tfo_pending = NULL;
9276         }
9277         /*
9278          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
9279          * FIN-WAIT-1
9280          */
9281         tp->t_starttime = ticks;
9282         if (tp->t_flags & TF_NEEDFIN) {
9283                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
9284                 tp->t_flags &= ~TF_NEEDFIN;
9285         } else {
9286                 tcp_state_change(tp, TCPS_ESTABLISHED);
9287                 TCP_PROBE5(accept__established, NULL, tp,
9288                            mtod(m, const char *), tp, th);
9289                 /*
9290                  * TFO connections call cc_conn_init() during SYN
9291                  * processing.  Calling it again here for such connections
9292                  * is not harmless as it would undo the snd_cwnd reduction
9293                  * that occurs when a TFO SYN|ACK is retransmitted.
9294                  */
9295                 if (!IS_FASTOPEN(tp->t_flags))
9296                         cc_conn_init(tp);
9297         }
9298         /*
9299          * Account for the ACK of our SYN prior to
9300          * regular ACK processing below, except for
9301          * simultaneous SYN, which is handled later.
9302          */
9303         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
9304                 tp->snd_una++;
9305         /*
9306          * If segment contains data or ACK, will call tcp_reass() later; if
9307          * not, do so now to pass queued data to user.
9308          */
9309         if (tlen == 0 && (thflags & TH_FIN) == 0)
9310                 (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
9311                         (struct mbuf *)0);
9312         tp->snd_wl1 = th->th_seq - 1;
9313         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9314                 return (ret_val);
9315         }
9316         if (tp->t_state == TCPS_FIN_WAIT_1) {
9317                 /* We could have went to FIN_WAIT_1 (or EST) above */
9318                 /*
9319                  * In FIN_WAIT_1 STATE in addition to the processing for the
9320                  * ESTABLISHED state if our FIN is now acknowledged then
9321                  * enter FIN_WAIT_2.
9322                  */
9323                 if (ourfinisacked) {
9324                         /*
9325                          * If we can't receive any more data, then closing
9326                          * user can proceed. Starting the timer is contrary
9327                          * to the specification, but if we don't get a FIN
9328                          * we'll hang forever.
9329                          *
9330                          * XXXjl: we should release the tp also, and use a
9331                          * compressed state.
9332                          */
9333                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9334                                 soisdisconnected(so);
9335                                 tcp_timer_activate(tp, TT_2MSL,
9336                                                    (tcp_fast_finwait2_recycle ?
9337                                                     tcp_finwait2_timeout :
9338                                                     TP_MAXIDLE(tp)));
9339                         }
9340                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
9341                 }
9342         }
9343         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9344                                  tiwin, thflags, nxt_pkt));
9345 }
9346
9347 /*
9348  * Return value of 1, the TCB is unlocked and most
9349  * likely gone, return value of 0, the TCB is still
9350  * locked.
9351  */
9352 static int
9353 bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
9354     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9355     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9356 {
9357         struct tcp_bbr *bbr;
9358         int32_t ret_val;
9359
9360         /*
9361          * Header prediction: check for the two common cases of a
9362          * uni-directional data xfer.  If the packet has no control flags,
9363          * is in-sequence, the window didn't change and we're not
9364          * retransmitting, it's a candidate.  If the length is zero and the
9365          * ack moved forward, we're the sender side of the xfer.  Just free
9366          * the data acked & wake any higher level process that was blocked
9367          * waiting for space.  If the length is non-zero and the ack didn't
9368          * move, we're the receiver side.  If we're getting packets in-order
9369          * (the reassembly queue is empty), add the data toc The socket
9370          * buffer and note that we need a delayed ack. Make sure that the
9371          * hidden state-flags are also off. Since we check for
9372          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
9373          */
9374         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9375         if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) {
9376                 /*
9377                  * If we have delived under 4 segments increase the initial
9378                  * window if raised by the peer. We use this to determine
9379                  * dynamic and static rwnd's at the end of a connection.
9380                  */
9381                 bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd);
9382         }
9383         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
9384             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
9385             __predict_true(SEGQ_EMPTY(tp)) &&
9386             __predict_true(th->th_seq == tp->rcv_nxt)) {
9387                 if (tlen == 0) {
9388                         if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
9389                             tiwin, nxt_pkt, iptos)) {
9390                                 return (0);
9391                         }
9392                 } else {
9393                         if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
9394                             tiwin, nxt_pkt)) {
9395                                 return (0);
9396                         }
9397                 }
9398         }
9399         ctf_calc_rwin(so, tp);
9400
9401         if ((thflags & TH_RST) ||
9402             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9403                 return (ctf_process_rst(m, th, so, tp));
9404         /*
9405          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9406          * synchronized state.
9407          */
9408         if (thflags & TH_SYN) {
9409                 ctf_challenge_ack(m, th, tp, &ret_val);
9410                 return (ret_val);
9411         }
9412         /*
9413          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9414          * it's less than ts_recent, drop it.
9415          */
9416         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9417             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9418                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9419                         return (ret_val);
9420         }
9421         INP_WLOCK_ASSERT(tp->t_inpcb);
9422         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9423                 return (ret_val);
9424         }
9425         /*
9426          * If last ACK falls within this segment's sequence numbers, record
9427          * its timestamp. NOTE: 1) That the test incorporates suggestions
9428          * from the latest proposal of the tcplw@cray.com list (Braden
9429          * 1993/04/26). 2) That updating only on newer timestamps interferes
9430          * with our earlier PAWS tests, so this check should be solely
9431          * predicated on the sequence space of this segment. 3) That we
9432          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9433          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9434          * SEG.Len, This modified check allows us to overcome RFC1323's
9435          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9436          * p.869. In such cases, we can still calculate the RTT correctly
9437          * when RCV.NXT == Last.ACK.Sent.
9438          */
9439         if ((to->to_flags & TOF_TS) != 0 &&
9440             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9441             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9442             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9443                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9444                 tp->ts_recent = to->to_tsval;
9445         }
9446         /*
9447          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9448          * is on (half-synchronized state), then queue data for later
9449          * processing; else drop segment and return.
9450          */
9451         if ((thflags & TH_ACK) == 0) {
9452                 if (tp->t_flags & TF_NEEDSYN) {
9453                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9454                             tiwin, thflags, nxt_pkt));
9455                 } else if (tp->t_flags & TF_ACKNOW) {
9456                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9457                         bbr->r_wanted_output = 1;
9458                         return (ret_val);
9459                 } else {
9460                         ctf_do_drop(m, NULL);
9461                         return (0);
9462                 }
9463         }
9464         /*
9465          * Ack processing.
9466          */
9467         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9468                 return (ret_val);
9469         }
9470         if (sbavail(&so->so_snd)) {
9471                 if (ctf_progress_timeout_check(tp, true)) {
9472                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
9473                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9474                         return (1);
9475                 }
9476         }
9477         /* State changes only happen in bbr_process_data() */
9478         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9479             tiwin, thflags, nxt_pkt));
9480 }
9481
9482 /*
9483  * Return value of 1, the TCB is unlocked and most
9484  * likely gone, return value of 0, the TCB is still
9485  * locked.
9486  */
9487 static int
9488 bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
9489     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9490     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9491 {
9492         struct tcp_bbr *bbr;
9493         int32_t ret_val;
9494
9495         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9496         ctf_calc_rwin(so, tp);
9497         if ((thflags & TH_RST) ||
9498             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9499                 return (ctf_process_rst(m, th, so, tp));
9500         /*
9501          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9502          * synchronized state.
9503          */
9504         if (thflags & TH_SYN) {
9505                 ctf_challenge_ack(m, th, tp, &ret_val);
9506                 return (ret_val);
9507         }
9508         /*
9509          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9510          * it's less than ts_recent, drop it.
9511          */
9512         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9513             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9514                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9515                         return (ret_val);
9516         }
9517         INP_WLOCK_ASSERT(tp->t_inpcb);
9518         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9519                 return (ret_val);
9520         }
9521         /*
9522          * If last ACK falls within this segment's sequence numbers, record
9523          * its timestamp. NOTE: 1) That the test incorporates suggestions
9524          * from the latest proposal of the tcplw@cray.com list (Braden
9525          * 1993/04/26). 2) That updating only on newer timestamps interferes
9526          * with our earlier PAWS tests, so this check should be solely
9527          * predicated on the sequence space of this segment. 3) That we
9528          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9529          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9530          * SEG.Len, This modified check allows us to overcome RFC1323's
9531          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9532          * p.869. In such cases, we can still calculate the RTT correctly
9533          * when RCV.NXT == Last.ACK.Sent.
9534          */
9535         if ((to->to_flags & TOF_TS) != 0 &&
9536             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9537             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9538             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9539                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9540                 tp->ts_recent = to->to_tsval;
9541         }
9542         /*
9543          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9544          * is on (half-synchronized state), then queue data for later
9545          * processing; else drop segment and return.
9546          */
9547         if ((thflags & TH_ACK) == 0) {
9548                 if (tp->t_flags & TF_NEEDSYN) {
9549                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9550                             tiwin, thflags, nxt_pkt));
9551                 } else if (tp->t_flags & TF_ACKNOW) {
9552                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9553                         bbr->r_wanted_output = 1;
9554                         return (ret_val);
9555                 } else {
9556                         ctf_do_drop(m, NULL);
9557                         return (0);
9558                 }
9559         }
9560         /*
9561          * Ack processing.
9562          */
9563         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9564                 return (ret_val);
9565         }
9566         if (sbavail(&so->so_snd)) {
9567                 if (ctf_progress_timeout_check(tp, true)) {
9568                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
9569                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9570                         return (1);
9571                 }
9572         }
9573         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9574             tiwin, thflags, nxt_pkt));
9575 }
9576
9577 static int
9578 bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr,
9579     struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so)
9580 {
9581
9582         if (bbr->rc_allow_data_af_clo == 0) {
9583 close_now:
9584                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
9585                 /* tcp_close will kill the inp pre-log the Reset */
9586                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
9587                 tp = tcp_close(tp);
9588                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
9589                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
9590                 return (1);
9591         }
9592         if (sbavail(&so->so_snd) == 0)
9593                 goto close_now;
9594         /* Ok we allow data that is ignored and a followup reset */
9595         tp->rcv_nxt = th->th_seq + *tlen;
9596         tp->t_flags2 |= TF2_DROP_AF_DATA;
9597         bbr->r_wanted_output = 1;
9598         *tlen = 0;
9599         return (0);
9600 }
9601
9602 /*
9603  * Return value of 1, the TCB is unlocked and most
9604  * likely gone, return value of 0, the TCB is still
9605  * locked.
9606  */
9607 static int
9608 bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
9609     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9610     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9611 {
9612         int32_t ourfinisacked = 0;
9613         int32_t ret_val;
9614         struct tcp_bbr *bbr;
9615
9616         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9617         ctf_calc_rwin(so, tp);
9618         if ((thflags & TH_RST) ||
9619             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9620                 return (ctf_process_rst(m, th, so, tp));
9621         /*
9622          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9623          * synchronized state.
9624          */
9625         if (thflags & TH_SYN) {
9626                 ctf_challenge_ack(m, th, tp, &ret_val);
9627                 return (ret_val);
9628         }
9629         /*
9630          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9631          * it's less than ts_recent, drop it.
9632          */
9633         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9634             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9635                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9636                         return (ret_val);
9637         }
9638         INP_WLOCK_ASSERT(tp->t_inpcb);
9639         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9640                 return (ret_val);
9641         }
9642         /*
9643          * If new data are received on a connection after the user processes
9644          * are gone, then RST the other end.
9645          */
9646         if ((so->so_state & SS_NOFDREF) && tlen) {
9647                 /*
9648                  * We call a new function now so we might continue and setup
9649                  * to reset at all data being ack'd.
9650                  */
9651                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9652                         return (1);
9653         }
9654         /*
9655          * If last ACK falls within this segment's sequence numbers, record
9656          * its timestamp. NOTE: 1) That the test incorporates suggestions
9657          * from the latest proposal of the tcplw@cray.com list (Braden
9658          * 1993/04/26). 2) That updating only on newer timestamps interferes
9659          * with our earlier PAWS tests, so this check should be solely
9660          * predicated on the sequence space of this segment. 3) That we
9661          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9662          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9663          * SEG.Len, This modified check allows us to overcome RFC1323's
9664          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9665          * p.869. In such cases, we can still calculate the RTT correctly
9666          * when RCV.NXT == Last.ACK.Sent.
9667          */
9668         if ((to->to_flags & TOF_TS) != 0 &&
9669             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9670             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9671             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9672                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9673                 tp->ts_recent = to->to_tsval;
9674         }
9675         /*
9676          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9677          * is on (half-synchronized state), then queue data for later
9678          * processing; else drop segment and return.
9679          */
9680         if ((thflags & TH_ACK) == 0) {
9681                 if (tp->t_flags & TF_NEEDSYN) {
9682                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9683                             tiwin, thflags, nxt_pkt));
9684                 } else if (tp->t_flags & TF_ACKNOW) {
9685                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9686                         bbr->r_wanted_output = 1;
9687                         return (ret_val);
9688                 } else {
9689                         ctf_do_drop(m, NULL);
9690                         return (0);
9691                 }
9692         }
9693         /*
9694          * Ack processing.
9695          */
9696         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9697                 return (ret_val);
9698         }
9699         if (ourfinisacked) {
9700                 /*
9701                  * If we can't receive any more data, then closing user can
9702                  * proceed. Starting the timer is contrary to the
9703                  * specification, but if we don't get a FIN we'll hang
9704                  * forever.
9705                  *
9706                  * XXXjl: we should release the tp also, and use a
9707                  * compressed state.
9708                  */
9709                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9710                         soisdisconnected(so);
9711                         tcp_timer_activate(tp, TT_2MSL,
9712                             (tcp_fast_finwait2_recycle ?
9713                             tcp_finwait2_timeout :
9714                             TP_MAXIDLE(tp)));
9715                 }
9716                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
9717         }
9718         if (sbavail(&so->so_snd)) {
9719                 if (ctf_progress_timeout_check(tp, true)) {
9720                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
9721                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9722                         return (1);
9723                 }
9724         }
9725         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9726             tiwin, thflags, nxt_pkt));
9727 }
9728
9729 /*
9730  * Return value of 1, the TCB is unlocked and most
9731  * likely gone, return value of 0, the TCB is still
9732  * locked.
9733  */
9734 static int
9735 bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
9736     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9737     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9738 {
9739         int32_t ourfinisacked = 0;
9740         int32_t ret_val;
9741         struct tcp_bbr *bbr;
9742
9743         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9744         ctf_calc_rwin(so, tp);
9745         if ((thflags & TH_RST) ||
9746             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9747                 return (ctf_process_rst(m, th, so, tp));
9748         /*
9749          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9750          * synchronized state.
9751          */
9752         if (thflags & TH_SYN) {
9753                 ctf_challenge_ack(m, th, tp, &ret_val);
9754                 return (ret_val);
9755         }
9756         /*
9757          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9758          * it's less than ts_recent, drop it.
9759          */
9760         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9761             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9762                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9763                         return (ret_val);
9764         }
9765         INP_WLOCK_ASSERT(tp->t_inpcb);
9766         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9767                 return (ret_val);
9768         }
9769         /*
9770          * If new data are received on a connection after the user processes
9771          * are gone, then RST the other end.
9772          */
9773         if ((so->so_state & SS_NOFDREF) && tlen) {
9774                 /*
9775                  * We call a new function now so we might continue and setup
9776                  * to reset at all data being ack'd.
9777                  */
9778                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9779                         return (1);
9780         }
9781         /*
9782          * If last ACK falls within this segment's sequence numbers, record
9783          * its timestamp. NOTE: 1) That the test incorporates suggestions
9784          * from the latest proposal of the tcplw@cray.com list (Braden
9785          * 1993/04/26). 2) That updating only on newer timestamps interferes
9786          * with our earlier PAWS tests, so this check should be solely
9787          * predicated on the sequence space of this segment. 3) That we
9788          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9789          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9790          * SEG.Len, This modified check allows us to overcome RFC1323's
9791          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9792          * p.869. In such cases, we can still calculate the RTT correctly
9793          * when RCV.NXT == Last.ACK.Sent.
9794          */
9795         if ((to->to_flags & TOF_TS) != 0 &&
9796             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9797             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9798             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9799                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9800                 tp->ts_recent = to->to_tsval;
9801         }
9802         /*
9803          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9804          * is on (half-synchronized state), then queue data for later
9805          * processing; else drop segment and return.
9806          */
9807         if ((thflags & TH_ACK) == 0) {
9808                 if (tp->t_flags & TF_NEEDSYN) {
9809                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9810                             tiwin, thflags, nxt_pkt));
9811                 } else if (tp->t_flags & TF_ACKNOW) {
9812                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9813                         bbr->r_wanted_output = 1;
9814                         return (ret_val);
9815                 } else {
9816                         ctf_do_drop(m, NULL);
9817                         return (0);
9818                 }
9819         }
9820         /*
9821          * Ack processing.
9822          */
9823         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9824                 return (ret_val);
9825         }
9826         if (ourfinisacked) {
9827                 tcp_twstart(tp);
9828                 m_freem(m);
9829                 return (1);
9830         }
9831         if (sbavail(&so->so_snd)) {
9832                 if (ctf_progress_timeout_check(tp, true)) {
9833                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
9834                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9835                         return (1);
9836                 }
9837         }
9838         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9839             tiwin, thflags, nxt_pkt));
9840 }
9841
9842 /*
9843  * Return value of 1, the TCB is unlocked and most
9844  * likely gone, return value of 0, the TCB is still
9845  * locked.
9846  */
9847 static int
9848 bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
9849     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9850     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9851 {
9852         int32_t ourfinisacked = 0;
9853         int32_t ret_val;
9854         struct tcp_bbr *bbr;
9855
9856         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9857         ctf_calc_rwin(so, tp);
9858         if ((thflags & TH_RST) ||
9859             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9860                 return (ctf_process_rst(m, th, so, tp));
9861         /*
9862          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9863          * synchronized state.
9864          */
9865         if (thflags & TH_SYN) {
9866                 ctf_challenge_ack(m, th, tp, &ret_val);
9867                 return (ret_val);
9868         }
9869         /*
9870          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9871          * it's less than ts_recent, drop it.
9872          */
9873         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9874             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9875                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9876                         return (ret_val);
9877         }
9878         INP_WLOCK_ASSERT(tp->t_inpcb);
9879         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9880                 return (ret_val);
9881         }
9882         /*
9883          * If new data are received on a connection after the user processes
9884          * are gone, then RST the other end.
9885          */
9886         if ((so->so_state & SS_NOFDREF) && tlen) {
9887                 /*
9888                  * We call a new function now so we might continue and setup
9889                  * to reset at all data being ack'd.
9890                  */
9891                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9892                         return (1);
9893         }
9894         /*
9895          * If last ACK falls within this segment's sequence numbers, record
9896          * its timestamp. NOTE: 1) That the test incorporates suggestions
9897          * from the latest proposal of the tcplw@cray.com list (Braden
9898          * 1993/04/26). 2) That updating only on newer timestamps interferes
9899          * with our earlier PAWS tests, so this check should be solely
9900          * predicated on the sequence space of this segment. 3) That we
9901          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9902          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9903          * SEG.Len, This modified check allows us to overcome RFC1323's
9904          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9905          * p.869. In such cases, we can still calculate the RTT correctly
9906          * when RCV.NXT == Last.ACK.Sent.
9907          */
9908         if ((to->to_flags & TOF_TS) != 0 &&
9909             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9910             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9911             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9912                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9913                 tp->ts_recent = to->to_tsval;
9914         }
9915         /*
9916          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9917          * is on (half-synchronized state), then queue data for later
9918          * processing; else drop segment and return.
9919          */
9920         if ((thflags & TH_ACK) == 0) {
9921                 if (tp->t_flags & TF_NEEDSYN) {
9922                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9923                             tiwin, thflags, nxt_pkt));
9924                 } else if (tp->t_flags & TF_ACKNOW) {
9925                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9926                         bbr->r_wanted_output = 1;
9927                         return (ret_val);
9928                 } else {
9929                         ctf_do_drop(m, NULL);
9930                         return (0);
9931                 }
9932         }
9933         /*
9934          * case TCPS_LAST_ACK: Ack processing.
9935          */
9936         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9937                 return (ret_val);
9938         }
9939         if (ourfinisacked) {
9940                 tp = tcp_close(tp);
9941                 ctf_do_drop(m, tp);
9942                 return (1);
9943         }
9944         if (sbavail(&so->so_snd)) {
9945                 if (ctf_progress_timeout_check(tp, true)) {
9946                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
9947                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9948                         return (1);
9949                 }
9950         }
9951         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9952             tiwin, thflags, nxt_pkt));
9953 }
9954
9955
9956 /*
9957  * Return value of 1, the TCB is unlocked and most
9958  * likely gone, return value of 0, the TCB is still
9959  * locked.
9960  */
9961 static int
9962 bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
9963     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9964     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9965 {
9966         int32_t ourfinisacked = 0;
9967         int32_t ret_val;
9968         struct tcp_bbr *bbr;
9969
9970         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9971         ctf_calc_rwin(so, tp);
9972         /* Reset receive buffer auto scaling when not in bulk receive mode. */
9973         if ((thflags & TH_RST) ||
9974             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9975                 return (ctf_process_rst(m, th, so, tp));
9976
9977         /*
9978          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9979          * synchronized state.
9980          */
9981         if (thflags & TH_SYN) {
9982                 ctf_challenge_ack(m, th, tp, &ret_val);
9983                 return (ret_val);
9984         }
9985         INP_WLOCK_ASSERT(tp->t_inpcb);
9986         /*
9987          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9988          * it's less than ts_recent, drop it.
9989          */
9990         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9991             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9992                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9993                         return (ret_val);
9994         }
9995         INP_WLOCK_ASSERT(tp->t_inpcb);
9996         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9997                 return (ret_val);
9998         }
9999         /*
10000          * If new data are received on a connection after the user processes
10001          * are gone, then we may RST the other end depending on the outcome
10002          * of bbr_check_data_after_close.
10003          */
10004         if ((so->so_state & SS_NOFDREF) &&
10005             tlen) {
10006                 /*
10007                  * We call a new function now so we might continue and setup
10008                  * to reset at all data being ack'd.
10009                  */
10010                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
10011                         return (1);
10012         }
10013         INP_WLOCK_ASSERT(tp->t_inpcb);
10014         /*
10015          * If last ACK falls within this segment's sequence numbers, record
10016          * its timestamp. NOTE: 1) That the test incorporates suggestions
10017          * from the latest proposal of the tcplw@cray.com list (Braden
10018          * 1993/04/26). 2) That updating only on newer timestamps interferes
10019          * with our earlier PAWS tests, so this check should be solely
10020          * predicated on the sequence space of this segment. 3) That we
10021          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
10022          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
10023          * SEG.Len, This modified check allows us to overcome RFC1323's
10024          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
10025          * p.869. In such cases, we can still calculate the RTT correctly
10026          * when RCV.NXT == Last.ACK.Sent.
10027          */
10028         INP_WLOCK_ASSERT(tp->t_inpcb);
10029         if ((to->to_flags & TOF_TS) != 0 &&
10030             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
10031             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
10032             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
10033                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
10034                 tp->ts_recent = to->to_tsval;
10035         }
10036         /*
10037          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
10038          * is on (half-synchronized state), then queue data for later
10039          * processing; else drop segment and return.
10040          */
10041         if ((thflags & TH_ACK) == 0) {
10042                 if (tp->t_flags & TF_NEEDSYN) {
10043                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
10044                             tiwin, thflags, nxt_pkt));
10045                 } else if (tp->t_flags & TF_ACKNOW) {
10046                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10047                         bbr->r_wanted_output = 1;
10048                         return (ret_val);
10049                 } else {
10050                         ctf_do_drop(m, NULL);
10051                         return (0);
10052                 }
10053         }
10054         /*
10055          * Ack processing.
10056          */
10057         INP_WLOCK_ASSERT(tp->t_inpcb);
10058         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10059                 return (ret_val);
10060         }
10061         if (sbavail(&so->so_snd)) {
10062                 if (ctf_progress_timeout_check(tp, true)) {
10063                         bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
10064                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10065                         return (1);
10066                 }
10067         }
10068         INP_WLOCK_ASSERT(tp->t_inpcb);
10069         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
10070             tiwin, thflags, nxt_pkt));
10071 }
10072
10073 static void
10074 bbr_stop_all_timers(struct tcpcb *tp)
10075 {
10076         struct tcp_bbr *bbr;
10077
10078         /*
10079          * Assure no timers are running.
10080          */
10081         if (tcp_timer_active(tp, TT_PERSIST)) {
10082                 /* We enter in persists, set the flag appropriately */
10083                 bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10084                 bbr->rc_in_persist = 1;
10085         }
10086         tcp_timer_suspend(tp, TT_PERSIST);
10087         tcp_timer_suspend(tp, TT_REXMT);
10088         tcp_timer_suspend(tp, TT_KEEP);
10089         tcp_timer_suspend(tp, TT_DELACK);
10090 }
10091
10092 static void
10093 bbr_google_mode_on(struct tcp_bbr *bbr)
10094 {
10095         bbr->rc_use_google = 1;
10096         bbr->rc_no_pacing = 0;
10097         bbr->r_ctl.bbr_google_discount = bbr_google_discount;
10098         bbr->r_use_policer = bbr_policer_detection_enabled;
10099         bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
10100         bbr->bbr_use_rack_cheat = 0;
10101         bbr->r_ctl.rc_incr_tmrs = 0;
10102         bbr->r_ctl.rc_inc_tcp_oh = 0;
10103         bbr->r_ctl.rc_inc_ip_oh = 0;
10104         bbr->r_ctl.rc_inc_enet_oh = 0;
10105         reset_time(&bbr->r_ctl.rc_delrate,
10106                    BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
10107         reset_time_small(&bbr->r_ctl.rc_rttprop,
10108                          (11 * USECS_IN_SECOND));
10109         tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
10110 }
10111
10112 static void
10113 bbr_google_mode_off(struct tcp_bbr *bbr)
10114 {
10115         bbr->rc_use_google = 0;
10116         bbr->r_ctl.bbr_google_discount = 0;
10117         bbr->no_pacing_until = bbr_no_pacing_until;
10118         bbr->r_use_policer = 0;
10119         if (bbr->no_pacing_until)
10120                 bbr->rc_no_pacing = 1;
10121         else
10122                 bbr->rc_no_pacing = 0;
10123         if (bbr_use_rack_resend_cheat)
10124                 bbr->bbr_use_rack_cheat = 1;
10125         else
10126                 bbr->bbr_use_rack_cheat = 0;
10127         if (bbr_incr_timers)
10128                 bbr->r_ctl.rc_incr_tmrs = 1;
10129         else
10130                 bbr->r_ctl.rc_incr_tmrs = 0;
10131         if (bbr_include_tcp_oh)
10132                 bbr->r_ctl.rc_inc_tcp_oh = 1;
10133         else
10134                 bbr->r_ctl.rc_inc_tcp_oh = 0;
10135         if (bbr_include_ip_oh)
10136                 bbr->r_ctl.rc_inc_ip_oh = 1;
10137         else
10138                 bbr->r_ctl.rc_inc_ip_oh = 0;
10139         if (bbr_include_enet_oh)
10140                 bbr->r_ctl.rc_inc_enet_oh = 1;
10141         else
10142                 bbr->r_ctl.rc_inc_enet_oh = 0;
10143         bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
10144         reset_time(&bbr->r_ctl.rc_delrate,
10145                    bbr_num_pktepo_for_del_limit);
10146         reset_time_small(&bbr->r_ctl.rc_rttprop,
10147                          (bbr_filter_len_sec * USECS_IN_SECOND));
10148         tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
10149 }
10150 /*
10151  * Return 0 on success, non-zero on failure
10152  * which indicates the error (usually no memory).
10153  */
10154 static int
10155 bbr_init(struct tcpcb *tp)
10156 {
10157         struct tcp_bbr *bbr = NULL;
10158         struct inpcb *inp;
10159         uint32_t cts;
10160
10161         tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
10162         if (tp->t_fb_ptr == NULL) {
10163                 /*
10164                  * We need to allocate memory but cant. The INP and INP_INFO
10165                  * locks and they are recusive (happens during setup. So a
10166                  * scheme to drop the locks fails :(
10167                  *
10168                  */
10169                 return (ENOMEM);
10170         }
10171         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10172         bbr->rtt_valid = 0;
10173         inp = tp->t_inpcb;
10174         inp->inp_flags2 |= INP_CANNOT_DO_ECN;
10175         inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
10176         TAILQ_INIT(&bbr->r_ctl.rc_map);
10177         TAILQ_INIT(&bbr->r_ctl.rc_free);
10178         TAILQ_INIT(&bbr->r_ctl.rc_tmap);
10179         bbr->rc_tp = tp;
10180         if (tp->t_inpcb) {
10181                 bbr->rc_inp = tp->t_inpcb;
10182         }
10183         cts = tcp_get_usecs(&bbr->rc_tv);
10184         tp->t_acktime = 0;
10185         bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close;
10186         bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade;
10187         bbr->rc_tlp_threshold = bbr_tlp_thresh;
10188         bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh;
10189         bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay;
10190         bbr->r_ctl.rc_min_to = bbr_min_to;
10191         bbr->rc_bbr_state = BBR_STATE_STARTUP;
10192         bbr->r_ctl.bbr_lost_at_state = 0;
10193         bbr->r_ctl.rc_lost_at_startup = 0;
10194         bbr->rc_all_timers_stopped = 0;
10195         bbr->r_ctl.rc_bbr_lastbtlbw = 0;
10196         bbr->r_ctl.rc_pkt_epoch_del = 0;
10197         bbr->r_ctl.rc_pkt_epoch = 0;
10198         bbr->r_ctl.rc_lowest_rtt = 0xffffffff;
10199         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain;
10200         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
10201         bbr->r_ctl.rc_went_idle_time = cts;
10202         bbr->rc_pacer_started = cts;
10203         bbr->r_ctl.rc_pkt_epoch_time = cts;
10204         bbr->r_ctl.rc_rcvtime = cts;
10205         bbr->r_ctl.rc_bbr_state_time = cts;
10206         bbr->r_ctl.rc_del_time = cts;
10207         bbr->r_ctl.rc_tlp_rxt_last_time = cts;
10208         bbr->r_ctl.last_in_probertt = cts;
10209         bbr->skip_gain = 0;
10210         bbr->gain_is_limited = 0;
10211         bbr->no_pacing_until = bbr_no_pacing_until;
10212         if (bbr->no_pacing_until)
10213                 bbr->rc_no_pacing = 1;
10214         if (bbr_use_google_algo) {
10215                 bbr->rc_no_pacing = 0;
10216                 bbr->rc_use_google = 1;
10217                 bbr->r_ctl.bbr_google_discount = bbr_google_discount;
10218                 bbr->r_use_policer = bbr_policer_detection_enabled;
10219         } else {
10220                 bbr->rc_use_google = 0;
10221                 bbr->r_ctl.bbr_google_discount = 0;
10222                 bbr->r_use_policer = 0;
10223         }
10224         if (bbr_ts_limiting)
10225                 bbr->rc_use_ts_limit = 1;
10226         else
10227                 bbr->rc_use_ts_limit = 0;
10228         if (bbr_ts_can_raise)
10229                 bbr->ts_can_raise = 1;
10230         else
10231                 bbr->ts_can_raise = 0;
10232         if (V_tcp_delack_enabled == 1)
10233                 tp->t_delayed_ack = 2;
10234         else if (V_tcp_delack_enabled == 0)
10235                 tp->t_delayed_ack = 0;
10236         else if (V_tcp_delack_enabled < 100)
10237                 tp->t_delayed_ack = V_tcp_delack_enabled;
10238         else
10239                 tp->t_delayed_ack = 2;
10240         if (bbr->rc_use_google == 0)
10241                 bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
10242         else
10243                 bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
10244         bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms;
10245         bbr->rc_max_rto_sec = bbr_rto_max_sec;
10246         bbr->rc_init_win = bbr_def_init_win;
10247         if (tp->t_flags & TF_REQ_TSTMP)
10248                 bbr->rc_last_options = TCP_TS_OVERHEAD;
10249         bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options;
10250         bbr->r_ctl.rc_high_rwnd = tp->snd_wnd;
10251         bbr->r_init_rtt = 1;
10252
10253         counter_u64_add(bbr_flows_nohdwr_pacing, 1);
10254         if (bbr_allow_hdwr_pacing)
10255                 bbr->bbr_hdw_pace_ena = 1;
10256         else
10257                 bbr->bbr_hdw_pace_ena = 0;
10258         if (bbr_sends_full_iwnd)
10259                 bbr->bbr_init_win_cheat = 1;
10260         else
10261                 bbr->bbr_init_win_cheat = 0;
10262         bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max;
10263         bbr->r_ctl.rc_drain_pg = bbr_drain_gain;
10264         bbr->r_ctl.rc_startup_pg = bbr_high_gain;
10265         bbr->rc_loss_exit = bbr_exit_startup_at_loss;
10266         bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain;
10267         bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second;
10268         bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar;
10269         bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max;
10270         bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor;
10271         bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min;
10272         bbr->r_ctl.bbr_cross_over = bbr_cross_over;
10273         bbr->r_ctl.rc_rtt_shrinks = cts;
10274         if (bbr->rc_use_google) {
10275                 setup_time_filter(&bbr->r_ctl.rc_delrate,
10276                                   FILTER_TYPE_MAX,
10277                                   BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
10278                 setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
10279                                         FILTER_TYPE_MIN, (11 * USECS_IN_SECOND));
10280         } else {
10281                 setup_time_filter(&bbr->r_ctl.rc_delrate,
10282                                   FILTER_TYPE_MAX,
10283                                   bbr_num_pktepo_for_del_limit);
10284                 setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
10285                                         FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND));
10286         }
10287         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0);
10288         if (bbr_uses_idle_restart)
10289                 bbr->rc_use_idle_restart = 1;
10290         else
10291                 bbr->rc_use_idle_restart = 0;
10292         bbr->r_ctl.rc_bbr_cur_del_rate = 0;
10293         bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps;
10294         if (bbr_resends_use_tso)
10295                 bbr->rc_resends_use_tso = 1;
10296 #ifdef NETFLIX_PEAKRATE
10297         tp->t_peakrate_thr = tp->t_maxpeakrate;
10298 #endif
10299         if (tp->snd_una != tp->snd_max) {
10300                 /* Create a send map for the current outstanding data */
10301                 struct bbr_sendmap *rsm;
10302
10303                 rsm = bbr_alloc(bbr);
10304                 if (rsm == NULL) {
10305                         uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
10306                         tp->t_fb_ptr = NULL;
10307                         return (ENOMEM);
10308                 }
10309                 rsm->r_flags = BBR_OVERMAX;
10310                 rsm->r_tim_lastsent[0] = cts;
10311                 rsm->r_rtr_cnt = 1;
10312                 rsm->r_rtr_bytes = 0;
10313                 rsm->r_start = tp->snd_una;
10314                 rsm->r_end = tp->snd_max;
10315                 rsm->r_dupack = 0;
10316                 rsm->r_delivered = bbr->r_ctl.rc_delivered;
10317                 rsm->r_ts_valid = 0;
10318                 rsm->r_del_ack_ts = tp->ts_recent;
10319                 rsm->r_del_time = cts;
10320                 if (bbr->r_ctl.r_app_limited_until)
10321                         rsm->r_app_limited = 1;
10322                 else
10323                         rsm->r_app_limited = 0;
10324                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
10325                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
10326                 rsm->r_in_tmap = 1;
10327                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
10328                         rsm->r_bbr_state = bbr_state_val(bbr);
10329                 else
10330                         rsm->r_bbr_state = 8;
10331         }
10332         if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0))
10333                 bbr->bbr_use_rack_cheat = 1;
10334         if (bbr_incr_timers && (bbr->rc_use_google == 0))
10335                 bbr->r_ctl.rc_incr_tmrs = 1;
10336         if (bbr_include_tcp_oh && (bbr->rc_use_google == 0))
10337                 bbr->r_ctl.rc_inc_tcp_oh = 1;
10338         if (bbr_include_ip_oh && (bbr->rc_use_google == 0))
10339                 bbr->r_ctl.rc_inc_ip_oh = 1;
10340         if (bbr_include_enet_oh && (bbr->rc_use_google == 0))
10341                 bbr->r_ctl.rc_inc_enet_oh = 1;
10342
10343         bbr_log_type_statechange(bbr, cts, __LINE__);
10344         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
10345             (tp->t_srtt)) {
10346                 uint32_t rtt;
10347
10348                 rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
10349                 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
10350         }
10351         /* announce the settings and state */
10352         bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT);
10353         tcp_bbr_tso_size_check(bbr, cts);
10354         /*
10355          * Now call the generic function to start a timer. This will place
10356          * the TCB on the hptsi wheel if a timer is needed with appropriate
10357          * flags.
10358          */
10359         bbr_stop_all_timers(tp);
10360         bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
10361         return (0);
10362 }
10363
10364 /*
10365  * Return 0 if we can accept the connection. Return
10366  * non-zero if we can't handle the connection. A EAGAIN
10367  * means you need to wait until the connection is up.
10368  * a EADDRNOTAVAIL means we can never handle the connection
10369  * (no SACK).
10370  */
10371 static int
10372 bbr_handoff_ok(struct tcpcb *tp)
10373 {
10374         if ((tp->t_state == TCPS_CLOSED) ||
10375             (tp->t_state == TCPS_LISTEN)) {
10376                 /* Sure no problem though it may not stick */
10377                 return (0);
10378         }
10379         if ((tp->t_state == TCPS_SYN_SENT) ||
10380             (tp->t_state == TCPS_SYN_RECEIVED)) {
10381                 /*
10382                  * We really don't know you have to get to ESTAB or beyond
10383                  * to tell.
10384                  */
10385                 return (EAGAIN);
10386         }
10387         if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) {
10388                 return (0);
10389         }
10390         /*
10391          * If we reach here we don't do SACK on this connection so we can
10392          * never do rack.
10393          */
10394         return (EINVAL);
10395 }
10396
10397 static void
10398 bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
10399 {
10400         if (tp->t_fb_ptr) {
10401                 uint32_t calc;
10402                 struct tcp_bbr *bbr;
10403                 struct bbr_sendmap *rsm;
10404
10405                 bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10406                 if (bbr->r_ctl.crte)
10407                         tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
10408                 bbr_log_flowend(bbr);
10409                 bbr->rc_tp = NULL;
10410                 if (tp->t_inpcb) {
10411                         /* Backout any flags2 we applied */
10412                         tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN;
10413                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
10414                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
10415                 }
10416                 if (bbr->bbr_hdrw_pacing)
10417                         counter_u64_add(bbr_flows_whdwr_pacing, -1);
10418                 else
10419                         counter_u64_add(bbr_flows_nohdwr_pacing, -1);
10420                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
10421                 while (rsm) {
10422                         TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
10423                         uma_zfree(bbr_zone, rsm);
10424                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
10425                 }
10426                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
10427                 while (rsm) {
10428                         TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
10429                         uma_zfree(bbr_zone, rsm);
10430                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
10431                 }
10432                 calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd;
10433                 if (calc > (bbr->r_ctl.rc_init_rwnd / 10))
10434                         BBR_STAT_INC(bbr_dynamic_rwnd);
10435                 else
10436                         BBR_STAT_INC(bbr_static_rwnd);
10437                 bbr->r_ctl.rc_free_cnt = 0;
10438                 uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
10439                 tp->t_fb_ptr = NULL;
10440         }
10441         /* Make sure snd_nxt is correctly set */
10442         tp->snd_nxt = tp->snd_max;
10443 }
10444
10445 static void
10446 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win)
10447 {
10448         switch (tp->t_state) {
10449         case TCPS_SYN_SENT:
10450                 bbr->r_state = TCPS_SYN_SENT;
10451                 bbr->r_substate = bbr_do_syn_sent;
10452                 break;
10453         case TCPS_SYN_RECEIVED:
10454                 bbr->r_state = TCPS_SYN_RECEIVED;
10455                 bbr->r_substate = bbr_do_syn_recv;
10456                 break;
10457         case TCPS_ESTABLISHED:
10458                 bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd);
10459                 bbr->r_state = TCPS_ESTABLISHED;
10460                 bbr->r_substate = bbr_do_established;
10461                 break;
10462         case TCPS_CLOSE_WAIT:
10463                 bbr->r_state = TCPS_CLOSE_WAIT;
10464                 bbr->r_substate = bbr_do_close_wait;
10465                 break;
10466         case TCPS_FIN_WAIT_1:
10467                 bbr->r_state = TCPS_FIN_WAIT_1;
10468                 bbr->r_substate = bbr_do_fin_wait_1;
10469                 break;
10470         case TCPS_CLOSING:
10471                 bbr->r_state = TCPS_CLOSING;
10472                 bbr->r_substate = bbr_do_closing;
10473                 break;
10474         case TCPS_LAST_ACK:
10475                 bbr->r_state = TCPS_LAST_ACK;
10476                 bbr->r_substate = bbr_do_lastack;
10477                 break;
10478         case TCPS_FIN_WAIT_2:
10479                 bbr->r_state = TCPS_FIN_WAIT_2;
10480                 bbr->r_substate = bbr_do_fin_wait_2;
10481                 break;
10482         case TCPS_LISTEN:
10483         case TCPS_CLOSED:
10484         case TCPS_TIME_WAIT:
10485         default:
10486                 break;
10487         };
10488 }
10489
10490 static void
10491 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog)
10492 {
10493         /*
10494          * Now what state are we going into now? Is there adjustments
10495          * needed?
10496          */
10497         int32_t old_state, old_gain;
10498
10499
10500         old_state = bbr_state_val(bbr);
10501         old_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
10502         if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) {
10503                 /* Save the lowest srtt we saw in our end of the sub-state */
10504                 bbr->rc_hit_state_1 = 0;
10505                 if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff)
10506                         bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state;
10507         }
10508         bbr->rc_bbr_substate++;
10509         if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) {
10510                 /* Cycle back to first state-> gain */
10511                 bbr->rc_bbr_substate = 0;
10512         }
10513         if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
10514                 /*
10515                  * We enter the gain(5/4) cycle (possibly less if
10516                  * shallow buffer detection is enabled)
10517                  */
10518                 if (bbr->skip_gain) {
10519                         /*
10520                          * Hardware pacing has set our rate to
10521                          * the max and limited our b/w just
10522                          * do level i.e. no gain.
10523                          */
10524                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1];
10525                 } else if (bbr->gain_is_limited &&
10526                            bbr->bbr_hdrw_pacing &&
10527                            bbr->r_ctl.crte) {
10528                         /*
10529                          * We can't gain above the hardware pacing
10530                          * rate which is less than our rate + the gain
10531                          * calculate the gain needed to reach the hardware
10532                          * pacing rate..
10533                          */
10534                         uint64_t bw, rate, gain_calc;
10535
10536                         bw = bbr_get_bw(bbr);
10537                         rate = bbr->r_ctl.crte->rate;
10538                         if ((rate > bw) &&
10539                             (((bw *  (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) {
10540                                 gain_calc = (rate * BBR_UNIT) / bw;
10541                                 if (gain_calc < BBR_UNIT)
10542                                         gain_calc = BBR_UNIT;
10543                                 bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc;
10544                         } else {
10545                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
10546                         }
10547                 } else
10548                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
10549                 if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) {
10550                         bbr->r_ctl.rc_bbr_state_atflight = cts;
10551                 } else
10552                         bbr->r_ctl.rc_bbr_state_atflight = 0;
10553         } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
10554                 bbr->rc_hit_state_1 = 1;
10555                 bbr->r_ctl.rc_exta_time_gd = 0;
10556                 bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
10557                                                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10558                 if (bbr_state_drain_2_tar) {
10559                         bbr->r_ctl.rc_bbr_state_atflight = 0;
10560                 } else
10561                         bbr->r_ctl.rc_bbr_state_atflight = cts;
10562                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN];
10563         } else {
10564                 /* All other cycles hit here 2-7 */
10565                 if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) {
10566                         if (bbr_sub_drain_slam_cwnd &&
10567                             (bbr->rc_use_google == 0) &&
10568                             (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
10569                                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
10570                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10571                         }
10572                         if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP))
10573                                 bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) -
10574                                                                bbr_get_rtt(bbr, BBR_RTT_PROP));
10575                         else
10576                                 bbr->r_ctl.rc_exta_time_gd = 0;
10577                         if (bbr->r_ctl.rc_exta_time_gd) {
10578                                 bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd;
10579                                 /* Now chop up the time for each state (div by 7) */
10580                                 bbr->r_ctl.rc_level_state_extra /= 7;
10581                                 if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) {
10582                                         /* Add a randomization */
10583                                         bbr_randomize_extra_state_time(bbr);
10584                                 }
10585                         }
10586                 }
10587                 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
10588                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)];
10589         }
10590         if (bbr->rc_use_google) {
10591                 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
10592         }
10593         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
10594         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
10595         if (dolog)
10596                 bbr_log_type_statechange(bbr, cts, line);
10597
10598         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10599                 uint32_t time_in;
10600
10601                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
10602                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
10603                         counter_u64_add(bbr_state_time[(old_state + 5)], time_in);
10604                 } else {
10605                         counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
10606                 }
10607         }
10608         bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
10609         bbr_set_state_target(bbr, __LINE__);
10610         if (bbr_sub_drain_slam_cwnd &&
10611             (bbr->rc_use_google == 0) &&
10612             (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
10613                 /* Slam down the cwnd */
10614                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10615                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10616                 if (bbr_sub_drain_app_limit) {
10617                         /* Go app limited if we are on a long drain */
10618                         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered +
10619                                                           ctf_flight_size(bbr->rc_tp,
10620                                                               (bbr->r_ctl.rc_sacked +
10621                                                                bbr->r_ctl.rc_lost_bytes)));
10622                 }
10623                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10624         }
10625         if (bbr->rc_lt_use_bw) {
10626                 /* In policed mode we clamp pacing_gain to BBR_UNIT */
10627                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
10628         }
10629         /* Google changes TSO size every cycle */
10630         if (bbr->rc_use_google)
10631                 tcp_bbr_tso_size_check(bbr, cts);
10632         bbr->r_ctl.gain_epoch = cts;
10633         bbr->r_ctl.rc_bbr_state_time = cts;
10634         bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch;
10635 }
10636
10637 static void
10638 bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
10639 {
10640         if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) &&
10641             (google_allow_early_out == 1) &&
10642             (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) {
10643                 /* We have reached out target flight size possibly early */
10644                 goto change_state;
10645         }
10646         if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10647                 return;
10648         }
10649         if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) {
10650                 /*
10651                  * Must be a rttProp movement forward before
10652                  * we can change states.
10653                  */
10654                 return;
10655         }
10656         if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
10657                 /*
10658                  * The needed time has passed but for
10659                  * the gain cycle extra rules apply:
10660                  * 1) If we have seen loss, we exit
10661                  * 2) If we have not reached the target
10662                  *    we stay in GAIN (gain-to-target).
10663                  */
10664                 if (google_consider_lost && losses)
10665                         goto change_state;
10666                 if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) {
10667                         return;
10668                 }
10669         }
10670 change_state:
10671         /* For gain we must reach our target, all others last 1 rttProp */
10672         bbr_substate_change(bbr, cts, __LINE__, 1);
10673 }
10674
10675 static void
10676 bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
10677 {
10678         uint32_t flight, bbr_cur_cycle_time;
10679
10680         if (bbr->rc_use_google) {
10681                 bbr_set_probebw_google_gains(bbr, cts, losses);
10682                 return;
10683         }
10684         if (cts == 0) {
10685                 /*
10686                  * Never alow cts to be 0 we
10687                  * do this so we can judge if
10688                  * we have set a timestamp.
10689                  */
10690                 cts = 1;
10691         }
10692         if (bbr_state_is_pkt_epoch)
10693                 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
10694         else
10695                 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP);
10696
10697         if (bbr->r_ctl.rc_bbr_state_atflight == 0) {
10698                 if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
10699                         flight = ctf_flight_size(bbr->rc_tp,
10700                                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10701                         if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) {
10702                                 /* Keep it slam down */
10703                                 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) {
10704                                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10705                                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10706                                 }
10707                                 if (bbr_sub_drain_app_limit) {
10708                                         /* Go app limited if we are on a long drain */
10709                                         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight);
10710                                 }
10711                         }
10712                         if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) &&
10713                             (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) ||
10714                              (flight >= bbr->r_ctl.flightsize_at_drain))) {
10715                                 /*
10716                                  * Still here after the same time as
10717                                  * the gain. We need to drain harder
10718                                  * for the next srtt. Reduce by a set amount
10719                                  * the gain drop is capped at DRAIN states
10720                                  * value (88).
10721                                  */
10722                                 bbr->r_ctl.flightsize_at_drain = flight;
10723                                 if (bbr_drain_drop_mul &&
10724                                     bbr_drain_drop_div &&
10725                                     (bbr_drain_drop_mul < bbr_drain_drop_div)) {
10726                                         /* Use your specific drop value (def 4/5 = 20%) */
10727                                         bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul;
10728                                         bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div;
10729                                 } else {
10730                                         /* You get drop of 20% */
10731                                         bbr->r_ctl.rc_bbr_hptsi_gain *= 4;
10732                                         bbr->r_ctl.rc_bbr_hptsi_gain /= 5;
10733                                 }
10734                                 if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) {
10735                                         /* Reduce our gain again to the bottom  */
10736                                         bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
10737                                 }
10738                                 bbr_log_exit_gain(bbr, cts, 4);
10739                                 /*
10740                                  * Extend out so we wait another
10741                                  * epoch before dropping again.
10742                                  */
10743                                 bbr->r_ctl.gain_epoch = cts;
10744                         }
10745                         if (flight <= bbr->r_ctl.rc_target_at_state) {
10746                                 if (bbr_sub_drain_slam_cwnd &&
10747                                     (bbr->rc_use_google == 0) &&
10748                                     (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
10749                                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
10750                                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10751                                 }
10752                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10753                                 bbr_log_exit_gain(bbr, cts, 3);
10754                         }
10755                 } else {
10756                         /* Its a gain  */
10757                         if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) {
10758                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10759                                 goto change_state;
10760                         }
10761                         if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) ||
10762                             ((ctf_outstanding(bbr->rc_tp) +  bbr->rc_tp->t_maxseg - 1) >=
10763                              bbr->rc_tp->snd_wnd)) {
10764                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10765                                 bbr_log_exit_gain(bbr, cts, 2);
10766                         }
10767                 }
10768                 /**
10769                  * We fall through and return always one of two things has
10770                  * occured.
10771                  * 1) We are still not at target
10772                  *    <or>
10773                  * 2) We reached the target and set rc_bbr_state_atflight
10774                  *    which means we no longer hit this block
10775                  *    next time we are called.
10776                  */
10777                 return;
10778         }
10779 change_state:
10780         if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time))
10781                 return;
10782         if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) {
10783                 /* Less than a full time-period has passed */
10784                 return;
10785         }
10786         if (bbr->r_ctl.rc_level_state_extra &&
10787             (bbr_state_val(bbr) > BBR_SUB_DRAIN) &&
10788             ((cts - bbr->r_ctl.rc_bbr_state_time) <
10789              (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
10790                 /* Less than a full time-period + extra has passed */
10791                 return;
10792         }
10793         if (bbr_gain_gets_extra_too &&
10794             bbr->r_ctl.rc_level_state_extra &&
10795             (bbr_state_val(bbr) == BBR_SUB_GAIN) &&
10796             ((cts - bbr->r_ctl.rc_bbr_state_time) <
10797              (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
10798                 /* Less than a full time-period + extra has passed */
10799                 return;
10800         }
10801         bbr_substate_change(bbr, cts, __LINE__, 1);
10802 }
10803
10804 static uint32_t
10805 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain)
10806 {
10807         uint32_t mss, tar;
10808
10809         if (bbr->rc_use_google) {
10810                 /* Google just uses the cwnd target */
10811                 tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain);
10812         } else {
10813                 mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options),
10814                           bbr->r_ctl.rc_pace_max_segs);
10815                 /* Get the base cwnd with gain rounded to a mss */
10816                 tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr),
10817                                                       gain), mss);
10818                 /* Make sure it is within our min */
10819                 if (tar < get_min_cwnd(bbr))
10820                         return (get_min_cwnd(bbr));
10821         }
10822         return (tar);
10823 }
10824
10825 static void
10826 bbr_set_state_target(struct tcp_bbr *bbr, int line)
10827 {
10828         uint32_t tar, meth;
10829
10830         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
10831             ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
10832                 /* Special case using old probe-rtt method */
10833                 tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
10834                 meth = 1;
10835         } else {
10836                 /* Non-probe-rtt case and reduced probe-rtt  */
10837                 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
10838                     (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) {
10839                         /* For gain cycle we use the hptsi gain */
10840                         tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
10841                         meth = 2;
10842                 } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) {
10843                         /*
10844                          * If configured, or for google all other states
10845                          * get BBR_UNIT.
10846                          */
10847                         tar = bbr_get_a_state_target(bbr, BBR_UNIT);
10848                         meth = 3;
10849                 } else {
10850                         /*
10851                          * Or we set a target based on the pacing gain
10852                          * for non-google mode and default (non-configured).
10853                          * Note we don't set a target goal below drain (192).
10854                          */
10855                         if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN])  {
10856                                 tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]);
10857                                 meth = 4;
10858                         } else {
10859                                 tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
10860                                 meth = 5;
10861                         }
10862                 }
10863         }
10864         bbr_log_set_of_state_target(bbr, tar, line, meth);
10865         bbr->r_ctl.rc_target_at_state = tar;
10866 }
10867
10868 static void
10869 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
10870 {
10871         /* Change to probe_rtt */
10872         uint32_t time_in;
10873
10874         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
10875         bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
10876                                              (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10877         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain
10878                                           + bbr->r_ctl.rc_delivered);
10879         /* Setup so we force feed the filter */
10880         if (bbr->rc_use_google || bbr_probertt_sets_rtt)
10881                 bbr->rc_prtt_set_ts = 1;
10882         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10883                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
10884                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
10885         }
10886         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0);
10887         bbr->r_ctl.rc_rtt_shrinks = cts;
10888         bbr->r_ctl.last_in_probertt = cts;
10889         bbr->r_ctl.rc_probertt_srttchktim = cts;
10890         bbr->r_ctl.rc_bbr_state_time = cts;
10891         bbr->rc_bbr_state = BBR_STATE_PROBE_RTT;
10892         /* We need to force the filter to update */
10893
10894         if ((bbr_sub_drain_slam_cwnd) &&
10895             bbr->rc_hit_state_1 &&
10896             (bbr->rc_use_google == 0) &&
10897             (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
10898                 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd)
10899                         bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10900         } else
10901                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10902         /* Update the lost */
10903         bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
10904         if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){
10905                 /* Set to the non-configurable default of 4 (PROBE_RTT_MIN)  */
10906                 bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
10907                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10908                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
10909                 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
10910                 bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6);
10911                 bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd;
10912         } else {
10913                 /*
10914                  * We bring it down slowly by using a hptsi gain that is
10915                  * probably 75%. This will slowly float down our outstanding
10916                  * without tampering with the cwnd.
10917                  */
10918                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
10919                 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
10920                 bbr_set_state_target(bbr, __LINE__);
10921                 if (bbr_prtt_slam_cwnd &&
10922                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
10923                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10924                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10925                 }
10926         }
10927         if (ctf_flight_size(bbr->rc_tp,
10928                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
10929             bbr->r_ctl.rc_target_at_state) {
10930                 /* We are at target */
10931                 bbr->r_ctl.rc_bbr_enters_probertt = cts;
10932         } else {
10933                 /* We need to come down to reach target before our time begins */
10934                 bbr->r_ctl.rc_bbr_enters_probertt = 0;
10935         }
10936         bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch;
10937         BBR_STAT_INC(bbr_enter_probertt);
10938         bbr_log_exit_gain(bbr, cts, 0);
10939         bbr_log_type_statechange(bbr, cts, line);
10940 }
10941
10942 static void
10943 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts)
10944 {
10945         /*
10946          * Sanity check on probe-rtt intervals.
10947          * In crazy situations where we are competing
10948          * against new-reno flows with huge buffers
10949          * our rtt-prop interval could come to dominate
10950          * things if we can't get through a full set
10951          * of cycles, we need to adjust it.
10952          */
10953         if (bbr_can_adjust_probertt &&
10954             (bbr->rc_use_google == 0)) {
10955                 uint16_t val = 0;
10956                 uint32_t cur_rttp, fval, newval, baseval;
10957
10958                 /* Are we to small and go into probe-rtt to often? */
10959                 baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1));
10960                 cur_rttp = roundup(baseval, USECS_IN_SECOND);
10961                 fval = bbr_filter_len_sec * USECS_IN_SECOND;
10962                 if (bbr_is_ratio == 0) {
10963                         if (fval > bbr_rtt_probe_limit)
10964                                 newval = cur_rttp + (fval - bbr_rtt_probe_limit);
10965                         else
10966                                 newval = cur_rttp;
10967                 } else {
10968                         int mul;
10969
10970                         mul = fval / bbr_rtt_probe_limit;
10971                         newval = cur_rttp * mul;
10972                 }
10973                 if (cur_rttp >  bbr->r_ctl.rc_probertt_int) {
10974                         bbr->r_ctl.rc_probertt_int = cur_rttp;
10975                         reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
10976                         val = 1;
10977                 } else {
10978                         /*
10979                          * No adjustments were made
10980                          * do we need to shrink it?
10981                          */
10982                         if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) {
10983                                 if (cur_rttp <= bbr_rtt_probe_limit) {
10984                                         /*
10985                                          * Things have calmed down lets
10986                                          * shrink all the way to default
10987                                          */
10988                                         bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
10989                                         reset_time_small(&bbr->r_ctl.rc_rttprop,
10990                                                          (bbr_filter_len_sec * USECS_IN_SECOND));
10991                                         cur_rttp = bbr_rtt_probe_limit;
10992                                         newval = (bbr_filter_len_sec * USECS_IN_SECOND);
10993                                         val = 2;
10994                                 } else {
10995                                         /*
10996                                          * Well does some adjustment make sense?
10997                                          */
10998                                         if (cur_rttp < bbr->r_ctl.rc_probertt_int) {
10999                                                 /* We can reduce interval time some */
11000                                                 bbr->r_ctl.rc_probertt_int = cur_rttp;
11001                                                 reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
11002                                                 val = 3;
11003                                         }
11004                                 }
11005                         }
11006                 }
11007                 if (val)
11008                         bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val);
11009         }
11010 }
11011
11012 static void
11013 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
11014 {
11015         /* Exit probe-rtt */
11016
11017         if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) {
11018                 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
11019                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11020         }
11021         bbr_log_exit_gain(bbr, cts, 1);
11022         bbr->rc_hit_state_1 = 0;
11023         bbr->r_ctl.rc_rtt_shrinks = cts;
11024         bbr->r_ctl.last_in_probertt = cts;
11025         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0);
11026         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11027         bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp,
11028                                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
11029                                           bbr->r_ctl.rc_delivered);
11030         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11031                 uint32_t time_in;
11032
11033                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11034                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11035         }
11036         if (bbr->rc_filled_pipe) {
11037                 /* Switch to probe_bw */
11038                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11039                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11040                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
11041                 bbr_substate_change(bbr, cts, __LINE__, 0);
11042                 bbr_log_type_statechange(bbr, cts, __LINE__);
11043         } else {
11044                 /* Back to startup */
11045                 bbr->rc_bbr_state = BBR_STATE_STARTUP;
11046                 bbr->r_ctl.rc_bbr_state_time = cts;
11047                 /*
11048                  * We don't want to give a complete free 3
11049                  * measurements until we exit, so we use
11050                  * the number of pe's we were in probe-rtt
11051                  * to add to the startup_epoch. That way
11052                  * we will still retain the old state.
11053                  */
11054                 bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt);
11055                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11056                 /* Make sure to use the lower pg when shifting back in */
11057                 if (bbr->r_ctl.rc_lost &&
11058                     bbr_use_lower_gain_in_startup &&
11059                     (bbr->rc_use_google == 0))
11060                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
11061                 else
11062                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
11063                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
11064                 /* Probably not needed but set it anyway */
11065                 bbr_set_state_target(bbr, __LINE__);
11066                 bbr_log_type_statechange(bbr, cts, __LINE__);
11067                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11068                     bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0);
11069         }
11070         bbr_check_probe_rtt_limits(bbr, cts);
11071 }
11072
11073 static int32_t inline
11074 bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts)
11075 {
11076         if ((bbr->rc_past_init_win == 1) &&
11077             (bbr->rc_in_persist == 0) &&
11078             (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) {
11079                 return (1);
11080         }
11081         if (bbr_can_force_probertt &&
11082             (bbr->rc_in_persist == 0) &&
11083             (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
11084             ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
11085                 return (1);
11086         }
11087         return (0);
11088 }
11089
11090
11091 static int32_t
11092 bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t  pkt_epoch)
11093 {
11094         uint64_t btlbw, gain;
11095         if (pkt_epoch == 0) {
11096                 /*
11097                  * Need to be on a pkt-epoch to continue.
11098                  */
11099                 return (0);
11100         }
11101         btlbw = bbr_get_full_bw(bbr);
11102         gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11103                  (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11104         if (btlbw >= gain) {
11105                 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
11106                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11107                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
11108                 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
11109         }
11110         if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)
11111                 return (1);
11112         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11113                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
11114         return(0);
11115 }
11116
11117 static int32_t inline
11118 bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch)
11119 {
11120         /* Have we gained 25% in the last 3 packet based epoch's? */
11121         uint64_t btlbw, gain;
11122         int do_exit;
11123         int delta, rtt_gain;
11124
11125         if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
11126             (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
11127                 /*
11128                  * This qualifies as a RTT_PROBE session since we drop the
11129                  * data outstanding to nothing and waited more than
11130                  * bbr_rtt_probe_time.
11131                  */
11132                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
11133                 bbr_set_reduced_rtt(bbr, cts, __LINE__);
11134         }
11135         if (bbr_should_enter_probe_rtt(bbr, cts)) {
11136                 bbr_enter_probe_rtt(bbr, cts, __LINE__);
11137                 return (0);
11138         }
11139         if (bbr->rc_use_google)
11140                 return (bbr_google_startup(bbr, cts,  pkt_epoch));
11141
11142         if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
11143             (bbr_use_lower_gain_in_startup)) {
11144                 /* Drop to a lower gain 1.5 x since we saw loss */
11145                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
11146         }
11147         if (pkt_epoch == 0) {
11148                 /*
11149                  * Need to be on a pkt-epoch to continue.
11150                  */
11151                 return (0);
11152         }
11153         if (bbr_rtt_gain_thresh) {
11154                 /*
11155                  * Do we allow a flow to stay
11156                  * in startup with no loss and no
11157                  * gain in rtt over a set threshold?
11158                  */
11159                 if (bbr->r_ctl.rc_pkt_epoch_rtt &&
11160                     bbr->r_ctl.startup_last_srtt &&
11161                     (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) {
11162                         delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt;
11163                         rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt;
11164                 } else
11165                         rtt_gain = 0;
11166                 if ((bbr->r_ctl.startup_last_srtt == 0)  ||
11167                     (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt))
11168                         /* First time or new lower value */
11169                         bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
11170
11171                 if ((bbr->r_ctl.rc_lost == 0) &&
11172                     (rtt_gain < bbr_rtt_gain_thresh)) {
11173                         /*
11174                          * No loss, and we are under
11175                          * our gain threhold for
11176                          * increasing RTT.
11177                          */
11178                         if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
11179                                 bbr->r_ctl.rc_bbr_last_startup_epoch++;
11180                         bbr_log_startup_event(bbr, cts, rtt_gain,
11181                                               delta, bbr->r_ctl.startup_last_srtt, 10);
11182                         return (0);
11183                 }
11184         }
11185         if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) &&
11186             (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) &&
11187             (!IN_RECOVERY(bbr->rc_tp->t_flags))) {
11188                 /*
11189                  * We only assess if we have a new measurment when
11190                  * we have no loss and are not in recovery.
11191                  * Drag up by one our last_startup epoch so we will hold
11192                  * the number of non-gain we have already accumulated.
11193                  */
11194                 if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
11195                         bbr->r_ctl.rc_bbr_last_startup_epoch++;
11196                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11197                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9);
11198                 return (0);
11199         }
11200         /* Case where we reduced the lost (bad retransmit) */
11201         if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost)
11202                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11203         bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count;
11204         btlbw = bbr_get_full_bw(bbr);
11205         if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower)
11206                 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11207                          (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11208         else
11209                 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11210                          (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11211         do_exit = 0;
11212         if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw)
11213                 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
11214         if (btlbw >= gain) {
11215                 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
11216                 /* Update the lost so we won't exit in next set of tests */
11217                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11218                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11219                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
11220         }
11221         if ((bbr->rc_loss_exit &&
11222              (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
11223              (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) &&
11224             ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) {
11225                 /*
11226                  * If we had no gain,  we had loss and that loss was above
11227                  * our threshould, the rwnd is not constrained, and we have
11228                  * had at least 3 packet epochs exit. Note that this is
11229                  * switched off by sysctl. Google does not do this by the
11230                  * way.
11231                  */
11232                 if ((ctf_flight_size(bbr->rc_tp,
11233                          (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
11234                      (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) {
11235                         do_exit = 1;
11236                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11237                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4);
11238                 } else {
11239                         /* Just record an updated loss value */
11240                         bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11241                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11242                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5);
11243                 }
11244         } else
11245                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11246         if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) ||
11247             do_exit) {
11248                 /* Return 1 to exit the startup state. */
11249                 return (1);
11250         }
11251         /* Stay in startup */
11252         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11253                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
11254         return (0);
11255 }
11256
11257 static void
11258 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses)
11259 {
11260         /*
11261          * A tick occured in the rtt epoch do we need to do anything?
11262          */
11263 #ifdef BBR_INVARIANTS
11264         if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
11265             (bbr->rc_bbr_state != BBR_STATE_DRAIN) &&
11266             (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) &&
11267             (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
11268             (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) {
11269                 /* Debug code? */
11270                 panic("Unknown BBR state %d?\n", bbr->rc_bbr_state);
11271         }
11272 #endif
11273         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
11274                 /* Do we exit the startup state? */
11275                 if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) {
11276                         uint32_t time_in;
11277
11278                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11279                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6);
11280                         bbr->rc_filled_pipe = 1;
11281                         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11282                         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11283
11284                                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11285                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11286                         } else
11287                                 time_in = 0;
11288                         if (bbr->rc_no_pacing)
11289                                 bbr->rc_no_pacing = 0;
11290                         bbr->r_ctl.rc_bbr_state_time = cts;
11291                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg;
11292                         bbr->rc_bbr_state = BBR_STATE_DRAIN;
11293                         bbr_set_state_target(bbr, __LINE__);
11294                         if ((bbr->rc_use_google == 0) &&
11295                             bbr_slam_cwnd_in_main_drain) {
11296                                 /* Here we don't have to worry about probe-rtt */
11297                                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
11298                                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11299                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11300                         }
11301                         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
11302                         bbr_log_type_statechange(bbr, cts, __LINE__);
11303                         if (ctf_flight_size(bbr->rc_tp,
11304                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
11305                             bbr->r_ctl.rc_target_at_state) {
11306                                 /*
11307                                  * Switch to probe_bw if we are already
11308                                  * there
11309                                  */
11310                                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11311                                 bbr_substate_change(bbr, cts, __LINE__, 0);
11312                                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11313                                 bbr_log_type_statechange(bbr, cts, __LINE__);
11314                         }
11315                 }
11316         } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) {
11317                 uint32_t inflight;
11318                 struct tcpcb *tp;
11319
11320                 tp = bbr->rc_tp;
11321                 inflight = ctf_flight_size(tp,
11322                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11323                 if (inflight >= bbr->r_ctl.rc_target_at_state) {
11324                         /* We have reached a flight of the cwnd target */
11325                         bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11326                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
11327                         bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
11328                         bbr_set_state_target(bbr, __LINE__);
11329                         /*
11330                          * Rig it so we don't do anything crazy and
11331                          * start fresh with a new randomization.
11332                          */
11333                         bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
11334                         bbr->rc_bbr_substate = BBR_SUB_LEVEL6;
11335                         bbr_substate_change(bbr, cts, __LINE__, 1);
11336                 }
11337         } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) {
11338                 /* Has in-flight reached the bdp (or less)? */
11339                 uint32_t inflight;
11340                 struct tcpcb *tp;
11341
11342                 tp = bbr->rc_tp;
11343                 inflight = ctf_flight_size(tp,
11344                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11345                 if ((bbr->rc_use_google == 0) &&
11346                     bbr_slam_cwnd_in_main_drain &&
11347                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11348                         /*
11349                          * Here we don't have to worry about probe-rtt
11350                          * re-slam it, but keep it slammed down.
11351                          */
11352                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11353                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11354                 }
11355                 if (inflight <= bbr->r_ctl.rc_target_at_state) {
11356                         /* We have drained */
11357                         bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11358                         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11359                         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11360                                 uint32_t time_in;
11361
11362                                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11363                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11364                         }
11365                         if ((bbr->rc_use_google == 0) &&
11366                             bbr_slam_cwnd_in_main_drain &&
11367                             (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
11368                                 /* Restore the cwnd */
11369                                 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
11370                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11371                         }
11372                         /* Setup probe-rtt has being done now RRS-HERE */
11373                         bbr->r_ctl.rc_rtt_shrinks = cts;
11374                         bbr->r_ctl.last_in_probertt = cts;
11375                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0);
11376                         /* Randomly pick a sub-state */
11377                         bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11378                         bbr_substate_change(bbr, cts, __LINE__, 0);
11379                         bbr_log_type_statechange(bbr, cts, __LINE__);
11380                 }
11381         } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) {
11382                 uint32_t flight;
11383
11384                 flight = ctf_flight_size(bbr->rc_tp,
11385                              (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11386                 bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered);
11387                 if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) &&
11388                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11389                         /*
11390                          * We must keep cwnd at the desired MSS.
11391                          */
11392                         bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
11393                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11394                 } else if ((bbr_prtt_slam_cwnd) &&
11395                            (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11396                         /* Re-slam it */
11397                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11398                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11399                 }
11400                 if (bbr->r_ctl.rc_bbr_enters_probertt == 0) {
11401                         /* Has outstanding reached our target? */
11402                         if (flight <= bbr->r_ctl.rc_target_at_state) {
11403                                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0);
11404                                 bbr->r_ctl.rc_bbr_enters_probertt = cts;
11405                                 /* If time is exactly 0, be 1usec off */
11406                                 if (bbr->r_ctl.rc_bbr_enters_probertt == 0)
11407                                         bbr->r_ctl.rc_bbr_enters_probertt = 1;
11408                                 if (bbr->rc_use_google == 0) {
11409                                         /*
11410                                          * Restore any lowering that as occured to
11411                                          * reach here
11412                                          */
11413                                         if (bbr->r_ctl.bbr_rttprobe_gain_val)
11414                                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
11415                                         else
11416                                                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
11417                                 }
11418                         }
11419                         if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) &&
11420                             (bbr->rc_use_google == 0) &&
11421                             bbr->r_ctl.bbr_rttprobe_gain_val &&
11422                             (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) ||
11423                              (flight >= bbr->r_ctl.flightsize_at_drain))) {
11424                                 /*
11425                                  * We have doddled with our current hptsi
11426                                  * gain an srtt and have still not made it
11427                                  * to target, or we have increased our flight.
11428                                  * Lets reduce the gain by xx%
11429                                  * flooring the reduce at DRAIN (based on
11430                                  * mul/div)
11431                                  */
11432                                 int red;
11433
11434                                 bbr->r_ctl.flightsize_at_drain = flight;
11435                                 bbr->r_ctl.rc_probertt_srttchktim = cts;
11436                                 red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1);
11437                                 if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) {
11438                                         /* Reduce our gain again */
11439                                         bbr->r_ctl.rc_bbr_hptsi_gain -= red;
11440                                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0);
11441                                 } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) {
11442                                         /* one more chance before we give up */
11443                                         bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
11444                                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0);
11445                                 } else {
11446                                         /* At the very bottom */
11447                                         bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1);
11448                                 }
11449                         }
11450                 }
11451                 if (bbr->r_ctl.rc_bbr_enters_probertt &&
11452                     (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) &&
11453                     ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) {
11454                         /* Time to exit probe RTT normally */
11455                         bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts);
11456                 }
11457         } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
11458                 if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
11459                     (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
11460                         /*
11461                          * This qualifies as a RTT_PROBE session since we
11462                          * drop the data outstanding to nothing and waited
11463                          * more than bbr_rtt_probe_time.
11464                          */
11465                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
11466                         bbr_set_reduced_rtt(bbr, cts, __LINE__);
11467                 }
11468                 if (bbr_should_enter_probe_rtt(bbr, cts)) {
11469                         bbr_enter_probe_rtt(bbr, cts, __LINE__);
11470                 } else {
11471                         bbr_set_probebw_gains(bbr, cts, losses);
11472                 }
11473         }
11474 }
11475
11476 static void
11477 bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses)
11478 {
11479         int32_t epoch = 0;
11480
11481         if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
11482                 bbr_set_epoch(bbr, cts, line);
11483                 /* At each epoch doe lt bw sampling */
11484                 epoch = 1;
11485         }
11486         bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses);
11487 }
11488
11489 static int
11490 bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
11491     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
11492     int32_t nxt_pkt, struct timeval *tv)
11493 {
11494         int32_t thflags, retval;
11495         uint32_t cts, lcts;
11496         uint32_t tiwin;
11497         struct tcpopt to;
11498         struct tcp_bbr *bbr;
11499         struct bbr_sendmap *rsm;
11500         struct timeval ltv;
11501         int32_t did_out = 0;
11502         int32_t in_recovery;
11503         uint16_t nsegs;
11504         int32_t prev_state;
11505         uint32_t lost;
11506
11507         nsegs = max(1, m->m_pkthdr.lro_nsegs);
11508         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
11509         /* add in our stats */
11510         kern_prefetch(bbr, &prev_state);
11511         prev_state = 0;
11512         thflags = th->th_flags;
11513         /*
11514          * If this is either a state-changing packet or current state isn't
11515          * established, we require a write lock on tcbinfo.  Otherwise, we
11516          * allow the tcbinfo to be in either alocked or unlocked, as the
11517          * caller may have unnecessarily acquired a write lock due to a
11518          * race.
11519          */
11520         INP_WLOCK_ASSERT(tp->t_inpcb);
11521         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
11522             __func__));
11523         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
11524             __func__));
11525
11526         tp->t_rcvtime = ticks;
11527         /*
11528          * Unscale the window into a 32-bit value. For the SYN_SENT state
11529          * the scale is zero.
11530          */
11531         tiwin = th->th_win << tp->snd_scale;
11532 #ifdef STATS
11533         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
11534 #endif
11535         /*
11536          * Parse options on any incoming segment.
11537          */
11538         tcp_dooptions(&to, (u_char *)(th + 1),
11539             (th->th_off << 2) - sizeof(struct tcphdr),
11540             (thflags & TH_SYN) ? TO_SYN : 0);
11541
11542         if (m->m_flags & M_TSTMP) {
11543                 /* Prefer the hardware timestamp if present */
11544                 struct timespec ts;
11545
11546                 mbuf_tstmp2timespec(m, &ts);
11547                 bbr->rc_tv.tv_sec = ts.tv_sec;
11548                 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
11549                 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
11550         } else if (m->m_flags & M_TSTMP_LRO) {
11551                 /* Next the arrival timestamp */
11552                 struct timespec ts;
11553
11554                 mbuf_tstmp2timespec(m, &ts);
11555                 bbr->rc_tv.tv_sec = ts.tv_sec;
11556                 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
11557                 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
11558         } else {
11559                 /*
11560                  * Ok just get the current time.
11561                  */
11562                 bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv);
11563         }
11564         /*
11565          * If echoed timestamp is later than the current time, fall back to
11566          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
11567          * were used when this connection was established.
11568          */
11569         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
11570                 to.to_tsecr -= tp->ts_offset;
11571                 if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv)))
11572                         to.to_tsecr = 0;
11573         }
11574         /*
11575          * If its the first time in we need to take care of options and
11576          * verify we can do SACK for rack!
11577          */
11578         if (bbr->r_state == 0) {
11579                 /*
11580                  * Process options only when we get SYN/ACK back. The SYN
11581                  * case for incoming connections is handled in tcp_syncache.
11582                  * According to RFC1323 the window field in a SYN (i.e., a
11583                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
11584                  * this is traditional behavior, may need to be cleaned up.
11585                  */
11586                 if (bbr->rc_inp == NULL) {
11587                         bbr->rc_inp = tp->t_inpcb;
11588                 }
11589                 /*
11590                  * We need to init rc_inp here since its not init'd when
11591                  * bbr_init is called
11592                  */
11593                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
11594                         if ((to.to_flags & TOF_SCALE) &&
11595                             (tp->t_flags & TF_REQ_SCALE)) {
11596                                 tp->t_flags |= TF_RCVD_SCALE;
11597                                 tp->snd_scale = to.to_wscale;
11598                         } else
11599                                 tp->t_flags &= ~TF_REQ_SCALE;
11600                         /*
11601                          * Initial send window.  It will be updated with the
11602                          * next incoming segment to the scaled value.
11603                          */
11604                         tp->snd_wnd = th->th_win;
11605                         if ((to.to_flags & TOF_TS) &&
11606                             (tp->t_flags & TF_REQ_TSTMP)) {
11607                                 tp->t_flags |= TF_RCVD_TSTMP;
11608                                 tp->ts_recent = to.to_tsval;
11609                                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
11610                         } else
11611                             tp->t_flags &= ~TF_REQ_TSTMP;
11612                         if (to.to_flags & TOF_MSS)
11613                                 tcp_mss(tp, to.to_mss);
11614                         if ((tp->t_flags & TF_SACK_PERMIT) &&
11615                             (to.to_flags & TOF_SACKPERM) == 0)
11616                                 tp->t_flags &= ~TF_SACK_PERMIT;
11617                         if (IS_FASTOPEN(tp->t_flags)) {
11618                                 if (to.to_flags & TOF_FASTOPEN) {
11619                                         uint16_t mss;
11620
11621                                         if (to.to_flags & TOF_MSS)
11622                                                 mss = to.to_mss;
11623                                         else
11624                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
11625                                                         mss = TCP6_MSS;
11626                                                 else
11627                                                         mss = TCP_MSS;
11628                                         tcp_fastopen_update_cache(tp, mss,
11629                                             to.to_tfo_len, to.to_tfo_cookie);
11630                                 } else
11631                                         tcp_fastopen_disable_path(tp);
11632                         }
11633                 }
11634                 /*
11635                  * At this point we are at the initial call. Here we decide
11636                  * if we are doing RACK or not. We do this by seeing if
11637                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
11638                  * we switch to the default code.
11639                  */
11640                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
11641                         /* Bail */
11642                         tcp_switch_back_to_default(tp);
11643                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
11644                             tlen, iptos);
11645                         return (1);
11646                 }
11647                 /* Set the flag */
11648                 bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
11649                 tcp_set_hpts(tp->t_inpcb);
11650                 sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack);
11651         }
11652         if (thflags & TH_ACK) {
11653                 /* Track ack types */
11654                 if (to.to_flags & TOF_SACK)
11655                         BBR_STAT_INC(bbr_acks_with_sacks);
11656                 else
11657                         BBR_STAT_INC(bbr_plain_acks);
11658         }
11659         /*
11660          * This is the one exception case where we set the rack state
11661          * always. All other times (timers etc) we must have a rack-state
11662          * set (so we assure we have done the checks above for SACK).
11663          */
11664         if (thflags & TH_FIN)
11665                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
11666         if (bbr->r_state != tp->t_state)
11667                 bbr_set_state(tp, bbr, tiwin);
11668
11669         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL)
11670                 kern_prefetch(rsm, &prev_state);
11671         prev_state = bbr->r_state;
11672         bbr->rc_ack_was_delayed = 0;
11673         lost = bbr->r_ctl.rc_lost;
11674         bbr->rc_is_pkt_epoch_now = 0;
11675         if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) {
11676                 /* Get the real time into lcts and figure the real delay */
11677                 lcts = tcp_get_usecs(&ltv);
11678                 if (TSTMP_GT(lcts, cts)) {
11679                         bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts;
11680                         bbr->rc_ack_was_delayed = 1;
11681                         if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay,
11682                                      bbr->r_ctl.highest_hdwr_delay))
11683                                 bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay;
11684                 } else {
11685                         bbr->r_ctl.rc_ack_hdwr_delay = 0;
11686                         bbr->rc_ack_was_delayed = 0;
11687                 }
11688         } else {
11689                 bbr->r_ctl.rc_ack_hdwr_delay = 0;
11690                 bbr->rc_ack_was_delayed = 0;
11691         }
11692         bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m);
11693         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
11694                 retval = 0;
11695                 m_freem(m);
11696                 goto done_with_input;
11697         }
11698         /*
11699          * If a segment with the ACK-bit set arrives in the SYN-SENT state
11700          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
11701          */
11702         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
11703             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
11704                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11705                 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11706                 return (1);
11707         }
11708         in_recovery = IN_RECOVERY(tp->t_flags);
11709         if (tiwin > bbr->r_ctl.rc_high_rwnd)
11710                 bbr->r_ctl.rc_high_rwnd = tiwin;
11711 #ifdef BBR_INVARIANTS
11712         if ((tp->t_inpcb->inp_flags & INP_DROPPED) ||
11713             (tp->t_inpcb->inp_flags2 & INP_FREED)) {
11714                 panic("tp:%p bbr:%p given a dropped inp:%p",
11715                     tp, bbr, tp->t_inpcb);
11716         }
11717 #endif
11718         bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp,
11719                                             (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11720         bbr->rtt_valid = 0;
11721         if (to.to_flags & TOF_TS) {
11722                 bbr->rc_ts_valid = 1;
11723                 bbr->r_ctl.last_inbound_ts = to.to_tsval;
11724         } else {
11725                 bbr->rc_ts_valid = 0;
11726                 bbr->r_ctl.last_inbound_ts = 0;
11727         }
11728         retval = (*bbr->r_substate) (m, th, so,
11729             tp, &to, drop_hdrlen,
11730             tlen, tiwin, thflags, nxt_pkt, iptos);
11731 #ifdef BBR_INVARIANTS
11732         if ((retval == 0) &&
11733             (tp->t_inpcb == NULL)) {
11734                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
11735                     retval, tp, prev_state);
11736         }
11737 #endif
11738         if (nxt_pkt == 0)
11739                 BBR_STAT_INC(bbr_rlock_left_ret0);
11740         else
11741                 BBR_STAT_INC(bbr_rlock_left_ret1);
11742         if (retval == 0) {
11743                 /*
11744                  * If retval is 1 the tcb is unlocked and most likely the tp
11745                  * is gone.
11746                  */
11747                 INP_WLOCK_ASSERT(tp->t_inpcb);
11748                 tcp_bbr_xmit_timer_commit(bbr, tp, cts);
11749                 if (bbr->rc_is_pkt_epoch_now)
11750                         bbr_set_pktepoch(bbr, cts, __LINE__);
11751                 bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
11752                 if (nxt_pkt == 0) {
11753                         if (bbr->r_wanted_output != 0) {
11754                                 bbr->rc_output_starts_timer = 0;
11755                                 did_out = 1;
11756                                 (void)tp->t_fb->tfb_tcp_output(tp);
11757                         } else
11758                                 bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
11759                 }
11760                 if ((nxt_pkt == 0) &&
11761                     ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
11762                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
11763                      (tp->t_flags & TF_DELACK) ||
11764                      ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
11765                       (tp->t_state <= TCPS_CLOSING)))) {
11766                         /*
11767                          * We could not send (probably in the hpts but
11768                          * stopped the timer)?
11769                          */
11770                         if ((tp->snd_max == tp->snd_una) &&
11771                             ((tp->t_flags & TF_DELACK) == 0) &&
11772                             (bbr->rc_inp->inp_in_hpts) &&
11773                             (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11774                                 /*
11775                                  * keep alive not needed if we are hptsi
11776                                  * output yet
11777                                  */
11778                                 ;
11779                         } else {
11780                                 if (bbr->rc_inp->inp_in_hpts) {
11781                                         tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
11782                                         if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
11783                                             (TSTMP_GT(lcts, bbr->rc_pacer_started))) {
11784                                                 uint32_t del;
11785
11786                                                 del = lcts - bbr->rc_pacer_started;
11787                                                 if (bbr->r_ctl.rc_last_delay_val > del) {
11788                                                         BBR_STAT_INC(bbr_force_timer_start);
11789                                                         bbr->r_ctl.rc_last_delay_val -= del;
11790                                                         bbr->rc_pacer_started = lcts;
11791                                                 } else {
11792                                                         /* We are late */
11793                                                         bbr->r_ctl.rc_last_delay_val = 0;
11794                                                         BBR_STAT_INC(bbr_force_output);
11795                                                         (void)tp->t_fb->tfb_tcp_output(tp);
11796                                                 }
11797                                         }
11798                                 }
11799                                 bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val,
11800                                     0);
11801                         }
11802                 } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) {
11803                         /* Do we have the correct timer running? */
11804                         bbr_timer_audit(tp, bbr, lcts, &so->so_snd);
11805                 }
11806                 /* Do we have a new state */
11807                 if (bbr->r_state != tp->t_state)
11808                         bbr_set_state(tp, bbr, tiwin);
11809 done_with_input:
11810                 bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out);
11811                 if (did_out)
11812                         bbr->r_wanted_output = 0;
11813 #ifdef BBR_INVARIANTS
11814                 if (tp->t_inpcb == NULL) {
11815                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
11816                             did_out,
11817                             retval, tp, prev_state);
11818                 }
11819 #endif
11820         }
11821         return (retval);
11822 }
11823
11824 static void
11825 bbr_log_type_hrdwtso(struct tcpcb *tp, struct tcp_bbr *bbr, int len, int mod, int what_we_can_send)
11826 {
11827         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
11828                 union tcp_log_stackspecific log;
11829                 struct timeval tv;
11830                 uint32_t cts;
11831
11832                 cts = tcp_get_usecs(&tv);
11833                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
11834                 log.u_bbr.flex1 = bbr->r_ctl.rc_pace_min_segs;
11835                 log.u_bbr.flex2 = what_we_can_send;
11836                 log.u_bbr.flex3 = bbr->r_ctl.rc_pace_max_segs;
11837                 log.u_bbr.flex4 = len;
11838                 log.u_bbr.flex5 = 0;
11839                 log.u_bbr.flex7 = mod;
11840                 log.u_bbr.flex8 = 1;
11841                 TCP_LOG_EVENTP(tp, NULL,
11842                     &tp->t_inpcb->inp_socket->so_rcv,
11843                     &tp->t_inpcb->inp_socket->so_snd,
11844                     TCP_HDWR_TLS, 0,
11845                     0, &log, false, &tv);
11846         }
11847 }
11848
11849 static void
11850 bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
11851     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
11852 {
11853         struct timeval tv;
11854         int retval;
11855
11856         /* First lets see if we have old packets */
11857         if (tp->t_in_pkt) {
11858                 if (ctf_do_queued_segments(so, tp, 1)) {
11859                         m_freem(m);
11860                         return;
11861                 }
11862         }
11863         if (m->m_flags & M_TSTMP_LRO) {
11864                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
11865                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
11866         } else {
11867                 /* Should not be should we kassert instead? */
11868                 tcp_get_usecs(&tv);
11869         }
11870         retval = bbr_do_segment_nounlock(m, th, so, tp,
11871                                          drop_hdrlen, tlen, iptos, 0, &tv);
11872         if (retval == 0)
11873                 INP_WUNLOCK(tp->t_inpcb);
11874 }
11875
11876 /*
11877  * Return how much data can be sent without violating the
11878  * cwnd or rwnd.
11879  */
11880
11881 static inline uint32_t
11882 bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin,
11883     uint32_t avail, int32_t sb_offset, uint32_t cts)
11884 {
11885         uint32_t len;
11886
11887         if (ctf_outstanding(tp) >= tp->snd_wnd) {
11888                 /* We never want to go over our peers rcv-window */
11889                 len = 0;
11890         } else {
11891                 uint32_t flight;
11892
11893                 flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11894                 if (flight >= sendwin) {
11895                         /*
11896                          * We have in flight what we are allowed by cwnd (if
11897                          * it was rwnd blocking it would have hit above out
11898                          * >= tp->snd_wnd).
11899                          */
11900                         return (0);
11901                 }
11902                 len = sendwin - flight;
11903                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
11904                         /* We would send too much (beyond the rwnd) */
11905                         len = tp->snd_wnd - ctf_outstanding(tp);
11906                 }
11907                 if ((len + sb_offset) > avail) {
11908                         /*
11909                          * We don't have that much in the SB, how much is
11910                          * there?
11911                          */
11912                         len = avail - sb_offset;
11913                 }
11914         }
11915         return (len);
11916 }
11917
11918 static inline void
11919 bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
11920 {
11921 #ifdef NETFLIX_STATS
11922         KMOD_TCPSTAT_INC(tcps_sndpack_error);
11923         KMOD_TCPSTAT_ADD(tcps_sndbyte_error, len);
11924 #endif
11925 }
11926
11927 static inline void
11928 bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
11929 {
11930         if (error) {
11931                 bbr_do_error_accounting(tp, bbr, rsm, len, error);
11932                 return;
11933         }
11934         if (rsm) {
11935                 if (rsm->r_flags & BBR_TLP) {
11936                         /*
11937                          * TLP should not count in retran count, but in its
11938                          * own bin
11939                          */
11940 #ifdef NETFLIX_STATS
11941                         tp->t_sndtlppack++;
11942                         tp->t_sndtlpbyte += len;
11943                         KMOD_TCPSTAT_INC(tcps_tlpresends);
11944                         KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len);
11945 #endif
11946                 } else {
11947                         /* Retransmit */
11948                         tp->t_sndrexmitpack++;
11949                         KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
11950                         KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
11951 #ifdef STATS
11952                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
11953                             len);
11954 #endif
11955                 }
11956                 /*
11957                  * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is
11958                  * sub-state
11959                  */
11960                 counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len);
11961                 if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) {
11962                         /* Non probe_bw log in 1, 2, or 4. */
11963                         counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len);
11964                 } else {
11965                         /*
11966                          * Log our probe state 3, and log also 5-13 to show
11967                          * us the recovery sub-state for the send. This
11968                          * means that 3 == (5+6+7+8+9+10+11+12+13)
11969                          */
11970                         counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len);
11971                         counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len);
11972                 }
11973                 /* Place in both 16's the totals of retransmitted */
11974                 counter_u64_add(bbr_state_lost[16], len);
11975                 counter_u64_add(bbr_state_resend[16], len);
11976                 /* Place in 17's the total sent */
11977                 counter_u64_add(bbr_state_resend[17], len);
11978                 counter_u64_add(bbr_state_lost[17], len);
11979
11980         } else {
11981                 /* New sends */
11982                 KMOD_TCPSTAT_INC(tcps_sndpack);
11983                 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
11984                 /* Place in 17's the total sent */
11985                 counter_u64_add(bbr_state_resend[17], len);
11986                 counter_u64_add(bbr_state_lost[17], len);
11987 #ifdef STATS
11988                 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
11989                     len);
11990 #endif
11991         }
11992 }
11993
11994 static void
11995 bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level)
11996 {
11997         if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) {
11998                 /*
11999                  * Limit the cwnd to not be above N x the target plus whats
12000                  * is outstanding. The target is based on the current b/w
12001                  * estimate.
12002                  */
12003                 uint32_t target;
12004
12005                 target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT);
12006                 target += ctf_outstanding(tp);
12007                 target *= bbr_target_cwnd_mult_limit;
12008                 if (tp->snd_cwnd > target)
12009                         tp->snd_cwnd = target;
12010                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__);
12011         }
12012 }
12013
12014 static int
12015 bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg)
12016 {
12017         /*
12018          * "adv" is the amount we could increase the window, taking into
12019          * account that we are limited by TCP_MAXWIN << tp->rcv_scale.
12020          */
12021         uint32_t adv;
12022         int32_t oldwin;
12023
12024         adv = min(recwin, TCP_MAXWIN << tp->rcv_scale);
12025         if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
12026                 oldwin = (tp->rcv_adv - tp->rcv_nxt);
12027                 adv -= oldwin;
12028         } else
12029                 oldwin = 0;
12030
12031         /*
12032          * If the new window size ends up being the same as the old size
12033          * when it is scaled, then don't force a window update.
12034          */
12035         if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
12036                 return (0);
12037
12038         if (adv >= (2 * maxseg) &&
12039             (adv >= (so->so_rcv.sb_hiwat / 4) ||
12040             recwin <= (so->so_rcv.sb_hiwat / 8) ||
12041             so->so_rcv.sb_hiwat <= 8 * maxseg)) {
12042                 return (1);
12043         }
12044         if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
12045                 return (1);
12046         return (0);
12047 }
12048
12049 /*
12050  * Return 0 on success and a errno on failure to send.
12051  * Note that a 0 return may not mean we sent anything
12052  * if the TCB was on the hpts. A non-zero return
12053  * does indicate the error we got from ip[6]_output.
12054  */
12055 static int
12056 bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
12057 {
12058         struct socket *so;
12059         int32_t len;
12060         uint32_t cts;
12061         uint32_t recwin, sendwin;
12062         int32_t sb_offset;
12063         int32_t flags, abandon, error = 0;
12064         struct tcp_log_buffer *lgb = NULL;
12065         struct mbuf *m;
12066         struct mbuf *mb;
12067         uint32_t if_hw_tsomaxsegcount = 0;
12068         uint32_t if_hw_tsomaxsegsize = 0;
12069         uint32_t if_hw_tsomax = 0;
12070         struct ip *ip = NULL;
12071 #ifdef TCPDEBUG
12072         struct ipovly *ipov = NULL;
12073 #endif
12074         struct tcp_bbr *bbr;
12075         struct tcphdr *th;
12076 #ifdef NETFLIX_TCPOUDP
12077         struct udphdr *udp = NULL;
12078 #endif
12079         u_char opt[TCP_MAXOLEN];
12080         unsigned ipoptlen, optlen, hdrlen;
12081 #ifdef NETFLIX_TCPOUDP
12082         unsigned ulen;
12083 #endif
12084         uint32_t bbr_seq;
12085         uint32_t delay_calc=0;
12086         uint8_t doing_tlp = 0;
12087         uint8_t local_options;
12088 #ifdef BBR_INVARIANTS
12089         uint8_t doing_retran_from = 0;
12090         uint8_t picked_up_retran = 0;
12091 #endif
12092         uint8_t wanted_cookie = 0;
12093         uint8_t more_to_rxt=0;
12094         int32_t prefetch_so_done = 0;
12095         int32_t prefetch_rsm = 0;
12096         uint32_t what_we_can = 0;
12097         uint32_t tot_len = 0;
12098         uint32_t rtr_cnt = 0;
12099         uint32_t maxseg, pace_max_segs, p_maxseg;
12100         int32_t csum_flags;
12101         int32_t hw_tls;
12102 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12103         unsigned ipsec_optlen = 0;
12104
12105 #endif
12106         volatile int32_t sack_rxmit;
12107         struct bbr_sendmap *rsm = NULL;
12108         int32_t tso, mtu;
12109         int force_tso = 0;
12110         struct tcpopt to;
12111         int32_t slot = 0;
12112         struct inpcb *inp;
12113         struct sockbuf *sb;
12114         uint32_t hpts_calling;
12115 #ifdef INET6
12116         struct ip6_hdr *ip6 = NULL;
12117         int32_t isipv6;
12118 #endif
12119         uint8_t app_limited = BBR_JR_SENT_DATA;
12120         uint8_t filled_all = 0;
12121         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
12122         /* We take a cache hit here */
12123         memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
12124         cts = tcp_tv_to_usectick(&bbr->rc_tv);
12125         inp = bbr->rc_inp;
12126         so = inp->inp_socket;
12127         sb = &so->so_snd;
12128 #ifdef KERN_TLS
12129         if (sb->sb_flags & SB_TLS_IFNET)
12130                 hw_tls = 1;
12131         else
12132 #endif
12133                 hw_tls = 0;
12134         kern_prefetch(sb, &maxseg);
12135         maxseg = tp->t_maxseg - bbr->rc_last_options;
12136         if (bbr_minseg(bbr) < maxseg) {
12137                 tcp_bbr_tso_size_check(bbr, cts);
12138         }
12139         /* Remove any flags that indicate we are pacing on the inp  */
12140         pace_max_segs = bbr->r_ctl.rc_pace_max_segs;
12141         p_maxseg = min(maxseg, pace_max_segs);
12142         INP_WLOCK_ASSERT(inp);
12143 #ifdef TCP_OFFLOAD
12144         if (tp->t_flags & TF_TOE)
12145                 return (tcp_offload_output(tp));
12146 #endif
12147
12148 #ifdef INET6
12149         if (bbr->r_state) {
12150                 /* Use the cache line loaded if possible */
12151                 isipv6 = bbr->r_is_v6;
12152         } else {
12153                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
12154         }
12155 #endif
12156         if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
12157             inp->inp_in_hpts) {
12158                 /*
12159                  * We are on the hpts for some timer but not hptsi output.
12160                  * Possibly remove from the hpts so we can send/recv etc.
12161                  */
12162                 if ((tp->t_flags & TF_ACKNOW) == 0) {
12163                         /*
12164                          * No immediate demand right now to send an ack, but
12165                          * the user may have read, making room for new data
12166                          * (a window update). If so we may want to cancel
12167                          * whatever timer is running (KEEP/DEL-ACK?) and
12168                          * continue to send out a window update. Or we may
12169                          * have gotten more data into the socket buffer to
12170                          * send.
12171                          */
12172                         recwin = min(max(sbspace(&so->so_rcv), 0),
12173                             TCP_MAXWIN << tp->rcv_scale);
12174                         if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) &&
12175                             ((tcp_outflags[tp->t_state] & TH_RST) == 0) &&
12176                             ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <=
12177                             (tp->snd_max - tp->snd_una))) {
12178                                 /*
12179                                  * Nothing new to send and no window update
12180                                  * is needed to send. Lets just return and
12181                                  * let the timer-run off.
12182                                  */
12183                                 return (0);
12184                         }
12185                 }
12186                 tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12187                 bbr_timer_cancel(bbr, __LINE__, cts);
12188         }
12189         if (bbr->r_ctl.rc_last_delay_val) {
12190                 /* Calculate a rough delay for early escape to sending  */
12191                 if (SEQ_GT(cts, bbr->rc_pacer_started))
12192                         delay_calc = cts - bbr->rc_pacer_started;
12193                 if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
12194                         delay_calc -= bbr->r_ctl.rc_last_delay_val;
12195                 else
12196                         delay_calc = 0;
12197         }
12198         /* Mark that we have called bbr_output(). */
12199         if ((bbr->r_timer_override) ||
12200             (tp->t_state < TCPS_ESTABLISHED)) {
12201                 /* Timeouts or early states are exempt */
12202                 if (inp->inp_in_hpts)
12203                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12204         } else if (inp->inp_in_hpts) {
12205                 if ((bbr->r_ctl.rc_last_delay_val) &&
12206                     (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
12207                     delay_calc) {
12208                         /*
12209                          * We were being paced for output and the delay has
12210                          * already exceeded when we were supposed to be
12211                          * called, lets go ahead and pull out of the hpts
12212                          * and call output.
12213                          */
12214                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);
12215                         bbr->r_ctl.rc_last_delay_val = 0;
12216                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12217                 } else if (tp->t_state == TCPS_CLOSED) {
12218                         bbr->r_ctl.rc_last_delay_val = 0;
12219                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12220                 } else {
12221                         /*
12222                          * On the hpts, you shall not pass! even if ACKNOW
12223                          * is on, we will when the hpts fires, unless of
12224                          * course we are overdue.
12225                          */
12226                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1);
12227                         return (0);
12228                 }
12229         }
12230         bbr->rc_cwnd_limited = 0;
12231         if (bbr->r_ctl.rc_last_delay_val) {
12232                 /* recalculate the real delay and deal with over/under  */
12233                 if (SEQ_GT(cts, bbr->rc_pacer_started))
12234                         delay_calc = cts - bbr->rc_pacer_started;
12235                 else
12236                         delay_calc = 0;
12237                 if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
12238                         /* Setup the delay which will be added in */
12239                         delay_calc -= bbr->r_ctl.rc_last_delay_val;
12240                 else {
12241                         /*
12242                          * We are early setup to adjust
12243                          * our slot time.
12244                          */
12245                         uint64_t merged_val;
12246
12247                         bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc);
12248                         bbr->r_agg_early_set = 1;
12249                         if (bbr->r_ctl.rc_hptsi_agg_delay) {
12250                                 if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) {
12251                                         /* Nope our previous late cancels out the early */
12252                                         bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early;
12253                                         bbr->r_agg_early_set = 0;
12254                                         bbr->r_ctl.rc_agg_early = 0;
12255                                 } else {
12256                                         bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay;
12257                                         bbr->r_ctl.rc_hptsi_agg_delay = 0;
12258                                 }
12259                         }
12260                         merged_val = bbr->rc_pacer_started;
12261                         merged_val <<= 32;
12262                         merged_val |= bbr->r_ctl.rc_last_delay_val;
12263                         bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls,
12264                                                  bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val,
12265                                                  bbr->r_agg_early_set, 3);
12266                         bbr->r_ctl.rc_last_delay_val = 0;
12267                         BBR_STAT_INC(bbr_early);
12268                         delay_calc = 0;
12269                 }
12270         } else {
12271                 /* We were not delayed due to hptsi */
12272                 if (bbr->r_agg_early_set)
12273                         bbr->r_ctl.rc_agg_early = 0;
12274                 bbr->r_agg_early_set = 0;
12275                 delay_calc = 0;
12276         }
12277         if (delay_calc) {
12278                 /*
12279                  * We had a hptsi delay which means we are falling behind on
12280                  * sending at the expected rate. Calculate an extra amount
12281                  * of data we can send, if any, to put us back on track.
12282                  */
12283                 if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay)
12284                         bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff;
12285                 else
12286                         bbr->r_ctl.rc_hptsi_agg_delay += delay_calc;
12287         }
12288         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
12289         if ((tp->snd_una == tp->snd_max) &&
12290             (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
12291             (sbavail(sb))) {
12292                 /*
12293                  * Ok we have been idle with nothing outstanding
12294                  * we possibly need to start fresh with either a new
12295                  * suite of states or a fast-ramp up.
12296                  */
12297                 bbr_restart_after_idle(bbr,
12298                                        cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time));
12299         }
12300         /*
12301          * Now was there a hptsi delay where we are behind? We only count
12302          * being behind if: a) We are not in recovery. b) There was a delay.
12303          * <and> c) We had room to send something.
12304          *
12305          */
12306         hpts_calling = inp->inp_hpts_calls;
12307         inp->inp_hpts_calls = 0;
12308         if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
12309                 if (bbr_process_timers(tp, bbr, cts, hpts_calling)) {
12310                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
12311                         return (0);
12312                 }
12313         }
12314         bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
12315         if (hpts_calling &&
12316             (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12317                 bbr->r_ctl.rc_last_delay_val = 0;
12318         }
12319         bbr->r_timer_override = 0;
12320         bbr->r_wanted_output = 0;
12321         /*
12322          * For TFO connections in SYN_RECEIVED, only allow the initial
12323          * SYN|ACK and those sent by the retransmit timer.
12324          */
12325         if (IS_FASTOPEN(tp->t_flags) &&
12326             ((tp->t_state == TCPS_SYN_RECEIVED) ||
12327              (tp->t_state == TCPS_SYN_SENT)) &&
12328             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
12329             (tp->t_rxtshift == 0)) {    /* not a retransmit */
12330                 len = 0;
12331                 goto just_return_nolock;
12332         }
12333         /*
12334          * Before sending anything check for a state update. For hpts
12335          * calling without input this is important. If its input calling
12336          * then this was already done.
12337          */
12338         if (bbr->rc_use_google == 0)
12339                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
12340 again:
12341         /*
12342          * If we've recently taken a timeout, snd_max will be greater than
12343          * snd_max. BBR in general does not pay much attention to snd_nxt
12344          * for historic reasons the persist timer still uses it. This means
12345          * we have to look at it. All retransmissions that are not persits
12346          * use the rsm that needs to be sent so snd_nxt is ignored. At the
12347          * end of this routine we pull snd_nxt always up to snd_max.
12348          */
12349         doing_tlp = 0;
12350 #ifdef BBR_INVARIANTS
12351         doing_retran_from = picked_up_retran = 0;
12352 #endif
12353         error = 0;
12354         tso = 0;
12355         slot = 0;
12356         mtu = 0;
12357         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
12358         sb_offset = tp->snd_max - tp->snd_una;
12359         flags = tcp_outflags[tp->t_state];
12360         sack_rxmit = 0;
12361         len = 0;
12362         rsm = NULL;
12363         if (flags & TH_RST) {
12364                 SOCKBUF_LOCK(sb);
12365                 goto send;
12366         }
12367 recheck_resend:
12368         while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
12369                 /* We need to always have one in reserve */
12370                 rsm = bbr_alloc(bbr);
12371                 if (rsm == NULL) {
12372                         error = ENOMEM;
12373                         /* Lie to get on the hpts */
12374                         tot_len = tp->t_maxseg;
12375                         if (hpts_calling)
12376                                 /* Retry in a ms */
12377                                 slot = 1001;
12378                         goto just_return_nolock;
12379                 }
12380                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
12381                 bbr->r_ctl.rc_free_cnt++;
12382                 rsm = NULL;
12383         }
12384         /* What do we send, a resend? */
12385         if (bbr->r_ctl.rc_resend == NULL) {
12386                 /* Check for rack timeout */
12387                 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
12388                 if (bbr->r_ctl.rc_resend) {
12389 #ifdef BBR_INVARIANTS
12390                         picked_up_retran = 1;
12391 #endif
12392                         bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend);
12393                 }
12394         }
12395         if (bbr->r_ctl.rc_resend) {
12396                 rsm = bbr->r_ctl.rc_resend;
12397 #ifdef BBR_INVARIANTS
12398                 doing_retran_from = 1;
12399 #endif
12400                 /* Remove any TLP flags its a RACK or T-O */
12401                 rsm->r_flags &= ~BBR_TLP;
12402                 bbr->r_ctl.rc_resend = NULL;
12403                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
12404 #ifdef BBR_INVARIANTS
12405                         panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n",
12406                             tp, bbr, rsm, rsm->r_start, tp->snd_una);
12407                         goto recheck_resend;
12408 #else
12409                         /* TSNH */
12410                         rsm = NULL;
12411                         goto recheck_resend;
12412 #endif
12413                 }
12414                 rtr_cnt++;
12415                 if (rsm->r_flags & BBR_HAS_SYN) {
12416                         /* Only retransmit a SYN by itself */
12417                         len = 0;
12418                         if ((flags & TH_SYN) == 0) {
12419                                 /* Huh something is wrong */
12420                                 rsm->r_start++;
12421                                 if (rsm->r_start == rsm->r_end) {
12422                                         /* Clean it up, somehow we missed the ack? */
12423                                         bbr_log_syn(tp, NULL);
12424                                 } else {
12425                                         /* TFO with data? */
12426                                         rsm->r_flags &= ~BBR_HAS_SYN;
12427                                         len = rsm->r_end - rsm->r_start;
12428                                 }
12429                         } else {
12430                                 /* Retransmitting SYN */
12431                                 rsm = NULL;
12432                                 SOCKBUF_LOCK(sb);
12433                                 goto send;
12434                         }
12435                 } else
12436                         len = rsm->r_end - rsm->r_start;
12437                 if ((bbr->rc_resends_use_tso == 0) &&
12438 #ifdef KERN_TLS
12439                     ((sb->sb_flags & SB_TLS_IFNET) == 0) &&
12440 #endif
12441                     (len > maxseg)) {
12442                         len = maxseg;
12443                         more_to_rxt = 1;
12444                 }
12445                 sb_offset = rsm->r_start - tp->snd_una;
12446                 if (len > 0) {
12447                         sack_rxmit = 1;
12448                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
12449                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
12450                             min(len, maxseg));
12451                 } else {
12452                         /* I dont think this can happen */
12453                         rsm = NULL;
12454                         goto recheck_resend;
12455                 }
12456                 BBR_STAT_INC(bbr_resends_set);
12457         } else if (bbr->r_ctl.rc_tlp_send) {
12458                 /*
12459                  * Tail loss probe
12460                  */
12461                 doing_tlp = 1;
12462                 rsm = bbr->r_ctl.rc_tlp_send;
12463                 bbr->r_ctl.rc_tlp_send = NULL;
12464                 sack_rxmit = 1;
12465                 len = rsm->r_end - rsm->r_start;
12466                 rtr_cnt++;
12467                 if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
12468                         len = maxseg;
12469
12470                 if (SEQ_GT(tp->snd_una, rsm->r_start)) {
12471 #ifdef BBR_INVARIANTS
12472                         panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u",
12473                             tp, bbr, tp->snd_una, rsm, rsm->r_start);
12474 #else
12475                         /* TSNH */
12476                         rsm = NULL;
12477                         goto recheck_resend;
12478 #endif
12479                 }
12480                 sb_offset = rsm->r_start - tp->snd_una;
12481                 BBR_STAT_INC(bbr_tlp_set);
12482         }
12483         /*
12484          * Enforce a connection sendmap count limit if set
12485          * as long as we are not retransmiting.
12486          */
12487         if ((rsm == NULL) &&
12488             (V_tcp_map_entries_limit > 0) &&
12489             (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
12490                 BBR_STAT_INC(bbr_alloc_limited);
12491                 if (!bbr->alloc_limit_reported) {
12492                         bbr->alloc_limit_reported = 1;
12493                         BBR_STAT_INC(bbr_alloc_limited_conns);
12494                 }
12495                 goto just_return_nolock;
12496         }
12497 #ifdef BBR_INVARIANTS
12498         if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) {
12499                 panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u",
12500                     tp, bbr, rsm, sb_offset, len);
12501         }
12502 #endif
12503         /*
12504          * Get standard flags, and add SYN or FIN if requested by 'hidden'
12505          * state flags.
12506          */
12507         if (tp->t_flags & TF_NEEDFIN && (rsm == NULL))
12508                 flags |= TH_FIN;
12509         if (tp->t_flags & TF_NEEDSYN)
12510                 flags |= TH_SYN;
12511
12512         if (rsm && (rsm->r_flags & BBR_HAS_FIN)) {
12513                 /* we are retransmitting the fin */
12514                 len--;
12515                 if (len) {
12516                         /*
12517                          * When retransmitting data do *not* include the
12518                          * FIN. This could happen from a TLP probe if we
12519                          * allowed data with a FIN.
12520                          */
12521                         flags &= ~TH_FIN;
12522                 }
12523         } else if (rsm) {
12524                 if (flags & TH_FIN)
12525                         flags &= ~TH_FIN;
12526         }
12527         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
12528                 void *end_rsm;
12529
12530                 end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
12531                 if (end_rsm)
12532                         kern_prefetch(end_rsm, &prefetch_rsm);
12533                 prefetch_rsm = 1;
12534         }
12535         SOCKBUF_LOCK(sb);
12536         /*
12537          * If snd_nxt == snd_max and we have transmitted a FIN, the
12538          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
12539          * negative length.  This can also occur when TCP opens up its
12540          * congestion window while receiving additional duplicate acks after
12541          * fast-retransmit because TCP will reset snd_nxt to snd_max after
12542          * the fast-retransmit.
12543          *
12544          * In the normal retransmit-FIN-only case, however, snd_nxt will be
12545          * set to snd_una, the sb_offset will be 0, and the length may wind
12546          * up 0.
12547          *
12548          * If sack_rxmit is true we are retransmitting from the scoreboard
12549          * in which case len is already set.
12550          */
12551         if (sack_rxmit == 0) {
12552                 uint32_t avail;
12553
12554                 avail = sbavail(sb);
12555                 if (SEQ_GT(tp->snd_max, tp->snd_una))
12556                         sb_offset = tp->snd_max - tp->snd_una;
12557                 else
12558                         sb_offset = 0;
12559                 if (bbr->rc_tlp_new_data) {
12560                         /* TLP is forcing out new data */
12561                         uint32_t tlplen;
12562
12563                         doing_tlp = 1;
12564                         tlplen = maxseg;
12565
12566                         if (tlplen > (uint32_t)(avail - sb_offset)) {
12567                                 tlplen = (uint32_t)(avail - sb_offset);
12568                         }
12569                         if (tlplen > tp->snd_wnd) {
12570                                 len = tp->snd_wnd;
12571                         } else {
12572                                 len = tlplen;
12573                         }
12574                         bbr->rc_tlp_new_data = 0;
12575                 } else {
12576                         what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
12577                         if ((len < p_maxseg) &&
12578                             (bbr->rc_in_persist == 0) &&
12579                             (ctf_outstanding(tp) >= (2 * p_maxseg)) &&
12580                             ((avail - sb_offset) >= p_maxseg)) {
12581                                 /*
12582                                  * We are not completing whats in the socket
12583                                  * buffer (i.e. there is at least a segment
12584                                  * waiting to send) and we have 2 or more
12585                                  * segments outstanding. There is no sense
12586                                  * of sending a little piece. Lets defer and
12587                                  * and wait until we can send a whole
12588                                  * segment.
12589                                  */
12590                                 len = 0;
12591                         }
12592                         if (bbr->rc_in_persist) {
12593                                 /*
12594                                  * We are in persists, figure out if
12595                                  * a retransmit is available (maybe the previous
12596                                  * persists we sent) or if we have to send new
12597                                  * data.
12598                                  */
12599                                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
12600                                 if (rsm) {
12601                                         len = rsm->r_end - rsm->r_start;
12602                                         if (rsm->r_flags & BBR_HAS_FIN)
12603                                                 len--;
12604                                         if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
12605                                                 len = maxseg;
12606                                         if (len > 1)
12607                                                 BBR_STAT_INC(bbr_persist_reneg);
12608                                         /*
12609                                          * XXXrrs we could force the len to
12610                                          * 1 byte here to cause the chunk to
12611                                          * split apart.. but that would then
12612                                          * mean we always retransmit it as
12613                                          * one byte even after the window
12614                                          * opens.
12615                                          */
12616                                         sack_rxmit = 1;
12617                                         sb_offset = rsm->r_start - tp->snd_una;
12618                                 } else {
12619                                         /*
12620                                          * First time through in persists or peer
12621                                          * acked our one byte. Though we do have
12622                                          * to have something in the sb.
12623                                          */
12624                                         len = 1;
12625                                         sb_offset = 0;
12626                                         if (avail == 0)
12627                                             len = 0;
12628                                 }
12629                         }
12630                 }
12631         }
12632         if (prefetch_so_done == 0) {
12633                 kern_prefetch(so, &prefetch_so_done);
12634                 prefetch_so_done = 1;
12635         }
12636         /*
12637          * Lop off SYN bit if it has already been sent.  However, if this is
12638          * SYN-SENT state and if segment contains data and if we don't know
12639          * that foreign host supports TAO, suppress sending segment.
12640          */
12641         if ((flags & TH_SYN) && (rsm == NULL) &&
12642             SEQ_GT(tp->snd_max, tp->snd_una)) {
12643                 if (tp->t_state != TCPS_SYN_RECEIVED)
12644                         flags &= ~TH_SYN;
12645                 /*
12646                  * When sending additional segments following a TFO SYN|ACK,
12647                  * do not include the SYN bit.
12648                  */
12649                 if (IS_FASTOPEN(tp->t_flags) &&
12650                     (tp->t_state == TCPS_SYN_RECEIVED))
12651                         flags &= ~TH_SYN;
12652                 sb_offset--, len++;
12653                 if (sbavail(sb) == 0)
12654                         len = 0;
12655         } else if ((flags & TH_SYN) && rsm) {
12656                 /*
12657                  * Subtract one from the len for the SYN being
12658                  * retransmitted.
12659                  */
12660                 len--;
12661         }
12662         /*
12663          * Be careful not to send data and/or FIN on SYN segments. This
12664          * measure is needed to prevent interoperability problems with not
12665          * fully conformant TCP implementations.
12666          */
12667         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
12668                 len = 0;
12669                 flags &= ~TH_FIN;
12670         }
12671         /*
12672          * On TFO sockets, ensure no data is sent in the following cases:
12673          *
12674          *  - When retransmitting SYN|ACK on a passively-created socket
12675          *  - When retransmitting SYN on an actively created socket
12676          *  - When sending a zero-length cookie (cookie request) on an
12677          *    actively created socket
12678          *  - When the socket is in the CLOSED state (RST is being sent)
12679          */
12680         if (IS_FASTOPEN(tp->t_flags) &&
12681             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
12682              ((tp->t_state == TCPS_SYN_SENT) &&
12683               (tp->t_tfo_client_cookie_len == 0)) ||
12684              (flags & TH_RST))) {
12685                 len = 0;
12686                 sack_rxmit = 0;
12687                 rsm = NULL;
12688         }
12689         /* Without fast-open there should never be data sent on a SYN */
12690         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
12691                 len = 0;
12692         if (len <= 0) {
12693                 /*
12694                  * If FIN has been sent but not acked, but we haven't been
12695                  * called to retransmit, len will be < 0.  Otherwise, window
12696                  * shrank after we sent into it.  If window shrank to 0,
12697                  * cancel pending retransmit, pull snd_nxt back to (closed)
12698                  * window, and set the persist timer if it isn't already
12699                  * going.  If the window didn't close completely, just wait
12700                  * for an ACK.
12701                  *
12702                  * We also do a general check here to ensure that we will
12703                  * set the persist timer when we have data to send, but a
12704                  * 0-byte window. This makes sure the persist timer is set
12705                  * even if the packet hits one of the "goto send" lines
12706                  * below.
12707                  */
12708                 len = 0;
12709                 if ((tp->snd_wnd == 0) &&
12710                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12711                     (tp->snd_una == tp->snd_max) &&
12712                     (sb_offset < (int)sbavail(sb))) {
12713                         /*
12714                          * Not enough room in the rwnd to send
12715                          * a paced segment out.
12716                          */
12717                         bbr_enter_persist(tp, bbr, cts, __LINE__);
12718                 }
12719         } else if ((rsm == NULL) &&
12720                    (doing_tlp == 0) &&
12721                    (len < bbr->r_ctl.rc_pace_max_segs)) {
12722                 /*
12723                  * We are not sending a full segment for
12724                  * some reason. Should we not send anything (think
12725                  * sws or persists)?
12726                  */
12727                 if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
12728                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12729                     (len < (int)(sbavail(sb) - sb_offset))) {
12730                         /*
12731                          * Here the rwnd is less than
12732                          * the pacing size, this is not a retransmit,
12733                          * we are established and
12734                          * the send is not the last in the socket buffer
12735                          * lets not send, and possibly enter persists.
12736                          */
12737                         len = 0;
12738                         if (tp->snd_max == tp->snd_una)
12739                                 bbr_enter_persist(tp, bbr, cts, __LINE__);
12740                 } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) &&
12741                            (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12742                                                  bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
12743                            (len < (int)(sbavail(sb) - sb_offset)) &&
12744                            (len < bbr_minseg(bbr))) {
12745                         /*
12746                          * Here we are not retransmitting, and
12747                          * the cwnd is not so small that we could
12748                          * not send at least a min size (rxt timer
12749                          * not having gone off), We have 2 segments or
12750                          * more already in flight, its not the tail end
12751                          * of the socket buffer  and the cwnd is blocking
12752                          * us from sending out minimum pacing segment size.
12753                          * Lets not send anything.
12754                          */
12755                         bbr->rc_cwnd_limited = 1;
12756                         len = 0;
12757                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
12758                             min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
12759                            (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12760                                                  bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
12761                            (len < (int)(sbavail(sb) - sb_offset)) &&
12762                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
12763                         /*
12764                          * Here we have a send window but we have
12765                          * filled it up and we can't send another pacing segment.
12766                          * We also have in flight more than 2 segments
12767                          * and we are not completing the sb i.e. we allow
12768                          * the last bytes of the sb to go out even if
12769                          * its not a full pacing segment.
12770                          */
12771                         len = 0;
12772                 }
12773         }
12774         /* len will be >= 0 after this point. */
12775         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
12776         tcp_sndbuf_autoscale(tp, so, sendwin);
12777         /*
12778          *
12779          */
12780         if (bbr->rc_in_persist &&
12781             len &&
12782             (rsm == NULL) &&
12783             (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) {
12784                 /*
12785                  * We are in persist, not doing a retransmit and don't have enough space
12786                  * yet to send a full TSO. So is it at the end of the sb
12787                  * if so we need to send else nuke to 0 and don't send.
12788                  */
12789                 int sbleft;
12790                 if (sbavail(sb) > sb_offset)
12791                         sbleft = sbavail(sb) - sb_offset;
12792                 else
12793                         sbleft = 0;
12794                 if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) {
12795                         /* not at end of sb lets not send */
12796                         len = 0;
12797                 }
12798         }
12799         /*
12800          * Decide if we can use TCP Segmentation Offloading (if supported by
12801          * hardware).
12802          *
12803          * TSO may only be used if we are in a pure bulk sending state.  The
12804          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
12805          * options prevent using TSO.  With TSO the TCP header is the same
12806          * (except for the sequence number) for all generated packets.  This
12807          * makes it impossible to transmit any options which vary per
12808          * generated segment or packet.
12809          *
12810          * IPv4 handling has a clear separation of ip options and ip header
12811          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen()
12812          * does the right thing below to provide length of just ip options
12813          * and thus checking for ipoptlen is enough to decide if ip options
12814          * are present.
12815          */
12816 #ifdef INET6
12817         if (isipv6)
12818                 ipoptlen = ip6_optlen(inp);
12819         else
12820 #endif
12821         if (inp->inp_options)
12822                 ipoptlen = inp->inp_options->m_len -
12823                     offsetof(struct ipoption, ipopt_list);
12824         else
12825                 ipoptlen = 0;
12826 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12827         /*
12828          * Pre-calculate here as we save another lookup into the darknesses
12829          * of IPsec that way and can actually decide if TSO is ok.
12830          */
12831 #ifdef INET6
12832         if (isipv6 && IPSEC_ENABLED(ipv6))
12833                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
12834 #ifdef INET
12835         else
12836 #endif
12837 #endif                          /* INET6 */
12838 #ifdef INET
12839         if (IPSEC_ENABLED(ipv4))
12840                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
12841 #endif                          /* INET */
12842 #endif                          /* IPSEC */
12843 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12844         ipoptlen += ipsec_optlen;
12845 #endif
12846         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
12847             (len > maxseg) &&
12848             (tp->t_port == 0) &&
12849             ((tp->t_flags & TF_SIGNATURE) == 0) &&
12850             tp->rcv_numsacks == 0 &&
12851             ipoptlen == 0)
12852                 tso = 1;
12853
12854         recwin = min(max(sbspace(&so->so_rcv), 0),
12855             TCP_MAXWIN << tp->rcv_scale);
12856         /*
12857          * Sender silly window avoidance.   We transmit under the following
12858          * conditions when len is non-zero:
12859          *
12860          * - We have a full segment (or more with TSO) - This is the last
12861          * buffer in a write()/send() and we are either idle or running
12862          * NODELAY - we've timed out (e.g. persist timer) - we have more
12863          * then 1/2 the maximum send window's worth of data (receiver may be
12864          * limited the window size) - we need to retransmit
12865          */
12866         if (rsm)
12867                 goto send;
12868         if (len) {
12869                 if (sack_rxmit)
12870                         goto send;
12871                 if (len >= p_maxseg)
12872                         goto send;
12873                 /*
12874                  * NOTE! on localhost connections an 'ack' from the remote
12875                  * end may occur synchronously with the output and cause us
12876                  * to flush a buffer queued with moretocome.  XXX
12877                  *
12878                  */
12879                 if (((tp->t_flags & TF_MORETOCOME) == 0) &&     /* normal case */
12880                     ((tp->t_flags & TF_NODELAY) ||
12881                     ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) &&
12882                     (tp->t_flags & TF_NOPUSH) == 0) {
12883                         goto send;
12884                 }
12885                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
12886                         goto send;
12887                 }
12888                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
12889                         goto send;
12890                 }
12891         }
12892         /*
12893          * Sending of standalone window updates.
12894          *
12895          * Window updates are important when we close our window due to a
12896          * full socket buffer and are opening it again after the application
12897          * reads data from it.  Once the window has opened again and the
12898          * remote end starts to send again the ACK clock takes over and
12899          * provides the most current window information.
12900          *
12901          * We must avoid the silly window syndrome whereas every read from
12902          * the receive buffer, no matter how small, causes a window update
12903          * to be sent.  We also should avoid sending a flurry of window
12904          * updates when the socket buffer had queued a lot of data and the
12905          * application is doing small reads.
12906          *
12907          * Prevent a flurry of pointless window updates by only sending an
12908          * update when we can increase the advertized window by more than
12909          * 1/4th of the socket buffer capacity.  When the buffer is getting
12910          * full or is very small be more aggressive and send an update
12911          * whenever we can increase by two mss sized segments. In all other
12912          * situations the ACK's to new incoming data will carry further
12913          * window increases.
12914          *
12915          * Don't send an independent window update if a delayed ACK is
12916          * pending (it will get piggy-backed on it) or the remote side
12917          * already has done a half-close and won't send more data.  Skip
12918          * this if the connection is in T/TCP half-open state.
12919          */
12920         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
12921             !(tp->t_flags & TF_DELACK) &&
12922             !TCPS_HAVERCVDFIN(tp->t_state)) {
12923                 /* Check to see if we should do a window update */
12924                 if (bbr_window_update_needed(tp, so, recwin, maxseg))
12925                         goto send;
12926         }
12927         /*
12928          * Send if we owe the peer an ACK, RST, SYN.  ACKNOW
12929          * is also a catch-all for the retransmit timer timeout case.
12930          */
12931         if (tp->t_flags & TF_ACKNOW) {
12932                 goto send;
12933         }
12934         if (flags & TH_RST) {
12935                 /* Always send a RST if one is due */
12936                 goto send;
12937         }
12938         if ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0) {
12939                 goto send;
12940         }
12941         /*
12942          * If our state indicates that FIN should be sent and we have not
12943          * yet done so, then we need to send.
12944          */
12945         if (flags & TH_FIN &&
12946             ((tp->t_flags & TF_SENTFIN) == 0)) {
12947                 goto send;
12948         }
12949         /*
12950          * No reason to send a segment, just return.
12951          */
12952 just_return:
12953         SOCKBUF_UNLOCK(sb);
12954 just_return_nolock:
12955         if (tot_len)
12956                 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
12957         if (bbr->rc_no_pacing)
12958                 slot = 0;
12959         if (tot_len == 0) {
12960                 if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
12961                     tp->snd_wnd) {
12962                         BBR_STAT_INC(bbr_rwnd_limited);
12963                         app_limited = BBR_JR_RWND_LIMITED;
12964                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
12965                         if ((bbr->rc_in_persist == 0) &&
12966                             TCPS_HAVEESTABLISHED(tp->t_state) &&
12967                             (tp->snd_max == tp->snd_una) &&
12968                             sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
12969                                 /* No send window.. we must enter persist */
12970                                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
12971                         }
12972                 } else if (ctf_outstanding(tp) >= sbavail(sb)) {
12973                         BBR_STAT_INC(bbr_app_limited);
12974                         app_limited = BBR_JR_APP_LIMITED;
12975                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
12976                 } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12977                                                  bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) {
12978                         BBR_STAT_INC(bbr_cwnd_limited);
12979                         app_limited = BBR_JR_CWND_LIMITED;
12980                         bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12981                                                                         bbr->r_ctl.rc_lost_bytes)));
12982                         bbr->rc_cwnd_limited = 1;
12983                 } else {
12984                         BBR_STAT_INC(bbr_app_limited);
12985                         app_limited = BBR_JR_APP_LIMITED;
12986                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
12987                 }
12988                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
12989                 bbr->r_agg_early_set = 0;
12990                 bbr->r_ctl.rc_agg_early = 0;
12991                 bbr->r_ctl.rc_last_delay_val = 0;
12992         } else if (bbr->rc_use_google == 0)
12993                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
12994         /* Are we app limited? */
12995         if ((app_limited == BBR_JR_APP_LIMITED) ||
12996             (app_limited == BBR_JR_RWND_LIMITED)) {
12997                 /**
12998                  * We are application limited.
12999                  */
13000                 bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
13001                                                                        bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered);
13002         }
13003         if (tot_len == 0)
13004                 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1);
13005         /* Dont update the time if we did not send */
13006         bbr->r_ctl.rc_last_delay_val = 0;
13007         bbr->rc_output_starts_timer = 1;
13008         bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
13009         bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
13010         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
13011                 /* Make sure snd_nxt is drug up */
13012                 tp->snd_nxt = tp->snd_max;
13013         }
13014         return (error);
13015
13016 send:
13017         if (doing_tlp == 0) {
13018                 /*
13019                  * Data not a TLP, and its not the rxt firing. If it is the
13020                  * rxt firing, we want to leave the tlp_in_progress flag on
13021                  * so we don't send another TLP. It has to be a rack timer
13022                  * or normal send (response to acked data) to clear the tlp
13023                  * in progress flag.
13024                  */
13025                 bbr->rc_tlp_in_progress = 0;
13026                 bbr->rc_tlp_rtx_out = 0;
13027         } else {
13028                 /*
13029                  * Its a TLP.
13030                  */
13031                 bbr->rc_tlp_in_progress = 1;
13032         }
13033         bbr_timer_cancel(bbr, __LINE__, cts);
13034         if (rsm == NULL) {
13035                 if (sbused(sb) > 0) {
13036                         /*
13037                          * This is sub-optimal. We only send a stand alone
13038                          * FIN on its own segment.
13039                          */
13040                         if (flags & TH_FIN) {
13041                                 flags &= ~TH_FIN;
13042                                 if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
13043                                         /* Lets not send this */
13044                                         slot = 0;
13045                                         goto just_return;
13046                                 }
13047                         }
13048                 }
13049         } else {
13050                 /*
13051                  * We do *not* send a FIN on a retransmit if it has data.
13052                  * The if clause here where len > 1 should never come true.
13053                  */
13054                 if ((len > 0) &&
13055                     (((rsm->r_flags & BBR_HAS_FIN) == 0) &&
13056                     (flags & TH_FIN))) {
13057                         flags &= ~TH_FIN;
13058                         len--;
13059                 }
13060         }
13061         SOCKBUF_LOCK_ASSERT(sb);
13062         if (len > 0) {
13063                 if ((tp->snd_una == tp->snd_max) &&
13064                     (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
13065                         /*
13066                          * This qualifies as a RTT_PROBE session since we
13067                          * drop the data outstanding to nothing and waited
13068                          * more than bbr_rtt_probe_time.
13069                          */
13070                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
13071                         bbr_set_reduced_rtt(bbr, cts, __LINE__);
13072                 }
13073                 if (len >= maxseg)
13074                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
13075                 else
13076                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
13077         }
13078         /*
13079          * Before ESTABLISHED, force sending of initial options unless TCP
13080          * set not to do any options. NOTE: we assume that the IP/TCP header
13081          * plus TCP options always fit in a single mbuf, leaving room for a
13082          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
13083          * + optlen <= MCLBYTES
13084          */
13085         optlen = 0;
13086 #ifdef INET6
13087         if (isipv6)
13088                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
13089         else
13090 #endif
13091                 hdrlen = sizeof(struct tcpiphdr);
13092
13093         /*
13094          * Compute options for segment. We only have to care about SYN and
13095          * established connection segments.  Options for SYN-ACK segments
13096          * are handled in TCP syncache.
13097          */
13098         to.to_flags = 0;
13099         local_options = 0;
13100         if ((tp->t_flags & TF_NOOPT) == 0) {
13101                 /* Maximum segment size. */
13102                 if (flags & TH_SYN) {
13103                         to.to_mss = tcp_mssopt(&inp->inp_inc);
13104 #ifdef NETFLIX_TCPOUDP
13105                         if (tp->t_port)
13106                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
13107 #endif
13108                         to.to_flags |= TOF_MSS;
13109                         /*
13110                          * On SYN or SYN|ACK transmits on TFO connections,
13111                          * only include the TFO option if it is not a
13112                          * retransmit, as the presence of the TFO option may
13113                          * have caused the original SYN or SYN|ACK to have
13114                          * been dropped by a middlebox.
13115                          */
13116                         if (IS_FASTOPEN(tp->t_flags) &&
13117                             (tp->t_rxtshift == 0)) {
13118                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
13119                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
13120                                         to.to_tfo_cookie =
13121                                             (u_int8_t *)&tp->t_tfo_cookie.server;
13122                                         to.to_flags |= TOF_FASTOPEN;
13123                                         wanted_cookie = 1;
13124                                 } else if (tp->t_state == TCPS_SYN_SENT) {
13125                                         to.to_tfo_len =
13126                                             tp->t_tfo_client_cookie_len;
13127                                         to.to_tfo_cookie =
13128                                             tp->t_tfo_cookie.client;
13129                                         to.to_flags |= TOF_FASTOPEN;
13130                                         wanted_cookie = 1;
13131                                 }
13132                         }
13133                 }
13134                 /* Window scaling. */
13135                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
13136                         to.to_wscale = tp->request_r_scale;
13137                         to.to_flags |= TOF_SCALE;
13138                 }
13139                 /* Timestamps. */
13140                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
13141                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
13142                         to.to_tsval =   tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset;
13143                         to.to_tsecr = tp->ts_recent;
13144                         to.to_flags |= TOF_TS;
13145                         local_options += TCPOLEN_TIMESTAMP + 2;
13146                 }
13147                 /* Set receive buffer autosizing timestamp. */
13148                 if (tp->rfbuf_ts == 0 &&
13149                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
13150                         tp->rfbuf_ts =  tcp_tv_to_mssectick(&bbr->rc_tv);
13151                 /* Selective ACK's. */
13152                 if (flags & TH_SYN)
13153                         to.to_flags |= TOF_SACKPERM;
13154                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
13155                     tp->rcv_numsacks > 0) {
13156                         to.to_flags |= TOF_SACK;
13157                         to.to_nsacks = tp->rcv_numsacks;
13158                         to.to_sacks = (u_char *)tp->sackblks;
13159                 }
13160 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13161                 /* TCP-MD5 (RFC2385). */
13162                 if (tp->t_flags & TF_SIGNATURE)
13163                         to.to_flags |= TOF_SIGNATURE;
13164 #endif                          /* TCP_SIGNATURE */
13165
13166                 /* Processing the options. */
13167                 hdrlen += (optlen = tcp_addoptions(&to, opt));
13168                 /*
13169                  * If we wanted a TFO option to be added, but it was unable
13170                  * to fit, ensure no data is sent.
13171                  */
13172                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
13173                     !(to.to_flags & TOF_FASTOPEN))
13174                         len = 0;
13175         }
13176 #ifdef NETFLIX_TCPOUDP
13177         if (tp->t_port) {
13178                 if (V_tcp_udp_tunneling_port == 0) {
13179                         /* The port was removed?? */
13180                         SOCKBUF_UNLOCK(&so->so_snd);
13181                         return (EHOSTUNREACH);
13182                 }
13183                 hdrlen += sizeof(struct udphdr);
13184         }
13185 #endif
13186 #ifdef INET6
13187         if (isipv6)
13188                 ipoptlen = ip6_optlen(tp->t_inpcb);
13189         else
13190 #endif
13191         if (tp->t_inpcb->inp_options)
13192                 ipoptlen = tp->t_inpcb->inp_options->m_len -
13193                     offsetof(struct ipoption, ipopt_list);
13194         else
13195                 ipoptlen = 0;
13196         ipoptlen = 0;
13197 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
13198         ipoptlen += ipsec_optlen;
13199 #endif
13200         if (bbr->rc_last_options != local_options) {
13201                 /*
13202                  * Cache the options length this generally does not change
13203                  * on a connection. We use this to calculate TSO.
13204                  */
13205                 bbr->rc_last_options = local_options;
13206         }
13207         maxseg = tp->t_maxseg - (ipoptlen + optlen);
13208         p_maxseg = min(maxseg, pace_max_segs);
13209         /*
13210          * Adjust data length if insertion of options will bump the packet
13211          * length beyond the t_maxseg length. Clear the FIN bit because we
13212          * cut off the tail of the segment.
13213          */
13214 #ifdef KERN_TLS
13215         /* force TSO for so TLS offload can get mss */
13216         if (sb->sb_flags & SB_TLS_IFNET) {
13217                 force_tso = 1;
13218         }
13219 #endif
13220
13221         if (len > maxseg) {
13222                 if (len != 0 && (flags & TH_FIN)) {
13223                         flags &= ~TH_FIN;
13224                 }
13225                 if (tso) {
13226                         uint32_t moff;
13227                         int32_t max_len;
13228
13229                         /* extract TSO information */
13230                         if_hw_tsomax = tp->t_tsomax;
13231                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
13232                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
13233                         KASSERT(ipoptlen == 0,
13234                             ("%s: TSO can't do IP options", __func__));
13235
13236                         /*
13237                          * Check if we should limit by maximum payload
13238                          * length:
13239                          */
13240                         if (if_hw_tsomax != 0) {
13241                                 /* compute maximum TSO length */
13242                                 max_len = (if_hw_tsomax - hdrlen -
13243                                     max_linkhdr);
13244                                 if (max_len <= 0) {
13245                                         len = 0;
13246                                 } else if (len > max_len) {
13247                                         len = max_len;
13248                                 }
13249                         }
13250                         /*
13251                          * Prevent the last segment from being fractional
13252                          * unless the send sockbuf can be emptied:
13253                          */
13254                         if (((sb_offset + len) < sbavail(sb)) &&
13255                             (hw_tls == 0)) {
13256                                 moff = len % (uint32_t)maxseg;
13257                                 if (moff != 0) {
13258                                         len -= moff;
13259                                 }
13260                         }
13261                         /*
13262                          * In case there are too many small fragments don't
13263                          * use TSO:
13264                          */
13265                         if (len <= maxseg) {
13266                                 len = maxseg;
13267                                 tso = 0;
13268                         }
13269                 } else {
13270                         /* Not doing TSO */
13271                         if (optlen + ipoptlen >= tp->t_maxseg) {
13272                                 /*
13273                                  * Since we don't have enough space to put
13274                                  * the IP header chain and the TCP header in
13275                                  * one packet as required by RFC 7112, don't
13276                                  * send it. Also ensure that at least one
13277                                  * byte of the payload can be put into the
13278                                  * TCP segment.
13279                                  */
13280                                 SOCKBUF_UNLOCK(&so->so_snd);
13281                                 error = EMSGSIZE;
13282                                 sack_rxmit = 0;
13283                                 goto out;
13284                         }
13285                         len = maxseg;
13286                 }
13287         } else {
13288                 /* Not doing TSO */
13289                 if_hw_tsomaxsegcount = 0;
13290                 tso = 0;
13291         }
13292         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
13293             ("%s: len > IP_MAXPACKET", __func__));
13294 #ifdef DIAGNOSTIC
13295 #ifdef INET6
13296         if (max_linkhdr + hdrlen > MCLBYTES)
13297 #else
13298         if (max_linkhdr + hdrlen > MHLEN)
13299 #endif
13300                 panic("tcphdr too big");
13301 #endif
13302         /*
13303          * This KASSERT is here to catch edge cases at a well defined place.
13304          * Before, those had triggered (random) panic conditions further
13305          * down.
13306          */
13307 #ifdef BBR_INVARIANTS
13308         if (sack_rxmit) {
13309                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
13310                         panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u",
13311                             rsm, tp, bbr, rsm->r_start, tp->snd_una);
13312                 }
13313         }
13314 #endif
13315         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
13316         if ((len == 0) &&
13317             (flags & TH_FIN) &&
13318             (sbused(sb))) {
13319                 /*
13320                  * We have outstanding data, don't send a fin by itself!.
13321                  */
13322                 slot = 0;
13323                 goto just_return;
13324         }
13325         /*
13326          * Grab a header mbuf, attaching a copy of data to be transmitted,
13327          * and initialize the header from the template for sends on this
13328          * connection.
13329          */
13330         if (len) {
13331                 uint32_t moff;
13332                 uint32_t orig_len;
13333
13334                 /*
13335                  * We place a limit on sending with hptsi.
13336                  */
13337                 if ((rsm == NULL) && len > pace_max_segs)
13338                         len = pace_max_segs;
13339                 if (len <= maxseg)
13340                         tso = 0;
13341 #ifdef INET6
13342                 if (MHLEN < hdrlen + max_linkhdr)
13343                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
13344                 else
13345 #endif
13346                         m = m_gethdr(M_NOWAIT, MT_DATA);
13347
13348                 if (m == NULL) {
13349                         BBR_STAT_INC(bbr_failed_mbuf_aloc);
13350                         bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
13351                         SOCKBUF_UNLOCK(sb);
13352                         error = ENOBUFS;
13353                         sack_rxmit = 0;
13354                         goto out;
13355                 }
13356                 m->m_data += max_linkhdr;
13357                 m->m_len = hdrlen;
13358                 /*
13359                  * Start the m_copy functions from the closest mbuf to the
13360                  * sb_offset in the socket buffer chain.
13361                  */
13362                 if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) {
13363 #ifdef BBR_INVARIANTS
13364                         if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0)))
13365                                 panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u",
13366                                     tp, bbr, len, sb_offset, sbavail(sb), rsm,
13367                                     doing_retran_from,
13368                                     picked_up_retran,
13369                                     doing_tlp);
13370
13371 #endif
13372                         /*
13373                          * In this messed up situation we have two choices,
13374                          * a) pretend the send worked, and just start timers
13375                          * and what not (not good since that may lead us
13376                          * back here a lot). <or> b) Send the lowest segment
13377                          * in the map. <or> c) Drop the connection. Lets do
13378                          * <b> which if it continues to happen will lead to
13379                          * <c> via timeouts.
13380                          */
13381                         BBR_STAT_INC(bbr_offset_recovery);
13382                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
13383                         sb_offset = 0;
13384                         if (rsm == NULL) {
13385                                 sack_rxmit = 0;
13386                                 len = sbavail(sb);
13387                         } else {
13388                                 sack_rxmit = 1;
13389                                 if (rsm->r_start != tp->snd_una) {
13390                                         /*
13391                                          * Things are really messed up, <c>
13392                                          * is the only thing to do.
13393                                          */
13394                                         BBR_STAT_INC(bbr_offset_drop);
13395                                         tcp_set_inp_to_drop(inp, EFAULT);
13396                                         return (0);
13397                                 }
13398                                 len = rsm->r_end - rsm->r_start;
13399                         }
13400                         if (len > sbavail(sb))
13401                                 len = sbavail(sb);
13402                         if (len > maxseg)
13403                                 len = maxseg;
13404                 }
13405                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
13406                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
13407                         m_copydata(mb, moff, (int)len,
13408                             mtod(m, caddr_t)+hdrlen);
13409                         if (rsm == NULL)
13410                                 sbsndptr_adv(sb, mb, len);
13411                         m->m_len += len;
13412                 } else {
13413                         struct sockbuf *msb;
13414
13415                         if (rsm)
13416                                 msb = NULL;
13417                         else
13418                                 msb = sb;
13419 #ifdef BBR_INVARIANTS
13420                         if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) {
13421                                 if (rsm) {
13422                                         panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ",
13423                                             tp, bbr, len, moff,
13424                                             sbavail(sb), rsm,
13425                                             tp->snd_una, rsm->r_flags, rsm->r_start,
13426                                             doing_retran_from,
13427                                             picked_up_retran,
13428                                             doing_tlp, sack_rxmit);
13429                                 } else {
13430                                         panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u",
13431                                             tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una);
13432                                 }
13433                         }
13434 #endif
13435                         orig_len = len;
13436                         m->m_next = tcp_m_copym(
13437                                 mb, moff, &len,
13438                                 if_hw_tsomaxsegcount,
13439                                 if_hw_tsomaxsegsize, msb,
13440                                 ((rsm == NULL) ? hw_tls : 0)
13441 #ifdef NETFLIX_COPY_ARGS
13442                                 , &filled_all
13443 #endif
13444                                 );
13445                         if (len <= maxseg && !force_tso) {
13446                                 /*
13447                                  * Must have ran out of mbufs for the copy
13448                                  * shorten it to no longer need tso. Lets
13449                                  * not put on sendalot since we are low on
13450                                  * mbufs.
13451                                  */
13452                                 tso = 0;
13453                         }
13454                         if (m->m_next == NULL) {
13455                                 SOCKBUF_UNLOCK(sb);
13456                                 (void)m_free(m);
13457                                 error = ENOBUFS;
13458                                 sack_rxmit = 0;
13459                                 goto out;
13460                         }
13461                 }
13462 #ifdef BBR_INVARIANTS
13463                 if (tso && len < maxseg) {
13464                         panic("tp:%p tso on, but len:%d < maxseg:%d",
13465                             tp, len, maxseg);
13466                 }
13467                 if (tso && if_hw_tsomaxsegcount) {
13468                         int32_t seg_cnt = 0;
13469                         struct mbuf *foo;
13470
13471                         foo = m;
13472                         while (foo) {
13473                                 seg_cnt++;
13474                                 foo = foo->m_next;
13475                         }
13476                         if (seg_cnt > if_hw_tsomaxsegcount) {
13477                                 panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount);
13478                         }
13479                 }
13480 #endif
13481                 /*
13482                  * If we're sending everything we've got, set PUSH. (This
13483                  * will keep happy those implementations which only give
13484                  * data to the user when a buffer fills or a PUSH comes in.)
13485                  */
13486                 if (sb_offset + len == sbused(sb) &&
13487                     sbused(sb) &&
13488                     !(flags & TH_SYN)) {
13489                         flags |= TH_PUSH;
13490                 }
13491                 SOCKBUF_UNLOCK(sb);
13492         } else {
13493                 SOCKBUF_UNLOCK(sb);
13494                 if (tp->t_flags & TF_ACKNOW)
13495                         KMOD_TCPSTAT_INC(tcps_sndacks);
13496                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
13497                         KMOD_TCPSTAT_INC(tcps_sndctrl);
13498                 else
13499                         KMOD_TCPSTAT_INC(tcps_sndwinup);
13500
13501                 m = m_gethdr(M_NOWAIT, MT_DATA);
13502                 if (m == NULL) {
13503                         BBR_STAT_INC(bbr_failed_mbuf_aloc);
13504                         bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
13505                         error = ENOBUFS;
13506                         /* Fudge the send time since we could not send */
13507                         sack_rxmit = 0;
13508                         goto out;
13509                 }
13510 #ifdef INET6
13511                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
13512                     MHLEN >= hdrlen) {
13513                         M_ALIGN(m, hdrlen);
13514                 } else
13515 #endif
13516                         m->m_data += max_linkhdr;
13517                 m->m_len = hdrlen;
13518         }
13519         SOCKBUF_UNLOCK_ASSERT(sb);
13520         m->m_pkthdr.rcvif = (struct ifnet *)0;
13521 #ifdef MAC
13522         mac_inpcb_create_mbuf(inp, m);
13523 #endif
13524 #ifdef INET6
13525         if (isipv6) {
13526                 ip6 = mtod(m, struct ip6_hdr *);
13527 #ifdef NETFLIX_TCPOUDP
13528                 if (tp->t_port) {
13529                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
13530                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13531                         udp->uh_dport = tp->t_port;
13532                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
13533                         udp->uh_ulen = htons(ulen);
13534                         th = (struct tcphdr *)(udp + 1);
13535                 } else {
13536 #endif
13537                         th = (struct tcphdr *)(ip6 + 1);
13538
13539 #ifdef NETFLIX_TCPOUDP
13540                 }
13541 #endif
13542                 tcpip_fillheaders(inp,
13543 #ifdef NETFLIX_TCPOUDP
13544                                   tp->t_port,
13545 #endif
13546                                   ip6, th);
13547         } else
13548 #endif                          /* INET6 */
13549         {
13550                 ip = mtod(m, struct ip *);
13551 #ifdef TCPDEBUG
13552                 ipov = (struct ipovly *)ip;
13553 #endif
13554 #ifdef NETFLIX_TCPOUDP
13555                 if (tp->t_port) {
13556                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
13557                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13558                         udp->uh_dport = tp->t_port;
13559                         ulen = hdrlen + len - sizeof(struct ip);
13560                         udp->uh_ulen = htons(ulen);
13561                         th = (struct tcphdr *)(udp + 1);
13562                 } else
13563 #endif
13564                         th = (struct tcphdr *)(ip + 1);
13565                 tcpip_fillheaders(inp,
13566 #ifdef NETFLIX_TCPOUDP
13567                                   tp->t_port,
13568 #endif
13569                                   ip, th);
13570         }
13571         /*
13572          * If we are doing retransmissions, then snd_nxt will not reflect
13573          * the first unsent octet.  For ACK only packets, we do not want the
13574          * sequence number of the retransmitted packet, we want the sequence
13575          * number of the next unsent octet.  So, if there is no data (and no
13576          * SYN or FIN), use snd_max instead of snd_nxt when filling in
13577          * ti_seq.  But if we are in persist state, snd_max might reflect
13578          * one byte beyond the right edge of the window, so use snd_nxt in
13579          * that case, since we know we aren't doing a retransmission.
13580          * (retransmit and persist are mutually exclusive...)
13581          */
13582         if (sack_rxmit == 0) {
13583                 if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) {
13584                         /* New data (including new persists) */
13585                         th->th_seq = htonl(tp->snd_max);
13586                         bbr_seq = tp->snd_max;
13587                 } else if (flags & TH_SYN) {
13588                         /* Syn's always send from iss */
13589                         th->th_seq = htonl(tp->iss);
13590                         bbr_seq = tp->iss;
13591                 } else if (flags & TH_FIN) {
13592                         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) {
13593                                 /*
13594                                  * If we sent the fin already its 1 minus
13595                                  * snd_max
13596                                  */
13597                                 th->th_seq = (htonl(tp->snd_max - 1));
13598                                 bbr_seq = (tp->snd_max - 1);
13599                         } else {
13600                                 /* First time FIN use snd_max */
13601                                 th->th_seq = htonl(tp->snd_max);
13602                                 bbr_seq = tp->snd_max;
13603                         }
13604                 } else if (flags & TH_RST) {
13605                         /*
13606                          * For a Reset send the last cum ack in sequence
13607                          * (this like any other choice may still generate a
13608                          * challenge ack, if a ack-update packet is in
13609                          * flight).
13610                          */
13611                         th->th_seq = htonl(tp->snd_una);
13612                         bbr_seq = tp->snd_una;
13613                 } else {
13614                         /*
13615                          * len == 0 and not persist we use snd_max, sending
13616                          * an ack unless we have sent the fin then its 1
13617                          * minus.
13618                          */
13619                         /*
13620                          * XXXRRS Question if we are in persists and we have
13621                          * nothing outstanding to send and we have not sent
13622                          * a FIN, we will send an ACK. In such a case it
13623                          * might be better to send (tp->snd_una - 1) which
13624                          * would force the peer to ack.
13625                          */
13626                         if (tp->t_flags & TF_SENTFIN) {
13627                                 th->th_seq = htonl(tp->snd_max - 1);
13628                                 bbr_seq = (tp->snd_max - 1);
13629                         } else {
13630                                 th->th_seq = htonl(tp->snd_max);
13631                                 bbr_seq = tp->snd_max;
13632                         }
13633                 }
13634         } else {
13635                 /* All retransmits use the rsm to guide the send */
13636                 th->th_seq = htonl(rsm->r_start);
13637                 bbr_seq = rsm->r_start;
13638         }
13639         th->th_ack = htonl(tp->rcv_nxt);
13640         if (optlen) {
13641                 bcopy(opt, th + 1, optlen);
13642                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
13643         }
13644         th->th_flags = flags;
13645         /*
13646          * Calculate receive window.  Don't shrink window, but avoid silly
13647          * window syndrome.
13648          */
13649         if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) &&
13650                                   recwin < maxseg)))
13651                 recwin = 0;
13652         if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
13653             recwin < (tp->rcv_adv - tp->rcv_nxt))
13654                 recwin = (tp->rcv_adv - tp->rcv_nxt);
13655         if (recwin > TCP_MAXWIN << tp->rcv_scale)
13656                 recwin = TCP_MAXWIN << tp->rcv_scale;
13657
13658         /*
13659          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
13660          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
13661          * handled in syncache.
13662          */
13663         if (flags & TH_SYN)
13664                 th->th_win = htons((u_short)
13665                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
13666         else {
13667                 /* Avoid shrinking window with window scaling. */
13668                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
13669                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
13670         }
13671         /*
13672          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
13673          * window.  This may cause the remote transmitter to stall.  This
13674          * flag tells soreceive() to disable delayed acknowledgements when
13675          * draining the buffer.  This can occur if the receiver is
13676          * attempting to read more data than can be buffered prior to
13677          * transmitting on the connection.
13678          */
13679         if (th->th_win == 0) {
13680                 tp->t_sndzerowin++;
13681                 tp->t_flags |= TF_RXWIN0SENT;
13682         } else
13683                 tp->t_flags &= ~TF_RXWIN0SENT;
13684         /*
13685          * We don't support urgent data, but drag along
13686          * the pointer in case of a stack switch.
13687          */
13688         tp->snd_up = tp->snd_una;
13689
13690 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13691         if (to.to_flags & TOF_SIGNATURE) {
13692                 /*
13693                  * Calculate MD5 signature and put it into the place
13694                  * determined before. NOTE: since TCP options buffer doesn't
13695                  * point into mbuf's data, calculate offset and use it.
13696                  */
13697                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
13698                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
13699                         /*
13700                          * Do not send segment if the calculation of MD5
13701                          * digest has failed.
13702                          */
13703                         goto out;
13704                 }
13705         }
13706 #endif
13707
13708         /*
13709          * Put TCP length in extended header, and then checksum extended
13710          * header and data.
13711          */
13712         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
13713 #ifdef INET6
13714         if (isipv6) {
13715                 /*
13716                  * ip6_plen is not need to be filled now, and will be filled
13717                  * in ip6_output.
13718                  */
13719 #ifdef NETFLIX_TCPOUDP
13720                 if (tp->t_port) {
13721                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
13722                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13723                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
13724                         th->th_sum = htons(0);
13725                         UDPSTAT_INC(udps_opackets);
13726                 } else {
13727 #endif
13728                         csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
13729                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13730                         th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
13731                             optlen + len, IPPROTO_TCP, 0);
13732 #ifdef NETFLIX_TCPOUDP
13733                 }
13734 #endif
13735         }
13736 #endif
13737 #if defined(INET6) && defined(INET)
13738         else
13739 #endif
13740 #ifdef INET
13741         {
13742 #ifdef NETFLIX_TCPOUDP
13743                 if (tp->t_port) {
13744                         m->m_pkthdr.csum_flags = CSUM_UDP;
13745                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13746                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
13747                             ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
13748                         th->th_sum = htons(0);
13749                         UDPSTAT_INC(udps_opackets);
13750                 } else {
13751 #endif
13752                         csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
13753                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13754                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
13755                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
13756                             IPPROTO_TCP + len + optlen));
13757 #ifdef NETFLIX_TCPOUDP
13758                 }
13759 #endif
13760                 /* IP version must be set here for ipv4/ipv6 checking later */
13761                 KASSERT(ip->ip_v == IPVERSION,
13762                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
13763         }
13764 #endif
13765
13766         /*
13767          * Enable TSO and specify the size of the segments. The TCP pseudo
13768          * header checksum is always provided. XXX: Fixme: This is currently
13769          * not the case for IPv6.
13770          */
13771         if (tso || force_tso) {
13772                 KASSERT(force_tso || len > maxseg,
13773                     ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg));
13774                 m->m_pkthdr.csum_flags |= CSUM_TSO;
13775                 csum_flags |= CSUM_TSO;
13776                 m->m_pkthdr.tso_segsz = maxseg;
13777         }
13778         KASSERT(len + hdrlen == m_length(m, NULL),
13779             ("%s: mbuf chain different than expected: %d + %u != %u",
13780             __func__, len, hdrlen, m_length(m, NULL)));
13781
13782 #ifdef TCP_HHOOK
13783         /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */
13784         hhook_run_tcp_est_out(tp, th, &to, len, tso);
13785 #endif
13786 #ifdef TCPDEBUG
13787         /*
13788          * Trace.
13789          */
13790         if (so->so_options & SO_DEBUG) {
13791                 u_short save = 0;
13792
13793 #ifdef INET6
13794                 if (!isipv6)
13795 #endif
13796                 {
13797                         save = ipov->ih_len;
13798                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
13799                               * (th->th_off << 2) */ );
13800                 }
13801                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
13802 #ifdef INET6
13803                 if (!isipv6)
13804 #endif
13805                         ipov->ih_len = save;
13806         }
13807 #endif                          /* TCPDEBUG */
13808
13809         /* Log to the black box */
13810         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13811                 union tcp_log_stackspecific log;
13812
13813                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
13814                 /* Record info on type of transmission */
13815                 log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay;
13816                 log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
13817                 log.u_bbr.flex3 = maxseg;
13818                 log.u_bbr.flex4 = delay_calc;
13819                 /* Encode filled_all into the upper flex5 bit */
13820                 log.u_bbr.flex5 = bbr->rc_past_init_win;
13821                 log.u_bbr.flex5 <<= 1;
13822                 log.u_bbr.flex5 |= bbr->rc_no_pacing;
13823                 log.u_bbr.flex5 <<= 29;
13824                 if (filled_all)
13825                         log.u_bbr.flex5 |= 0x80000000;
13826                 log.u_bbr.flex5 |= tp->t_maxseg;
13827                 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
13828                 log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
13829                 /* lets poke in the low and the high here for debugging */
13830                 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
13831                 if (rsm || sack_rxmit) {
13832                         if (doing_tlp)
13833                                 log.u_bbr.flex8 = 2;
13834                         else
13835                                 log.u_bbr.flex8 = 1;
13836                 } else {
13837                         log.u_bbr.flex8 = 0;
13838                 }
13839                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
13840                     len, &log, false, NULL, NULL, 0, tv);
13841         } else {
13842                 lgb = NULL;
13843         }
13844         /*
13845          * Fill in IP length and desired time to live and send to IP level.
13846          * There should be a better way to handle ttl and tos; we could keep
13847          * them in the template, but need a way to checksum without them.
13848          */
13849         /*
13850          * m->m_pkthdr.len should have been set before cksum calcuration,
13851          * because in6_cksum() need it.
13852          */
13853 #ifdef INET6
13854         if (isipv6) {
13855                 /*
13856                  * we separately set hoplimit for every segment, since the
13857                  * user might want to change the value via setsockopt. Also,
13858                  * desired default hop limit might be changed via Neighbor
13859                  * Discovery.
13860                  */
13861                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
13862
13863                 /*
13864                  * Set the packet size here for the benefit of DTrace
13865                  * probes. ip6_output() will set it properly; it's supposed
13866                  * to include the option header lengths as well.
13867                  */
13868                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
13869
13870                 if (V_path_mtu_discovery && maxseg > V_tcp_minmss)
13871                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
13872                 else
13873                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
13874
13875                 if (tp->t_state == TCPS_SYN_SENT)
13876                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
13877
13878                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
13879                 /* TODO: IPv6 IP6TOS_ECT bit on */
13880                 error = ip6_output(m, inp->in6p_outputopts,
13881                     &inp->inp_route6,
13882                     ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
13883                     NULL, NULL, inp);
13884
13885                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
13886                         mtu = inp->inp_route6.ro_nh->nh_mtu;
13887         }
13888 #endif                          /* INET6 */
13889 #if defined(INET) && defined(INET6)
13890         else
13891 #endif
13892 #ifdef INET
13893         {
13894                 ip->ip_len = htons(m->m_pkthdr.len);
13895 #ifdef INET6
13896                 if (isipv6)
13897                         ip->ip_ttl = in6_selecthlim(inp, NULL);
13898 #endif                          /* INET6 */
13899                 /*
13900                  * If we do path MTU discovery, then we set DF on every
13901                  * packet. This might not be the best thing to do according
13902                  * to RFC3390 Section 2. However the tcp hostcache migitates
13903                  * the problem so it affects only the first tcp connection
13904                  * with a host.
13905                  *
13906                  * NB: Don't set DF on small MTU/MSS to have a safe
13907                  * fallback.
13908                  */
13909                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
13910                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
13911                         if (tp->t_port == 0 || len < V_tcp_minmss) {
13912                                 ip->ip_off |= htons(IP_DF);
13913                         }
13914                 } else {
13915                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
13916                 }
13917
13918                 if (tp->t_state == TCPS_SYN_SENT)
13919                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
13920
13921                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
13922
13923                 error = ip_output(m, inp->inp_options, &inp->inp_route,
13924                     ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
13925                     inp);
13926                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
13927                         mtu = inp->inp_route.ro_nh->nh_mtu;
13928         }
13929 #endif                          /* INET */
13930 out:
13931
13932         if (lgb) {
13933                 lgb->tlb_errno = error;
13934                 lgb = NULL;
13935         }
13936         /*
13937          * In transmit state, time the transmission and arrange for the
13938          * retransmit.  In persist state, just set snd_max.
13939          */
13940         if (error == 0) {
13941                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
13942                     (tp->t_flags & TF_SACK_PERMIT) &&
13943                     tp->rcv_numsacks > 0)
13944                         tcp_clean_dsack_blocks(tp);
13945                 /* We sent an ack clear the bbr_segs_rcvd count */
13946                 bbr->output_error_seen = 0;
13947                 bbr->oerror_cnt = 0;
13948                 bbr->bbr_segs_rcvd = 0;
13949                 if (len == 0)
13950                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1);
13951                 else if (hw_tls) {
13952                         if (filled_all ||
13953                             (len >= bbr->r_ctl.rc_pace_max_segs))
13954                                 BBR_STAT_INC(bbr_meets_tso_thresh);
13955                         else {
13956                                 if (doing_tlp) {
13957                                         BBR_STAT_INC(bbr_miss_tlp);
13958                                         bbr_log_type_hrdwtso(tp, bbr, len, 1, what_we_can);
13959
13960
13961                                 } else if (rsm) {
13962                                         BBR_STAT_INC(bbr_miss_retran);
13963                                         bbr_log_type_hrdwtso(tp, bbr, len, 2, what_we_can);
13964                                 } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > sbavail(sb)) {
13965                                         BBR_STAT_INC(bbr_miss_tso_app);
13966                                         bbr_log_type_hrdwtso(tp, bbr, len, 3, what_we_can);
13967                                 } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
13968                                                                  bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_cwnd) {
13969                                         BBR_STAT_INC(bbr_miss_tso_cwnd);
13970                                         bbr_log_type_hrdwtso(tp, bbr, len, 4, what_we_can);
13971                                 } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_wnd) {
13972                                         BBR_STAT_INC(bbr_miss_tso_rwnd);
13973                                         bbr_log_type_hrdwtso(tp, bbr, len, 5, what_we_can);
13974                                 } else {
13975                                         BBR_STAT_INC(bbr_miss_unknown);
13976                                         bbr_log_type_hrdwtso(tp, bbr, len, 6, what_we_can);
13977                                 }
13978                         }
13979                 }
13980                 /* Do accounting for new sends */
13981                 if ((len > 0) && (rsm == NULL)) {
13982                         int idx;
13983                         if (tp->snd_una == tp->snd_max) {
13984                                 /*
13985                                  * Special case to match google, when
13986                                  * nothing is in flight the delivered
13987                                  * time does get updated to the current
13988                                  * time (see tcp_rate_bsd.c).
13989                                  */
13990                                 bbr->r_ctl.rc_del_time = cts;
13991                         }
13992                         if (len >= maxseg) {
13993                                 idx = (len / maxseg) + 3;
13994                                 if (idx >= TCP_MSS_ACCT_ATIMER)
13995                                         counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1);
13996                                 else
13997                                         counter_u64_add(bbr_out_size[idx], 1);
13998                         } else {
13999                                 /* smaller than a MSS */
14000                                 idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options);
14001                                 if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV)
14002                                         idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1);
14003                                 counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1);
14004                         }
14005                 }
14006         }
14007         abandon = 0;
14008         /*
14009          * We must do the send accounting before we log the output,
14010          * otherwise the state of the rsm could change and we account to the
14011          * wrong bucket.
14012          */
14013         if (len > 0) {
14014                 bbr_do_send_accounting(tp, bbr, rsm, len, error);
14015                 if (error == 0) {
14016                         if (tp->snd_una == tp->snd_max)
14017                                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
14018                 }
14019         }
14020         bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error,
14021             cts, mb, &abandon, rsm, 0, sb);
14022         if (abandon) {
14023                 /*
14024                  * If bbr_log_output destroys the TCB or sees a TH_RST being
14025                  * sent we should hit this condition.
14026                  */
14027                 return (0);
14028         }
14029         if (bbr->rc_in_persist == 0) {
14030                 /*
14031                  * Advance snd_nxt over sequence space of this segment.
14032                  */
14033                 if (error)
14034                         /* We don't log or do anything with errors */
14035                         goto skip_upd;
14036
14037                 if (tp->snd_una == tp->snd_max &&
14038                     (len || (flags & (TH_SYN | TH_FIN)))) {
14039                         /*
14040                          * Update the time we just added data since none was
14041                          * outstanding.
14042                          */
14043                         bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
14044                         bbr->rc_tp->t_acktime  = ticks;
14045                 }
14046                 if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) {
14047                         if (flags & TH_SYN) {
14048                                 /*
14049                                  * Smack the snd_max to iss + 1
14050                                  * if its a FO we will add len below.
14051                                  */
14052                                 tp->snd_max = tp->iss + 1;
14053                         }
14054                         if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
14055                                 tp->snd_max++;
14056                                 tp->t_flags |= TF_SENTFIN;
14057                         }
14058                 }
14059                 if (sack_rxmit == 0)
14060                         tp->snd_max += len;
14061 skip_upd:
14062                 if ((error == 0) && len)
14063                         tot_len += len;
14064         } else {
14065                 /* Persists case */
14066                 int32_t xlen = len;
14067
14068                 if (error)
14069                         goto nomore;
14070
14071                 if (flags & TH_SYN)
14072                         ++xlen;
14073                 if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
14074                         ++xlen;
14075                         tp->t_flags |= TF_SENTFIN;
14076                 }
14077                 if (xlen && (tp->snd_una == tp->snd_max)) {
14078                         /*
14079                          * Update the time we just added data since none was
14080                          * outstanding.
14081                          */
14082                         bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
14083                         bbr->rc_tp->t_acktime = ticks;
14084                 }
14085                 if (sack_rxmit == 0)
14086                         tp->snd_max += xlen;
14087                 tot_len += (len + optlen + ipoptlen);
14088         }
14089 nomore:
14090         if (error) {
14091                 /*
14092                  * Failures do not advance the seq counter above. For the
14093                  * case of ENOBUFS we will fall out and become ack-clocked.
14094                  * capping the cwnd at the current flight.
14095                  * Everything else will just have to retransmit with the timer
14096                  * (no pacer).
14097                  */
14098                 SOCKBUF_UNLOCK_ASSERT(sb);
14099                 BBR_STAT_INC(bbr_saw_oerr);
14100                 /* Clear all delay/early tracks */
14101                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
14102                 bbr->r_ctl.rc_agg_early = 0;
14103                 bbr->r_agg_early_set = 0;
14104                 bbr->output_error_seen = 1;
14105                 if (bbr->oerror_cnt < 0xf)
14106                         bbr->oerror_cnt++;
14107                 if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
14108                         /* drop the session */
14109                         tcp_set_inp_to_drop(inp, ENETDOWN);
14110                 }
14111                 switch (error) {
14112                 case ENOBUFS:
14113                         /*
14114                          * Make this guy have to get ack's to send
14115                          * more but lets make sure we don't
14116                          * slam him below a T-O (1MSS).
14117                          */
14118                         if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
14119                                 tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
14120                                                                     bbr->r_ctl.rc_lost_bytes)) - maxseg;
14121                                 if (tp->snd_cwnd < maxseg)
14122                                         tp->snd_cwnd = maxseg;
14123                         }
14124                         slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
14125                         BBR_STAT_INC(bbr_saw_enobuf);
14126                         if (bbr->bbr_hdrw_pacing)
14127                                 counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
14128                         else
14129                                 counter_u64_add(bbr_nohdwr_pacing_enobuf, 1);
14130                         /*
14131                          * Here even in the enobuf's case we want to do our
14132                          * state update. The reason being we may have been
14133                          * called by the input function. If so we have had
14134                          * things change.
14135                          */
14136                         error = 0;
14137                         goto enobufs;
14138                 case EMSGSIZE:
14139                         /*
14140                          * For some reason the interface we used initially
14141                          * to send segments changed to another or lowered
14142                          * its MTU. If TSO was active we either got an
14143                          * interface without TSO capabilits or TSO was
14144                          * turned off. If we obtained mtu from ip_output()
14145                          * then update it and try again.
14146                          */
14147                         /* Turn on tracing (or try to) */
14148                         {
14149                                 int old_maxseg;
14150
14151                                 old_maxseg = tp->t_maxseg;
14152                                 BBR_STAT_INC(bbr_saw_emsgsiz);
14153                                 bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts);
14154                                 if (mtu != 0)
14155                                         tcp_mss_update(tp, -1, mtu, NULL, NULL);
14156                                 if (old_maxseg <= tp->t_maxseg) {
14157                                         /* Huh it did not shrink? */
14158                                         tp->t_maxseg = old_maxseg - 40;
14159                                         bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts);
14160                                 }
14161                                 /*
14162                                  * Nuke all other things that can interfere
14163                                  * with slot
14164                                  */
14165                                 if ((tot_len + len) && (len >= tp->t_maxseg)) {
14166                                         slot = bbr_get_pacing_delay(bbr,
14167                                             bbr->r_ctl.rc_bbr_hptsi_gain,
14168                                             (tot_len + len), cts, 0);
14169                                         if (slot < bbr_error_base_paceout)
14170                                                 slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
14171                                 } else
14172                                         slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
14173                                 bbr->rc_output_starts_timer = 1;
14174                                 bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
14175                                     tot_len);
14176                                 return (error);
14177                         }
14178                 case EPERM:
14179                         tp->t_softerror = error;
14180                         /* Fall through */
14181                 case EHOSTDOWN:
14182                 case EHOSTUNREACH:
14183                 case ENETDOWN:
14184                 case ENETUNREACH:
14185                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
14186                                 tp->t_softerror = error;
14187                         }
14188                         /* FALLTHROUGH */
14189                 default:
14190                         slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
14191                         bbr->rc_output_starts_timer = 1;
14192                         bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
14193                         return (error);
14194                 }
14195 #ifdef STATS
14196         } else if (((tp->t_flags & TF_GPUTINPROG) == 0) &&
14197                     len &&
14198                     (rsm == NULL) &&
14199             (bbr->rc_in_persist == 0)) {
14200                 tp->gput_seq = bbr_seq;
14201                 tp->gput_ack = bbr_seq +
14202                     min(sbavail(&so->so_snd) - sb_offset, sendwin);
14203                 tp->gput_ts = cts;
14204                 tp->t_flags |= TF_GPUTINPROG;
14205 #endif
14206         }
14207         KMOD_TCPSTAT_INC(tcps_sndtotal);
14208         if ((bbr->bbr_hdw_pace_ena) &&
14209             (bbr->bbr_attempt_hdwr_pace == 0) &&
14210             (bbr->rc_past_init_win) &&
14211             (bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
14212             (get_filter_value(&bbr->r_ctl.rc_delrate)) &&
14213             (inp->inp_route.ro_nh &&
14214              inp->inp_route.ro_nh->nh_ifp)) {
14215                 /*
14216                  * We are past the initial window and
14217                  * have at least one measurement so we
14218                  * could use hardware pacing if its available.
14219                  * We have an interface and we have not attempted
14220                  * to setup hardware pacing, lets try to now.
14221                  */
14222                 uint64_t rate_wanted;
14223                 int err = 0;
14224
14225                 rate_wanted = bbr_get_hardware_rate(bbr);
14226                 bbr->bbr_attempt_hdwr_pace = 1;
14227                 bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp,
14228                                                       inp->inp_route.ro_nh->nh_ifp,
14229                                                       rate_wanted,
14230                                                       (RS_PACING_GEQ|RS_PACING_SUB_OK),
14231                                                       &err);
14232                 if (bbr->r_ctl.crte) {
14233                         bbr_type_log_hdwr_pacing(bbr,
14234                                                  bbr->r_ctl.crte->ptbl->rs_ifp,
14235                                                  rate_wanted,
14236                                                  bbr->r_ctl.crte->rate,
14237                                                  __LINE__, cts, err);
14238                         BBR_STAT_INC(bbr_hdwr_rl_add_ok);
14239                         counter_u64_add(bbr_flows_nohdwr_pacing, -1);
14240                         counter_u64_add(bbr_flows_whdwr_pacing, 1);
14241                         bbr->bbr_hdrw_pacing = 1;
14242                         /* Now what is our gain status? */
14243                         if (bbr->r_ctl.crte->rate < rate_wanted) {
14244                                 /* We have a problem */
14245                                 bbr_setup_less_of_rate(bbr, cts,
14246                                                        bbr->r_ctl.crte->rate, rate_wanted);
14247                         } else {
14248                                 /* We are good */
14249                                 bbr->gain_is_limited = 0;
14250                                 bbr->skip_gain = 0;
14251                         }
14252                         tcp_bbr_tso_size_check(bbr, cts);
14253                 } else {
14254                         bbr_type_log_hdwr_pacing(bbr,
14255                                                  inp->inp_route.ro_nh->nh_ifp,
14256                                                  rate_wanted,
14257                                                  0,
14258                                                  __LINE__, cts, err);
14259                         BBR_STAT_INC(bbr_hdwr_rl_add_fail);
14260                 }
14261         }
14262         if (bbr->bbr_hdrw_pacing) {
14263                 /*
14264                  * Worry about cases where the route
14265                  * changes or something happened that we
14266                  * lost our hardware pacing possibly during
14267                  * the last ip_output call.
14268                  */
14269                 if (inp->inp_snd_tag == NULL) {
14270                         /* A change during ip output disabled hw pacing? */
14271                         bbr->bbr_hdrw_pacing = 0;
14272                 } else if ((inp->inp_route.ro_nh == NULL) ||
14273                     (inp->inp_route.ro_nh->nh_ifp != inp->inp_snd_tag->ifp)) {
14274                         /*
14275                          * We had an interface or route change,
14276                          * detach from the current hdwr pacing
14277                          * and setup to re-attempt next go
14278                          * round.
14279                          */
14280                         bbr->bbr_hdrw_pacing = 0;
14281                         bbr->bbr_attempt_hdwr_pace = 0;
14282                         tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
14283                         tcp_bbr_tso_size_check(bbr, cts);
14284                 }
14285         }
14286         /*
14287          * Data sent (as far as we can tell). If this advertises a larger
14288          * window than any other segment, then remember the size of the
14289          * advertised window. Any pending ACK has now been sent.
14290          */
14291         if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
14292                 tp->rcv_adv = tp->rcv_nxt + recwin;
14293
14294         tp->last_ack_sent = tp->rcv_nxt;
14295         if ((error == 0) &&
14296             (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) &&
14297             (doing_tlp == 0) &&
14298             (tso == 0) &&
14299             (hw_tls == 0) &&
14300             (len > 0) &&
14301             ((flags & TH_RST) == 0) &&
14302             ((flags & TH_SYN) == 0) &&
14303             (IN_RECOVERY(tp->t_flags) == 0) &&
14304             (bbr->rc_in_persist == 0) &&
14305             (tot_len < bbr->r_ctl.rc_pace_max_segs)) {
14306                 /*
14307                  * For non-tso we need to goto again until we have sent out
14308                  * enough data to match what we are hptsi out every hptsi
14309                  * interval.
14310                  */
14311                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
14312                         /* Make sure snd_nxt is drug up */
14313                         tp->snd_nxt = tp->snd_max;
14314                 }
14315                 if (rsm != NULL) {
14316                         rsm = NULL;
14317                         goto skip_again;
14318                 }
14319                 rsm = NULL;
14320                 sack_rxmit = 0;
14321                 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
14322                 goto again;
14323         }
14324 skip_again:
14325         if ((error == 0) && (flags & TH_FIN))
14326                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
14327         if ((error == 0) && (flags & TH_RST))
14328                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
14329         if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
14330                 /*
14331                  * Calculate/Re-Calculate the hptsi slot in usecs based on
14332                  * what we have sent so far
14333                  */
14334                 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
14335                 if (bbr->rc_no_pacing)
14336                         slot = 0;
14337         }
14338         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
14339 enobufs:
14340         if (bbr->rc_use_google == 0)
14341                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
14342         bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
14343                                                         bbr->r_ctl.rc_lost_bytes)));
14344         bbr->rc_output_starts_timer = 1;
14345         if (bbr->bbr_use_rack_cheat &&
14346             (more_to_rxt ||
14347              ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
14348                 /* Rack cheats and shotguns out all rxt's 1ms apart */
14349                 if (slot > 1000)
14350                         slot = 1000;
14351         }
14352         if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
14353                 /*
14354                  * We don't change the tso size until some number of sends
14355                  * to give the hardware commands time to get down
14356                  * to the interface.
14357                  */
14358                 bbr->r_ctl.bbr_hdwr_cnt_noset_snt++;
14359                 if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) {
14360                         bbr->hw_pacing_set = 1;
14361                         tcp_bbr_tso_size_check(bbr, cts);
14362                 }
14363         }
14364         bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
14365         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
14366                 /* Make sure snd_nxt is drug up */
14367                 tp->snd_nxt = tp->snd_max;
14368         }
14369         return (error);
14370
14371 }
14372
14373 /*
14374  * See bbr_output_wtime() for return values.
14375  */
14376 static int
14377 bbr_output(struct tcpcb *tp)
14378 {
14379         int32_t ret;
14380         struct timeval tv;
14381         struct tcp_bbr *bbr;
14382
14383         NET_EPOCH_ASSERT();
14384
14385         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14386         INP_WLOCK_ASSERT(tp->t_inpcb);
14387         (void)tcp_get_usecs(&tv);
14388         ret = bbr_output_wtime(tp, &tv);
14389         return (ret);
14390 }
14391
14392 static void
14393 bbr_mtu_chg(struct tcpcb *tp)
14394 {
14395         struct tcp_bbr *bbr;
14396         struct bbr_sendmap *rsm, *frsm = NULL;
14397         uint32_t maxseg;
14398
14399         /*
14400          * The MTU has changed. a) Clear the sack filter. b) Mark everything
14401          * over the current size as SACK_PASS so a retransmit will occur.
14402          */
14403
14404         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14405         maxseg = tp->t_maxseg - bbr->rc_last_options;
14406         sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
14407         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
14408                 /* Don't mess with ones acked (by sack?) */
14409                 if (rsm->r_flags & BBR_ACKED)
14410                         continue;
14411                 if ((rsm->r_end - rsm->r_start) > maxseg) {
14412                         /*
14413                          * We mark sack-passed on all the previous large
14414                          * sends we did. This will force them to retransmit.
14415                          */
14416                         rsm->r_flags |= BBR_SACK_PASSED;
14417                         if (((rsm->r_flags & BBR_MARKED_LOST) == 0) &&
14418                             bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) {
14419                                 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
14420                                 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
14421                                 rsm->r_flags |= BBR_MARKED_LOST;
14422                         }
14423                         if (frsm == NULL)
14424                                 frsm = rsm;
14425                 }
14426         }
14427         if (frsm) {
14428                 bbr->r_ctl.rc_resend = frsm;
14429         }
14430 }
14431
14432 /*
14433  * bbr_ctloutput() must drop the inpcb lock before performing copyin on
14434  * socket option arguments.  When it re-acquires the lock after the copy, it
14435  * has to revalidate that the connection is still valid for the socket
14436  * option.
14437  */
14438 static int
14439 bbr_set_sockopt(struct socket *so, struct sockopt *sopt,
14440                 struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
14441 {
14442         struct epoch_tracker et;
14443         int32_t error = 0, optval;
14444
14445         switch (sopt->sopt_name) {
14446         case TCP_RACK_PACE_MAX_SEG:
14447         case TCP_RACK_MIN_TO:
14448         case TCP_RACK_REORD_THRESH:
14449         case TCP_RACK_REORD_FADE:
14450         case TCP_RACK_TLP_THRESH:
14451         case TCP_RACK_PKT_DELAY:
14452         case TCP_BBR_ALGORITHM:
14453         case TCP_BBR_TSLIMITS:
14454         case TCP_BBR_IWINTSO:
14455         case TCP_BBR_RECFORCE:
14456         case TCP_BBR_STARTUP_PG:
14457         case TCP_BBR_DRAIN_PG:
14458         case TCP_BBR_RWND_IS_APP:
14459         case TCP_BBR_PROBE_RTT_INT:
14460         case TCP_BBR_PROBE_RTT_GAIN:
14461         case TCP_BBR_PROBE_RTT_LEN:
14462         case TCP_BBR_STARTUP_LOSS_EXIT:
14463         case TCP_BBR_USEDEL_RATE:
14464         case TCP_BBR_MIN_RTO:
14465         case TCP_BBR_MAX_RTO:
14466         case TCP_BBR_PACE_PER_SEC:
14467         case TCP_DELACK:
14468         case TCP_BBR_PACE_DEL_TAR:
14469         case TCP_BBR_SEND_IWND_IN_TSO:
14470         case TCP_BBR_EXTRA_STATE:
14471         case TCP_BBR_UTTER_MAX_TSO:
14472         case TCP_BBR_MIN_TOPACEOUT:
14473         case TCP_BBR_FLOOR_MIN_TSO:
14474         case TCP_BBR_TSTMP_RAISES:
14475         case TCP_BBR_POLICER_DETECT:
14476         case TCP_BBR_USE_RACK_CHEAT:
14477         case TCP_DATA_AFTER_CLOSE:
14478         case TCP_BBR_HDWR_PACE:
14479         case TCP_BBR_PACE_SEG_MAX:
14480         case TCP_BBR_PACE_SEG_MIN:
14481         case TCP_BBR_PACE_CROSS:
14482         case TCP_BBR_PACE_OH:
14483 #ifdef NETFLIX_PEAKRATE
14484         case TCP_MAXPEAKRATE:
14485 #endif
14486         case TCP_BBR_TMR_PACE_OH:
14487         case TCP_BBR_RACK_RTT_USE:
14488         case TCP_BBR_RETRAN_WTSO:
14489                 break;
14490         default:
14491                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14492                 break;
14493         }
14494         INP_WUNLOCK(inp);
14495         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
14496         if (error)
14497                 return (error);
14498         INP_WLOCK(inp);
14499         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
14500                 INP_WUNLOCK(inp);
14501                 return (ECONNRESET);
14502         }
14503         tp = intotcpcb(inp);
14504         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14505         switch (sopt->sopt_name) {
14506         case TCP_BBR_PACE_PER_SEC:
14507                 BBR_OPTS_INC(tcp_bbr_pace_per_sec);
14508                 bbr->r_ctl.bbr_hptsi_per_second = optval;
14509                 break;
14510         case TCP_BBR_PACE_DEL_TAR:
14511                 BBR_OPTS_INC(tcp_bbr_pace_del_tar);
14512                 bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval;
14513                 break;
14514         case TCP_BBR_PACE_SEG_MAX:
14515                 BBR_OPTS_INC(tcp_bbr_pace_seg_max);
14516                 bbr->r_ctl.bbr_hptsi_segments_max = optval;
14517                 break;
14518         case TCP_BBR_PACE_SEG_MIN:
14519                 BBR_OPTS_INC(tcp_bbr_pace_seg_min);
14520                 bbr->r_ctl.bbr_hptsi_bytes_min = optval;
14521                 break;
14522         case TCP_BBR_PACE_CROSS:
14523                 BBR_OPTS_INC(tcp_bbr_pace_cross);
14524                 bbr->r_ctl.bbr_cross_over = optval;
14525                 break;
14526         case TCP_BBR_ALGORITHM:
14527                 BBR_OPTS_INC(tcp_bbr_algorithm);
14528                 if (optval && (bbr->rc_use_google == 0)) {
14529                         /* Turn on the google mode */
14530                         bbr_google_mode_on(bbr);
14531                         if ((optval > 3) && (optval < 500)) {
14532                                 /*
14533                                  * Must be at least greater than .3%
14534                                  * and must be less than 50.0%.
14535                                  */
14536                                 bbr->r_ctl.bbr_google_discount = optval;
14537                         }
14538                 } else if ((optval == 0) && (bbr->rc_use_google == 1)) {
14539                         /* Turn off the google mode */
14540                         bbr_google_mode_off(bbr);
14541                 }
14542                 break;
14543         case TCP_BBR_TSLIMITS:
14544                 BBR_OPTS_INC(tcp_bbr_tslimits);
14545                 if (optval == 1)
14546                         bbr->rc_use_ts_limit = 1;
14547                 else if (optval == 0)
14548                         bbr->rc_use_ts_limit = 0;
14549                 else
14550                         error = EINVAL;
14551                 break;
14552
14553         case TCP_BBR_IWINTSO:
14554                 BBR_OPTS_INC(tcp_bbr_iwintso);
14555                 if ((optval >= 0) && (optval < 128)) {
14556                         uint32_t twin;
14557
14558                         bbr->rc_init_win = optval;
14559                         twin = bbr_initial_cwnd(bbr, tp);
14560                         if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd))
14561                                 tp->snd_cwnd = twin;
14562                         else
14563                                 error = EBUSY;
14564                 } else
14565                         error = EINVAL;
14566                 break;
14567         case TCP_BBR_STARTUP_PG:
14568                 BBR_OPTS_INC(tcp_bbr_startup_pg);
14569                 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) {
14570                         bbr->r_ctl.rc_startup_pg = optval;
14571                         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
14572                                 bbr->r_ctl.rc_bbr_hptsi_gain = optval;
14573                         }
14574                 } else
14575                         error = EINVAL;
14576                 break;
14577         case TCP_BBR_DRAIN_PG:
14578                 BBR_OPTS_INC(tcp_bbr_drain_pg);
14579                 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE))
14580                         bbr->r_ctl.rc_drain_pg = optval;
14581                 else
14582                         error = EINVAL;
14583                 break;
14584         case TCP_BBR_PROBE_RTT_LEN:
14585                 BBR_OPTS_INC(tcp_bbr_probertt_len);
14586                 if (optval <= 1)
14587                         reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND));
14588                 else
14589                         error = EINVAL;
14590                 break;
14591         case TCP_BBR_PROBE_RTT_GAIN:
14592                 BBR_OPTS_INC(tcp_bbr_probertt_gain);
14593                 if (optval <= BBR_UNIT)
14594                         bbr->r_ctl.bbr_rttprobe_gain_val = optval;
14595                 else
14596                         error = EINVAL;
14597                 break;
14598         case TCP_BBR_PROBE_RTT_INT:
14599                 BBR_OPTS_INC(tcp_bbr_probe_rtt_int);
14600                 if (optval > 1000)
14601                         bbr->r_ctl.rc_probertt_int = optval;
14602                 else
14603                         error = EINVAL;
14604                 break;
14605         case TCP_BBR_MIN_TOPACEOUT:
14606                 BBR_OPTS_INC(tcp_bbr_topaceout);
14607                 if (optval == 0) {
14608                         bbr->no_pacing_until = 0;
14609                         bbr->rc_no_pacing = 0;
14610                 } else if (optval <= 0x00ff) {
14611                         bbr->no_pacing_until = optval;
14612                         if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) &&
14613                             (bbr->rc_bbr_state == BBR_STATE_STARTUP)){
14614                                 /* Turn on no pacing */
14615                                 bbr->rc_no_pacing = 1;
14616                         }
14617                 } else
14618                         error = EINVAL;
14619                 break;
14620         case TCP_BBR_STARTUP_LOSS_EXIT:
14621                 BBR_OPTS_INC(tcp_bbr_startup_loss_exit);
14622                 bbr->rc_loss_exit = optval;
14623                 break;
14624         case TCP_BBR_USEDEL_RATE:
14625                 error = EINVAL;
14626                 break;
14627         case TCP_BBR_MIN_RTO:
14628                 BBR_OPTS_INC(tcp_bbr_min_rto);
14629                 bbr->r_ctl.rc_min_rto_ms = optval;
14630                 break;
14631         case TCP_BBR_MAX_RTO:
14632                 BBR_OPTS_INC(tcp_bbr_max_rto);
14633                 bbr->rc_max_rto_sec = optval;
14634                 break;
14635         case TCP_RACK_MIN_TO:
14636                 /* Minimum time between rack t-o's in ms */
14637                 BBR_OPTS_INC(tcp_rack_min_to);
14638                 bbr->r_ctl.rc_min_to = optval;
14639                 break;
14640         case TCP_RACK_REORD_THRESH:
14641                 /* RACK reorder threshold (shift amount) */
14642                 BBR_OPTS_INC(tcp_rack_reord_thresh);
14643                 if ((optval > 0) && (optval < 31))
14644                         bbr->r_ctl.rc_reorder_shift = optval;
14645                 else
14646                         error = EINVAL;
14647                 break;
14648         case TCP_RACK_REORD_FADE:
14649                 /* Does reordering fade after ms time */
14650                 BBR_OPTS_INC(tcp_rack_reord_fade);
14651                 bbr->r_ctl.rc_reorder_fade = optval;
14652                 break;
14653         case TCP_RACK_TLP_THRESH:
14654                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
14655                 BBR_OPTS_INC(tcp_rack_tlp_thresh);
14656                 if (optval)
14657                         bbr->rc_tlp_threshold = optval;
14658                 else
14659                         error = EINVAL;
14660                 break;
14661         case TCP_BBR_USE_RACK_CHEAT:
14662                 BBR_OPTS_INC(tcp_use_rackcheat);
14663                 if (bbr->rc_use_google) {
14664                         error = EINVAL;
14665                         break;
14666                 }
14667                 BBR_OPTS_INC(tcp_rack_cheat);
14668                 if (optval)
14669                         bbr->bbr_use_rack_cheat = 1;
14670                 else
14671                         bbr->bbr_use_rack_cheat = 0;
14672                 break;
14673         case TCP_BBR_FLOOR_MIN_TSO:
14674                 BBR_OPTS_INC(tcp_utter_max_tso);
14675                 if ((optval >= 0) && (optval < 40))
14676                         bbr->r_ctl.bbr_hptsi_segments_floor = optval;
14677                 else
14678                         error = EINVAL;
14679                 break;
14680         case TCP_BBR_UTTER_MAX_TSO:
14681                 BBR_OPTS_INC(tcp_utter_max_tso);
14682                 if ((optval >= 0) && (optval < 0xffff))
14683                         bbr->r_ctl.bbr_utter_max = optval;
14684                 else
14685                         error = EINVAL;
14686                 break;
14687
14688         case TCP_BBR_EXTRA_STATE:
14689                 BBR_OPTS_INC(tcp_extra_state);
14690                 if (optval)
14691                         bbr->rc_use_idle_restart = 1;
14692                 else
14693                         bbr->rc_use_idle_restart = 0;
14694                 break;
14695         case TCP_BBR_SEND_IWND_IN_TSO:
14696                 BBR_OPTS_INC(tcp_iwnd_tso);
14697                 if (optval) {
14698                         bbr->bbr_init_win_cheat = 1;
14699                         if (bbr->rc_past_init_win == 0) {
14700                                 uint32_t cts;
14701                                 cts = tcp_get_usecs(&bbr->rc_tv);
14702                                 tcp_bbr_tso_size_check(bbr, cts);
14703                         }
14704                 } else
14705                         bbr->bbr_init_win_cheat = 0;
14706                 break;
14707         case TCP_BBR_HDWR_PACE:
14708                 BBR_OPTS_INC(tcp_hdwr_pacing);
14709                 if (optval){
14710                         bbr->bbr_hdw_pace_ena = 1;
14711                         bbr->bbr_attempt_hdwr_pace = 0;
14712                 } else {
14713                         bbr->bbr_hdw_pace_ena = 0;
14714 #ifdef RATELIMIT
14715                         if (bbr->bbr_hdrw_pacing) {
14716                                 bbr->bbr_hdrw_pacing = 0;
14717                                 in_pcbdetach_txrtlmt(bbr->rc_inp);
14718                         }
14719 #endif
14720                 }
14721                 break;
14722
14723         case TCP_DELACK:
14724                 BBR_OPTS_INC(tcp_delack);
14725                 if (optval < 100) {
14726                         if (optval == 0) /* off */
14727                                 tp->t_delayed_ack = 0;
14728                         else if (optval == 1) /* on which is 2 */
14729                                 tp->t_delayed_ack = 2;
14730                         else /* higher than 2 and less than 100 */
14731                                 tp->t_delayed_ack = optval;
14732                         if (tp->t_flags & TF_DELACK) {
14733                                 tp->t_flags &= ~TF_DELACK;
14734                                 tp->t_flags |= TF_ACKNOW;
14735                                 NET_EPOCH_ENTER(et);
14736                                 bbr_output(tp);
14737                                 NET_EPOCH_EXIT(et);
14738                         }
14739                 } else
14740                         error = EINVAL;
14741                 break;
14742         case TCP_RACK_PKT_DELAY:
14743                 /* RACK added ms i.e. rack-rtt + reord + N */
14744                 BBR_OPTS_INC(tcp_rack_pkt_delay);
14745                 bbr->r_ctl.rc_pkt_delay = optval;
14746                 break;
14747 #ifdef NETFLIX_PEAKRATE
14748         case TCP_MAXPEAKRATE:
14749                 BBR_OPTS_INC(tcp_maxpeak);
14750                 error = tcp_set_maxpeakrate(tp, optval);
14751                 if (!error)
14752                         tp->t_peakrate_thr = tp->t_maxpeakrate;
14753                 break;
14754 #endif
14755         case TCP_BBR_RETRAN_WTSO:
14756                 BBR_OPTS_INC(tcp_retran_wtso);
14757                 if (optval)
14758                         bbr->rc_resends_use_tso = 1;
14759                 else
14760                         bbr->rc_resends_use_tso = 0;
14761                 break;
14762         case TCP_DATA_AFTER_CLOSE:
14763                 BBR_OPTS_INC(tcp_data_ac);
14764                 if (optval)
14765                         bbr->rc_allow_data_af_clo = 1;
14766                 else
14767                         bbr->rc_allow_data_af_clo = 0;
14768                 break;
14769         case TCP_BBR_POLICER_DETECT:
14770                 BBR_OPTS_INC(tcp_policer_det);
14771                 if (bbr->rc_use_google == 0)
14772                         error = EINVAL;
14773                 else if (optval)
14774                         bbr->r_use_policer = 1;
14775                 else
14776                         bbr->r_use_policer = 0;
14777                 break;
14778
14779         case TCP_BBR_TSTMP_RAISES:
14780                 BBR_OPTS_INC(tcp_ts_raises);
14781                 if (optval)
14782                         bbr->ts_can_raise = 1;
14783                 else
14784                         bbr->ts_can_raise = 0;
14785                 break;
14786         case TCP_BBR_TMR_PACE_OH:
14787                 BBR_OPTS_INC(tcp_pacing_oh_tmr);
14788                 if (bbr->rc_use_google) {
14789                         error = EINVAL;
14790                 } else {
14791                         if (optval)
14792                                 bbr->r_ctl.rc_incr_tmrs = 1;
14793                         else
14794                                 bbr->r_ctl.rc_incr_tmrs = 0;
14795                 }
14796                 break;
14797         case TCP_BBR_PACE_OH:
14798                 BBR_OPTS_INC(tcp_pacing_oh);
14799                 if (bbr->rc_use_google) {
14800                         error = EINVAL;
14801                 } else {
14802                         if (optval > (BBR_INCL_TCP_OH|
14803                                       BBR_INCL_IP_OH|
14804                                       BBR_INCL_ENET_OH)) {
14805                                 error = EINVAL;
14806                                 break;
14807                         }
14808                         if (optval & BBR_INCL_TCP_OH)
14809                                 bbr->r_ctl.rc_inc_tcp_oh = 1;
14810                         else
14811                                 bbr->r_ctl.rc_inc_tcp_oh = 0;
14812                         if (optval & BBR_INCL_IP_OH)
14813                                 bbr->r_ctl.rc_inc_ip_oh = 1;
14814                         else
14815                                 bbr->r_ctl.rc_inc_ip_oh = 0;
14816                         if (optval & BBR_INCL_ENET_OH)
14817                                 bbr->r_ctl.rc_inc_enet_oh = 1;
14818                         else
14819                                 bbr->r_ctl.rc_inc_enet_oh = 0;
14820                 }
14821                 break;
14822         default:
14823                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14824                 break;
14825         }
14826 #ifdef NETFLIX_STATS
14827         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
14828 #endif
14829         INP_WUNLOCK(inp);
14830         return (error);
14831 }
14832
14833 /*
14834  * return 0 on success, error-num on failure
14835  */
14836 static int
14837 bbr_get_sockopt(struct socket *so, struct sockopt *sopt,
14838     struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
14839 {
14840         int32_t error, optval;
14841
14842         /*
14843          * Because all our options are either boolean or an int, we can just
14844          * pull everything into optval and then unlock and copy. If we ever
14845          * add a option that is not a int, then this will have quite an
14846          * impact to this routine.
14847          */
14848         switch (sopt->sopt_name) {
14849         case TCP_BBR_PACE_PER_SEC:
14850                 optval = bbr->r_ctl.bbr_hptsi_per_second;
14851                 break;
14852         case TCP_BBR_PACE_DEL_TAR:
14853                 optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar;
14854                 break;
14855         case TCP_BBR_PACE_SEG_MAX:
14856                 optval = bbr->r_ctl.bbr_hptsi_segments_max;
14857                 break;
14858         case TCP_BBR_MIN_TOPACEOUT:
14859                 optval = bbr->no_pacing_until;
14860                 break;
14861         case TCP_BBR_PACE_SEG_MIN:
14862                 optval = bbr->r_ctl.bbr_hptsi_bytes_min;
14863                 break;
14864         case TCP_BBR_PACE_CROSS:
14865                 optval = bbr->r_ctl.bbr_cross_over;
14866                 break;
14867         case TCP_BBR_ALGORITHM:
14868                 optval = bbr->rc_use_google;
14869                 break;
14870         case TCP_BBR_TSLIMITS:
14871                 optval = bbr->rc_use_ts_limit;
14872                 break;
14873         case TCP_BBR_IWINTSO:
14874                 optval = bbr->rc_init_win;
14875                 break;
14876         case TCP_BBR_STARTUP_PG:
14877                 optval = bbr->r_ctl.rc_startup_pg;
14878                 break;
14879         case TCP_BBR_DRAIN_PG:
14880                 optval = bbr->r_ctl.rc_drain_pg;
14881                 break;
14882         case TCP_BBR_PROBE_RTT_INT:
14883                 optval = bbr->r_ctl.rc_probertt_int;
14884                 break;
14885         case TCP_BBR_PROBE_RTT_LEN:
14886                 optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND);
14887                 break;
14888         case TCP_BBR_PROBE_RTT_GAIN:
14889                 optval = bbr->r_ctl.bbr_rttprobe_gain_val;
14890                 break;
14891         case TCP_BBR_STARTUP_LOSS_EXIT:
14892                 optval = bbr->rc_loss_exit;
14893                 break;
14894         case TCP_BBR_USEDEL_RATE:
14895                 error = EINVAL;
14896                 break;
14897         case TCP_BBR_MIN_RTO:
14898                 optval = bbr->r_ctl.rc_min_rto_ms;
14899                 break;
14900         case TCP_BBR_MAX_RTO:
14901                 optval = bbr->rc_max_rto_sec;
14902                 break;
14903         case TCP_RACK_PACE_MAX_SEG:
14904                 /* Max segments in a pace */
14905                 optval = bbr->r_ctl.rc_pace_max_segs;
14906                 break;
14907         case TCP_RACK_MIN_TO:
14908                 /* Minimum time between rack t-o's in ms */
14909                 optval = bbr->r_ctl.rc_min_to;
14910                 break;
14911         case TCP_RACK_REORD_THRESH:
14912                 /* RACK reorder threshold (shift amount) */
14913                 optval = bbr->r_ctl.rc_reorder_shift;
14914                 break;
14915         case TCP_RACK_REORD_FADE:
14916                 /* Does reordering fade after ms time */
14917                 optval = bbr->r_ctl.rc_reorder_fade;
14918                 break;
14919         case TCP_BBR_USE_RACK_CHEAT:
14920                 /* Do we use the rack cheat for rxt */
14921                 optval = bbr->bbr_use_rack_cheat;
14922                 break;
14923         case TCP_BBR_FLOOR_MIN_TSO:
14924                 optval = bbr->r_ctl.bbr_hptsi_segments_floor;
14925                 break;
14926         case TCP_BBR_UTTER_MAX_TSO:
14927                 optval = bbr->r_ctl.bbr_utter_max;
14928                 break;
14929         case TCP_BBR_SEND_IWND_IN_TSO:
14930                 /* Do we send TSO size segments initially */
14931                 optval = bbr->bbr_init_win_cheat;
14932                 break;
14933         case TCP_BBR_EXTRA_STATE:
14934                 optval = bbr->rc_use_idle_restart;
14935                 break;
14936         case TCP_RACK_TLP_THRESH:
14937                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
14938                 optval = bbr->rc_tlp_threshold;
14939                 break;
14940         case TCP_RACK_PKT_DELAY:
14941                 /* RACK added ms i.e. rack-rtt + reord + N */
14942                 optval = bbr->r_ctl.rc_pkt_delay;
14943                 break;
14944         case TCP_BBR_RETRAN_WTSO:
14945                 optval = bbr->rc_resends_use_tso;
14946                 break;
14947         case TCP_DATA_AFTER_CLOSE:
14948                 optval = bbr->rc_allow_data_af_clo;
14949                 break;
14950         case TCP_DELACK:
14951                 optval = tp->t_delayed_ack;
14952                 break;
14953         case TCP_BBR_HDWR_PACE:
14954                 optval = bbr->bbr_hdw_pace_ena;
14955                 break;
14956         case TCP_BBR_POLICER_DETECT:
14957                 optval = bbr->r_use_policer;
14958                 break;
14959         case TCP_BBR_TSTMP_RAISES:
14960                 optval = bbr->ts_can_raise;
14961                 break;
14962         case TCP_BBR_TMR_PACE_OH:
14963                 optval = bbr->r_ctl.rc_incr_tmrs;
14964                 break;
14965         case TCP_BBR_PACE_OH:
14966                 optval = 0;
14967                 if (bbr->r_ctl.rc_inc_tcp_oh)
14968                         optval |= BBR_INCL_TCP_OH;
14969                 if (bbr->r_ctl.rc_inc_ip_oh)
14970                         optval |= BBR_INCL_IP_OH;
14971                 if (bbr->r_ctl.rc_inc_enet_oh)
14972                         optval |= BBR_INCL_ENET_OH;
14973                 break;
14974         default:
14975                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14976                 break;
14977         }
14978         INP_WUNLOCK(inp);
14979         error = sooptcopyout(sopt, &optval, sizeof optval);
14980         return (error);
14981 }
14982
14983 /*
14984  * return 0 on success, error-num on failure
14985  */
14986 static int
14987 bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
14988 {
14989         int32_t error = EINVAL;
14990         struct tcp_bbr *bbr;
14991
14992         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14993         if (bbr == NULL) {
14994                 /* Huh? */
14995                 goto out;
14996         }
14997         if (sopt->sopt_dir == SOPT_SET) {
14998                 return (bbr_set_sockopt(so, sopt, inp, tp, bbr));
14999         } else if (sopt->sopt_dir == SOPT_GET) {
15000                 return (bbr_get_sockopt(so, sopt, inp, tp, bbr));
15001         }
15002 out:
15003         INP_WUNLOCK(inp);
15004         return (error);
15005 }
15006
15007 static int
15008 bbr_pru_options(struct tcpcb *tp, int flags)
15009 {
15010         if (flags & PRUS_OOB)
15011                 return (EOPNOTSUPP);
15012         return (0);
15013 }
15014
15015 struct tcp_function_block __tcp_bbr = {
15016         .tfb_tcp_block_name = __XSTRING(STACKNAME),
15017         .tfb_tcp_output = bbr_output,
15018         .tfb_do_queued_segments = ctf_do_queued_segments,
15019         .tfb_do_segment_nounlock = bbr_do_segment_nounlock,
15020         .tfb_tcp_do_segment = bbr_do_segment,
15021         .tfb_tcp_ctloutput = bbr_ctloutput,
15022         .tfb_tcp_fb_init = bbr_init,
15023         .tfb_tcp_fb_fini = bbr_fini,
15024         .tfb_tcp_timer_stop_all = bbr_stopall,
15025         .tfb_tcp_timer_activate = bbr_timer_activate,
15026         .tfb_tcp_timer_active = bbr_timer_active,
15027         .tfb_tcp_timer_stop = bbr_timer_stop,
15028         .tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
15029         .tfb_tcp_handoff_ok = bbr_handoff_ok,
15030         .tfb_tcp_mtu_chg = bbr_mtu_chg,
15031         .tfb_pru_options = bbr_pru_options,
15032 };
15033
15034 static const char *bbr_stack_names[] = {
15035         __XSTRING(STACKNAME),
15036 #ifdef STACKALIAS
15037         __XSTRING(STACKALIAS),
15038 #endif
15039 };
15040
15041 static bool bbr_mod_inited = false;
15042
15043 static int
15044 tcp_addbbr(module_t mod, int32_t type, void *data)
15045 {
15046         int32_t err = 0;
15047         int num_stacks;
15048
15049         switch (type) {
15050         case MOD_LOAD:
15051                 printf("Attempting to load " __XSTRING(MODNAME) "\n");
15052                 bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
15053                     sizeof(struct bbr_sendmap),
15054                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
15055                 bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
15056                     sizeof(struct tcp_bbr),
15057                     NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
15058                 sysctl_ctx_init(&bbr_sysctl_ctx);
15059                 bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
15060                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
15061                     OID_AUTO,
15062 #ifdef STACKALIAS
15063                     __XSTRING(STACKALIAS),
15064 #else
15065                     __XSTRING(STACKNAME),
15066 #endif
15067                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
15068                     "");
15069                 if (bbr_sysctl_root == NULL) {
15070                         printf("Failed to add sysctl node\n");
15071                         err = EFAULT;
15072                         goto free_uma;
15073                 }
15074                 bbr_init_sysctls();
15075                 num_stacks = nitems(bbr_stack_names);
15076                 err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK,
15077                     bbr_stack_names, &num_stacks);
15078                 if (err) {
15079                         printf("Failed to register %s stack name for "
15080                             "%s module\n", bbr_stack_names[num_stacks],
15081                             __XSTRING(MODNAME));
15082                         sysctl_ctx_free(&bbr_sysctl_ctx);
15083         free_uma:
15084                         uma_zdestroy(bbr_zone);
15085                         uma_zdestroy(bbr_pcb_zone);
15086                         bbr_counter_destroy();
15087                         printf("Failed to register " __XSTRING(MODNAME)
15088                             " module err:%d\n", err);
15089                         return (err);
15090                 }
15091                 tcp_lro_reg_mbufq();
15092                 bbr_mod_inited = true;
15093                 printf(__XSTRING(MODNAME) " is now available\n");
15094                 break;
15095         case MOD_QUIESCE:
15096                 err = deregister_tcp_functions(&__tcp_bbr, true, false);
15097                 break;
15098         case MOD_UNLOAD:
15099                 err = deregister_tcp_functions(&__tcp_bbr, false, true);
15100                 if (err == EBUSY)
15101                         break;
15102                 if (bbr_mod_inited) {
15103                         uma_zdestroy(bbr_zone);
15104                         uma_zdestroy(bbr_pcb_zone);
15105                         sysctl_ctx_free(&bbr_sysctl_ctx);
15106                         bbr_counter_destroy();
15107                         printf(__XSTRING(MODNAME)
15108                             " is now no longer available\n");
15109                         bbr_mod_inited = false;
15110                 }
15111                 tcp_lro_dereg_mbufq();
15112                 err = 0;
15113                 break;
15114         default:
15115                 return (EOPNOTSUPP);
15116         }
15117         return (err);
15118 }
15119
15120 static moduledata_t tcp_bbr = {
15121         .name = __XSTRING(MODNAME),
15122             .evhand = tcp_addbbr,
15123             .priv = 0
15124 };
15125
15126 MODULE_VERSION(MODNAME, 1);
15127 DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
15128 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);