]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_stacks/bbr.c
Prevent premature shrinking of the scaled receive window
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_stacks / bbr.c
1 /*-
2  * Copyright (c) 2016-9
3  *      Netflix Inc.
4  *      All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  */
28 /**
29  * Author: Randall Stewart <rrs@netflix.com>
30  * This work is based on the ACM Queue paper
31  * BBR - Congestion Based Congestion Control
32  * and also numerous discussions with Neal, Yuchung and Van.
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40 #include "opt_ipsec.h"
41 #include "opt_tcpdebug.h"
42 #include "opt_ratelimit.h"
43 #include "opt_kern_tls.h"
44 #include <sys/param.h>
45 #include <sys/arb.h>
46 #include <sys/module.h>
47 #include <sys/kernel.h>
48 #ifdef TCP_HHOOK
49 #include <sys/hhook.h>
50 #endif
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/proc.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #ifdef KERN_TLS
57 #include <sys/ktls.h>
58 #endif
59 #include <sys/sysctl.h>
60 #include <sys/systm.h>
61 #ifdef STATS
62 #include <sys/qmath.h>
63 #include <sys/tree.h>
64 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
65 #endif
66 #include <sys/refcount.h>
67 #include <sys/queue.h>
68 #include <sys/eventhandler.h>
69 #include <sys/smp.h>
70 #include <sys/kthread.h>
71 #include <sys/lock.h>
72 #include <sys/mutex.h>
73 #include <sys/tim_filter.h>
74 #include <sys/time.h>
75 #include <vm/uma.h>
76 #include <sys/kern_prefetch.h>
77
78 #include <net/route.h>
79 #include <net/route/nhop.h>
80 #include <net/vnet.h>
81
82 #define TCPSTATES               /* for logging */
83
84 #include <netinet/in.h>
85 #include <netinet/in_kdtrace.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/ip.h>
88 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
89 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
90 #include <netinet/ip_var.h>
91 #include <netinet/ip6.h>
92 #include <netinet6/in6_pcb.h>
93 #include <netinet6/ip6_var.h>
94 #define TCPOUTFLAGS
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_timer.h>
99 #include <netinet/tcp_var.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/tcp_hpts.h>
102 #include <netinet/cc/cc.h>
103 #include <netinet/tcp_log_buf.h>
104 #include <netinet/tcp_ratelimit.h>
105 #include <netinet/tcp_lro.h>
106 #ifdef TCPDEBUG
107 #include <netinet/tcp_debug.h>
108 #endif                          /* TCPDEBUG */
109 #ifdef TCP_OFFLOAD
110 #include <netinet/tcp_offload.h>
111 #endif
112 #ifdef INET6
113 #include <netinet6/tcp6_var.h>
114 #endif
115 #include <netinet/tcp_fastopen.h>
116
117 #include <netipsec/ipsec_support.h>
118 #include <net/if.h>
119 #include <net/if_var.h>
120 #include <net/ethernet.h>
121
122 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
123 #include <netipsec/ipsec.h>
124 #include <netipsec/ipsec6.h>
125 #endif                          /* IPSEC */
126
127 #include <netinet/udp.h>
128 #include <netinet/udp_var.h>
129 #include <machine/in_cksum.h>
130
131 #ifdef MAC
132 #include <security/mac/mac_framework.h>
133 #endif
134
135 #include "sack_filter.h"
136 #include "tcp_bbr.h"
137 #include "rack_bbr_common.h"
138 uma_zone_t bbr_zone;
139 uma_zone_t bbr_pcb_zone;
140
141 struct sysctl_ctx_list bbr_sysctl_ctx;
142 struct sysctl_oid *bbr_sysctl_root;
143
144 #define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
145         (tv) = (value); \
146         if ((u_long)(tv) < (u_long)(tvmin)) \
147                 (tv) = (tvmin); \
148         if ((u_long)(tv) > (u_long)(tvmax)) \
149                 (tv) = (tvmax); \
150 } while(0)
151
152 /*#define BBR_INVARIANT 1*/
153
154 /*
155  * initial window
156  */
157 static uint32_t bbr_def_init_win = 10;
158 static int32_t bbr_persist_min = 250000;        /* 250ms */
159 static int32_t bbr_persist_max = 1000000;       /* 1 Second */
160 static int32_t bbr_cwnd_may_shrink = 0;
161 static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
162 static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
163 static int32_t bbr_hardware_pacing_limit = 8000;
164 static int32_t bbr_quanta = 3;  /* How much extra quanta do we get? */
165 static int32_t bbr_no_retran = 0;
166
167
168 static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
169 static int32_t bbr_max_net_error_cnt = 10;
170 /* Should the following be dynamic too -- loss wise */
171 static int32_t bbr_rtt_gain_thresh = 0;
172 /* Measurement controls */
173 static int32_t bbr_use_google_algo = 1;
174 static int32_t bbr_ts_limiting = 1;
175 static int32_t bbr_ts_can_raise = 0;
176 static int32_t bbr_do_red = 600;
177 static int32_t bbr_red_scale = 20000;
178 static int32_t bbr_red_mul = 1;
179 static int32_t bbr_red_div = 2;
180 static int32_t bbr_red_growth_restrict = 1;
181 static int32_t  bbr_target_is_bbunit = 0;
182 static int32_t bbr_drop_limit = 0;
183 /*
184  * How much gain do we need to see to
185  * stay in startup?
186  */
187 static int32_t bbr_marks_rxt_sack_passed = 0;
188 static int32_t bbr_start_exit = 25;
189 static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */
190 static int32_t bbr_startup_loss_thresh = 2000;  /* 20.00% loss */
191 static int32_t bbr_hptsi_max_mul = 1;   /* These two mul/div assure a min pacing */
192 static int32_t bbr_hptsi_max_div = 2;   /* time, 0 means turned off. We need this
193                                          * if we go back ever to where the pacer
194                                          * has priority over timers.
195                                          */
196 static int32_t bbr_policer_call_from_rack_to = 0;
197 static int32_t bbr_policer_detection_enabled = 1;
198 static int32_t bbr_min_measurements_req = 1;    /* We need at least 2
199                                                  * measurments before we are
200                                                  * "good" note that 2 == 1.
201                                                  * This is because we use a >
202                                                  * comparison. This means if
203                                                  * min_measure was 0, it takes
204                                                  * num-measures > min(0) and
205                                                  * you get 1 measurement and
206                                                  * you are good. Set to 1, you
207                                                  * have to have two
208                                                  * measurements (this is done
209                                                  * to prevent it from being ok
210                                                  * to have no measurements). */
211 static int32_t bbr_no_pacing_until = 4;
212
213 static int32_t bbr_min_usec_delta = 20000;      /* 20,000 usecs */
214 static int32_t bbr_min_peer_delta = 20;         /* 20 units */
215 static int32_t bbr_delta_percent = 150;         /* 15.0 % */
216
217 static int32_t bbr_target_cwnd_mult_limit = 8;
218 /*
219  * bbr_cwnd_min_val is the number of
220  * segments we hold to in the RTT probe
221  * state typically 4.
222  */
223 static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
224
225
226 static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
227
228 static int32_t bbr_gain_to_target = 1;
229 static int32_t bbr_gain_gets_extra_too = 1;
230 /*
231  * bbr_high_gain is the 2/ln(2) value we need
232  * to double the sending rate in startup. This
233  * is used for both cwnd and hptsi gain's.
234  */
235 static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
236 static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
237 static int32_t bbr_use_lower_gain_in_startup = 1;
238
239 /* thresholds for reduction on drain in sub-states/drain */
240 static int32_t bbr_drain_rtt = BBR_SRTT;
241 static int32_t bbr_drain_floor = 88;
242 static int32_t google_allow_early_out = 1;
243 static int32_t google_consider_lost = 1;
244 static int32_t bbr_drain_drop_mul = 4;
245 static int32_t bbr_drain_drop_div = 5;
246 static int32_t bbr_rand_ot = 50;
247 static int32_t bbr_can_force_probertt = 0;
248 static int32_t bbr_can_adjust_probertt = 1;
249 static int32_t bbr_probertt_sets_rtt = 0;
250 static int32_t bbr_can_use_ts_for_rtt = 1;
251 static int32_t bbr_is_ratio = 0;
252 static int32_t bbr_sub_drain_app_limit = 1;
253 static int32_t bbr_prtt_slam_cwnd = 1;
254 static int32_t bbr_sub_drain_slam_cwnd = 1;
255 static int32_t bbr_slam_cwnd_in_main_drain = 1;
256 static int32_t bbr_filter_len_sec = 6;  /* How long does the rttProp filter
257                                          * hold */
258 static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
259 /*
260  * bbr_drain_gain is the reverse of the high_gain
261  * designed to drain back out the standing queue
262  * that is formed in startup by causing a larger
263  * hptsi gain and thus drainging the packets
264  * in flight.
265  */
266 static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
267 static int32_t bbr_rttprobe_gain = 192;
268
269 /*
270  * The cwnd_gain is the default cwnd gain applied when
271  * calculating a target cwnd. Note that the cwnd is
272  * a secondary factor in the way BBR works (see the
273  * paper and think about it, it will take some time).
274  * Basically the hptsi_gain spreads the packets out
275  * so you never get more than BDP to the peer even
276  * if the cwnd is high. In our implemenation that
277  * means in non-recovery/retransmission scenarios
278  * cwnd will never be reached by the flight-size.
279  */
280 static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
281 static int32_t bbr_tlp_type_to_use = BBR_SRTT;
282 static int32_t bbr_delack_time = 100000;        /* 100ms in useconds */
283 static int32_t bbr_sack_not_required = 0;       /* set to one to allow non-sack to use bbr */
284 static int32_t bbr_initial_bw_bps = 62500;      /* 500kbps in bytes ps */
285 static int32_t bbr_ignore_data_after_close = 1;
286 static int16_t bbr_hptsi_gain[] = {
287         (BBR_UNIT *5 / 4),
288         (BBR_UNIT * 3 / 4),
289         BBR_UNIT,
290         BBR_UNIT,
291         BBR_UNIT,
292         BBR_UNIT,
293         BBR_UNIT,
294         BBR_UNIT
295 };
296 int32_t bbr_use_rack_resend_cheat = 1;
297 int32_t bbr_sends_full_iwnd = 1;
298
299 #define BBR_HPTSI_GAIN_MAX 8
300 /*
301  * The BBR module incorporates a number of
302  * TCP ideas that have been put out into the IETF
303  * over the last few years:
304  * - Yuchung Cheng's RACK TCP (for which its named) that
305  *    will stop us using the number of dup acks and instead
306  *    use time as the gage of when we retransmit.
307  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
308  *    of Dukkipati et.al.
309  * - Van Jacobson's et.al BBR.
310  *
311  * RACK depends on SACK, so if an endpoint arrives that
312  * cannot do SACK the state machine below will shuttle the
313  * connection back to using the "default" TCP stack that is
314  * in FreeBSD.
315  *
316  * To implement BBR and RACK the original TCP stack was first decomposed
317  * into a functional state machine with individual states
318  * for each of the possible TCP connection states. The do_segement
319  * functions role in life is to mandate the connection supports SACK
320  * initially and then assure that the RACK state matches the conenction
321  * state before calling the states do_segment function. Data processing
322  * of inbound segments also now happens in the hpts_do_segment in general
323  * with only one exception. This is so we can keep the connection on
324  * a single CPU.
325  *
326  * Each state is simplified due to the fact that the original do_segment
327  * has been decomposed and we *know* what state we are in (no
328  * switches on the state) and all tests for SACK are gone. This
329  * greatly simplifies what each state does.
330  *
331  * TCP output is also over-written with a new version since it
332  * must maintain the new rack scoreboard and has had hptsi
333  * integrated as a requirment. Still todo is to eliminate the
334  * use of the callout_() system and use the hpts for all
335  * timers as well.
336  */
337 static uint32_t bbr_rtt_probe_time = 200000;    /* 200ms in micro seconds */
338 static uint32_t bbr_rtt_probe_cwndtarg = 4;     /* How many mss's outstanding */
339 static const int32_t bbr_min_req_free = 2;      /* The min we must have on the
340                                                  * free list */
341 static int32_t bbr_tlp_thresh = 1;
342 static int32_t bbr_reorder_thresh = 2;
343 static int32_t bbr_reorder_fade = 60000000;     /* 0 - never fade, def
344                                                  * 60,000,000 - 60 seconds */
345 static int32_t bbr_pkt_delay = 1000;
346 static int32_t bbr_min_to = 1000;       /* Number of usec's minimum timeout */
347 static int32_t bbr_incr_timers = 1;
348
349 static int32_t bbr_tlp_min = 10000;     /* 10ms in usecs */
350 static int32_t bbr_delayed_ack_time = 200000;   /* 200ms in usecs */
351 static int32_t bbr_exit_startup_at_loss = 1;
352
353 /*
354  * bbr_lt_bw_ratio is 1/8th
355  * bbr_lt_bw_diff is  < 4 Kbit/sec
356  */
357 static uint64_t bbr_lt_bw_diff = 4000 / 8;      /* In bytes per second */
358 static uint64_t bbr_lt_bw_ratio = 8;    /* For 1/8th */
359 static uint32_t bbr_lt_bw_max_rtts = 48;        /* How many rtt's do we use
360                                                  * the lt_bw for */
361 static uint32_t bbr_lt_intvl_min_rtts = 4;      /* Min num of RTT's to measure
362                                                  * lt_bw */
363 static int32_t bbr_lt_intvl_fp = 0;             /* False positive epoch diff */
364 static int32_t bbr_lt_loss_thresh = 196;        /* Lost vs delivered % */
365 static int32_t bbr_lt_fd_thresh = 100;          /* false detection % */
366
367 static int32_t bbr_verbose_logging = 0;
368 /*
369  * Currently regular tcp has a rto_min of 30ms
370  * the backoff goes 12 times so that ends up
371  * being a total of 122.850 seconds before a
372  * connection is killed.
373  */
374 static int32_t bbr_rto_min_ms = 30;     /* 30ms same as main freebsd */
375 static int32_t bbr_rto_max_sec = 4;     /* 4 seconds */
376
377 /****************************************************/
378 /* DEFAULT TSO SIZING  (cpu performance impacting)  */
379 /****************************************************/
380 /* What amount is our formula using to get TSO size */
381 static int32_t bbr_hptsi_per_second = 1000;
382
383 /*
384  * For hptsi under bbr_cross_over connections what is delay
385  * target 7ms (in usec) combined with a seg_max of 2
386  * gets us close to identical google behavior in
387  * TSO size selection (possibly more 1MSS sends).
388  */
389 static int32_t bbr_hptsi_segments_delay_tar = 7000;
390
391 /* Does pacing delay include overhead's in its time calculations? */
392 static int32_t bbr_include_enet_oh = 0;
393 static int32_t bbr_include_ip_oh = 1;
394 static int32_t bbr_include_tcp_oh = 1;
395 static int32_t bbr_google_discount = 10;
396
397 /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
398 static int32_t bbr_state_is_pkt_epoch = 0;
399 static int32_t bbr_state_drain_2_tar = 1;
400 /* What is the max the 0 - bbr_cross_over MBPS TSO target
401  * can reach using our delay target. Note that this
402  * value becomes the floor for the cross over
403  * algorithm.
404  */
405 static int32_t bbr_hptsi_segments_max = 2;
406 static int32_t bbr_hptsi_segments_floor = 1;
407 static int32_t bbr_hptsi_utter_max = 0;
408
409 /* What is the min the 0 - bbr_cross-over MBPS  TSO target can be */
410 static int32_t bbr_hptsi_bytes_min = 1460;
411 static int32_t bbr_all_get_min = 0;
412
413 /* Cross over point from algo-a to algo-b */
414 static uint32_t bbr_cross_over = TWENTY_THREE_MBPS;
415
416 /* Do we deal with our restart state? */
417 static int32_t bbr_uses_idle_restart = 0;
418 static int32_t bbr_idle_restart_threshold = 100000;     /* 100ms in useconds */
419
420 /* Do we allow hardware pacing? */
421 static int32_t bbr_allow_hdwr_pacing = 0;
422 static int32_t bbr_hdwr_pace_adjust = 2;        /* multipler when we calc the tso size */
423 static int32_t bbr_hdwr_pace_floor = 1;
424 static int32_t bbr_hdwr_pacing_delay_cnt = 10;
425
426 /****************************************************/
427 static int32_t bbr_resends_use_tso = 0;
428 static int32_t bbr_tlp_max_resend = 2;
429 static int32_t bbr_sack_block_limit = 128;
430
431 #define  BBR_MAX_STAT 19
432 counter_u64_t bbr_state_time[BBR_MAX_STAT];
433 counter_u64_t bbr_state_lost[BBR_MAX_STAT];
434 counter_u64_t bbr_state_resend[BBR_MAX_STAT];
435 counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
436 counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
437 counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE];
438 counter_u64_t bbr_flows_whdwr_pacing;
439 counter_u64_t bbr_flows_nohdwr_pacing;
440
441 counter_u64_t bbr_nohdwr_pacing_enobuf;
442 counter_u64_t bbr_hdwr_pacing_enobuf;
443
444 static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr);
445
446 /*
447  * Static defintions we need for forward declarations.
448  */
449 static uint32_t
450 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain,
451     uint32_t useconds_time, uint64_t bw);
452 static uint32_t
453 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain);
454 static void
455      bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win);
456 static void
457 bbr_set_probebw_gains(struct tcp_bbr *bbr,  uint32_t cts, uint32_t losses);
458 static void
459 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line,
460                     int dolog);
461 static uint32_t
462 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain);
463 static void
464 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch,
465                  int32_t pkt_epoch, uint32_t losses);
466 static uint32_t
467 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm);
468 static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp);
469 static uint32_t
470 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
471     struct bbr_sendmap *rsm, uint32_t srtt,
472     uint32_t cts);
473 static void
474 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
475     int32_t line);
476 static void
477      bbr_set_state_target(struct tcp_bbr *bbr, int line);
478 static void
479      bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line);
480
481 static void
482      bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line);
483
484 static void
485      tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts);
486
487 static void
488      bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts);
489
490 static void
491      bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt,
492                          uint32_t line, uint8_t is_start, uint16_t set);
493
494 static struct bbr_sendmap *
495             bbr_find_lowest_rsm(struct tcp_bbr *bbr);
496 static __inline uint32_t
497 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
498 static void
499      bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which);
500
501 static void
502 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
503     uint32_t thresh, uint32_t to);
504 static void
505      bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
506
507 static void
508 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
509     uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay);
510
511 static void
512 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr,
513     uint32_t cts, int32_t line);
514 static void
515      bbr_stop_all_timers(struct tcpcb *tp);
516 static void
517      bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
518 static void
519      bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts);
520 static void
521      bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts);
522
523
524 static void
525 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
526     uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod);
527
528 static inline uint8_t
529 bbr_state_val(struct tcp_bbr *bbr)
530 {
531         return(bbr->rc_bbr_substate);
532 }
533
534 static inline uint32_t
535 get_min_cwnd(struct tcp_bbr *bbr)
536 {
537         int mss;
538
539         mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
540         if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED)
541                 return (bbr_cwnd_min_val_hs * mss);
542         else
543                 return (bbr_cwnd_min_val * mss);
544 }
545
546 static uint32_t
547 bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr)
548 {
549         uint64_t srtt, var;
550         uint64_t ret_val;
551
552         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
553         if (tp->t_srtt == 0) {
554                 srtt = (uint64_t)BBR_INITIAL_RTO;
555                 var = 0;
556         } else {
557                 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
558                 var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT);
559         }
560         TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]),
561             bbr_persist_min, bbr_persist_max);
562         return ((uint32_t)ret_val);
563 }
564
565 static uint32_t
566 bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
567 {
568         /*
569          * Start the FR timer, we do this based on getting the first one in
570          * the rc_tmap. Note that if its NULL we must stop the timer. in all
571          * events we need to stop the running timer (if its running) before
572          * starting the new one.
573          */
574         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
575         int32_t idx;
576         int32_t is_tlp_timer = 0;
577         struct bbr_sendmap *rsm;
578
579         if (bbr->rc_all_timers_stopped) {
580                 /* All timers have been stopped none are to run */
581                 return (0);
582         }
583         if (bbr->rc_in_persist) {
584                 /* We can't start any timer in persists */
585                 return (bbr_get_persists_timer_val(tp, bbr));
586         }
587         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
588         if ((rsm == NULL) ||
589             ((tp->t_flags & TF_SACK_PERMIT) == 0) ||
590             (tp->t_state < TCPS_ESTABLISHED)) {
591                 /* Nothing on the send map */
592 activate_rxt:
593                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
594                         uint64_t tov;
595
596                         time_since_sent = 0;
597                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
598                         if (rsm) {
599                                 idx = rsm->r_rtr_cnt - 1;
600                                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
601                                         tstmp_touse = rsm->r_tim_lastsent[idx];
602                                 else
603                                         tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
604                                 if (TSTMP_GT(tstmp_touse, cts))
605                                     time_since_sent = cts - tstmp_touse;
606                         }
607                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
608                         if (tp->t_srtt == 0)
609                                 tov = BBR_INITIAL_RTO;
610                         else
611                                 tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) +
612                                     ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT);
613                         if (tp->t_rxtshift)
614                                 tov *= tcp_backoff[tp->t_rxtshift];
615                         if (tov > time_since_sent)
616                                 tov -= time_since_sent;
617                         else
618                                 tov = bbr->r_ctl.rc_min_to;
619                         TCPT_RANGESET_NOSLOP(to, tov,
620                             (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC),
621                             (bbr->rc_max_rto_sec * USECS_IN_SECOND));
622                         bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to);
623                         return (to);
624                 }
625                 return (0);
626         }
627         if (rsm->r_flags & BBR_ACKED) {
628                 rsm = bbr_find_lowest_rsm(bbr);
629                 if (rsm == NULL) {
630                         /* No lowest? */
631                         goto activate_rxt;
632                 }
633         }
634         /* Convert from ms to usecs */
635         if (rsm->r_flags & BBR_SACK_PASSED) {
636                 if ((tp->t_flags & TF_SENTFIN) &&
637                     ((tp->snd_max - tp->snd_una) == 1) &&
638                     (rsm->r_flags & BBR_HAS_FIN)) {
639                         /*
640                          * We don't start a bbr rack timer if all we have is
641                          * a FIN outstanding.
642                          */
643                         goto activate_rxt;
644                 }
645                 srtt = bbr_get_rtt(bbr, BBR_RTT_RACK);
646                 thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm);
647                 idx = rsm->r_rtr_cnt - 1;
648                 exp = rsm->r_tim_lastsent[idx] + thresh;
649                 if (SEQ_GEQ(exp, cts)) {
650                         to = exp - cts;
651                         if (to < bbr->r_ctl.rc_min_to) {
652                                 to = bbr->r_ctl.rc_min_to;
653                         }
654                 } else {
655                         to = bbr->r_ctl.rc_min_to;
656                 }
657         } else {
658                 /* Ok we need to do a TLP not RACK */
659                 if (bbr->rc_tlp_in_progress != 0) {
660                         /*
661                          * The previous send was a TLP.
662                          */
663                         goto activate_rxt;
664                 }
665                 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
666                 if (rsm == NULL) {
667                         /* We found no rsm to TLP with. */
668                         goto activate_rxt;
669                 }
670                 if (rsm->r_flags & BBR_HAS_FIN) {
671                         /* If its a FIN we don't do TLP */
672                         rsm = NULL;
673                         goto activate_rxt;
674                 }
675                 time_since_sent = 0;
676                 idx = rsm->r_rtr_cnt - 1;
677                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
678                         tstmp_touse = rsm->r_tim_lastsent[idx];
679                 else
680                         tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
681                 if (TSTMP_GT(tstmp_touse, cts))
682                     time_since_sent = cts - tstmp_touse;
683                 is_tlp_timer = 1;
684                 srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use);
685                 thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts);
686                 if (thresh > time_since_sent)
687                         to = thresh - time_since_sent;
688                 else
689                         to = bbr->r_ctl.rc_min_to;
690                 if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
691                         /*
692                          * If the TLP time works out to larger than the max
693                          * RTO lets not do TLP.. just RTO.
694                          */
695                         goto activate_rxt;
696                 }
697                 if ((bbr->rc_tlp_rtx_out == 1) &&
698                     (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) {
699                         /*
700                          * Second retransmit of the same TLP
701                          * lets not.
702                          */
703                         bbr->rc_tlp_rtx_out = 0;
704                         goto activate_rxt;
705                 }
706                 if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) {
707                         /*
708                          * The tail is no longer the last one I did a probe
709                          * on
710                          */
711                         bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
712                         bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
713                 }
714         }
715         if (is_tlp_timer == 0) {
716                 BBR_STAT_INC(bbr_to_arm_rack);
717                 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
718         } else {
719                 bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to);
720                 if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
721                         /*
722                          * We have exceeded how many times we can retran the
723                          * current TLP timer, switch to the RTO timer.
724                          */
725                         goto activate_rxt;
726                 } else {
727                         BBR_STAT_INC(bbr_to_arm_tlp);
728                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
729                 }
730         }
731         return (to);
732 }
733
734 static inline int32_t
735 bbr_minseg(struct tcp_bbr *bbr)
736 {
737         return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options);
738 }
739
740 static void
741 bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
742 {
743         struct inpcb *inp;
744         struct hpts_diag diag;
745         uint32_t delayed_ack = 0;
746         uint32_t left = 0;
747         uint32_t hpts_timeout;
748         uint8_t stopped;
749         int32_t delay_calc = 0;
750         uint32_t prev_delay = 0;
751
752         inp = tp->t_inpcb;
753         if (inp->inp_in_hpts) {
754                 /* A previous call is already set up */
755                 return;
756         }
757         if ((tp->t_state == TCPS_CLOSED) ||
758             (tp->t_state == TCPS_LISTEN)) {
759                 return;
760         }
761         stopped = bbr->rc_tmr_stopped;
762         if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
763                 left = bbr->r_ctl.rc_timer_exp - cts;
764         }
765         bbr->r_ctl.rc_hpts_flags = 0;
766         bbr->r_ctl.rc_timer_exp = 0;
767         prev_delay = bbr->r_ctl.rc_last_delay_val;
768         if (bbr->r_ctl.rc_last_delay_val &&
769             (slot == 0)) {
770                 /*
771                  * If a previous pacer delay was in place we
772                  * are not coming from the output side (where
773                  * we calculate a delay, more likely a timer).
774                  */
775                 slot = bbr->r_ctl.rc_last_delay_val;
776                 if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
777                         /* Compensate for time passed  */
778                         delay_calc = cts - bbr->rc_pacer_started;
779                         if (delay_calc <= slot)
780                                 slot -= delay_calc;
781                 }
782         }
783         /* Do we have early to make up for by pushing out the pacing time? */
784         if (bbr->r_agg_early_set) {
785                 bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
786                 slot += bbr->r_ctl.rc_agg_early;
787                 bbr->r_ctl.rc_agg_early = 0;
788                 bbr->r_agg_early_set = 0;
789         }
790         /* Are we running a total debt that needs to be compensated for? */
791         if (bbr->r_ctl.rc_hptsi_agg_delay) {
792                 if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
793                         /* We nuke the delay */
794                         slot -= bbr->r_ctl.rc_hptsi_agg_delay;
795                         bbr->r_ctl.rc_hptsi_agg_delay = 0;
796                 } else {
797                         /* We nuke some of the delay, put in a minimal 100usecs  */
798                         bbr->r_ctl.rc_hptsi_agg_delay -= slot;
799                         bbr->r_ctl.rc_last_delay_val = slot = 100;
800                 }
801         }
802         bbr->r_ctl.rc_last_delay_val = slot;
803         hpts_timeout = bbr_timer_start(tp, bbr, cts);
804         if (tp->t_flags & TF_DELACK) {
805                 if (bbr->rc_in_persist == 0) {
806                         delayed_ack = bbr_delack_time;
807                 } else {
808                         /*
809                          * We are in persists and have
810                          * gotten a new data element.
811                          */
812                         if (hpts_timeout > bbr_delack_time) {
813                                 /*
814                                  * Lets make the persists timer (which acks)
815                                  * be the smaller of hpts_timeout and bbr_delack_time.
816                                  */
817                                 hpts_timeout = bbr_delack_time;
818                         }
819                 }
820         }
821         if (delayed_ack &&
822             ((hpts_timeout == 0) ||
823              (delayed_ack < hpts_timeout))) {
824                 /* We need a Delayed ack timer */
825                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
826                 hpts_timeout = delayed_ack;
827         }
828         if (slot) {
829                 /* Mark that we have a pacing timer up */
830                 BBR_STAT_INC(bbr_paced_segments);
831                 bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
832         }
833         /*
834          * If no timers are going to run and we will fall off thfe hptsi
835          * wheel, we resort to a keep-alive timer if its configured.
836          */
837         if ((hpts_timeout == 0) &&
838             (slot == 0)) {
839                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
840                     (tp->t_state <= TCPS_CLOSING)) {
841                         /*
842                          * Ok we have no timer (persists, rack, tlp, rxt  or
843                          * del-ack), we don't have segments being paced. So
844                          * all that is left is the keepalive timer.
845                          */
846                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
847                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
848                         } else {
849                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
850                         }
851                         bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
852                 }
853         }
854         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
855             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
856                 /*
857                  * RACK, TLP, persists and RXT timers all are restartable
858                  * based on actions input .. i.e we received a packet (ack
859                  * or sack) and that changes things (rw, or snd_una etc).
860                  * Thus we can restart them with a new value. For
861                  * keep-alive, delayed_ack we keep track of what was left
862                  * and restart the timer with a smaller value.
863                  */
864                 if (left < hpts_timeout)
865                         hpts_timeout = left;
866         }
867         if (bbr->r_ctl.rc_incr_tmrs && slot &&
868             (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
869                 /*
870                  * If configured to do so, and the timer is either
871                  * the TLP or RXT timer, we need to increase the timeout
872                  * by the pacing time. Consider the bottleneck at my
873                  * machine as an example, we are sending something
874                  * to start a TLP on. The last packet won't be emitted
875                  * fully until the pacing time (the bottleneck will hold
876                  * the data in place). Once the packet is emitted that
877                  * is when we want to start waiting for the TLP. This
878                  * is most evident with hardware pacing (where the nic
879                  * is holding the packet(s) before emitting). But it
880                  * can also show up in the network so we do it for all
881                  * cases. Technically we would take off one packet from
882                  * this extra delay but this is easier and being more
883                  * conservative is probably better.
884                  */
885                 hpts_timeout += slot;
886         }
887         if (hpts_timeout) {
888                 /*
889                  * Hack alert for now we can't time-out over 2147 seconds (a
890                  * bit more than 35min)
891                  */
892                 if (hpts_timeout > 0x7ffffffe)
893                         hpts_timeout = 0x7ffffffe;
894                 bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
895         } else
896                 bbr->r_ctl.rc_timer_exp = 0;
897         if ((slot) &&
898             (bbr->rc_use_google ||
899              bbr->output_error_seen ||
900              (slot <= hpts_timeout))  ) {
901                 /*
902                  * Tell LRO that it can queue packets while
903                  * we pace.
904                  */
905                 bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
906                 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
907                     (bbr->rc_cwnd_limited == 0)) {
908                         /*
909                          * If we are not cwnd limited and we
910                          * are running a rack timer we put on
911                          * the do not disturbe even for sack.
912                          */
913                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
914                 } else
915                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
916                 bbr->rc_pacer_started = cts;
917
918                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
919                                            __LINE__, &diag);
920                 bbr->rc_timer_first = 0;
921                 bbr->bbr_timer_src = frm;
922                 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
923                 bbr_log_hpts_diag(bbr, cts, &diag);
924         } else if (hpts_timeout) {
925                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
926                                            __LINE__, &diag);
927                 /*
928                  * We add the flag here as well if the slot is set,
929                  * since hpts will call in to clear the queue first before
930                  * calling the output routine (which does our timers).
931                  * We don't want to set the flag if its just a timer
932                  * else the arrival of data might (that causes us
933                  * to send more) might get delayed. Imagine being
934                  * on a keep-alive timer and a request comes in for
935                  * more data.
936                  */
937                 if (slot)
938                         bbr->rc_pacer_started = cts;
939                 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
940                     (bbr->rc_cwnd_limited == 0)) {
941                         /*
942                          * For a rack timer, don't wake us even
943                          * if a sack arrives as long as we are
944                          * not cwnd limited.
945                          */
946                         bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
947                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
948                 } else {
949                         /* All other timers wake us up */
950                         bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
951                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
952                 }
953                 bbr->bbr_timer_src = frm;
954                 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
955                 bbr_log_hpts_diag(bbr, cts, &diag);
956                 bbr->rc_timer_first = 1;
957         }
958         bbr->rc_tmr_stopped = 0;
959         bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
960 }
961
962 static void
963 bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb)
964 {
965         /*
966          * We received an ack, and then did not call send or were bounced
967          * out due to the hpts was running. Now a timer is up as well, is it
968          * the right timer?
969          */
970         struct inpcb *inp;
971         struct bbr_sendmap *rsm;
972         uint32_t hpts_timeout;
973         int tmr_up;
974
975         tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
976         if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
977                 return;
978         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
979         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
980             (tmr_up == PACE_TMR_RXT)) {
981                 /* Should be an RXT */
982                 return;
983         }
984         inp = bbr->rc_inp;
985         if (rsm == NULL) {
986                 /* Nothing outstanding? */
987                 if (tp->t_flags & TF_DELACK) {
988                         if (tmr_up == PACE_TMR_DELACK)
989                                 /*
990                                  * We are supposed to have delayed ack up
991                                  * and we do
992                                  */
993                                 return;
994                 } else if (sbavail(&inp->inp_socket->so_snd) &&
995                     (tmr_up == PACE_TMR_RXT)) {
996                         /*
997                          * if we hit enobufs then we would expect the
998                          * possiblity of nothing outstanding and the RXT up
999                          * (and the hptsi timer).
1000                          */
1001                         return;
1002                 } else if (((V_tcp_always_keepalive ||
1003                             inp->inp_socket->so_options & SO_KEEPALIVE) &&
1004                             (tp->t_state <= TCPS_CLOSING)) &&
1005                             (tmr_up == PACE_TMR_KEEP) &&
1006                     (tp->snd_max == tp->snd_una)) {
1007                         /* We should have keep alive up and we do */
1008                         return;
1009                 }
1010         }
1011         if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) {
1012                 if ((tp->t_flags & TF_SENTFIN) &&
1013                     ((tp->snd_max - tp->snd_una) == 1) &&
1014                     (rsm->r_flags & BBR_HAS_FIN)) {
1015                         /* needs to be a RXT */
1016                         if (tmr_up == PACE_TMR_RXT)
1017                                 return;
1018                         else
1019                                 goto wrong_timer;
1020                 } else if (tmr_up == PACE_TMR_RACK)
1021                         return;
1022                 else
1023                         goto wrong_timer;
1024         } else if (rsm && (tmr_up == PACE_TMR_RACK)) {
1025                 /* Rack timer has priority if we have data out */
1026                 return;
1027         } else if (SEQ_GT(tp->snd_max, tp->snd_una) &&
1028                     ((tmr_up == PACE_TMR_TLP) ||
1029             (tmr_up == PACE_TMR_RXT))) {
1030                 /*
1031                  * Either a TLP or RXT is fine if no sack-passed is in place
1032                  * and data is outstanding.
1033                  */
1034                 return;
1035         } else if (tmr_up == PACE_TMR_DELACK) {
1036                 /*
1037                  * If the delayed ack was going to go off before the
1038                  * rtx/tlp/rack timer were going to expire, then that would
1039                  * be the timer in control. Note we don't check the time
1040                  * here trusting the code is correct.
1041                  */
1042                 return;
1043         }
1044         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
1045             ((tmr_up == PACE_TMR_RXT) ||
1046              (tmr_up == PACE_TMR_TLP) ||
1047              (tmr_up == PACE_TMR_RACK))) {
1048                 /*
1049                  * We have outstanding data and
1050                  * we *do* have a RACK, TLP or RXT
1051                  * timer running. We won't restart
1052                  * anything here since thats probably ok we
1053                  * will get called with some timer here shortly.
1054                  */
1055                 return;
1056         }
1057         /*
1058          * Ok the timer originally started is not what we want now. We will
1059          * force the hpts to be stopped if any, and restart with the slot
1060          * set to what was in the saved slot.
1061          */
1062 wrong_timer:
1063         if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
1064                 if (inp->inp_in_hpts)
1065                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
1066                 bbr_timer_cancel(bbr, __LINE__, cts);
1067                 bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
1068                     0);
1069         } else {
1070                 /*
1071                  * Output is hptsi so we just need to switch the type of
1072                  * timer. We don't bother with keep-alive, since when we
1073                  * jump through the output, it will start the keep-alive if
1074                  * nothing is sent.
1075                  *
1076                  * We only need a delayed-ack added and or the hpts_timeout.
1077                  */
1078                 hpts_timeout = bbr_timer_start(tp, bbr, cts);
1079                 if (tp->t_flags & TF_DELACK) {
1080                         if (hpts_timeout == 0) {
1081                                 hpts_timeout = bbr_delack_time;
1082                                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
1083                         }
1084                         else if (hpts_timeout > bbr_delack_time) {
1085                                 hpts_timeout = bbr_delack_time;
1086                                 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
1087                         }
1088                 }
1089                 if (hpts_timeout) {
1090                         if (hpts_timeout > 0x7ffffffe)
1091                                 hpts_timeout = 0x7ffffffe;
1092                         bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
1093                 }
1094         }
1095 }
1096
1097 int32_t bbr_clear_lost = 0;
1098
1099 /*
1100  * Considers the two time values now (cts) and earlier.
1101  * If cts is smaller than earlier, we could have
1102  * had a sequence wrap (our counter wraps every
1103  * 70 min or so) or it could be just clock skew
1104  * getting us two differnt time values. Clock skew
1105  * will show up within 10ms or so. So in such
1106  * a case (where cts is behind earlier time by
1107  * less than 10ms) we return 0. Otherwise we
1108  * return the true difference between them.
1109  */
1110 static inline uint32_t
1111 bbr_calc_time(uint32_t cts, uint32_t earlier_time) {
1112         /*
1113          * Given two timestamps, the current time stamp cts, and some other
1114          * time-stamp taken in theory earlier return the difference. The
1115          * trick is here sometimes locking will get the other timestamp
1116          * after the cts. If this occurs we need to return 0.
1117          */
1118         if (TSTMP_GEQ(cts, earlier_time))
1119                 return (cts - earlier_time);
1120         /*
1121          * cts is behind earlier_time if its less than 10ms consider it 0.
1122          * If its more than 10ms difference then we had a time wrap. Else
1123          * its just the normal locking foo. I wonder if we should not go to
1124          * 64bit TS and get rid of this issue.
1125          */
1126         if (TSTMP_GEQ((cts + 10000), earlier_time))
1127                 return (0);
1128         /*
1129          * Ok the time must have wrapped. So we need to answer a large
1130          * amount of time, which the normal subtraction should do.
1131          */
1132         return (cts - earlier_time);
1133 }
1134
1135
1136
1137 static int
1138 sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS)
1139 {
1140         uint32_t stat;
1141         int32_t error;
1142
1143         error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t));
1144         if (error || req->newptr == NULL)
1145                 return error;
1146
1147         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
1148         if (error)
1149                 return (error);
1150         if (stat == 1) {
1151 #ifdef BBR_INVARIANTS
1152                 printf("Clearing BBR lost counters\n");
1153 #endif
1154                 COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT);
1155                 COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT);
1156                 COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT);
1157         } else if (stat == 2) {
1158 #ifdef BBR_INVARIANTS
1159                 printf("Clearing BBR option counters\n");
1160 #endif
1161                 COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE);
1162         } else if (stat == 3) {
1163 #ifdef BBR_INVARIANTS
1164                 printf("Clearing BBR stats counters\n");
1165 #endif
1166                 COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE);
1167         } else if (stat == 4) {
1168 #ifdef BBR_INVARIANTS
1169                 printf("Clearing BBR out-size counters\n");
1170 #endif
1171                 COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE);
1172         }
1173         bbr_clear_lost = 0;
1174         return (0);
1175 }
1176
1177 static void
1178 bbr_init_sysctls(void)
1179 {
1180         struct sysctl_oid *bbr_probertt;
1181         struct sysctl_oid *bbr_hptsi;
1182         struct sysctl_oid *bbr_measure;
1183         struct sysctl_oid *bbr_cwnd;
1184         struct sysctl_oid *bbr_timeout;
1185         struct sysctl_oid *bbr_states;
1186         struct sysctl_oid *bbr_startup;
1187         struct sysctl_oid *bbr_policer;
1188
1189         /* Probe rtt controls */
1190         bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1191             SYSCTL_CHILDREN(bbr_sysctl_root),
1192             OID_AUTO,
1193             "probertt",
1194             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1195             "");
1196         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1197             SYSCTL_CHILDREN(bbr_probertt),
1198             OID_AUTO, "gain", CTLFLAG_RW,
1199             &bbr_rttprobe_gain, 192,
1200             "What is the filter gain drop in probe_rtt (0=disable)?");
1201         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1202             SYSCTL_CHILDREN(bbr_probertt),
1203             OID_AUTO, "cwnd", CTLFLAG_RW,
1204             &bbr_rtt_probe_cwndtarg, 4,
1205             "How many mss's are outstanding during probe-rtt");
1206         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1207             SYSCTL_CHILDREN(bbr_probertt),
1208             OID_AUTO, "int", CTLFLAG_RW,
1209             &bbr_rtt_probe_limit, 4000000,
1210             "If RTT has not shrank in this many micro-seconds enter probe-rtt");
1211         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1212             SYSCTL_CHILDREN(bbr_probertt),
1213             OID_AUTO, "mintime", CTLFLAG_RW,
1214             &bbr_rtt_probe_time, 200000,
1215             "How many microseconds in probe-rtt");
1216         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1217             SYSCTL_CHILDREN(bbr_probertt),
1218             OID_AUTO, "filter_len_sec", CTLFLAG_RW,
1219             &bbr_filter_len_sec, 6,
1220             "How long in seconds does the rttProp filter run?");
1221         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1222             SYSCTL_CHILDREN(bbr_probertt),
1223             OID_AUTO, "drain_rtt", CTLFLAG_RW,
1224             &bbr_drain_rtt, BBR_SRTT,
1225             "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?");
1226         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1227             SYSCTL_CHILDREN(bbr_probertt),
1228             OID_AUTO, "can_force", CTLFLAG_RW,
1229             &bbr_can_force_probertt, 0,
1230             "If we keep setting new low rtt's but delay going in probe-rtt can we force in??");
1231         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1232             SYSCTL_CHILDREN(bbr_probertt),
1233             OID_AUTO, "enter_sets_force", CTLFLAG_RW,
1234             &bbr_probertt_sets_rtt, 0,
1235             "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?");
1236         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1237             SYSCTL_CHILDREN(bbr_probertt),
1238             OID_AUTO, "can_adjust", CTLFLAG_RW,
1239             &bbr_can_adjust_probertt, 1,
1240             "Can we dynamically adjust the probe-rtt limits and times?");
1241         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1242             SYSCTL_CHILDREN(bbr_probertt),
1243             OID_AUTO, "is_ratio", CTLFLAG_RW,
1244             &bbr_is_ratio, 0,
1245             "is the limit to filter a ratio?");
1246         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1247             SYSCTL_CHILDREN(bbr_probertt),
1248             OID_AUTO, "use_cwnd", CTLFLAG_RW,
1249             &bbr_prtt_slam_cwnd, 0,
1250             "Should we set/recover cwnd?");
1251         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1252             SYSCTL_CHILDREN(bbr_probertt),
1253             OID_AUTO, "can_use_ts", CTLFLAG_RW,
1254             &bbr_can_use_ts_for_rtt, 1,
1255             "Can we use the ms timestamp if available for retransmistted rtt calculations?");
1256
1257         /* Pacing controls */
1258         bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1259             SYSCTL_CHILDREN(bbr_sysctl_root),
1260             OID_AUTO,
1261             "pacing",
1262             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1263             "");
1264         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1265             SYSCTL_CHILDREN(bbr_hptsi),
1266             OID_AUTO, "hw_pacing", CTLFLAG_RW,
1267             &bbr_allow_hdwr_pacing, 1,
1268             "Do we allow hardware pacing?");
1269         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1270             SYSCTL_CHILDREN(bbr_hptsi),
1271             OID_AUTO, "hw_pacing_limit", CTLFLAG_RW,
1272             &bbr_hardware_pacing_limit, 4000,
1273             "Do we have a limited number of connections for pacing chelsio (0=no limit)?");
1274         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1275             SYSCTL_CHILDREN(bbr_hptsi),
1276             OID_AUTO, "hw_pacing_adj", CTLFLAG_RW,
1277             &bbr_hdwr_pace_adjust, 2,
1278             "Multiplier to calculated tso size?");
1279         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1280             SYSCTL_CHILDREN(bbr_hptsi),
1281             OID_AUTO, "hw_pacing_floor", CTLFLAG_RW,
1282             &bbr_hdwr_pace_floor, 1,
1283             "Do we invoke the hardware pacing floor?");
1284         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1285             SYSCTL_CHILDREN(bbr_hptsi),
1286             OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW,
1287             &bbr_hdwr_pacing_delay_cnt, 10,
1288             "How many packets must be sent after hdwr pacing is enabled");
1289         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1290             SYSCTL_CHILDREN(bbr_hptsi),
1291             OID_AUTO, "bw_cross", CTLFLAG_RW,
1292             &bbr_cross_over, 3000000,
1293             "What is the point where we cross over to linux like TSO size set");
1294         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1295             SYSCTL_CHILDREN(bbr_hptsi),
1296             OID_AUTO, "seg_deltarg", CTLFLAG_RW,
1297             &bbr_hptsi_segments_delay_tar, 7000,
1298             "What is the worse case delay target for hptsi < 48Mbp connections");
1299         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1300             SYSCTL_CHILDREN(bbr_hptsi),
1301             OID_AUTO, "enet_oh", CTLFLAG_RW,
1302             &bbr_include_enet_oh, 0,
1303             "Do we include the ethernet overhead in calculating pacing delay?");
1304         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1305             SYSCTL_CHILDREN(bbr_hptsi),
1306             OID_AUTO, "ip_oh", CTLFLAG_RW,
1307             &bbr_include_ip_oh, 1,
1308             "Do we include the IP overhead in calculating pacing delay?");
1309         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1310             SYSCTL_CHILDREN(bbr_hptsi),
1311             OID_AUTO, "tcp_oh", CTLFLAG_RW,
1312             &bbr_include_tcp_oh, 0,
1313             "Do we include the TCP overhead in calculating pacing delay?");
1314         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1315             SYSCTL_CHILDREN(bbr_hptsi),
1316             OID_AUTO, "google_discount", CTLFLAG_RW,
1317             &bbr_google_discount, 10,
1318             "What is the default google discount percentage wise for pacing (11 = 1.1%%)?");
1319         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1320             SYSCTL_CHILDREN(bbr_hptsi),
1321             OID_AUTO, "all_get_min", CTLFLAG_RW,
1322             &bbr_all_get_min, 0,
1323             "If you are less than a MSS do you just get the min?");
1324         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1325             SYSCTL_CHILDREN(bbr_hptsi),
1326             OID_AUTO, "tso_min", CTLFLAG_RW,
1327             &bbr_hptsi_bytes_min, 1460,
1328             "For 0 -> 24Mbps what is floor number of segments for TSO");
1329         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1330             SYSCTL_CHILDREN(bbr_hptsi),
1331             OID_AUTO, "seg_tso_max", CTLFLAG_RW,
1332             &bbr_hptsi_segments_max, 6,
1333             "For 0 -> 24Mbps what is top number of segments for TSO");
1334         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1335             SYSCTL_CHILDREN(bbr_hptsi),
1336             OID_AUTO, "seg_floor", CTLFLAG_RW,
1337             &bbr_hptsi_segments_floor, 1,
1338             "Minimum TSO size we will fall too in segments");
1339         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1340             SYSCTL_CHILDREN(bbr_hptsi),
1341             OID_AUTO, "utter_max", CTLFLAG_RW,
1342             &bbr_hptsi_utter_max, 0,
1343             "The absolute maximum that any pacing (outside of hardware) can be");
1344         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1345             SYSCTL_CHILDREN(bbr_hptsi),
1346             OID_AUTO, "seg_divisor", CTLFLAG_RW,
1347             &bbr_hptsi_per_second, 100,
1348             "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps ");
1349         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1350             SYSCTL_CHILDREN(bbr_hptsi),
1351             OID_AUTO, "srtt_mul", CTLFLAG_RW,
1352             &bbr_hptsi_max_mul, 1,
1353             "The multiplier for pace len max");
1354         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1355             SYSCTL_CHILDREN(bbr_hptsi),
1356             OID_AUTO, "srtt_div", CTLFLAG_RW,
1357             &bbr_hptsi_max_div, 2,
1358             "The divisor for pace len max");
1359         /* Measurement controls */
1360         bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1361             SYSCTL_CHILDREN(bbr_sysctl_root),
1362             OID_AUTO,
1363             "measure",
1364             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1365             "Measurement controls");
1366         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1367             SYSCTL_CHILDREN(bbr_measure),
1368             OID_AUTO, "min_i_bw", CTLFLAG_RW,
1369             &bbr_initial_bw_bps, 62500,
1370             "Minimum initial b/w in bytes per second");
1371         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1372             SYSCTL_CHILDREN(bbr_measure),
1373             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1374             &bbr_sack_not_required, 0,
1375             "Do we allow bbr to run on connections not supporting SACK?");
1376         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1377             SYSCTL_CHILDREN(bbr_measure),
1378             OID_AUTO, "use_google", CTLFLAG_RW,
1379             &bbr_use_google_algo, 0,
1380             "Use has close to google V1.0 has possible?");
1381         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1382             SYSCTL_CHILDREN(bbr_measure),
1383             OID_AUTO, "ts_limiting", CTLFLAG_RW,
1384             &bbr_ts_limiting, 1,
1385             "Do we attempt to use the peers timestamp to limit b/w caculations?");
1386         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1387             SYSCTL_CHILDREN(bbr_measure),
1388             OID_AUTO, "ts_can_raise", CTLFLAG_RW,
1389             &bbr_ts_can_raise, 0,
1390             "Can we raise the b/w via timestamp b/w calculation?");
1391         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1392             SYSCTL_CHILDREN(bbr_measure),
1393             OID_AUTO, "ts_delta", CTLFLAG_RW,
1394             &bbr_min_usec_delta, 20000,
1395             "How long in usec between ts of our sends in ts validation code?");
1396         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1397             SYSCTL_CHILDREN(bbr_measure),
1398             OID_AUTO, "ts_peer_delta", CTLFLAG_RW,
1399             &bbr_min_peer_delta, 20,
1400             "What min numerical value should be between the peer deltas?");
1401         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1402             SYSCTL_CHILDREN(bbr_measure),
1403             OID_AUTO, "ts_delta_percent", CTLFLAG_RW,
1404             &bbr_delta_percent, 150,
1405             "What percentage (150 = 15.0) do we allow variance for?");
1406         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1407             SYSCTL_CHILDREN(bbr_measure),
1408             OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
1409             &bbr_min_measurements_req, 1,
1410             "What is the minimum measurment count we need before we switch to our b/w estimate");
1411         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1412             SYSCTL_CHILDREN(bbr_measure),
1413             OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
1414             &bbr_no_pacing_until, 4,
1415             "How many pkt-epoch's (0 is off) do we need before pacing is on?");
1416         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1417             SYSCTL_CHILDREN(bbr_measure),
1418             OID_AUTO, "quanta", CTLFLAG_RW,
1419             &bbr_quanta, 2,
1420             "Extra quanta to add when calculating the target (ID section 4.2.3.2).");
1421         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1422             SYSCTL_CHILDREN(bbr_measure),
1423             OID_AUTO, "noretran", CTLFLAG_RW,
1424             &bbr_no_retran, 0,
1425             "Should google mode not use retransmission measurements for the b/w estimation?");
1426         /* State controls */
1427         bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1428             SYSCTL_CHILDREN(bbr_sysctl_root),
1429             OID_AUTO,
1430             "states",
1431             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1432             "State controls");
1433         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1434             SYSCTL_CHILDREN(bbr_states),
1435             OID_AUTO, "idle_restart", CTLFLAG_RW,
1436             &bbr_uses_idle_restart, 0,
1437             "Do we use a new special idle_restart state to ramp back up quickly?");
1438         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1439             SYSCTL_CHILDREN(bbr_states),
1440             OID_AUTO, "idle_restart_threshold", CTLFLAG_RW,
1441             &bbr_idle_restart_threshold, 100000,
1442             "How long must we be idle before we restart??");
1443         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1444             SYSCTL_CHILDREN(bbr_states),
1445             OID_AUTO, "use_pkt_epoch", CTLFLAG_RW,
1446             &bbr_state_is_pkt_epoch, 0,
1447             "Do we use a pkt-epoch for substate if 0 rttProp?");
1448         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1449             SYSCTL_CHILDREN(bbr_states),
1450             OID_AUTO, "startup_rtt_gain", CTLFLAG_RW,
1451             &bbr_rtt_gain_thresh, 0,
1452             "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?");
1453         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1454             SYSCTL_CHILDREN(bbr_states),
1455             OID_AUTO, "drain_floor", CTLFLAG_RW,
1456             &bbr_drain_floor, 88,
1457             "What is the lowest we can drain (pg) too?");
1458         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1459             SYSCTL_CHILDREN(bbr_states),
1460             OID_AUTO, "drain_2_target", CTLFLAG_RW,
1461             &bbr_state_drain_2_tar, 1,
1462             "Do we drain to target in drain substate?");
1463         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1464             SYSCTL_CHILDREN(bbr_states),
1465             OID_AUTO, "gain_2_target", CTLFLAG_RW,
1466             &bbr_gain_to_target, 1,
1467             "Does probe bw gain to target??");
1468         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1469             SYSCTL_CHILDREN(bbr_states),
1470             OID_AUTO, "gain_extra_time", CTLFLAG_RW,
1471             &bbr_gain_gets_extra_too, 1,
1472             "Does probe bw gain get the extra time too?");
1473         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1474             SYSCTL_CHILDREN(bbr_states),
1475             OID_AUTO, "ld_div", CTLFLAG_RW,
1476             &bbr_drain_drop_div, 5,
1477             "Long drain drop divider?");
1478         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1479             SYSCTL_CHILDREN(bbr_states),
1480             OID_AUTO, "ld_mul", CTLFLAG_RW,
1481             &bbr_drain_drop_mul, 4,
1482             "Long drain drop multiplier?");
1483         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1484             SYSCTL_CHILDREN(bbr_states),
1485             OID_AUTO, "rand_ot_disc", CTLFLAG_RW,
1486             &bbr_rand_ot, 50,
1487             "Random discount of the ot?");
1488         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1489             SYSCTL_CHILDREN(bbr_states),
1490             OID_AUTO, "dr_filter_life", CTLFLAG_RW,
1491             &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT,
1492             "How many packet-epochs does the b/w delivery rate last?");
1493         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1494             SYSCTL_CHILDREN(bbr_states),
1495             OID_AUTO, "subdrain_applimited", CTLFLAG_RW,
1496             &bbr_sub_drain_app_limit, 0,
1497             "Does our sub-state drain invoke app limited if its long?");
1498         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1499             SYSCTL_CHILDREN(bbr_states),
1500             OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW,
1501             &bbr_sub_drain_slam_cwnd, 0,
1502             "Should we set/recover cwnd for sub-state drain?");
1503         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1504             SYSCTL_CHILDREN(bbr_states),
1505             OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW,
1506             &bbr_slam_cwnd_in_main_drain, 0,
1507             "Should we set/recover cwnd for main-state drain?");
1508         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1509             SYSCTL_CHILDREN(bbr_states),
1510             OID_AUTO, "google_gets_earlyout", CTLFLAG_RW,
1511             &google_allow_early_out, 1,
1512             "Should we allow google probe-bw/drain to exit early at flight target?");
1513         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1514             SYSCTL_CHILDREN(bbr_states),
1515             OID_AUTO, "google_exit_loss", CTLFLAG_RW,
1516             &google_consider_lost, 1,
1517             "Should we have losses exit gain of probebw in google mode??");
1518         /* Startup controls */
1519         bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1520             SYSCTL_CHILDREN(bbr_sysctl_root),
1521             OID_AUTO,
1522             "startup",
1523             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1524             "Startup controls");
1525         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1526             SYSCTL_CHILDREN(bbr_startup),
1527             OID_AUTO, "cheat_iwnd", CTLFLAG_RW,
1528             &bbr_sends_full_iwnd, 1,
1529             "Do we not pace but burst out initial windows has our TSO size?");
1530         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1531             SYSCTL_CHILDREN(bbr_startup),
1532             OID_AUTO, "loss_threshold", CTLFLAG_RW,
1533             &bbr_startup_loss_thresh, 2000,
1534             "In startup what is the loss threshold in a pe that will exit us from startup?");
1535         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1536             SYSCTL_CHILDREN(bbr_startup),
1537             OID_AUTO, "use_lowerpg", CTLFLAG_RW,
1538             &bbr_use_lower_gain_in_startup, 1,
1539             "Should we use a lower hptsi gain if we see loss in startup?");
1540         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1541             SYSCTL_CHILDREN(bbr_startup),
1542             OID_AUTO, "gain", CTLFLAG_RW,
1543             &bbr_start_exit, 25,
1544             "What gain percent do we need to see to stay in startup??");
1545         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1546             SYSCTL_CHILDREN(bbr_startup),
1547             OID_AUTO, "low_gain", CTLFLAG_RW,
1548             &bbr_low_start_exit, 15,
1549             "What gain percent do we need to see to stay in the lower gain startup??");
1550         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1551             SYSCTL_CHILDREN(bbr_startup),
1552             OID_AUTO, "loss_exit", CTLFLAG_RW,
1553             &bbr_exit_startup_at_loss, 1,
1554             "Should we exit startup at loss in an epoch if we are not gaining?");
1555         /* CWND controls */
1556         bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1557             SYSCTL_CHILDREN(bbr_sysctl_root),
1558             OID_AUTO,
1559             "cwnd",
1560             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1561             "Cwnd controls");
1562         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1563             SYSCTL_CHILDREN(bbr_cwnd),
1564             OID_AUTO, "tar_rtt", CTLFLAG_RW,
1565             &bbr_cwndtarget_rtt_touse, 0,
1566             "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
1567         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1568             SYSCTL_CHILDREN(bbr_cwnd),
1569             OID_AUTO, "may_shrink", CTLFLAG_RW,
1570             &bbr_cwnd_may_shrink, 0,
1571             "Can the cwnd shrink if it would grow to more than the target?");
1572         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1573             SYSCTL_CHILDREN(bbr_cwnd),
1574             OID_AUTO, "max_target_limit", CTLFLAG_RW,
1575             &bbr_target_cwnd_mult_limit, 8,
1576             "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?");
1577         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1578             SYSCTL_CHILDREN(bbr_cwnd),
1579             OID_AUTO, "highspeed_min", CTLFLAG_RW,
1580             &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS,
1581             "What is the high-speed min cwnd (rttProp under 1ms)");
1582         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1583             SYSCTL_CHILDREN(bbr_cwnd),
1584             OID_AUTO, "lowspeed_min", CTLFLAG_RW,
1585             &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS,
1586             "What is the min cwnd (rttProp > 1ms)");
1587         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1588             SYSCTL_CHILDREN(bbr_cwnd),
1589             OID_AUTO, "initwin", CTLFLAG_RW,
1590             &bbr_def_init_win, 10,
1591             "What is the BBR initial window, if 0 use tcp version");
1592         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1593             SYSCTL_CHILDREN(bbr_cwnd),
1594             OID_AUTO, "do_loss_red", CTLFLAG_RW,
1595             &bbr_do_red, 600,
1596             "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?");
1597         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1598             SYSCTL_CHILDREN(bbr_cwnd),
1599             OID_AUTO, "red_scale", CTLFLAG_RW,
1600             &bbr_red_scale, 20000,
1601             "What RTT do we scale with?");
1602         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1603             SYSCTL_CHILDREN(bbr_cwnd),
1604             OID_AUTO, "red_growslow", CTLFLAG_RW,
1605             &bbr_red_growth_restrict, 1,
1606             "Do we restrict cwnd growth for whats in flight?");
1607         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1608             SYSCTL_CHILDREN(bbr_cwnd),
1609             OID_AUTO, "red_div", CTLFLAG_RW,
1610             &bbr_red_div, 2,
1611             "If we reduce whats the divisor?");
1612         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1613             SYSCTL_CHILDREN(bbr_cwnd),
1614             OID_AUTO, "red_mul", CTLFLAG_RW,
1615             &bbr_red_mul, 1,
1616             "If we reduce whats the mulitiplier?");
1617         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1618             SYSCTL_CHILDREN(bbr_cwnd),
1619             OID_AUTO, "target_is_unit", CTLFLAG_RW,
1620             &bbr_target_is_bbunit, 0,
1621             "Is the state target the pacing_gain or BBR_UNIT?");
1622         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1623             SYSCTL_CHILDREN(bbr_cwnd),
1624             OID_AUTO, "drop_limit", CTLFLAG_RW,
1625             &bbr_drop_limit, 0,
1626             "Number of segments limit for drop (0=use min_cwnd w/flight)?");
1627
1628         /* Timeout controls */
1629         bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1630             SYSCTL_CHILDREN(bbr_sysctl_root),
1631             OID_AUTO,
1632             "timeout",
1633             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1634             "Time out controls");
1635         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1636             SYSCTL_CHILDREN(bbr_timeout),
1637             OID_AUTO, "delack", CTLFLAG_RW,
1638             &bbr_delack_time, 100000,
1639             "BBR's delayed ack time");
1640         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1641             SYSCTL_CHILDREN(bbr_timeout),
1642             OID_AUTO, "tlp_uses", CTLFLAG_RW,
1643             &bbr_tlp_type_to_use, 3,
1644             "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt");
1645         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1646             SYSCTL_CHILDREN(bbr_timeout),
1647             OID_AUTO, "persmin", CTLFLAG_RW,
1648             &bbr_persist_min, 250000,
1649             "What is the minimum time in microseconds between persists");
1650         SYSCTL_ADD_U32(&bbr_sysctl_ctx,
1651             SYSCTL_CHILDREN(bbr_timeout),
1652             OID_AUTO, "persmax", CTLFLAG_RW,
1653             &bbr_persist_max, 1000000,
1654             "What is the largest delay in microseconds between persists");
1655         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1656             SYSCTL_CHILDREN(bbr_timeout),
1657             OID_AUTO, "tlp_minto", CTLFLAG_RW,
1658             &bbr_tlp_min, 10000,
1659             "TLP Min timeout in usecs");
1660         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1661             SYSCTL_CHILDREN(bbr_timeout),
1662             OID_AUTO, "tlp_dack_time", CTLFLAG_RW,
1663             &bbr_delayed_ack_time, 200000,
1664             "TLP delayed ack compensation value");
1665         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1666             SYSCTL_CHILDREN(bbr_sysctl_root),
1667             OID_AUTO, "minrto", CTLFLAG_RW,
1668             &bbr_rto_min_ms, 30,
1669             "Minimum RTO in ms");
1670         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1671             SYSCTL_CHILDREN(bbr_timeout),
1672             OID_AUTO, "maxrto", CTLFLAG_RW,
1673             &bbr_rto_max_sec, 4,
1674             "Maxiumum RTO in seconds -- should be at least as large as min_rto");
1675         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1676             SYSCTL_CHILDREN(bbr_timeout),
1677             OID_AUTO, "tlp_retry", CTLFLAG_RW,
1678             &bbr_tlp_max_resend, 2,
1679             "How many times does TLP retry a single segment or multiple with no ACK");
1680         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1681             SYSCTL_CHILDREN(bbr_timeout),
1682             OID_AUTO, "minto", CTLFLAG_RW,
1683             &bbr_min_to, 1000,
1684             "Minimum rack timeout in useconds");
1685         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1686             SYSCTL_CHILDREN(bbr_timeout),
1687             OID_AUTO, "pktdelay", CTLFLAG_RW,
1688             &bbr_pkt_delay, 1000,
1689             "Extra RACK time (in useconds) besides reordering thresh");
1690         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1691             SYSCTL_CHILDREN(bbr_timeout),
1692             OID_AUTO, "incr_tmrs", CTLFLAG_RW,
1693             &bbr_incr_timers, 1,
1694             "Increase the RXT/TLP timer by the pacing time used?");
1695         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1696             SYSCTL_CHILDREN(bbr_timeout),
1697             OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW,
1698             &bbr_marks_rxt_sack_passed, 0,
1699             "Mark sack passed on all those not ack'd when a RXT hits?");
1700         /* Policer controls */
1701         bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
1702             SYSCTL_CHILDREN(bbr_sysctl_root),
1703             OID_AUTO,
1704             "policer",
1705             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1706             "Policer controls");
1707         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1708             SYSCTL_CHILDREN(bbr_policer),
1709             OID_AUTO, "detect_enable", CTLFLAG_RW,
1710             &bbr_policer_detection_enabled, 1,
1711             "Is policer detection enabled??");
1712         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1713             SYSCTL_CHILDREN(bbr_policer),
1714             OID_AUTO, "min_pes", CTLFLAG_RW,
1715             &bbr_lt_intvl_min_rtts, 4,
1716             "Minimum number of PE's?");
1717         SYSCTL_ADD_U64(&bbr_sysctl_ctx,
1718             SYSCTL_CHILDREN(bbr_policer),
1719             OID_AUTO, "bwdiff", CTLFLAG_RW,
1720             &bbr_lt_bw_diff, (4000/8),
1721             "Minimal bw diff?");
1722         SYSCTL_ADD_U64(&bbr_sysctl_ctx,
1723             SYSCTL_CHILDREN(bbr_policer),
1724             OID_AUTO, "bwratio", CTLFLAG_RW,
1725             &bbr_lt_bw_ratio, 8,
1726             "Minimal bw diff?");
1727         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1728             SYSCTL_CHILDREN(bbr_policer),
1729             OID_AUTO, "from_rack_rxt", CTLFLAG_RW,
1730             &bbr_policer_call_from_rack_to, 0,
1731             "Do we call the policer detection code from a rack-timeout?");
1732         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1733             SYSCTL_CHILDREN(bbr_policer),
1734             OID_AUTO, "false_postive", CTLFLAG_RW,
1735             &bbr_lt_intvl_fp, 0,
1736             "What packet epoch do we do false-postive detection at (0=no)?");
1737         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1738             SYSCTL_CHILDREN(bbr_policer),
1739             OID_AUTO, "loss_thresh", CTLFLAG_RW,
1740             &bbr_lt_loss_thresh, 196,
1741             "Loss threshold 196 = 19.6%?");
1742         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1743             SYSCTL_CHILDREN(bbr_policer),
1744             OID_AUTO, "false_postive_thresh", CTLFLAG_RW,
1745             &bbr_lt_fd_thresh, 100,
1746             "What percentage is the false detection threshold (150=15.0)?");
1747         /* All the rest */
1748         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1749             SYSCTL_CHILDREN(bbr_sysctl_root),
1750             OID_AUTO, "cheat_rxt", CTLFLAG_RW,
1751             &bbr_use_rack_resend_cheat, 0,
1752             "Do we burst 1ms between sends on retransmissions (like rack)?");
1753         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1754             SYSCTL_CHILDREN(bbr_sysctl_root),
1755             OID_AUTO, "error_paceout", CTLFLAG_RW,
1756             &bbr_error_base_paceout, 10000,
1757             "When we hit an error what is the min to pace out in usec's?");
1758         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1759             SYSCTL_CHILDREN(bbr_sysctl_root),
1760             OID_AUTO, "kill_paceout", CTLFLAG_RW,
1761             &bbr_max_net_error_cnt, 10,
1762             "When we hit this many errors in a row, kill the session?");
1763         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1764             SYSCTL_CHILDREN(bbr_sysctl_root),
1765             OID_AUTO, "data_after_close", CTLFLAG_RW,
1766             &bbr_ignore_data_after_close, 1,
1767             "Do we hold off sending a RST until all pending data is ack'd");
1768         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1769             SYSCTL_CHILDREN(bbr_sysctl_root),
1770             OID_AUTO, "resend_use_tso", CTLFLAG_RW,
1771             &bbr_resends_use_tso, 0,
1772             "Can resends use TSO?");
1773         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1774             SYSCTL_CHILDREN(bbr_sysctl_root),
1775             OID_AUTO, "sblklimit", CTLFLAG_RW,
1776             &bbr_sack_block_limit, 128,
1777             "When do we start ignoring small sack blocks");
1778         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1779             SYSCTL_CHILDREN(bbr_sysctl_root),
1780             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1781             &bbr_verbose_logging, 0,
1782             "Should BBR black box logging be verbose");
1783         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1784             SYSCTL_CHILDREN(bbr_sysctl_root),
1785             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1786             &bbr_reorder_thresh, 2,
1787             "What factor for rack will be added when seeing reordering (shift right)");
1788         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1789             SYSCTL_CHILDREN(bbr_sysctl_root),
1790             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1791             &bbr_reorder_fade, 0,
1792             "Does reorder detection fade, if so how many ms (0 means never)");
1793         SYSCTL_ADD_S32(&bbr_sysctl_ctx,
1794             SYSCTL_CHILDREN(bbr_sysctl_root),
1795             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1796             &bbr_tlp_thresh, 1,
1797             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1798         /* Stats and counters */
1799         /* The pacing counters for hdwr/software can't be in the array */
1800         bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
1801         bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
1802         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1803             SYSCTL_CHILDREN(bbr_sysctl_root),
1804             OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD,
1805             &bbr_hdwr_pacing_enobuf,
1806             "Total number of enobufs for hardware paced flows");
1807         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1808             SYSCTL_CHILDREN(bbr_sysctl_root),
1809             OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD,
1810             &bbr_nohdwr_pacing_enobuf,
1811             "Total number of enobufs for non-hardware paced flows");
1812
1813
1814         bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK);
1815         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1816             SYSCTL_CHILDREN(bbr_sysctl_root),
1817             OID_AUTO, "hdwr_pacing", CTLFLAG_RD,
1818             &bbr_flows_whdwr_pacing,
1819             "Total number of hardware paced flows");
1820         bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK);
1821         SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
1822             SYSCTL_CHILDREN(bbr_sysctl_root),
1823             OID_AUTO, "software_pacing", CTLFLAG_RD,
1824             &bbr_flows_nohdwr_pacing,
1825             "Total number of software paced flows");
1826         COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK);
1827         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1828             OID_AUTO, "stats", CTLFLAG_RD,
1829             bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats");
1830         COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK);
1831         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1832             OID_AUTO, "opts", CTLFLAG_RD,
1833             bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats");
1834         COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK);
1835         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1836             OID_AUTO, "lost", CTLFLAG_RD,
1837             bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur");
1838         COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK);
1839         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1840             OID_AUTO, "stateresend", CTLFLAG_RD,
1841             bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend");
1842         COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK);
1843         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1844             OID_AUTO, "statetime", CTLFLAG_RD,
1845             bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states");
1846         COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1847         SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
1848             OID_AUTO, "outsize", CTLFLAG_RD,
1849             bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls");
1850         SYSCTL_ADD_PROC(&bbr_sysctl_ctx,
1851             SYSCTL_CHILDREN(bbr_sysctl_root),
1852             OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1853             &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
1854 }
1855
1856 static inline int32_t
1857 bbr_progress_timeout_check(struct tcp_bbr *bbr)
1858 {
1859         if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime &&
1860             TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) {
1861                 if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) {
1862                         /*
1863                          * There is an assumption here that the caller will
1864                          * drop the connection, so we increment the
1865                          * statistics.
1866                          */
1867                         bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__);
1868                         BBR_STAT_INC(bbr_progress_drops);
1869 #ifdef NETFLIX_STATS
1870                         KMOD_TCPSTAT_INC(tcps_progdrops);
1871 #endif
1872                         return (1);
1873                 }
1874         }
1875         return (0);
1876 }
1877
1878 static void
1879 bbr_counter_destroy(void)
1880 {
1881         COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE);
1882         COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE);
1883         COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE);
1884         COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
1885         COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
1886         COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
1887         counter_u64_free(bbr_flows_whdwr_pacing);
1888         counter_u64_free(bbr_flows_nohdwr_pacing);
1889
1890 }
1891
1892 static __inline void
1893 bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)
1894 {
1895         memset(l, 0, sizeof(union tcp_log_stackspecific));
1896         l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate;
1897         l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate);
1898         l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
1899         l->bw_inuse = bbr_get_bw(bbr);
1900         l->inflight = ctf_flight_size(bbr->rc_tp,
1901                           (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
1902         l->applimited = bbr->r_ctl.r_app_limited_until;
1903         l->delivered = bbr->r_ctl.rc_delivered;
1904         l->timeStamp = cts;
1905         l->lost = bbr->r_ctl.rc_lost;
1906         l->bbr_state = bbr->rc_bbr_state;
1907         l->bbr_substate = bbr_state_val(bbr);
1908         l->epoch = bbr->r_ctl.rc_rtt_epoch;
1909         l->lt_epoch = bbr->r_ctl.rc_lt_epoch;
1910         l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
1911         l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
1912         l->inhpts = bbr->rc_inp->inp_in_hpts;
1913         l->ininput = bbr->rc_inp->inp_in_input;
1914         l->use_lt_bw = bbr->rc_lt_use_bw;
1915         l->pkts_out = bbr->r_ctl.rc_flight_at_input;
1916         l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
1917 }
1918
1919 static void
1920 bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason)
1921 {
1922         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1923                 union tcp_log_stackspecific log;
1924
1925                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1926                 log.u_bbr.flex1 = 0;
1927                 log.u_bbr.flex2 = 0;
1928                 log.u_bbr.flex5 = 0;
1929                 log.u_bbr.flex3 = 0;
1930                 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate;
1931                 log.u_bbr.flex7 = reason;
1932                 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt;
1933                 log.u_bbr.flex8 = 0;
1934                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1935                     &bbr->rc_inp->inp_socket->so_rcv,
1936                     &bbr->rc_inp->inp_socket->so_snd,
1937                     BBR_LOG_BW_RED_EV, 0,
1938                     0, &log, false, &bbr->rc_tv);
1939         }
1940 }
1941
1942 static void
1943 bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count)
1944 {
1945         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1946                 union tcp_log_stackspecific log;
1947
1948                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1949                 log.u_bbr.flex1 = seq;
1950                 log.u_bbr.flex2 = count;
1951                 log.u_bbr.flex8 = mode;
1952                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1953                     &bbr->rc_inp->inp_socket->so_rcv,
1954                     &bbr->rc_inp->inp_socket->so_snd,
1955                     BBR_LOG_LOWGAIN, 0,
1956                     0, &log, false, &bbr->rc_tv);
1957         }
1958 }
1959
1960
1961
1962 static void
1963 bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling,
1964     uint8_t reason, uint32_t p_maxseg, int len)
1965 {
1966         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1967                 union tcp_log_stackspecific log;
1968
1969                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
1970                 log.u_bbr.flex1 = p_maxseg;
1971                 log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags;
1972                 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
1973                 log.u_bbr.flex4 = reason;
1974                 log.u_bbr.flex5 = bbr->rc_in_persist;
1975                 log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val;
1976                 log.u_bbr.flex7 = p_maxseg;
1977                 log.u_bbr.flex8 = bbr->rc_in_persist;
1978                 log.u_bbr.pkts_out = 0;
1979                 log.u_bbr.applimited = len;
1980                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
1981                     &bbr->rc_inp->inp_socket->so_rcv,
1982                     &bbr->rc_inp->inp_socket->so_snd,
1983                     BBR_LOG_JUSTRET, 0,
1984                     tlen, &log, false, &bbr->rc_tv);
1985         }
1986 }
1987
1988
1989 static void
1990 bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
1991 {
1992         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1993                 union tcp_log_stackspecific log;
1994
1995                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
1996                 log.u_bbr.flex1 = seq;
1997                 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
1998                 log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start;
1999                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2000                     &bbr->rc_inp->inp_socket->so_rcv,
2001                     &bbr->rc_inp->inp_socket->so_snd,
2002                     BBR_LOG_ENTREC, 0,
2003                     0, &log, false, &bbr->rc_tv);
2004         }
2005 }
2006
2007 static void
2008 bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
2009 {
2010         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2011                 union tcp_log_stackspecific log;
2012
2013                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2014                 log.u_bbr.flex1 = tso;
2015                 log.u_bbr.flex2 = maxseg;
2016                 log.u_bbr.flex3 = mtu;
2017                 log.u_bbr.flex4 = csum_flags;
2018                 TCP_LOG_EVENTP(tp, NULL,
2019                     &bbr->rc_inp->inp_socket->so_rcv,
2020                     &bbr->rc_inp->inp_socket->so_snd,
2021                     BBR_LOG_MSGSIZE, 0,
2022                     0, &log, false, &bbr->rc_tv);
2023         }
2024 }
2025
2026 static void
2027 bbr_log_flowend(struct tcp_bbr *bbr)
2028 {
2029         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2030                 union tcp_log_stackspecific log;
2031                 struct sockbuf *r, *s;
2032                 struct timeval tv;
2033
2034                 if (bbr->rc_inp->inp_socket) {
2035                         r = &bbr->rc_inp->inp_socket->so_rcv;
2036                         s = &bbr->rc_inp->inp_socket->so_snd;
2037                 } else {
2038                         r = s = NULL;
2039                 }
2040                 bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv));
2041                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2042                     r, s,
2043                     TCP_LOG_FLOWEND, 0,
2044                     0, &log, false, &tv);
2045         }
2046 }
2047
2048 static void
2049 bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line,
2050     uint32_t lost, uint32_t del)
2051 {
2052         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2053                 union tcp_log_stackspecific log;
2054
2055                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2056                 log.u_bbr.flex1 = lost;
2057                 log.u_bbr.flex2 = del;
2058                 log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw;
2059                 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt;
2060                 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
2061                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2062                 log.u_bbr.flex7 = line;
2063                 log.u_bbr.flex8 = 0;
2064                 log.u_bbr.inflight = bbr->r_ctl.r_measurement_count;
2065                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2066                     &bbr->rc_inp->inp_socket->so_rcv,
2067                     &bbr->rc_inp->inp_socket->so_snd,
2068                     BBR_LOG_PKT_EPOCH, 0,
2069                     0, &log, false, &bbr->rc_tv);
2070         }
2071 }
2072
2073 static void
2074 bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time)
2075 {
2076         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2077                 union tcp_log_stackspecific log;
2078
2079                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2080                 log.u_bbr.flex1 = bbr->r_ctl.rc_lost;
2081                 log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat;
2082                 log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat;
2083                 log.u_bbr.flex7 = line;
2084                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2085                     &bbr->rc_inp->inp_socket->so_rcv,
2086                     &bbr->rc_inp->inp_socket->so_snd,
2087                     BBR_LOG_TIME_EPOCH, 0,
2088                     0, &log, false, &bbr->rc_tv);
2089         }
2090 }
2091
2092 static void
2093 bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth)
2094 {
2095         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2096                 union tcp_log_stackspecific log;
2097
2098                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2099                 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
2100                 log.u_bbr.flex2 = new_tar;
2101                 log.u_bbr.flex3 = line;
2102                 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
2103                 log.u_bbr.flex5 = bbr_quanta;
2104                 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs;
2105                 log.u_bbr.flex7 = bbr->rc_last_options;
2106                 log.u_bbr.flex8 = meth;
2107                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2108                     &bbr->rc_inp->inp_socket->so_rcv,
2109                     &bbr->rc_inp->inp_socket->so_snd,
2110                     BBR_LOG_STATE_TARGET, 0,
2111                     0, &log, false, &bbr->rc_tv);
2112         }
2113
2114 }
2115
2116 static void
2117 bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2118 {
2119         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2120                 union tcp_log_stackspecific log;
2121
2122                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2123                 log.u_bbr.flex1 = line;
2124                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2125                 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
2126                 if (bbr_state_is_pkt_epoch)
2127                         log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
2128                 else
2129                         log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP);
2130                 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
2131                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2132                 log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000);
2133                 log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra;
2134                 log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state;
2135                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2136                     &bbr->rc_inp->inp_socket->so_rcv,
2137                     &bbr->rc_inp->inp_socket->so_snd,
2138                     BBR_LOG_STATE, 0,
2139                     0, &log, false, &bbr->rc_tv);
2140         }
2141 }
2142
2143 static void
2144 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
2145                     uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond)
2146 {
2147         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2148                 union tcp_log_stackspecific log;
2149
2150                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2151                 log.u_bbr.flex1 = line;
2152                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2153                 log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt;
2154                 log.u_bbr.flex4 = applied;
2155                 log.u_bbr.flex5 = rtt;
2156                 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
2157                 log.u_bbr.flex7 = cond;
2158                 log.u_bbr.flex8 = reas;
2159                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2160                     &bbr->rc_inp->inp_socket->so_rcv,
2161                     &bbr->rc_inp->inp_socket->so_snd,
2162                     BBR_LOG_RTT_SHRINKS, 0,
2163                     0, &log, false, &bbr->rc_tv);
2164         }
2165 }
2166
2167 static void
2168 bbr_log_type_exit_rec(struct tcp_bbr *bbr)
2169 {
2170         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2171                 union tcp_log_stackspecific log;
2172
2173                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2174                 log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start;
2175                 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
2176                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2177                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2178                     &bbr->rc_inp->inp_socket->so_rcv,
2179                     &bbr->rc_inp->inp_socket->so_snd,
2180                     BBR_LOG_EXITREC, 0,
2181                     0, &log, false, &bbr->rc_tv);
2182         }
2183 }
2184
2185 static void
2186 bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg,
2187     uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line)
2188 {
2189         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2190                 union tcp_log_stackspecific log;
2191
2192                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2193                 log.u_bbr.flex1 = line;
2194                 log.u_bbr.flex2 = prev_acked;
2195                 log.u_bbr.flex3 = bytes_this_ack;
2196                 log.u_bbr.flex4 = chg;
2197                 log.u_bbr.flex5 = th_ack;
2198                 log.u_bbr.flex6 = target;
2199                 log.u_bbr.flex8 = meth;
2200                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2201                     &bbr->rc_inp->inp_socket->so_rcv,
2202                     &bbr->rc_inp->inp_socket->so_snd,
2203                     BBR_LOG_CWND, 0,
2204                     0, &log, false, &bbr->rc_tv);
2205         }
2206 }
2207
2208 static void
2209 bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
2210 {
2211         /*
2212          * Log the rtt sample we are applying to the srtt algorithm in
2213          * useconds.
2214          */
2215         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2216                 union tcp_log_stackspecific log;
2217
2218                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2219                 log.u_bbr.flex1 = rtt;
2220                 log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time;
2221                 log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
2222                 log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
2223                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2224                 log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
2225                 log.u_bbr.flex6 = tsin;
2226                 log.u_bbr.flex7 = 0;
2227                 log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
2228                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2229                     &bbr->rc_inp->inp_socket->so_rcv,
2230                     &bbr->rc_inp->inp_socket->so_snd,
2231                     TCP_LOG_RTT, 0,
2232                     0, &log, false, &bbr->rc_tv);
2233         }
2234 }
2235
2236 static void
2237 bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit)
2238 {
2239         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2240                 union tcp_log_stackspecific log;
2241
2242                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2243                 log.u_bbr.flex1 = time_in;
2244                 log.u_bbr.flex2 = line;
2245                 log.u_bbr.flex8 = enter_exit;
2246                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2247                     &bbr->rc_inp->inp_socket->so_rcv,
2248                     &bbr->rc_inp->inp_socket->so_snd,
2249                     BBR_LOG_PERSIST, 0,
2250                     0, &log, false, &bbr->rc_tv);
2251         }
2252 }
2253 static void
2254 bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts)
2255 {
2256         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2257                 union tcp_log_stackspecific log;
2258
2259                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2260                 log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age;
2261                 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
2262                 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
2263                 log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time;
2264                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2265                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2266                     &bbr->rc_inp->inp_socket->so_rcv,
2267                     &bbr->rc_inp->inp_socket->so_snd,
2268                     BBR_LOG_ACKCLEAR, 0,
2269                     0, &log, false, &bbr->rc_tv);
2270         }
2271 }
2272
2273 static void
2274 bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen,
2275                   uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m)
2276 {
2277         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2278                 union tcp_log_stackspecific log;
2279                 struct timeval tv;
2280
2281                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2282                 log.u_bbr.flex1 = nsegs;
2283                 log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes;
2284                 if (m) {
2285                         struct timespec ts;
2286
2287                         log.u_bbr.flex3 = m->m_flags;
2288                         if (m->m_flags & M_TSTMP) {
2289                                 mbuf_tstmp2timespec(m, &ts);
2290                                 tv.tv_sec = ts.tv_sec;
2291                                 tv.tv_usec = ts.tv_nsec / 1000;
2292                                 log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
2293                         } else {
2294                                 log.u_bbr.lt_epoch = 0;
2295                         }
2296                         if (m->m_flags & M_TSTMP_LRO) {
2297                                 tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
2298                                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
2299                                 log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
2300                         } else {
2301                                 /* No arrival timestamp */
2302                                 log.u_bbr.flex5 = 0;
2303                         }
2304
2305                         log.u_bbr.pkts_out = tcp_get_usecs(&tv);
2306                 } else {
2307                         log.u_bbr.flex3 = 0;
2308                         log.u_bbr.flex5 = 0;
2309                         log.u_bbr.flex6 = 0;
2310                         log.u_bbr.pkts_out = 0;
2311                 }
2312                 log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state;
2313                 log.u_bbr.flex7 = bbr->r_wanted_output;
2314                 log.u_bbr.flex8 = bbr->rc_in_persist;
2315                 TCP_LOG_EVENTP(bbr->rc_tp, th,
2316                     &bbr->rc_inp->inp_socket->so_rcv,
2317                     &bbr->rc_inp->inp_socket->so_snd,
2318                     TCP_LOG_IN, 0,
2319                     tlen, &log, true, &bbr->rc_tv);
2320         }
2321 }
2322
2323 static void
2324 bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out)
2325 {
2326         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2327                 union tcp_log_stackspecific log;
2328
2329                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2330                 log.u_bbr.flex1 = did_out;
2331                 log.u_bbr.flex2 = nxt_pkt;
2332                 log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val;
2333                 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
2334                 log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp;
2335                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes;
2336                 log.u_bbr.flex7 = bbr->r_wanted_output;
2337                 log.u_bbr.flex8 = bbr->rc_in_persist;
2338                 log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay;
2339                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2340                     &bbr->rc_inp->inp_socket->so_rcv,
2341                     &bbr->rc_inp->inp_socket->so_snd,
2342                     BBR_LOG_DOSEG_DONE, 0,
2343                     0, &log, true, &bbr->rc_tv);
2344         }
2345 }
2346
2347 static void
2348 bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts,
2349     int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz)
2350 {
2351         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2352                 union tcp_log_stackspecific log;
2353
2354                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2355                 log.u_bbr.flex1 = line;
2356                 log.u_bbr.flex2 = o_len;
2357                 log.u_bbr.flex3 = segcnt;
2358                 log.u_bbr.flex4 = segsiz;
2359                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2360                     &bbr->rc_inp->inp_socket->so_rcv,
2361                     &bbr->rc_inp->inp_socket->so_snd,
2362                     BBR_LOG_ENOBUF_JMP, ENOBUFS,
2363                     len, &log, true, &bbr->rc_tv);
2364         }
2365 }
2366
2367 static void
2368 bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling)
2369 {
2370         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2371                 union tcp_log_stackspecific log;
2372
2373                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2374                 log.u_bbr.flex1 = timers;
2375                 log.u_bbr.flex2 = ret;
2376                 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
2377                 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
2378                 log.u_bbr.flex5 = cts;
2379                 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
2380                 log.u_bbr.flex8 = hpts_calling;
2381                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2382                     &bbr->rc_inp->inp_socket->so_rcv,
2383                     &bbr->rc_inp->inp_socket->so_snd,
2384                     BBR_LOG_TO_PROCESS, 0,
2385                     0, &log, false, &bbr->rc_tv);
2386         }
2387 }
2388
2389 static void
2390 bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num)
2391 {
2392         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2393                 union tcp_log_stackspecific log;
2394                 uint64_t ar;
2395
2396                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2397                 log.u_bbr.flex1 = bbr->bbr_timer_src;
2398                 log.u_bbr.flex2 = 0;
2399                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2400                 ar = (uint64_t)(bbr->r_ctl.rc_resend);
2401                 ar >>= 32;
2402                 ar &= 0x00000000ffffffff;
2403                 log.u_bbr.flex4 = (uint32_t)ar;
2404                 ar = (uint64_t)bbr->r_ctl.rc_resend;
2405                 ar &= 0x00000000ffffffff;
2406                 log.u_bbr.flex5 = (uint32_t)ar;
2407                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2408                 log.u_bbr.flex8 = to_num;
2409                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2410                     &bbr->rc_inp->inp_socket->so_rcv,
2411                     &bbr->rc_inp->inp_socket->so_snd,
2412                     BBR_LOG_RTO, 0,
2413                     0, &log, false, &bbr->rc_tv);
2414         }
2415 }
2416
2417 static void
2418 bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason)
2419 {
2420         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2421                 union tcp_log_stackspecific log;
2422
2423                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2424                 log.u_bbr.flex1 = flex1;
2425                 log.u_bbr.flex2 = flex2;
2426                 log.u_bbr.flex3 = flex3;
2427                 log.u_bbr.flex4 = 0;
2428                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2429                 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
2430                 log.u_bbr.flex8 = reason;
2431                 log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw;
2432                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2433                     &bbr->rc_inp->inp_socket->so_rcv,
2434                     &bbr->rc_inp->inp_socket->so_snd,
2435                     BBR_LOG_REDUCE, 0,
2436                     0, &log, false, &bbr->rc_tv);
2437         }
2438 }
2439
2440 static void
2441 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
2442 {
2443         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2444                 union tcp_log_stackspecific log;
2445
2446                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2447                 log.u_bbr.flex1 = diag->p_nxt_slot;
2448                 log.u_bbr.flex2 = diag->p_cur_slot;
2449                 log.u_bbr.flex3 = diag->slot_req;
2450                 log.u_bbr.flex4 = diag->inp_hptsslot;
2451                 log.u_bbr.flex5 = diag->slot_remaining;
2452                 log.u_bbr.flex6 = diag->need_new_to;
2453                 log.u_bbr.flex7 = diag->p_hpts_active;
2454                 log.u_bbr.flex8 = diag->p_on_min_sleep;
2455                 /* Hijack other fields as needed  */
2456                 log.u_bbr.epoch = diag->have_slept;
2457                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
2458                 log.u_bbr.pkts_out = diag->co_ret;
2459                 log.u_bbr.applimited = diag->hpts_sleep_time;
2460                 log.u_bbr.delivered = diag->p_prev_slot;
2461                 log.u_bbr.inflight = diag->p_runningtick;
2462                 log.u_bbr.bw_inuse = diag->wheel_tick;
2463                 log.u_bbr.rttProp = diag->wheel_cts;
2464                 log.u_bbr.delRate = diag->maxticks;
2465                 log.u_bbr.cur_del_rate = diag->p_curtick;
2466                 log.u_bbr.cur_del_rate <<= 32;
2467                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
2468                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2469                     &bbr->rc_inp->inp_socket->so_rcv,
2470                     &bbr->rc_inp->inp_socket->so_snd,
2471                     BBR_LOG_HPTSDIAG, 0,
2472                     0, &log, false, &bbr->rc_tv);
2473         }
2474 }
2475
2476 static void
2477 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
2478     uint32_t thresh, uint32_t to)
2479 {
2480         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2481                 union tcp_log_stackspecific log;
2482
2483                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2484                 log.u_bbr.flex1 = bbr->rc_tp->t_rttvar;
2485                 log.u_bbr.flex2 = time_since_sent;
2486                 log.u_bbr.flex3 = srtt;
2487                 log.u_bbr.flex4 = thresh;
2488                 log.u_bbr.flex5 = to;
2489                 log.u_bbr.flex6 = bbr->rc_tp->t_srtt;
2490                 log.u_bbr.flex8 = mode;
2491                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2492                     &bbr->rc_inp->inp_socket->so_rcv,
2493                     &bbr->rc_inp->inp_socket->so_snd,
2494                     BBR_LOG_TIMERPREP, 0,
2495                     0, &log, false, &bbr->rc_tv);
2496         }
2497 }
2498
2499 static void
2500 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
2501     uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod)
2502 {
2503         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2504                 union tcp_log_stackspecific log;
2505
2506                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2507                 log.u_bbr.flex1 = usecs;
2508                 log.u_bbr.flex2 = len;
2509                 log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff);
2510                 log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff);
2511                 if (override)
2512                         log.u_bbr.flex5 = (1 << 2);
2513                 else
2514                         log.u_bbr.flex5 = 0;
2515                 log.u_bbr.flex6 = override;
2516                 log.u_bbr.flex7 = gain;
2517                 log.u_bbr.flex8 = mod;
2518                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2519                     &bbr->rc_inp->inp_socket->so_rcv,
2520                     &bbr->rc_inp->inp_socket->so_snd,
2521                     BBR_LOG_HPTSI_CALC, 0,
2522                     len, &log, false, &bbr->rc_tv);
2523         }
2524 }
2525
2526 static void
2527 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2528 {
2529         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2530                 union tcp_log_stackspecific log;
2531
2532                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2533
2534                 log.u_bbr.flex1 = bbr->bbr_timer_src;
2535                 log.u_bbr.flex2 = to;
2536                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2537                 log.u_bbr.flex4 = slot;
2538                 log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;
2539                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2540                 log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;
2541                 log.u_bbr.flex8 = which;
2542                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2543                     &bbr->rc_inp->inp_socket->so_rcv,
2544                     &bbr->rc_inp->inp_socket->so_snd,
2545                     BBR_LOG_TIMERSTAR, 0,
2546                     0, &log, false, &bbr->rc_tv);
2547         }
2548 }
2549
2550 static void
2551 bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm)
2552 {
2553         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2554                 union tcp_log_stackspecific log;
2555
2556                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2557                 log.u_bbr.flex1 = thresh;
2558                 log.u_bbr.flex2 = lro;
2559                 log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts;
2560                 log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
2561                 log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2562                 log.u_bbr.flex6 = srtt;
2563                 log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift;
2564                 log.u_bbr.flex8 = frm;
2565                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2566                     &bbr->rc_inp->inp_socket->so_rcv,
2567                     &bbr->rc_inp->inp_socket->so_snd,
2568                     BBR_LOG_THRESH_CALC, 0,
2569                     0, &log, false, &bbr->rc_tv);
2570         }
2571 }
2572
2573 static void
2574 bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed)
2575 {
2576         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2577                 union tcp_log_stackspecific log;
2578
2579                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2580                 log.u_bbr.flex1 = line;
2581                 log.u_bbr.flex2 = bbr->bbr_timer_src;
2582                 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
2583                 log.u_bbr.flex4 = bbr->rc_in_persist;
2584                 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
2585                 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
2586                 log.u_bbr.flex8 = hpts_removed;
2587                 log.u_bbr.pkts_out = bbr->rc_pacer_started;
2588                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2589                     &bbr->rc_inp->inp_socket->so_rcv,
2590                     &bbr->rc_inp->inp_socket->so_snd,
2591                     BBR_LOG_TIMERCANC, 0,
2592                     0, &log, false, &bbr->rc_tv);
2593         }
2594 }
2595
2596
2597 static void
2598 bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta)
2599 {
2600         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2601                 union tcp_log_stackspecific log;
2602
2603                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2604                 log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio;
2605                 log.u_bbr.flex2 = (peer_delta >> 32);
2606                 log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff);
2607                 log.u_bbr.flex4 = (delta >> 32);
2608                 log.u_bbr.flex5 = (delta & 0x00000000ffffffff);
2609                 log.u_bbr.flex7 = bbr->rc_ts_clock_set;
2610                 log.u_bbr.flex8 = bbr->rc_ts_cant_be_used;
2611                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2612                     &bbr->rc_inp->inp_socket->so_rcv,
2613                     &bbr->rc_inp->inp_socket->so_snd,
2614                     BBR_LOG_TSTMP_VAL, 0,
2615                     0, &log, false, &bbr->rc_tv);
2616
2617         }
2618 }
2619
2620 static void
2621 bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr)
2622 {
2623         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2624                 union tcp_log_stackspecific log;
2625
2626                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2627                 log.u_bbr.flex1 = tsosz;
2628                 log.u_bbr.flex2 = tls;
2629                 log.u_bbr.flex3 = tcp_min_hptsi_time;
2630                 log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min;
2631                 log.u_bbr.flex5 = old_val;
2632                 log.u_bbr.flex6 = maxseg;
2633                 log.u_bbr.flex7 = bbr->rc_no_pacing;
2634                 log.u_bbr.flex7 <<= 1;
2635                 log.u_bbr.flex7 |= bbr->rc_past_init_win;
2636                 if (hdwr)
2637                         log.u_bbr.flex8 = 0x80 | bbr->rc_use_google;
2638                 else
2639                         log.u_bbr.flex8 = bbr->rc_use_google;
2640                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2641                     &bbr->rc_inp->inp_socket->so_rcv,
2642                     &bbr->rc_inp->inp_socket->so_snd,
2643                     BBR_LOG_BBRTSO, 0,
2644                     0, &log, false, &bbr->rc_tv);
2645         }
2646 }
2647
2648 static void
2649 bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm,
2650                       uint32_t flags, uint32_t line)
2651 {
2652         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2653                 union tcp_log_stackspecific log;
2654
2655                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2656                 log.u_bbr.flex1 = line;
2657                 log.u_bbr.flex2 = rsm->r_start;
2658                 log.u_bbr.flex3 = rsm->r_end;
2659                 log.u_bbr.flex4 = rsm->r_delivered;
2660                 log.u_bbr.flex5 = rsm->r_rtr_cnt;
2661                 log.u_bbr.flex6 = rsm->r_dupack;
2662                 log.u_bbr.flex7 = rsm->r_tim_lastsent[0];
2663                 log.u_bbr.flex8 = rsm->r_flags;
2664                 /* Hijack the pkts_out fids */
2665                 log.u_bbr.applimited = flags;
2666                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2667                     &bbr->rc_inp->inp_socket->so_rcv,
2668                     &bbr->rc_inp->inp_socket->so_snd,
2669                     BBR_RSM_CLEARED, 0,
2670                     0, &log, false, &bbr->rc_tv);
2671         }
2672 }
2673
2674 static void
2675 bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts,
2676     uint32_t flex3, uint32_t flex2, uint32_t flex5,
2677     uint32_t flex6, uint32_t pkts_out, int flex7,
2678     uint32_t flex4, uint32_t flex1)
2679 {
2680
2681         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2682                 union tcp_log_stackspecific log;
2683
2684                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2685                 log.u_bbr.flex1 = flex1;
2686                 log.u_bbr.flex2 = flex2;
2687                 log.u_bbr.flex3 = flex3;
2688                 log.u_bbr.flex4 = flex4;
2689                 log.u_bbr.flex5 = flex5;
2690                 log.u_bbr.flex6 = flex6;
2691                 log.u_bbr.flex7 = flex7;
2692                 /* Hijack the pkts_out fids */
2693                 log.u_bbr.pkts_out = pkts_out;
2694                 log.u_bbr.flex8 = flex8;
2695                 if (bbr->rc_ack_was_delayed)
2696                         log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay;
2697                 else
2698                         log.u_bbr.epoch = 0;
2699                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2700                     &bbr->rc_inp->inp_socket->so_rcv,
2701                     &bbr->rc_inp->inp_socket->so_snd,
2702                     BBR_LOG_BBRUPD, 0,
2703                     flex2, &log, false, &bbr->rc_tv);
2704         }
2705 }
2706
2707
2708 static void
2709 bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
2710         uint32_t newbw, uint32_t obw, uint32_t diff,
2711         uint32_t tim)
2712 {
2713         if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2714                 union tcp_log_stackspecific log;
2715
2716                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2717                 log.u_bbr.flex1 = reason;
2718                 log.u_bbr.flex2 = newbw;
2719                 log.u_bbr.flex3 = obw;
2720                 log.u_bbr.flex4 = diff;
2721                 log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost;
2722                 log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del;
2723                 log.u_bbr.flex7 = bbr->rc_lt_is_sampling;
2724                 log.u_bbr.pkts_out = tim;
2725                 log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw;
2726                 if (bbr->rc_lt_use_bw == 0)
2727                         log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
2728                 else
2729                         log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
2730                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2731                     &bbr->rc_inp->inp_socket->so_rcv,
2732                     &bbr->rc_inp->inp_socket->so_snd,
2733                     BBR_LOG_BWSAMP, 0,
2734                     0, &log, false, &bbr->rc_tv);
2735         }
2736 }
2737
2738 static inline void
2739 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
2740 {
2741         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2742                 union tcp_log_stackspecific log;
2743
2744                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2745                 log.u_bbr.flex1 = line;
2746                 log.u_bbr.flex2 = tick;
2747                 log.u_bbr.flex3 = tp->t_maxunacktime;
2748                 log.u_bbr.flex4 = tp->t_acktime;
2749                 log.u_bbr.flex8 = event;
2750                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2751                     &bbr->rc_inp->inp_socket->so_rcv,
2752                     &bbr->rc_inp->inp_socket->so_snd,
2753                     BBR_LOG_PROGRESS, 0,
2754                     0, &log, false, &bbr->rc_tv);
2755         }
2756 }
2757
2758 static void
2759 bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
2760                          uint64_t rate, uint64_t hw_rate, int line, uint32_t cts,
2761                          int error)
2762 {
2763         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2764                 union tcp_log_stackspecific log;
2765
2766                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2767                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2768                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2769                 log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2770                 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2771                 log.u_bbr.bw_inuse = rate;
2772                 log.u_bbr.flex5 = line;
2773                 log.u_bbr.flex6 = error;
2774                 log.u_bbr.flex8 = bbr->skip_gain;
2775                 log.u_bbr.flex8 <<= 1;
2776                 log.u_bbr.flex8 |= bbr->gain_is_limited;
2777                 log.u_bbr.flex8 <<= 1;
2778                 log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing;
2779                 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
2780                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2781                     &bbr->rc_inp->inp_socket->so_rcv,
2782                     &bbr->rc_inp->inp_socket->so_snd,
2783                     BBR_LOG_HDWR_PACE, 0,
2784                     0, &log, false, &bbr->rc_tv);
2785         }
2786 }
2787
2788 static void
2789 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
2790 {
2791         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2792                 union tcp_log_stackspecific log;
2793
2794                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2795                 log.u_bbr.flex1 = slot;
2796                 log.u_bbr.flex2 = del_by;
2797                 log.u_bbr.flex3 = prev_delay;
2798                 log.u_bbr.flex4 = line;
2799                 log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val;
2800                 log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay;
2801                 log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags);
2802                 log.u_bbr.flex8 = bbr->rc_in_persist;
2803                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2804                     &bbr->rc_inp->inp_socket->so_rcv,
2805                     &bbr->rc_inp->inp_socket->so_snd,
2806                     BBR_LOG_BBRSND, 0,
2807                     len, &log, false, &bbr->rc_tv);
2808         }
2809 }
2810
2811 static void
2812 bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags)
2813 {
2814         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2815                 union tcp_log_stackspecific log;
2816
2817                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2818                 log.u_bbr.flex1 = bbr->r_ctl.rc_delivered;
2819                 log.u_bbr.flex2 = 0;
2820                 log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt;
2821                 log.u_bbr.flex4 = end;
2822                 log.u_bbr.flex5 = seq;
2823                 log.u_bbr.flex6 = t;
2824                 log.u_bbr.flex7 = match;
2825                 log.u_bbr.flex8 = flags;
2826                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2827                     &bbr->rc_inp->inp_socket->so_rcv,
2828                     &bbr->rc_inp->inp_socket->so_snd,
2829                     BBR_LOG_BBRRTT, 0,
2830                     0, &log, false, &bbr->rc_tv);
2831         }
2832 }
2833
2834 static void
2835 bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method)
2836 {
2837         if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2838                 union tcp_log_stackspecific log;
2839
2840                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
2841                 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
2842                 log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
2843                 log.u_bbr.flex3 = bbr->r_ctl.gain_epoch;
2844                 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
2845                 log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs;
2846                 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight;
2847                 log.u_bbr.flex7 = 0;
2848                 log.u_bbr.flex8 = entry_method;
2849                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2850                     &bbr->rc_inp->inp_socket->so_rcv,
2851                     &bbr->rc_inp->inp_socket->so_snd,
2852                     BBR_LOG_EXIT_GAIN, 0,
2853                     0, &log, false, &bbr->rc_tv);
2854         }
2855 }
2856
2857 static void
2858 bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired)
2859 {
2860         if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2861                 union tcp_log_stackspecific log;
2862
2863                 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
2864                 /* R-HU */
2865                 log.u_bbr.flex1 = 0;
2866                 log.u_bbr.flex2 = 0;
2867                 log.u_bbr.flex3 = 0;
2868                 log.u_bbr.flex4 = 0;
2869                 log.u_bbr.flex7 = 0;
2870                 log.u_bbr.flex8 = settings_desired;
2871
2872                 TCP_LOG_EVENTP(bbr->rc_tp, NULL,
2873                     &bbr->rc_inp->inp_socket->so_rcv,
2874                     &bbr->rc_inp->inp_socket->so_snd,
2875                     BBR_LOG_SETTINGS_CHG, 0,
2876                     0, &log, false, &bbr->rc_tv);
2877         }
2878 }
2879
2880 /*
2881  * Returns the bw from the our filter.
2882  */
2883 static inline uint64_t
2884 bbr_get_full_bw(struct tcp_bbr *bbr)
2885 {
2886         uint64_t bw;
2887
2888         bw = get_filter_value(&bbr->r_ctl.rc_delrate);
2889
2890         return (bw);
2891 }
2892
2893 static inline void
2894 bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2895 {
2896         uint64_t calclr;
2897         uint32_t lost, del;
2898
2899         if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch)
2900                 lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch;
2901         else
2902                 lost = 0;
2903         del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del;
2904         if (lost == 0)  {
2905                 calclr = 0;
2906         } else if (del) {
2907                 calclr = lost;
2908                 calclr *= (uint64_t)1000;
2909                 calclr /= (uint64_t)del;
2910         } else {
2911                 /* Nothing delivered? 100.0% loss */
2912                 calclr = 1000;
2913         }
2914         bbr->r_ctl.rc_pkt_epoch_loss_rate =  (uint32_t)calclr;
2915         if (IN_RECOVERY(bbr->rc_tp->t_flags))
2916                 bbr->r_ctl.recovery_lr += (uint32_t)calclr;
2917         bbr->r_ctl.rc_pkt_epoch++;
2918         if (bbr->rc_no_pacing &&
2919             (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) {
2920                 bbr->rc_no_pacing = 0;
2921                 tcp_bbr_tso_size_check(bbr, cts);
2922         }
2923         bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time);
2924         bbr->r_ctl.rc_pkt_epoch_time = cts;
2925         /* What was our loss rate */
2926         bbr_log_pkt_epoch(bbr, cts, line, lost, del);
2927         bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered;
2928         bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost;
2929 }
2930
2931 static inline void
2932 bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
2933 {
2934         uint32_t epoch_time;
2935
2936         /* Tick the RTT clock */
2937         bbr->r_ctl.rc_rtt_epoch++;
2938         epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start;
2939         bbr_log_time_epoch(bbr, cts, line, epoch_time);
2940         bbr->r_ctl.rc_rcv_epoch_start = cts;
2941 }
2942
2943
2944 static inline void
2945 bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked)
2946 {
2947         if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) {
2948                 bbr->rc_is_pkt_epoch_now = 1;
2949         }
2950 }
2951
2952 /*
2953  * Returns the bw from either the b/w filter
2954  * or from the lt_bw (if the connection is being
2955  * policed).
2956  */
2957 static inline uint64_t
2958 __bbr_get_bw(struct tcp_bbr *bbr)
2959 {
2960         uint64_t bw, min_bw;
2961         uint64_t rtt;
2962         int gm_measure_cnt = 1;
2963
2964         /*
2965          * For startup we make, like google, a
2966          * minimum b/w. This is generated from the
2967          * IW and the rttProp. We do fall back to srtt
2968          * if for some reason (initial handshake) we don't
2969          * have a rttProp. We, in the worst case, fall back
2970          * to the configured min_bw (rc_initial_hptsi_bw).
2971          */
2972         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
2973                 /* Attempt first to use rttProp */
2974                 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
2975                 if (rtt && (rtt < 0xffffffff)) {
2976 measure:
2977                         min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
2978                                 ((uint64_t)1000000);
2979                         min_bw /= rtt;
2980                         if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) {
2981                                 min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
2982                         }
2983
2984                 } else if (bbr->rc_tp->t_srtt != 0) {
2985                         /* No rttProp, use srtt? */
2986                         rtt = bbr_get_rtt(bbr, BBR_SRTT);
2987                         goto measure;
2988                 } else {
2989                         min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
2990                 }
2991         } else
2992                 min_bw = 0;
2993
2994         if ((bbr->rc_past_init_win == 0) &&
2995             (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp)))
2996                 bbr->rc_past_init_win = 1;
2997         if ((bbr->rc_use_google)  && (bbr->r_ctl.r_measurement_count >= 1))
2998                 gm_measure_cnt = 0;
2999         if (gm_measure_cnt &&
3000             ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) ||
3001              (bbr->rc_past_init_win == 0))) {
3002                 /* For google we use our guess rate until we get 1 measurement */
3003
3004 use_initial_window:
3005                 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
3006                 if (rtt && (rtt < 0xffffffff)) {
3007                         /*
3008                          * We have an RTT measurment. Use that in
3009                          * combination with our initial window to calculate
3010                          * a b/w.
3011                          */
3012                         bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
3013                                 ((uint64_t)1000000);
3014                         bw /= rtt;
3015                         if (bw < bbr->r_ctl.rc_initial_hptsi_bw) {
3016                                 bw = bbr->r_ctl.rc_initial_hptsi_bw;
3017                         }
3018                 } else {
3019                         /* Drop back to the 40 and punt to a default */
3020                         bw = bbr->r_ctl.rc_initial_hptsi_bw;
3021                 }
3022                 if (bw < 1)
3023                         /* Probably should panic */
3024                         bw = 1;
3025                 if (bw > min_bw)
3026                         return (bw);
3027                 else
3028                         return (min_bw);
3029         }
3030         if (bbr->rc_lt_use_bw)
3031                 bw = bbr->r_ctl.rc_lt_bw;
3032         else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0))
3033                 bw = bbr->r_ctl.red_bw;
3034         else
3035                 bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3036         if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) {
3037                 /*
3038                  * Enforce user set rate limit, keep in mind that
3039                  * t_peakrate_thr is in B/s already
3040                  */
3041                 bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw);
3042         }
3043         if (bw == 0) {
3044                 /* We should not be at 0, go to the initial window then  */
3045                 goto use_initial_window;
3046         }
3047         if (bw < 1)
3048                 /* Probably should panic */
3049                 bw = 1;
3050         if (bw < min_bw)
3051                 bw = min_bw;
3052         return (bw);
3053 }
3054
3055 static inline uint64_t
3056 bbr_get_bw(struct tcp_bbr *bbr)
3057 {
3058         uint64_t bw;
3059
3060         bw = __bbr_get_bw(bbr);
3061         return (bw);
3062 }
3063
3064 static inline void
3065 bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts)
3066 {
3067         bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch;
3068         bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time;
3069         bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
3070         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
3071 }
3072
3073 static inline void
3074 bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts)
3075 {
3076         bbr->rc_lt_is_sampling = 0;
3077         bbr->rc_lt_use_bw = 0;
3078         bbr->r_ctl.rc_lt_bw = 0;
3079         bbr_reset_lt_bw_interval(bbr, cts);
3080 }
3081
3082 static inline void
3083 bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin)
3084 {
3085         uint64_t diff;
3086
3087         /* Do we have a previous sample? */
3088         if (bbr->r_ctl.rc_lt_bw) {
3089                 /* Get the diff in bytes per second */
3090                 if (bbr->r_ctl.rc_lt_bw > bw)
3091                         diff = bbr->r_ctl.rc_lt_bw - bw;
3092                 else
3093                         diff = bw - bbr->r_ctl.rc_lt_bw;
3094                 if ((diff <= bbr_lt_bw_diff) ||
3095                     (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) {
3096                         /* Consider us policed */
3097                         uint32_t saved_bw;
3098
3099                         saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw;
3100                         bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2;   /* average of two */
3101                         bbr->rc_lt_use_bw = 1;
3102                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
3103                         /*
3104                          * Use pkt based epoch for measuring length of
3105                          * policer up
3106                          */
3107                         bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch;
3108                         /*
3109                          * reason 4 is we need to start consider being
3110                          * policed
3111                          */
3112                         bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin);
3113                         return;
3114                 }
3115         }
3116         bbr->r_ctl.rc_lt_bw = bw;
3117         bbr_reset_lt_bw_interval(bbr, cts);
3118         bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin);
3119 }
3120
3121 /*
3122  * RRS: Copied from user space!
3123  * Calculate a uniformly distributed random number less than upper_bound
3124  * avoiding "modulo bias".
3125  *
3126  * Uniformity is achieved by generating new random numbers until the one
3127  * returned is outside the range [0, 2**32 % upper_bound).  This
3128  * guarantees the selected random number will be inside
3129  * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound)
3130  * after reduction modulo upper_bound.
3131  */
3132 static uint32_t
3133 arc4random_uniform(uint32_t upper_bound)
3134 {
3135         uint32_t r, min;
3136
3137         if (upper_bound < 2)
3138                 return 0;
3139
3140         /* 2**32 % x == (2**32 - x) % x */
3141         min = -upper_bound % upper_bound;
3142
3143         /*
3144          * This could theoretically loop forever but each retry has
3145          * p > 0.5 (worst case, usually far better) of selecting a
3146          * number inside the range we need, so it should rarely need
3147          * to re-roll.
3148          */
3149         for (;;) {
3150                 r = arc4random();
3151                 if (r >= min)
3152                         break;
3153         }
3154
3155         return r % upper_bound;
3156 }
3157
3158 static void
3159 bbr_randomize_extra_state_time(struct tcp_bbr *bbr)
3160 {
3161         uint32_t ran, deduct;
3162
3163         ran = arc4random_uniform(bbr_rand_ot);
3164         if (ran) {
3165                 deduct = bbr->r_ctl.rc_level_state_extra / ran;
3166                 bbr->r_ctl.rc_level_state_extra -= deduct;
3167         }
3168 }
3169 /*
3170  * Return randomly the starting state
3171  * to use in probebw.
3172  */
3173 static uint8_t
3174 bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts)
3175 {
3176         uint32_t ran;
3177         uint8_t ret_val;
3178
3179         /* Initialize the offset to 0 */
3180         bbr->r_ctl.rc_exta_time_gd = 0;
3181         bbr->rc_hit_state_1 = 0;
3182         bbr->r_ctl.rc_level_state_extra = 0;
3183         ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1));
3184         /*
3185          * The math works funny here :) the return value is used to set the
3186          * substate and then the state change is called which increments by
3187          * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when
3188          * we fully enter the state. Note that the (8 - 1 - ran) assures that
3189          * we return 1 - 7, so we dont return 0 and end up starting in
3190          * state 1 (DRAIN).
3191          */
3192         ret_val = BBR_SUBSTATE_COUNT - 1 - ran;
3193         /* Set an epoch */
3194         if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP))
3195                 bbr_set_epoch(bbr, cts, __LINE__);
3196
3197         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
3198         return (ret_val);
3199 }
3200
3201 static void
3202 bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected)
3203 {
3204         uint32_t diff, d_time;
3205         uint64_t del_time, bw, lost, delivered;
3206
3207         if (bbr->r_use_policer == 0)
3208                 return;
3209         if (bbr->rc_lt_use_bw) {
3210                 /* We are using lt bw do we stop yet? */
3211                 diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
3212                 if (diff > bbr_lt_bw_max_rtts) {
3213                         /* Reset it all */
3214 reset_all:
3215                         bbr_reset_lt_bw_sampling(bbr, cts);
3216                         if (bbr->rc_filled_pipe) {
3217                                 bbr_set_epoch(bbr, cts, __LINE__);
3218                                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
3219                                 bbr_substate_change(bbr, cts, __LINE__, 0);
3220                                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
3221                                 bbr_log_type_statechange(bbr, cts, __LINE__);
3222                         } else {
3223                                 /*
3224                                  * This should not happen really
3225                                  * unless we remove the startup/drain
3226                                  * restrictions above.
3227                                  */
3228                                 bbr->rc_bbr_state = BBR_STATE_STARTUP;
3229                                 bbr_set_epoch(bbr, cts, __LINE__);
3230                                 bbr->r_ctl.rc_bbr_state_time = cts;
3231                                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
3232                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
3233                                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
3234                                 bbr_set_state_target(bbr, __LINE__);
3235                                 bbr_log_type_statechange(bbr, cts, __LINE__);
3236                         }
3237                         /* reason 0 is to stop using lt-bw */
3238                         bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0);
3239                         return;
3240                 }
3241                 if (bbr_lt_intvl_fp == 0) {
3242                         /* Not doing false-postive detection */
3243                         return;
3244                 }
3245                 /* False positive detection */
3246                 if (diff == bbr_lt_intvl_fp) {
3247                         /* At bbr_lt_intvl_fp we record the lost */
3248                         bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
3249                         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
3250                 } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) {
3251                         /* Now is our loss rate still high? */
3252                         lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
3253                         delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
3254                         if ((delivered == 0) ||
3255                             (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) {
3256                                 /* No still below our threshold */
3257                                 bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0);
3258                         } else {
3259                                 /* Yikes its still high, it must be a false positive */
3260                                 bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0);
3261                                 goto reset_all;
3262                         }
3263                 }
3264                 return;
3265         }
3266         /*
3267          * Wait for the first loss before sampling, to let the policer
3268          * exhaust its tokens and estimate the steady-state rate allowed by
3269          * the policer. Starting samples earlier includes bursts that
3270          * over-estimate the bw.
3271          */
3272         if (bbr->rc_lt_is_sampling == 0) {
3273                 /* reason 1 is to begin doing the sampling  */
3274                 if (loss_detected == 0)
3275                         return;
3276                 bbr_reset_lt_bw_interval(bbr, cts);
3277                 bbr->rc_lt_is_sampling = 1;
3278                 bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0);
3279                 return;
3280         }
3281         /* Now how long were we delivering long term last> */
3282         if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time))
3283                 d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time;
3284         else
3285                 d_time = 0;
3286
3287         /* To avoid underestimates, reset sampling if we run out of data. */
3288         if (bbr->r_ctl.r_app_limited_until) {
3289                 /* Can not measure in app-limited state */
3290                 bbr_reset_lt_bw_sampling(bbr, cts);
3291                 /* reason 2 is to reset sampling due to app limits  */
3292                 bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time);
3293                 return;
3294         }
3295         diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
3296         if (diff < bbr_lt_intvl_min_rtts) {
3297                 /*
3298                  * need more samples (we don't
3299                  * start on a round like linux so
3300                  * we need 1 more).
3301                  */
3302                 /* 6 is not_enough time or no-loss */
3303                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3304                 return;
3305         }
3306         if (diff > (4 * bbr_lt_intvl_min_rtts)) {
3307                 /*
3308                  * For now if we wait too long, reset all sampling. We need
3309                  * to do some research here, its possible that we should
3310                  * base this on how much loss as occurred.. something like
3311                  * if its under 10% (or some thresh) reset all otherwise
3312                  * don't.  Thats for phase II I guess.
3313                  */
3314                 bbr_reset_lt_bw_sampling(bbr, cts);
3315                 /* reason 3 is to reset sampling due too long of sampling */
3316                 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
3317                 return;
3318         }
3319         /*
3320          * End sampling interval when a packet is lost, so we estimate the
3321          * policer tokens were exhausted. Stopping the sampling before the
3322          * tokens are exhausted under-estimates the policed rate.
3323          */
3324         if (loss_detected == 0) {
3325                 /* 6 is not_enough time or no-loss */
3326                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3327                 return;
3328         }
3329         /* Calculate packets lost and delivered in sampling interval. */
3330         lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
3331         delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
3332         if ((delivered == 0) ||
3333             (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) {
3334                 bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time);
3335                 return;
3336         }
3337         if (d_time < 1000) {
3338                 /* Not enough time. wait */
3339                 /* 6 is not_enough time or no-loss */
3340                 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
3341                 return;
3342         }
3343         if (d_time >= (0xffffffff / USECS_IN_MSEC)) {
3344                 /* Too long */
3345                 bbr_reset_lt_bw_sampling(bbr, cts);
3346                 /* reason 3 is to reset sampling due too long of sampling */
3347                 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
3348                 return;
3349         }
3350         del_time = d_time;
3351         bw = delivered;
3352         bw *= (uint64_t)USECS_IN_SECOND;
3353         bw /= del_time;
3354         bbr_lt_bw_samp_done(bbr, bw, cts, d_time);
3355 }
3356
3357 /*
3358  * Allocate a sendmap from our zone.
3359  */
3360 static struct bbr_sendmap *
3361 bbr_alloc(struct tcp_bbr *bbr)
3362 {
3363         struct bbr_sendmap *rsm;
3364
3365         BBR_STAT_INC(bbr_to_alloc);
3366         rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO));
3367         if (rsm) {
3368                 bbr->r_ctl.rc_num_maps_alloced++;
3369                 return (rsm);
3370         }
3371         if (bbr->r_ctl.rc_free_cnt) {
3372                 BBR_STAT_INC(bbr_to_alloc_emerg);
3373                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
3374                 TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
3375                 bbr->r_ctl.rc_free_cnt--;
3376                 return (rsm);
3377         }
3378         BBR_STAT_INC(bbr_to_alloc_failed);
3379         return (NULL);
3380 }
3381
3382 static struct bbr_sendmap *
3383 bbr_alloc_full_limit(struct tcp_bbr *bbr)
3384 {
3385         if ((V_tcp_map_entries_limit > 0) &&
3386             (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3387                 BBR_STAT_INC(bbr_alloc_limited);
3388                 if (!bbr->alloc_limit_reported) {
3389                         bbr->alloc_limit_reported = 1;
3390                         BBR_STAT_INC(bbr_alloc_limited_conns);
3391                 }
3392                 return (NULL);
3393         }
3394         return (bbr_alloc(bbr));
3395 }
3396
3397
3398 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3399 static struct bbr_sendmap *
3400 bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type)
3401 {
3402         struct bbr_sendmap *rsm;
3403
3404         if (limit_type) {
3405                 /* currently there is only one limit type */
3406                 if (V_tcp_map_split_limit > 0 &&
3407                     bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
3408                         BBR_STAT_INC(bbr_split_limited);
3409                         if (!bbr->alloc_limit_reported) {
3410                                 bbr->alloc_limit_reported = 1;
3411                                 BBR_STAT_INC(bbr_alloc_limited_conns);
3412                         }
3413                         return (NULL);
3414                 }
3415         }
3416
3417         /* allocate and mark in the limit type, if set */
3418         rsm = bbr_alloc(bbr);
3419         if (rsm != NULL && limit_type) {
3420                 rsm->r_limit_type = limit_type;
3421                 bbr->r_ctl.rc_num_split_allocs++;
3422         }
3423         return (rsm);
3424 }
3425
3426 static void
3427 bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
3428 {
3429         if (rsm->r_limit_type) {
3430                 /* currently there is only one limit type */
3431                 bbr->r_ctl.rc_num_split_allocs--;
3432         }
3433         if (rsm->r_is_smallmap)
3434                 bbr->r_ctl.rc_num_small_maps_alloced--;
3435         if (bbr->r_ctl.rc_tlp_send == rsm)
3436                 bbr->r_ctl.rc_tlp_send = NULL;
3437         if (bbr->r_ctl.rc_resend == rsm) {
3438                 bbr->r_ctl.rc_resend = NULL;
3439         }
3440         if (bbr->r_ctl.rc_next == rsm)
3441                 bbr->r_ctl.rc_next = NULL;
3442         if (bbr->r_ctl.rc_sacklast == rsm)
3443                 bbr->r_ctl.rc_sacklast = NULL;
3444         if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
3445                 memset(rsm, 0, sizeof(struct bbr_sendmap));
3446                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
3447                 rsm->r_limit_type = 0;
3448                 bbr->r_ctl.rc_free_cnt++;
3449                 return;
3450         }
3451         bbr->r_ctl.rc_num_maps_alloced--;
3452         uma_zfree(bbr_zone, rsm);
3453 }
3454
3455 /*
3456  * Returns the BDP.
3457  */
3458 static uint64_t
3459 bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) {
3460         /*
3461          * Calculate the bytes in flight needed given the bw (in bytes per
3462          * second) and the specifyed rtt in useconds. We need to put out the
3463          * returned value per RTT to match that rate. Gain will normaly
3464          * raise it up from there.
3465          *
3466          * This should not overflow as long as the bandwidth is below 1
3467          * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller
3468          * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30).
3469          */
3470         uint64_t usec_per_sec;
3471
3472         usec_per_sec = USECS_IN_SECOND;
3473         return ((rtt * bw) / usec_per_sec);
3474 }
3475
3476 /*
3477  * Return the initial cwnd.
3478  */
3479 static uint32_t
3480 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp)
3481 {
3482         uint32_t i_cwnd;
3483
3484         if (bbr->rc_init_win) {
3485                 i_cwnd = bbr->rc_init_win * tp->t_maxseg;
3486         } else if (V_tcp_initcwnd_segments)
3487                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
3488                     max(2 * tp->t_maxseg, 14600));
3489         else if (V_tcp_do_rfc3390)
3490                 i_cwnd = min(4 * tp->t_maxseg,
3491                     max(2 * tp->t_maxseg, 4380));
3492         else {
3493                 /* Per RFC5681 Section 3.1 */
3494                 if (tp->t_maxseg > 2190)
3495                         i_cwnd = 2 * tp->t_maxseg;
3496                 else if (tp->t_maxseg > 1095)
3497                         i_cwnd = 3 * tp->t_maxseg;
3498                 else
3499                         i_cwnd = 4 * tp->t_maxseg;
3500         }
3501         return (i_cwnd);
3502 }
3503
3504 /*
3505  * Given a specified gain, return the target
3506  * cwnd based on that gain.
3507  */
3508 static uint32_t
3509 bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw)
3510 {
3511         uint64_t bdp, rtt;
3512         uint32_t cwnd;
3513
3514         if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) ||
3515             (bbr_get_full_bw(bbr) == 0)) {
3516                 /* No measurements yet */
3517                 return (bbr_initial_cwnd(bbr, bbr->rc_tp));
3518         }
3519         /*
3520          * Get bytes per RTT needed (rttProp is normally in
3521          * bbr_cwndtarget_rtt_touse)
3522          */
3523         rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse);
3524         /* Get the bdp from the two values */
3525         bdp = bbr_get_bw_delay_prod(rtt, bw);
3526         /* Now apply the gain */
3527         cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT));
3528
3529         return (cwnd);
3530 }
3531
3532 static uint32_t
3533 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain)
3534 {
3535         uint32_t cwnd, mss;
3536
3537         mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
3538         /* Get the base cwnd with gain rounded to a mss */
3539         cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss);
3540         /*
3541          * Add in N (2 default since we do not have a
3542          * fq layer to trap packets in) quanta's per the I-D
3543          * section 4.2.3.2 quanta adjust.
3544          */
3545         cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs);
3546         if (bbr->rc_use_google) {
3547                 if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
3548                    (bbr_state_val(bbr) == BBR_SUB_GAIN)) {
3549                         /*
3550                          * The linux implementation adds
3551                          * an extra 2 x mss in gain cycle which
3552                          * is documented no-where except in the code.
3553                          * so we add more for Neal undocumented feature
3554                          */
3555                         cwnd += 2 * mss;
3556                 }
3557                 if ((cwnd / mss) & 0x1) {
3558                         /* Round up for odd num mss */
3559                         cwnd += mss;
3560                 }
3561         }
3562         /* Are we below the min cwnd? */
3563         if (cwnd < get_min_cwnd(bbr))
3564                 return (get_min_cwnd(bbr));
3565         return (cwnd);
3566 }
3567
3568 static uint16_t
3569 bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain)
3570 {
3571         if (gain < 1)
3572                 gain = 1;
3573         return (gain);
3574 }
3575
3576 static uint32_t
3577 bbr_get_header_oh(struct tcp_bbr *bbr)
3578 {
3579         int seg_oh;
3580
3581         seg_oh = 0;
3582         if (bbr->r_ctl.rc_inc_tcp_oh) {
3583                 /* Do we include TCP overhead? */
3584                 seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr));
3585         }
3586         if (bbr->r_ctl.rc_inc_ip_oh) {
3587                 /* Do we include IP overhead? */
3588 #ifdef INET6
3589                 if (bbr->r_is_v6)
3590                         seg_oh += sizeof(struct ip6_hdr);
3591                 else
3592 #endif
3593 #ifdef INET
3594                         seg_oh += sizeof(struct ip);
3595 #endif
3596         }
3597         if (bbr->r_ctl.rc_inc_enet_oh) {
3598                 /* Do we include the ethernet overhead?  */
3599                 seg_oh += sizeof(struct ether_header);
3600         }
3601         return(seg_oh);
3602 }
3603
3604
3605 static uint32_t
3606 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw)
3607 {
3608         uint64_t divor, res, tim;
3609
3610         if (useconds_time == 0)
3611                 return (0);
3612         gain = bbr_gain_adjust(bbr, gain);
3613         divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT;
3614         tim = useconds_time;
3615         res = (tim * bw * gain) / divor;
3616         if (res == 0)
3617                 res = 1;
3618         return ((uint32_t)res);
3619 }
3620
3621 /*
3622  * Given a gain and a length return the delay in useconds that
3623  * should be used to evenly space out packets
3624  * on the connection (based on the gain factor).
3625  */
3626 static uint32_t
3627 bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog)
3628 {
3629         uint64_t bw, lentim, res;
3630         uint32_t usecs, srtt, over = 0;
3631         uint32_t seg_oh, num_segs, maxseg;
3632
3633         if (len == 0)
3634                 return (0);
3635
3636         maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
3637         num_segs = (len + maxseg - 1) / maxseg;
3638         if (bbr->rc_use_google == 0) {
3639                 seg_oh = bbr_get_header_oh(bbr);
3640                 len += (num_segs * seg_oh);
3641         }
3642         gain = bbr_gain_adjust(bbr, gain);
3643         bw = bbr_get_bw(bbr);
3644         if (bbr->rc_use_google) {
3645                 uint64_t cbw;
3646
3647                 /*
3648                  * Reduce the b/w by the google discount
3649                  * factor 10 = 1%.
3650                  */
3651                 cbw = bw *  (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount);
3652                 cbw /= (uint64_t)1000;
3653                 /* We don't apply a discount if it results in 0 */
3654                 if (cbw > 0)
3655                         bw = cbw;
3656         }
3657         lentim = ((uint64_t)len *
3658                   (uint64_t)USECS_IN_SECOND *
3659                   (uint64_t)BBR_UNIT);
3660         res = lentim / ((uint64_t)gain * bw);
3661         if (res == 0)
3662                 res = 1;
3663         usecs = (uint32_t)res;
3664         srtt = bbr_get_rtt(bbr, BBR_SRTT);
3665         if (bbr_hptsi_max_mul && bbr_hptsi_max_div &&
3666             (bbr->rc_use_google == 0) &&
3667             (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) {
3668                 /*
3669                  * We cannot let the delay be more than 1/2 the srtt time.
3670                  * Otherwise we cannot pace out or send properly.
3671                  */
3672                 over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div;
3673                 BBR_STAT_INC(bbr_hpts_min_time);
3674         }
3675         if (!nolog)
3676                 bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1);
3677         return (usecs);
3678 }
3679
3680 static void
3681 bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack,
3682                  uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses)
3683 {
3684         INP_WLOCK_ASSERT(tp->t_inpcb);
3685         uint64_t bw;
3686         uint32_t cwnd, target_cwnd, saved_bytes, maxseg;
3687         int32_t meth;
3688
3689 #ifdef STATS
3690         if ((tp->t_flags & TF_GPUTINPROG) &&
3691             SEQ_GEQ(th->th_ack, tp->gput_ack)) {
3692                 /*
3693                  * Strech acks and compressed acks will cause this to
3694                  * oscillate but we are doing it the same way as the main
3695                  * stack so it will be compariable (though possibly not
3696                  * ideal).
3697                  */
3698                 int32_t cgput;
3699                 int64_t gput, time_stamp;
3700
3701                 gput = (int64_t) (th->th_ack - tp->gput_seq) * 8;
3702                 time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000));
3703                 cgput = gput / time_stamp;
3704                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
3705                                          cgput);
3706                 if (tp->t_stats_gput_prev > 0)
3707                         stats_voi_update_abs_s32(tp->t_stats,
3708                                                  VOI_TCP_GPUT_ND,
3709                                                  ((gput - tp->t_stats_gput_prev) * 100) /
3710                                                  tp->t_stats_gput_prev);
3711                 tp->t_flags &= ~TF_GPUTINPROG;
3712                 tp->t_stats_gput_prev = cgput;
3713         }
3714 #endif
3715         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
3716             ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
3717                 /* We don't change anything in probe-rtt */
3718                 return;
3719         }
3720         maxseg = tp->t_maxseg - bbr->rc_last_options;
3721         saved_bytes = bytes_this_ack;
3722         bytes_this_ack += sack_changed;
3723         if (bytes_this_ack > prev_acked) {
3724                 bytes_this_ack -= prev_acked;
3725                 /*
3726                  * A byte ack'd gives us a full mss
3727                  * to be like linux i.e. they count packets.
3728                  */
3729                 if ((bytes_this_ack < maxseg) && bbr->rc_use_google)
3730                         bytes_this_ack = maxseg;
3731         } else {
3732                 /* Unlikely */
3733                 bytes_this_ack = 0;
3734         }
3735         cwnd = tp->snd_cwnd;
3736         bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3737         if (bw)
3738                 target_cwnd = bbr_get_target_cwnd(bbr,
3739                                                   bw,
3740                                                   (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain);
3741         else
3742                 target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp);
3743         if (IN_RECOVERY(tp->t_flags) &&
3744             (bbr->bbr_prev_in_rec == 0)) {
3745                 /*
3746                  * We are entering recovery and
3747                  * thus packet conservation.
3748                  */
3749                 bbr->pkt_conservation = 1;
3750                 bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime;
3751                 cwnd = ctf_flight_size(tp,
3752                                        (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
3753                         bytes_this_ack;
3754         }
3755         if (IN_RECOVERY(tp->t_flags)) {
3756                 uint32_t flight;
3757
3758                 bbr->bbr_prev_in_rec = 1;
3759                 if (cwnd > losses) {
3760                         cwnd -= losses;
3761                         if (cwnd < maxseg)
3762                                 cwnd = maxseg;
3763                 } else
3764                         cwnd = maxseg;
3765                 flight = ctf_flight_size(tp,
3766                                          (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
3767                 bbr_log_type_cwndupd(bbr, flight, 0,
3768                                      losses, 10, 0, 0, line);
3769                 if (bbr->pkt_conservation) {
3770                         uint32_t time_in;
3771
3772                         if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start))
3773                                 time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start;
3774                         else
3775                                 time_in = 0;
3776
3777                         if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
3778                                 /* Clear packet conservation after an rttProp */
3779                                 bbr->pkt_conservation = 0;
3780                         } else {
3781                                 if ((flight + bytes_this_ack) > cwnd)
3782                                         cwnd = flight + bytes_this_ack;
3783                                 if (cwnd < get_min_cwnd(bbr))
3784                                         cwnd = get_min_cwnd(bbr);
3785                                 tp->snd_cwnd = cwnd;
3786                                 bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed,
3787                                                      prev_acked, 1, target_cwnd, th->th_ack, line);
3788                                 return;
3789                         }
3790                 }
3791         } else
3792                 bbr->bbr_prev_in_rec = 0;
3793         if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) {
3794                 bbr->r_ctl.restrict_growth--;
3795                 if (bytes_this_ack > maxseg)
3796                         bytes_this_ack = maxseg;
3797         }
3798         if (bbr->rc_filled_pipe) {
3799                 /*
3800                  * Here we have exited startup and filled the pipe. We will
3801                  * thus allow the cwnd to shrink to the target. We hit here
3802                  * mostly.
3803                  */
3804                 uint32_t s_cwnd;
3805
3806                 meth = 2;
3807                 s_cwnd = min((cwnd + bytes_this_ack), target_cwnd);
3808                 if (s_cwnd > cwnd)
3809                         cwnd = s_cwnd;
3810                 else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing)
3811                         cwnd = s_cwnd;
3812         } else {
3813                 /*
3814                  * Here we are still in startup, we increase cwnd by what
3815                  * has been acked.
3816                  */
3817                 if ((cwnd < target_cwnd) ||
3818                     (bbr->rc_past_init_win == 0)) {
3819                         meth = 3;
3820                         cwnd += bytes_this_ack;
3821                 } else {
3822                         /*
3823                          * Method 4 means we are at target so no gain in
3824                          * startup and past the initial window.
3825                          */
3826                         meth = 4;
3827                 }
3828         }
3829         tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr));
3830         bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line);
3831 }
3832
3833 static void
3834 tcp_bbr_partialack(struct tcpcb *tp)
3835 {
3836         struct tcp_bbr *bbr;
3837
3838         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
3839         INP_WLOCK_ASSERT(tp->t_inpcb);
3840         if (ctf_flight_size(tp,
3841                 (bbr->r_ctl.rc_sacked  + bbr->r_ctl.rc_lost_bytes)) <=
3842             tp->snd_cwnd) {
3843                 bbr->r_wanted_output = 1;
3844         }
3845 }
3846
3847 static void
3848 bbr_post_recovery(struct tcpcb *tp)
3849 {
3850         struct tcp_bbr *bbr;
3851         uint32_t  flight;
3852
3853         INP_WLOCK_ASSERT(tp->t_inpcb);
3854         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
3855         /*
3856          * Here we just exit recovery.
3857          */
3858         EXIT_RECOVERY(tp->t_flags);
3859         /* Lock in our b/w reduction for the specified number of pkt-epochs */
3860         bbr->r_recovery_bw = 0;
3861         tp->snd_recover = tp->snd_una;
3862         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
3863         bbr->pkt_conservation = 0;
3864         if (bbr->rc_use_google == 0) {
3865                 /*
3866                  * For non-google mode lets
3867                  * go ahead and make sure we clear
3868                  * the recovery state so if we
3869                  * bounce back in to recovery we
3870                  * will do PC.
3871                  */
3872                 bbr->bbr_prev_in_rec = 0;
3873         }
3874         bbr_log_type_exit_rec(bbr);
3875         if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
3876                 tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
3877                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__);
3878         } else {
3879                 /* For probe-rtt case lets fix up its saved_cwnd */
3880                 if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) {
3881                         bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent;
3882                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__);
3883                 }
3884         }
3885         flight = ctf_flight_size(tp,
3886                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
3887         if ((bbr->rc_use_google == 0) &&
3888             bbr_do_red) {
3889                 uint64_t val, lr2use;
3890                 uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd;
3891                 uint32_t *cwnd_p;
3892
3893                 if (bbr_get_rtt(bbr, BBR_SRTT)) {
3894                         val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000);
3895                         val /= bbr_get_rtt(bbr, BBR_SRTT);
3896                         ratio = (uint32_t)val;
3897                 } else
3898                         ratio = 1000;
3899
3900                 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div,
3901                                      bbr->r_ctl.recovery_lr, 21,
3902                                      ratio,
3903                                      bbr->r_ctl.rc_red_cwnd_pe,
3904                                      __LINE__);
3905                 if ((ratio < bbr_do_red) || (bbr_do_red == 0))
3906                         goto done;
3907                 if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
3908                      bbr_prtt_slam_cwnd) ||
3909                     (bbr_sub_drain_slam_cwnd &&
3910                      (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
3911                      bbr->rc_hit_state_1 &&
3912                      (bbr_state_val(bbr) == BBR_SUB_DRAIN)) ||
3913                     ((bbr->rc_bbr_state == BBR_STATE_DRAIN) &&
3914                      bbr_slam_cwnd_in_main_drain)) {
3915                         /*
3916                          * Here we must poke at the saved cwnd
3917                          * as well as the cwnd.
3918                          */
3919                         cwnd = bbr->r_ctl.rc_saved_cwnd;
3920                         cwnd_p = &bbr->r_ctl.rc_saved_cwnd;
3921                 } else {
3922                         cwnd = tp->snd_cwnd;
3923                         cwnd_p = &tp->snd_cwnd;
3924                 }
3925                 maxseg = tp->t_maxseg - bbr->rc_last_options;
3926                 /* Add the overall lr with the recovery lr */
3927                 if (bbr->r_ctl.rc_lost == 0)
3928                         lr2use = 0;
3929                 else if (bbr->r_ctl.rc_delivered == 0)
3930                         lr2use = 1000;
3931                 else {
3932                         lr2use = bbr->r_ctl.rc_lost * 1000;
3933                         lr2use /= bbr->r_ctl.rc_delivered;
3934                 }
3935                 lr2use += bbr->r_ctl.recovery_lr;
3936                 acks_inflight = (flight / (maxseg * 2));
3937                 if (bbr_red_scale) {
3938                         lr2use *= bbr_get_rtt(bbr, BBR_SRTT);
3939                         lr2use /= bbr_red_scale;
3940                         if ((bbr_red_growth_restrict) &&
3941                             ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1))
3942                             bbr->r_ctl.restrict_growth += acks_inflight;
3943                 }
3944                 if (lr2use) {
3945                         val = (uint64_t)cwnd * lr2use;
3946                         val /= 1000;
3947                         if (cwnd > val)
3948                                 newcwnd = roundup((cwnd - val), maxseg);
3949                         else
3950                                 newcwnd = maxseg;
3951                 } else {
3952                         val = (uint64_t)cwnd * (uint64_t)bbr_red_mul;
3953                         val /= (uint64_t)bbr_red_div;
3954                         newcwnd = roundup((uint32_t)val, maxseg);
3955                 }
3956                 /* with standard delayed acks how many acks can I expect? */
3957                 if (bbr_drop_limit == 0) {
3958                         /*
3959                          * Anticpate how much we will
3960                          * raise the cwnd based on the acks.
3961                          */
3962                         if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) {
3963                                 /* We do enforce the min (with the acks) */
3964                                 newcwnd = (get_min_cwnd(bbr) - acks_inflight);
3965                         }
3966                 } else {
3967                         /*
3968                          * A strict drop limit of N is is inplace
3969                          */
3970                         if (newcwnd < (bbr_drop_limit * maxseg)) {
3971                                 newcwnd = bbr_drop_limit * maxseg;
3972                         }
3973                 }
3974                 /* For the next N acks do we restrict the growth */
3975                 *cwnd_p = newcwnd;
3976                 if (tp->snd_cwnd > newcwnd)
3977                         tp->snd_cwnd = newcwnd;
3978                 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22,
3979                                      (uint32_t)lr2use,
3980                                      bbr_get_rtt(bbr, BBR_SRTT), __LINE__);
3981                 bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch;
3982         }
3983 done:
3984         bbr->r_ctl.recovery_lr = 0;
3985         if (flight <= tp->snd_cwnd) {
3986                 bbr->r_wanted_output = 1;
3987         }
3988         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
3989 }
3990
3991 static void
3992 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts)
3993 {
3994         bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
3995         /* Limit the drop in b/w to 1/2 our current filter. */
3996         if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate)
3997                 bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate;
3998         if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2))
3999                 bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2;
4000         tcp_bbr_tso_size_check(bbr, cts);
4001 }
4002
4003 static void
4004 bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm)
4005 {
4006         struct tcp_bbr *bbr;
4007
4008         INP_WLOCK_ASSERT(tp->t_inpcb);
4009         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
4010         switch (type) {
4011         case CC_NDUPACK:
4012                 if (!IN_RECOVERY(tp->t_flags)) {
4013                         tp->snd_recover = tp->snd_max;
4014                         /* Start a new epoch */
4015                         bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
4016                         if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) {
4017                                 /*
4018                                  * Move forward the lt epoch
4019                                  * so it won't count the truncated
4020                                  * epoch.
4021                                  */
4022                                 bbr->r_ctl.rc_lt_epoch++;
4023                         }
4024                         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
4025                                 /*
4026                                  * Just like the policer detection code
4027                                  * if we are in startup we must push
4028                                  * forward the last startup epoch
4029                                  * to hide the truncated PE.
4030                                  */
4031                                 bbr->r_ctl.rc_bbr_last_startup_epoch++;
4032                         }
4033                         bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd;
4034                         ENTER_RECOVERY(tp->t_flags);
4035                         bbr->rc_tlp_rtx_out = 0;
4036                         bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate;
4037                         tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
4038                         if (bbr->rc_inp->inp_in_hpts &&
4039                             ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) {
4040                                 /*
4041                                  * When we enter recovery, we need to restart
4042                                  * any timers. This may mean we gain an agg
4043                                  * early, which will be made up for at the last
4044                                  * rxt out.
4045                                  */
4046                                 bbr->rc_timer_first = 1;
4047                                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
4048                         }
4049                         /*
4050                          * Calculate a new cwnd based on to the current
4051                          * delivery rate with no gain. We get the bdp
4052                          * without gaining it up like we normally would and
4053                          * we use the last cur_del_rate.
4054                          */
4055                         if ((bbr->rc_use_google == 0) &&
4056                             (bbr->r_ctl.bbr_rttprobe_gain_val ||
4057                              (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) {
4058                                 tp->snd_cwnd = ctf_flight_size(tp,
4059                                                    (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
4060                                         (tp->t_maxseg - bbr->rc_last_options);
4061                                 if (tp->snd_cwnd < get_min_cwnd(bbr)) {
4062                                         /* We always gate to min cwnd */
4063                                         tp->snd_cwnd = get_min_cwnd(bbr);
4064                                 }
4065                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__);
4066                         }
4067                         bbr_log_type_enter_rec(bbr, rsm->r_start);
4068                 }
4069                 break;
4070         case CC_RTO_ERR:
4071                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
4072                 /* RTO was unnecessary, so reset everything. */
4073                 bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime);
4074                 if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
4075                         tp->snd_cwnd = tp->snd_cwnd_prev;
4076                         tp->snd_ssthresh = tp->snd_ssthresh_prev;
4077                         tp->snd_recover = tp->snd_recover_prev;
4078                         tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
4079                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__);
4080                 }
4081                 tp->t_badrxtwin = 0;
4082                 break;
4083         }
4084 }
4085
4086 /*
4087  * Indicate whether this ack should be delayed.  We can delay the ack if
4088  * following conditions are met:
4089  *      - There is no delayed ack timer in progress.
4090  *      - Our last ack wasn't a 0-sized window. We never want to delay
4091  *        the ack that opens up a 0-sized window.
4092  *      - LRO wasn't used for this segment. We make sure by checking that the
4093  *        segment size is not larger than the MSS.
4094  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
4095  *        connection.
4096  *      - The data being acked is less than a full segment (a stretch ack
4097  *        of more than a segment we should ack.
4098  *      - nsegs is 1 (if its more than that we received more than 1 ack).
4099  */
4100 #define DELAY_ACK(tp, bbr, nsegs)                               \
4101         (((tp->t_flags & TF_RXWIN0SENT) == 0) &&                \
4102          ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) &&  \
4103          (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
4104
4105 /*
4106  * Return the lowest RSM in the map of
4107  * packets still in flight that is not acked.
4108  * This should normally find on the first one
4109  * since we remove packets from the send
4110  * map after they are marked ACKED.
4111  */
4112 static struct bbr_sendmap *
4113 bbr_find_lowest_rsm(struct tcp_bbr *bbr)
4114 {
4115         struct bbr_sendmap *rsm;
4116
4117         /*
4118          * Walk the time-order transmitted list looking for an rsm that is
4119          * not acked. This will be the one that was sent the longest time
4120          * ago that is still outstanding.
4121          */
4122         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) {
4123                 if (rsm->r_flags & BBR_ACKED) {
4124                         continue;
4125                 }
4126                 goto finish;
4127         }
4128 finish:
4129         return (rsm);
4130 }
4131
4132 static struct bbr_sendmap *
4133 bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
4134 {
4135         struct bbr_sendmap *prsm;
4136
4137         /*
4138          * Walk the sequence order list backward until we hit and arrive at
4139          * the highest seq not acked. In theory when this is called it
4140          * should be the last segment (which it was not).
4141          */
4142         prsm = rsm;
4143         TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
4144                 if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) {
4145                         continue;
4146                 }
4147                 return (prsm);
4148         }
4149         return (NULL);
4150 }
4151
4152 /*
4153  * Returns to the caller the number of microseconds that
4154  * the packet can be outstanding before we think we
4155  * should have had an ack returned.
4156  */
4157 static uint32_t
4158 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm)
4159 {
4160         /*
4161          * lro is the flag we use to determine if we have seen reordering.
4162          * If it gets set we have seen reordering. The reorder logic either
4163          * works in one of two ways:
4164          *
4165          * If reorder-fade is configured, then we track the last time we saw
4166          * re-ordering occur. If we reach the point where enough time as
4167          * passed we no longer consider reordering has occuring.
4168          *
4169          * Or if reorder-face is 0, then once we see reordering we consider
4170          * the connection to alway be subject to reordering and just set lro
4171          * to 1.
4172          *
4173          * In the end if lro is non-zero we add the extra time for
4174          * reordering in.
4175          */
4176         int32_t lro;
4177         uint32_t thresh, t_rxtcur;
4178
4179         if (srtt == 0)
4180                 srtt = 1;
4181         if (bbr->r_ctl.rc_reorder_ts) {
4182                 if (bbr->r_ctl.rc_reorder_fade) {
4183                         if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) {
4184                                 lro = cts - bbr->r_ctl.rc_reorder_ts;
4185                                 if (lro == 0) {
4186                                         /*
4187                                          * No time as passed since the last
4188                                          * reorder, mark it as reordering.
4189                                          */
4190                                         lro = 1;
4191                                 }
4192                         } else {
4193                                 /* Negative time? */
4194                                 lro = 0;
4195                         }
4196                         if (lro > bbr->r_ctl.rc_reorder_fade) {
4197                                 /* Turn off reordering seen too */
4198                                 bbr->r_ctl.rc_reorder_ts = 0;
4199                                 lro = 0;
4200                         }
4201                 } else {
4202                         /* Reodering does not fade */
4203                         lro = 1;
4204                 }
4205         } else {
4206                 lro = 0;
4207         }
4208         thresh = srtt + bbr->r_ctl.rc_pkt_delay;
4209         if (lro) {
4210                 /* It must be set, if not you get 1/4 rtt */
4211                 if (bbr->r_ctl.rc_reorder_shift)
4212                         thresh += (srtt >> bbr->r_ctl.rc_reorder_shift);
4213                 else
4214                         thresh += (srtt >> 2);
4215         } else {
4216                 thresh += 1000;
4217         }
4218         /* We don't let the rack timeout be above a RTO */
4219         if ((bbr->rc_tp)->t_srtt == 0)
4220                 t_rxtcur = BBR_INITIAL_RTO;
4221         else
4222                 t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
4223         if (thresh > t_rxtcur) {
4224                 thresh = t_rxtcur;
4225         }
4226         /* And we don't want it above the RTO max either */
4227         if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
4228                 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
4229         }
4230         bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK);
4231         return (thresh);
4232 }
4233
4234 /*
4235  * Return to the caller the amount of time in mico-seconds
4236  * that should be used for the TLP timer from the last
4237  * send time of this packet.
4238  */
4239 static uint32_t
4240 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
4241     struct bbr_sendmap *rsm, uint32_t srtt,
4242     uint32_t cts)
4243 {
4244         uint32_t thresh, len, maxseg, t_rxtcur;
4245         struct bbr_sendmap *prsm;
4246
4247         if (srtt == 0)
4248                 srtt = 1;
4249         if (bbr->rc_tlp_threshold)
4250                 thresh = srtt + (srtt / bbr->rc_tlp_threshold);
4251         else
4252                 thresh = (srtt * 2);
4253         maxseg = tp->t_maxseg - bbr->rc_last_options;
4254         /* Get the previous sent packet, if any  */
4255         len = rsm->r_end - rsm->r_start;
4256
4257         /* 2.1 behavior */
4258         prsm = TAILQ_PREV(rsm, bbr_head, r_tnext);
4259         if (prsm && (len <= maxseg)) {
4260                 /*
4261                  * Two packets outstanding, thresh should be (2*srtt) +
4262                  * possible inter-packet delay (if any).
4263                  */
4264                 uint32_t inter_gap = 0;
4265                 int idx, nidx;
4266
4267                 idx = rsm->r_rtr_cnt - 1;
4268                 nidx = prsm->r_rtr_cnt - 1;
4269                 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
4270                         /* Yes it was sent later (or at the same time) */
4271                         inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
4272                 }
4273                 thresh += inter_gap;
4274         } else if (len <= maxseg) {
4275                 /*
4276                  * Possibly compensate for delayed-ack.
4277                  */
4278                 uint32_t alt_thresh;
4279
4280                 alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time;
4281                 if (alt_thresh > thresh)
4282                         thresh = alt_thresh;
4283         }
4284         /* Not above the current  RTO */
4285         if (tp->t_srtt == 0)
4286                 t_rxtcur = BBR_INITIAL_RTO;
4287         else
4288                 t_rxtcur = TICKS_2_USEC(tp->t_rxtcur);
4289
4290         bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP);
4291         /* Not above an RTO */
4292         if (thresh > t_rxtcur) {
4293                 thresh = t_rxtcur;
4294         }
4295         /* Not above a RTO max */
4296         if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
4297                 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
4298         }
4299         /* And now apply the user TLP min */
4300         if (thresh < bbr_tlp_min) {
4301                 thresh = bbr_tlp_min;
4302         }
4303         return (thresh);
4304 }
4305
4306 /*
4307  * Return one of three RTTs to use (in microseconds).
4308  */
4309 static __inline uint32_t
4310 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type)
4311 {
4312         uint32_t f_rtt;
4313         uint32_t srtt;
4314
4315         f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
4316         if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) {
4317                 /* We have no rtt at all */
4318                 if (bbr->rc_tp->t_srtt == 0)
4319                         f_rtt = BBR_INITIAL_RTO;
4320                 else
4321                         f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
4322                 /*
4323                  * Since we don't know how good the rtt is apply a
4324                  * delayed-ack min
4325                  */
4326                 if (f_rtt < bbr_delayed_ack_time) {
4327                         f_rtt = bbr_delayed_ack_time;
4328                 }
4329         }
4330         /* Take the filter version or last measured pkt-rtt */
4331         if (rtt_type == BBR_RTT_PROP) {
4332                 srtt = f_rtt;
4333         } else if (rtt_type == BBR_RTT_PKTRTT) {
4334                 if (bbr->r_ctl.rc_pkt_epoch_rtt) {
4335                         srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
4336                 } else {
4337                         /* No pkt rtt yet */
4338                         srtt = f_rtt;
4339                 }
4340         } else if (rtt_type == BBR_RTT_RACK) {
4341                 srtt = bbr->r_ctl.rc_last_rtt;
4342                 /* We need to add in any internal delay for our timer */
4343                 if (bbr->rc_ack_was_delayed)
4344                         srtt += bbr->r_ctl.rc_ack_hdwr_delay;
4345         } else if (rtt_type == BBR_SRTT) {
4346                 srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
4347         } else {
4348                 /* TSNH */
4349                 srtt = f_rtt;
4350 #ifdef BBR_INVARIANTS
4351                 panic("Unknown rtt request type %d", rtt_type);
4352 #endif
4353         }
4354         return (srtt);
4355 }
4356
4357 static int
4358 bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts)
4359 {
4360         uint32_t thresh;
4361
4362
4363         thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK),
4364                                       cts, rsm);
4365         if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) {
4366                 /* It is lost (past time) */
4367                 return (1);
4368         }
4369         return (0);
4370 }
4371
4372 /*
4373  * Return a sendmap if we need to retransmit something.
4374  */
4375 static struct bbr_sendmap *
4376 bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4377 {
4378         /*
4379          * Check to see that we don't need to fall into recovery. We will
4380          * need to do so if our oldest transmit is past the time we should
4381          * have had an ack.
4382          */
4383
4384         struct bbr_sendmap *rsm;
4385         int32_t idx;
4386
4387         if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) {
4388                 /* Nothing outstanding that we know of */
4389                 return (NULL);
4390         }
4391         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
4392         if (rsm == NULL) {
4393                 /* Nothing in the transmit map */
4394                 return (NULL);
4395         }
4396         if (tp->t_flags & TF_SENTFIN) {
4397                 /* Fin restricted, don't find anything once a fin is sent */
4398                 return (NULL);
4399         }
4400         if (rsm->r_flags & BBR_ACKED) {
4401                 /*
4402                  * Ok the first one is acked (this really should not happen
4403                  * since we remove the from the tmap once they are acked)
4404                  */
4405                 rsm = bbr_find_lowest_rsm(bbr);
4406                 if (rsm == NULL)
4407                         return (NULL);
4408         }
4409         idx = rsm->r_rtr_cnt - 1;
4410         if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) {
4411                 /* Send timestamp is the same or less? can't be ready */
4412                 return (NULL);
4413         }
4414         /* Get our RTT time */
4415         if (bbr_is_lost(bbr, rsm, cts) &&
4416             ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
4417              (rsm->r_flags & BBR_SACK_PASSED))) {
4418                 if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
4419                         rsm->r_flags |= BBR_MARKED_LOST;
4420                         bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
4421                         bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
4422                 }
4423                 bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm);
4424 #ifdef BBR_INVARIANTS
4425                 if ((rsm->r_end - rsm->r_start) == 0)
4426                         panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm);
4427 #endif
4428                 return (rsm);
4429         }
4430         return (NULL);
4431 }
4432
4433 /*
4434  * RACK Timer, here we simply do logging and house keeping.
4435  * the normal bbr_output_wtime() function will call the
4436  * appropriate thing to check if we need to do a RACK retransmit.
4437  * We return 1, saying don't proceed with bbr_output_wtime only
4438  * when all timers have been stopped (destroyed PCB?).
4439  */
4440 static int
4441 bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4442 {
4443         /*
4444          * This timer simply provides an internal trigger to send out data.
4445          * The check_recovery_mode call will see if there are needed
4446          * retransmissions, if so we will enter fast-recovery. The output
4447          * call may or may not do the same thing depending on sysctl
4448          * settings.
4449          */
4450         uint32_t lost;
4451
4452         if (bbr->rc_all_timers_stopped) {
4453                 return (1);
4454         }
4455         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
4456                 /* Its not time yet */
4457                 return (0);
4458         }
4459         BBR_STAT_INC(bbr_to_tot);
4460         lost = bbr->r_ctl.rc_lost;
4461         if (bbr->r_state && (bbr->r_state != tp->t_state))
4462                 bbr_set_state(tp, bbr, 0);
4463         bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK);
4464         if (bbr->r_ctl.rc_resend == NULL) {
4465                 /* Lets do the check here */
4466                 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
4467         }
4468         if (bbr_policer_call_from_rack_to)
4469                 bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
4470         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
4471         return (0);
4472 }
4473
4474 static __inline void
4475 bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start)
4476 {
4477         int idx;
4478
4479         nrsm->r_start = start;
4480         nrsm->r_end = rsm->r_end;
4481         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4482         nrsm->r_flags = rsm->r_flags;
4483         /* We don't transfer forward the SYN flag */
4484         nrsm->r_flags &= ~BBR_HAS_SYN;
4485         /* We move forward the FIN flag, not that this should happen */
4486         rsm->r_flags &= ~BBR_HAS_FIN;
4487         nrsm->r_dupack = rsm->r_dupack;
4488         nrsm->r_rtr_bytes = 0;
4489         nrsm->r_is_gain = rsm->r_is_gain;
4490         nrsm->r_is_drain = rsm->r_is_drain;
4491         nrsm->r_delivered = rsm->r_delivered;
4492         nrsm->r_ts_valid = rsm->r_ts_valid;
4493         nrsm->r_del_ack_ts = rsm->r_del_ack_ts;
4494         nrsm->r_del_time = rsm->r_del_time;
4495         nrsm->r_app_limited = rsm->r_app_limited;
4496         nrsm->r_first_sent_time = rsm->r_first_sent_time;
4497         nrsm->r_flight_at_send = rsm->r_flight_at_send;
4498         /* We split a piece the lower section looses any just_ret flag. */
4499         nrsm->r_bbr_state = rsm->r_bbr_state;
4500         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4501                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4502         }
4503         rsm->r_end = nrsm->r_start;
4504         idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
4505         idx /= 8;
4506         /* Check if we got too small */
4507         if ((rsm->r_is_smallmap == 0) &&
4508             ((rsm->r_end - rsm->r_start) <= idx)) {
4509                 bbr->r_ctl.rc_num_small_maps_alloced++;
4510                 rsm->r_is_smallmap = 1;
4511         }
4512         /* Check the new one as well */
4513         if ((nrsm->r_end - nrsm->r_start) <= idx) {
4514                 bbr->r_ctl.rc_num_small_maps_alloced++;
4515                 nrsm->r_is_smallmap = 1;
4516         }
4517 }
4518
4519 static int
4520 bbr_sack_mergable(struct bbr_sendmap *at,
4521                   uint32_t start, uint32_t end)
4522 {
4523         /*
4524          * Given a sack block defined by
4525          * start and end, and a current postion
4526          * at. Return 1 if either side of at
4527          * would show that the block is mergable
4528          * to that side. A block to be mergable
4529          * must have overlap with the start/end
4530          * and be in the SACK'd state.
4531          */
4532         struct bbr_sendmap *l_rsm;
4533         struct bbr_sendmap *r_rsm;
4534
4535         /* first get the either side blocks */
4536         l_rsm = TAILQ_PREV(at, bbr_head, r_next);
4537         r_rsm = TAILQ_NEXT(at, r_next);
4538         if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) {
4539                 /* Potentially mergeable */
4540                 if ((l_rsm->r_end == start) ||
4541                     (SEQ_LT(start, l_rsm->r_end) &&
4542                      SEQ_GT(end, l_rsm->r_end))) {
4543                             /*
4544                              * map blk   |------|
4545                              * sack blk         |------|
4546                              * <or>
4547                              * map blk   |------|
4548                              * sack blk      |------|
4549                              */
4550                             return (1);
4551                     }
4552         }
4553         if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) {
4554                 /* Potentially mergeable */
4555                 if ((r_rsm->r_start == end) ||
4556                     (SEQ_LT(start, r_rsm->r_start) &&
4557                      SEQ_GT(end, r_rsm->r_start))) {
4558                         /*
4559                          * map blk          |---------|
4560                          * sack blk    |----|
4561                          * <or>
4562                          * map blk          |---------|
4563                          * sack blk    |-------|
4564                          */
4565                         return (1);
4566                 }
4567         }
4568         return (0);
4569 }
4570
4571 static struct bbr_sendmap *
4572 bbr_merge_rsm(struct tcp_bbr *bbr,
4573               struct bbr_sendmap *l_rsm,
4574               struct bbr_sendmap *r_rsm)
4575 {
4576         /*
4577          * We are merging two ack'd RSM's,
4578          * the l_rsm is on the left (lower seq
4579          * values) and the r_rsm is on the right
4580          * (higher seq value). The simplest way
4581          * to merge these is to move the right
4582          * one into the left. I don't think there
4583          * is any reason we need to try to find
4584          * the oldest (or last oldest retransmitted).
4585          */
4586         l_rsm->r_end = r_rsm->r_end;
4587         if (l_rsm->r_dupack < r_rsm->r_dupack)
4588                 l_rsm->r_dupack = r_rsm->r_dupack;
4589         if (r_rsm->r_rtr_bytes)
4590                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
4591         if (r_rsm->r_in_tmap) {
4592                 /* This really should not happen */
4593                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext);
4594         }
4595         if (r_rsm->r_app_limited)
4596                 l_rsm->r_app_limited = r_rsm->r_app_limited;
4597         /* Now the flags */
4598         if (r_rsm->r_flags & BBR_HAS_FIN)
4599                 l_rsm->r_flags |= BBR_HAS_FIN;
4600         if (r_rsm->r_flags & BBR_TLP)
4601                 l_rsm->r_flags |= BBR_TLP;
4602         if (r_rsm->r_flags & BBR_RWND_COLLAPSED)
4603                 l_rsm->r_flags |= BBR_RWND_COLLAPSED;
4604         if (r_rsm->r_flags & BBR_MARKED_LOST) {
4605                 /* This really should not happen */
4606                 bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start;
4607         }
4608         TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next);
4609         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
4610                 /* Transfer the split limit to the map we free */
4611                 r_rsm->r_limit_type = l_rsm->r_limit_type;
4612                 l_rsm->r_limit_type = 0;
4613         }
4614         bbr_free(bbr, r_rsm);
4615         return(l_rsm);
4616 }
4617
4618 /*
4619  * TLP Timer, here we simply setup what segment we want to
4620  * have the TLP expire on, the normal bbr_output_wtime() will then
4621  * send it out.
4622  *
4623  * We return 1, saying don't proceed with bbr_output_wtime only
4624  * when all timers have been stopped (destroyed PCB?).
4625  */
4626 static int
4627 bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4628 {
4629         /*
4630          * Tail Loss Probe.
4631          */
4632         struct bbr_sendmap *rsm = NULL;
4633         struct socket *so;
4634         uint32_t amm;
4635         uint32_t out, avail;
4636         uint32_t maxseg;
4637         int collapsed_win = 0;
4638
4639         if (bbr->rc_all_timers_stopped) {
4640                 return (1);
4641         }
4642         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
4643                 /* Its not time yet */
4644                 return (0);
4645         }
4646         if (bbr_progress_timeout_check(bbr)) {
4647                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4648                 return (1);
4649         }
4650         /* Did we somehow get into persists? */
4651         if (bbr->rc_in_persist) {
4652                 return (0);
4653         }
4654         if (bbr->r_state && (bbr->r_state != tp->t_state))
4655                 bbr_set_state(tp, bbr, 0);
4656         BBR_STAT_INC(bbr_tlp_tot);
4657         maxseg = tp->t_maxseg - bbr->rc_last_options;
4658 #ifdef KERN_TLS
4659         if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
4660                 /*
4661                  * For hardware TLS we do *not* want to send
4662                  * new data.
4663                  */
4664                 goto need_retran;
4665         }
4666 #endif
4667         /*
4668          * A TLP timer has expired. We have been idle for 2 rtts. So we now
4669          * need to figure out how to force a full MSS segment out.
4670          */
4671         so = tp->t_inpcb->inp_socket;
4672         avail = sbavail(&so->so_snd);
4673         out = ctf_outstanding(tp);
4674         if (out > tp->snd_wnd) {
4675                 /* special case, we need a retransmission */
4676                 collapsed_win = 1;
4677                 goto need_retran;
4678         }
4679         if (avail > out) {
4680                 /* New data is available */
4681                 amm = avail - out;
4682                 if (amm > maxseg) {
4683                         amm = maxseg;
4684                 } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
4685                         /* not enough to fill a MTU and no-delay is off */
4686                         goto need_retran;
4687                 }
4688                 /* Set the send-new override */
4689                 if ((out + amm) <= tp->snd_wnd) {
4690                         bbr->rc_tlp_new_data = 1;
4691                 } else {
4692                         goto need_retran;
4693                 }
4694                 bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
4695                 bbr->r_ctl.rc_last_tlp_seq = tp->snd_max;
4696                 bbr->r_ctl.rc_tlp_send = NULL;
4697                 /* cap any slots */
4698                 BBR_STAT_INC(bbr_tlp_newdata);
4699                 goto send;
4700         }
4701 need_retran:
4702         /*
4703          * Ok we need to arrange the last un-acked segment to be re-sent, or
4704          * optionally the first un-acked segment.
4705          */
4706         if (collapsed_win == 0) {
4707                 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
4708                 if (rsm && (BBR_ACKED | BBR_HAS_FIN)) {
4709                         rsm = bbr_find_high_nonack(bbr, rsm);
4710                 }
4711                 if (rsm == NULL) {
4712                         goto restore;
4713                 }
4714         } else {
4715                 /*
4716                  * We must find the last segment
4717                  * that was acceptable by the client.
4718                  */
4719                 TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
4720                         if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) {
4721                                 /* Found one */
4722                                 break;
4723                         }
4724                 }
4725                 if (rsm == NULL) {
4726                         /* None? if so send the first */
4727                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
4728                         if (rsm == NULL)
4729                                 goto restore;
4730                 }
4731         }
4732         if ((rsm->r_end - rsm->r_start) > maxseg) {
4733                 /*
4734                  * We need to split this the last segment in two.
4735                  */
4736                 struct bbr_sendmap *nrsm;
4737
4738                 nrsm = bbr_alloc_full_limit(bbr);
4739                 if (nrsm == NULL) {
4740                         /*
4741                          * We can't get memory to split, we can either just
4742                          * not split it. Or retransmit the whole piece, lets
4743                          * do the large send (BTLP :-) ).
4744                          */
4745                         goto go_for_it;
4746                 }
4747                 bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg));
4748                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
4749                 if (rsm->r_in_tmap) {
4750                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4751                         nrsm->r_in_tmap = 1;
4752                 }
4753                 rsm->r_flags &= (~BBR_HAS_FIN);
4754                 rsm = nrsm;
4755         }
4756 go_for_it:
4757         bbr->r_ctl.rc_tlp_send = rsm;
4758         bbr->rc_tlp_rtx_out = 1;
4759         if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) {
4760                 bbr->r_ctl.rc_tlp_seg_send_cnt++;
4761                 tp->t_rxtshift++;
4762         } else {
4763                 bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
4764                 bbr->r_ctl.rc_tlp_seg_send_cnt = 1;
4765         }
4766 send:
4767         if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
4768                 /*
4769                  * Can't [re]/transmit a segment we have retranmitted the
4770                  * max times. We need the retransmit timer to take over.
4771                  */
4772 restore:
4773                 bbr->rc_tlp_new_data = 0;
4774                 bbr->r_ctl.rc_tlp_send = NULL;
4775                 if (rsm)
4776                         rsm->r_flags &= ~BBR_TLP;
4777                 BBR_STAT_INC(bbr_tlp_retran_fail);
4778                 return (0);
4779         } else if (rsm) {
4780                 rsm->r_flags |= BBR_TLP;
4781         }
4782         if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) &&
4783             (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) {
4784                 /*
4785                  * We have retransmitted to many times for TLP. Switch to
4786                  * the regular RTO timer
4787                  */
4788                 goto restore;
4789         }
4790         bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP);
4791         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
4792         return (0);
4793 }
4794
4795 /*
4796  * Delayed ack Timer, here we simply need to setup the
4797  * ACK_NOW flag and remove the DELACK flag. From there
4798  * the output routine will send the ack out.
4799  *
4800  * We only return 1, saying don't proceed, if all timers
4801  * are stopped (destroyed PCB?).
4802  */
4803 static int
4804 bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4805 {
4806         if (bbr->rc_all_timers_stopped) {
4807                 return (1);
4808         }
4809         bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK);
4810         tp->t_flags &= ~TF_DELACK;
4811         tp->t_flags |= TF_ACKNOW;
4812         KMOD_TCPSTAT_INC(tcps_delack);
4813         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
4814         return (0);
4815 }
4816
4817 /*
4818  * Persists timer, here we simply need to setup the
4819  * FORCE-DATA flag the output routine will send
4820  * the one byte send.
4821  *
4822  * We only return 1, saying don't proceed, if all timers
4823  * are stopped (destroyed PCB?).
4824  */
4825 static int
4826 bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4827 {
4828         struct tcptemp *t_template;
4829         int32_t retval = 1;
4830
4831         if (bbr->rc_all_timers_stopped) {
4832                 return (1);
4833         }
4834         if (bbr->rc_in_persist == 0)
4835                 return (0);
4836         KASSERT(tp->t_inpcb != NULL,
4837             ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
4838         /*
4839          * Persistence timer into zero window. Force a byte to be output, if
4840          * possible.
4841          */
4842         bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST);
4843         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
4844         KMOD_TCPSTAT_INC(tcps_persisttimeo);
4845         /*
4846          * Have we exceeded the user specified progress time?
4847          */
4848         if (bbr_progress_timeout_check(bbr)) {
4849                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4850                 goto out;
4851         }
4852         /*
4853          * Hack: if the peer is dead/unreachable, we do not time out if the
4854          * window is closed.  After a full backoff, drop the connection if
4855          * the idle time (no responses to probes) reaches the maximum
4856          * backoff that we would use if retransmitting.
4857          */
4858         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
4859             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
4860             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
4861                 KMOD_TCPSTAT_INC(tcps_persistdrop);
4862                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4863                 goto out;
4864         }
4865         if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
4866             tp->snd_una == tp->snd_max) {
4867                 bbr_exit_persist(tp, bbr, cts, __LINE__);
4868                 retval = 0;
4869                 goto out;
4870         }
4871         /*
4872          * If the user has closed the socket then drop a persisting
4873          * connection after a much reduced timeout.
4874          */
4875         if (tp->t_state > TCPS_CLOSE_WAIT &&
4876             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
4877                 KMOD_TCPSTAT_INC(tcps_persistdrop);
4878                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4879                 goto out;
4880         }
4881         t_template = tcpip_maketemplate(bbr->rc_inp);
4882         if (t_template) {
4883                 tcp_respond(tp, t_template->tt_ipgen,
4884                             &t_template->tt_t, (struct mbuf *)NULL,
4885                             tp->rcv_nxt, tp->snd_una - 1, 0);
4886                 /* This sends an ack */
4887                 if (tp->t_flags & TF_DELACK)
4888                         tp->t_flags &= ~TF_DELACK;
4889                 free(t_template, M_TEMP);
4890         }
4891         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
4892                 tp->t_rxtshift++;
4893         bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0);
4894 out:
4895         return (retval);
4896 }
4897
4898 /*
4899  * If a keepalive goes off, we had no other timers
4900  * happening. We always return 1 here since this
4901  * routine either drops the connection or sends
4902  * out a segment with respond.
4903  */
4904 static int
4905 bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
4906 {
4907         struct tcptemp *t_template;
4908         struct inpcb *inp;
4909
4910         if (bbr->rc_all_timers_stopped) {
4911                 return (1);
4912         }
4913         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
4914         inp = tp->t_inpcb;
4915         bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP);
4916         /*
4917          * Keep-alive timer went off; send something or drop connection if
4918          * idle for too long.
4919          */
4920         KMOD_TCPSTAT_INC(tcps_keeptimeo);
4921         if (tp->t_state < TCPS_ESTABLISHED)
4922                 goto dropit;
4923         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
4924             tp->t_state <= TCPS_CLOSING) {
4925                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
4926                         goto dropit;
4927                 /*
4928                  * Send a packet designed to force a response if the peer is
4929                  * up and reachable: either an ACK if the connection is
4930                  * still alive, or an RST if the peer has closed the
4931                  * connection due to timeout or reboot. Using sequence
4932                  * number tp->snd_una-1 causes the transmitted zero-length
4933                  * segment to lie outside the receive window; by the
4934                  * protocol spec, this requires the correspondent TCP to
4935                  * respond.
4936                  */
4937                 KMOD_TCPSTAT_INC(tcps_keepprobe);
4938                 t_template = tcpip_maketemplate(inp);
4939                 if (t_template) {
4940                         tcp_respond(tp, t_template->tt_ipgen,
4941                             &t_template->tt_t, (struct mbuf *)NULL,
4942                             tp->rcv_nxt, tp->snd_una - 1, 0);
4943                         free(t_template, M_TEMP);
4944                 }
4945         }
4946         bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0);
4947         return (1);
4948 dropit:
4949         KMOD_TCPSTAT_INC(tcps_keepdrops);
4950         tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
4951         return (1);
4952 }
4953
4954 /*
4955  * Retransmit helper function, clear up all the ack
4956  * flags and take care of important book keeping.
4957  */
4958 static void
4959 bbr_remxt_tmr(struct tcpcb *tp)
4960 {
4961         /*
4962          * The retransmit timer went off, all sack'd blocks must be
4963          * un-acked.
4964          */
4965         struct bbr_sendmap *rsm, *trsm = NULL;
4966         struct tcp_bbr *bbr;
4967         uint32_t cts, lost;
4968
4969         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
4970         cts = tcp_get_usecs(&bbr->rc_tv);
4971         lost = bbr->r_ctl.rc_lost;
4972         if (bbr->r_state && (bbr->r_state != tp->t_state))
4973                 bbr_set_state(tp, bbr, 0);
4974
4975         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
4976                 if (rsm->r_flags & BBR_ACKED) {
4977                         uint32_t old_flags;
4978
4979                         rsm->r_dupack = 0;
4980                         if (rsm->r_in_tmap == 0) {
4981                                 /* We must re-add it back to the tlist */
4982                                 if (trsm == NULL) {
4983                                         TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
4984                                 } else {
4985                                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext);
4986                                 }
4987                                 rsm->r_in_tmap = 1;
4988                         }
4989                         old_flags = rsm->r_flags;
4990                         rsm->r_flags |= BBR_RXT_CLEARED;
4991                         rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS);
4992                         bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
4993                 } else {
4994                         if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
4995                                 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
4996                                 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
4997                         }
4998                         if (bbr_marks_rxt_sack_passed) {
4999                                 /*
5000                                  * With this option, we will rack out
5001                                  * in 1ms increments the rest of the packets.
5002                                  */
5003                                 rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST;
5004                                 rsm->r_flags &= ~BBR_WAS_SACKPASS;
5005                         } else {
5006                                 /*
5007                                  * With this option we only mark them lost
5008                                  * and remove all sack'd markings. We will run
5009                                  * another RXT or a TLP. This will cause
5010                                  * us to eventually send more based on what
5011                                  * ack's come in.
5012                                  */
5013                                 rsm->r_flags |= BBR_MARKED_LOST;
5014                                 rsm->r_flags &= ~BBR_WAS_SACKPASS;
5015                                 rsm->r_flags &= ~BBR_SACK_PASSED;
5016                         }
5017                 }
5018                 trsm = rsm;
5019         }
5020         bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map);
5021         /* Clear the count (we just un-acked them) */
5022         bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR);
5023         bbr->rc_tlp_new_data = 0;
5024         bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
5025         /* zap the behindness on a rxt */
5026         bbr->r_ctl.rc_hptsi_agg_delay = 0;
5027         bbr->r_agg_early_set = 0;
5028         bbr->r_ctl.rc_agg_early = 0;
5029         bbr->rc_tlp_rtx_out = 0;
5030         bbr->r_ctl.rc_sacked = 0;
5031         bbr->r_ctl.rc_sacklast = NULL;
5032         bbr->r_timer_override = 1;
5033         bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
5034 }
5035
5036 /*
5037  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
5038  * we will setup to retransmit the lowest seq number outstanding.
5039  */
5040 static int
5041 bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
5042 {
5043         int32_t rexmt;
5044         int32_t retval = 0;
5045         bool isipv6;
5046
5047         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
5048         if (bbr->rc_all_timers_stopped) {
5049                 return (1);
5050         }
5051         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
5052             (tp->snd_una == tp->snd_max)) {
5053                 /* Nothing outstanding .. nothing to do */
5054                 return (0);
5055         }
5056         /*
5057          * Retransmission timer went off.  Message has not been acked within
5058          * retransmit interval.  Back off to a longer retransmit interval
5059          * and retransmit one segment.
5060          */
5061         if (bbr_progress_timeout_check(bbr)) {
5062                 retval = 1;
5063                 tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
5064                 goto out;
5065         }
5066         bbr_remxt_tmr(tp);
5067         if ((bbr->r_ctl.rc_resend == NULL) ||
5068             ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) {
5069                 /*
5070                  * If the rwnd collapsed on
5071                  * the one we are retransmitting
5072                  * it does not count against the
5073                  * rxt count.
5074                  */
5075                 tp->t_rxtshift++;
5076         }
5077         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
5078                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
5079                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
5080                 retval = 1;
5081                 tcp_set_inp_to_drop(bbr->rc_inp,
5082                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
5083                 goto out;
5084         }
5085         if (tp->t_state == TCPS_SYN_SENT) {
5086                 /*
5087                  * If the SYN was retransmitted, indicate CWND to be limited
5088                  * to 1 segment in cc_conn_init().
5089                  */
5090                 tp->snd_cwnd = 1;
5091         } else if (tp->t_rxtshift == 1) {
5092                 /*
5093                  * first retransmit; record ssthresh and cwnd so they can be
5094                  * recovered if this turns out to be a "bad" retransmit. A
5095                  * retransmit is considered "bad" if an ACK for this segment
5096                  * is received within RTT/2 interval; the assumption here is
5097                  * that the ACK was already in flight.  See "On Estimating
5098                  * End-to-End Network Path Properties" by Allman and Paxson
5099                  * for more details.
5100                  */
5101                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5102                 if (!IN_RECOVERY(tp->t_flags)) {
5103                         tp->snd_cwnd_prev = tp->snd_cwnd;
5104                         tp->snd_ssthresh_prev = tp->snd_ssthresh;
5105                         tp->snd_recover_prev = tp->snd_recover;
5106                         tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
5107                         tp->t_flags |= TF_PREVVALID;
5108                 } else {
5109                         tp->t_flags &= ~TF_PREVVALID;
5110                 }
5111                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5112         } else {
5113                 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
5114                 tp->t_flags &= ~TF_PREVVALID;
5115         }
5116         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
5117         if ((tp->t_state == TCPS_SYN_SENT) ||
5118             (tp->t_state == TCPS_SYN_RECEIVED))
5119                 rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift];
5120         else
5121                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
5122         TCPT_RANGESET(tp->t_rxtcur, rexmt,
5123             MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms),
5124             MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
5125         /*
5126          * We enter the path for PLMTUD if connection is established or, if
5127          * connection is FIN_WAIT_1 status, reason for the last is that if
5128          * amount of data we send is very small, we could send it in couple
5129          * of packets and process straight to FIN. In that case we won't
5130          * catch ESTABLISHED state.
5131          */
5132 #ifdef INET6
5133         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
5134 #else
5135         isipv6 = false;
5136 #endif
5137         if (((V_tcp_pmtud_blackhole_detect == 1) ||
5138             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
5139             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
5140             ((tp->t_state == TCPS_ESTABLISHED) ||
5141             (tp->t_state == TCPS_FIN_WAIT_1))) {
5142
5143                 /*
5144                  * Idea here is that at each stage of mtu probe (usually,
5145                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
5146                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
5147                  * should take care of that.
5148                  */
5149                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
5150                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
5151                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
5152                     tp->t_rxtshift % 2 == 0)) {
5153                         /*
5154                          * Enter Path MTU Black-hole Detection mechanism: -
5155                          * Disable Path MTU Discovery (IP "DF" bit). -
5156                          * Reduce MTU to lower value than what we negotiated
5157                          * with peer.
5158                          */
5159                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
5160                                 /*
5161                                  * Record that we may have found a black
5162                                  * hole.
5163                                  */
5164                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
5165                                 /* Keep track of previous MSS. */
5166                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
5167                         }
5168                         /*
5169                          * Reduce the MSS to blackhole value or to the
5170                          * default in an attempt to retransmit.
5171                          */
5172 #ifdef INET6
5173                         isipv6 = bbr->r_is_v6;
5174                         if (isipv6 &&
5175                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
5176                                 /* Use the sysctl tuneable blackhole MSS. */
5177                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
5178                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5179                         } else if (isipv6) {
5180                                 /* Use the default MSS. */
5181                                 tp->t_maxseg = V_tcp_v6mssdflt;
5182                                 /*
5183                                  * Disable Path MTU Discovery when we switch
5184                                  * to minmss.
5185                                  */
5186                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5187                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5188                         }
5189 #endif
5190 #if defined(INET6) && defined(INET)
5191                         else
5192 #endif
5193 #ifdef INET
5194                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
5195                                 /* Use the sysctl tuneable blackhole MSS. */
5196                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
5197                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5198                         } else {
5199                                 /* Use the default MSS. */
5200                                 tp->t_maxseg = V_tcp_mssdflt;
5201                                 /*
5202                                  * Disable Path MTU Discovery when we switch
5203                                  * to minmss.
5204                                  */
5205                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5206                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5207                         }
5208 #endif
5209                 } else {
5210                         /*
5211                          * If further retransmissions are still unsuccessful
5212                          * with a lowered MTU, maybe this isn't a blackhole
5213                          * and we restore the previous MSS and blackhole
5214                          * detection flags. The limit '6' is determined by
5215                          * giving each probe stage (1448, 1188, 524) 2
5216                          * chances to recover.
5217                          */
5218                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
5219                             (tp->t_rxtshift >= 6)) {
5220                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
5221                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
5222                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
5223                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
5224                         }
5225                 }
5226         }
5227         /*
5228          * Disable RFC1323 and SACK if we haven't got any response to our
5229          * third SYN to work-around some broken terminal servers (most of
5230          * which have hopefully been retired) that have bad VJ header
5231          * compression code which trashes TCP segments containing
5232          * unknown-to-them TCP options.
5233          */
5234         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
5235             (tp->t_rxtshift == 3))
5236                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
5237         /*
5238          * If we backed off this far, our srtt estimate is probably bogus.
5239          * Clobber it so we'll take the next rtt measurement as our srtt;
5240          * move the current srtt into rttvar to keep the current retransmit
5241          * times until then.
5242          */
5243         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
5244 #ifdef INET6
5245                 if (bbr->r_is_v6)
5246                         in6_losing(tp->t_inpcb);
5247                 else
5248 #endif
5249                         in_losing(tp->t_inpcb);
5250                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
5251                 tp->t_srtt = 0;
5252         }
5253         sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
5254         tp->snd_recover = tp->snd_max;
5255         tp->t_flags |= TF_ACKNOW;
5256         tp->t_rtttime = 0;
5257 out:
5258         return (retval);
5259 }
5260
5261 static int
5262 bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling)
5263 {
5264         int32_t ret = 0;
5265         int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
5266
5267         if (timers == 0) {
5268                 return (0);
5269         }
5270         if (tp->t_state == TCPS_LISTEN) {
5271                 /* no timers on listen sockets */
5272                 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
5273                         return (0);
5274                 return (1);
5275         }
5276         if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
5277                 uint32_t left;
5278
5279                 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
5280                         ret = -1;
5281                         bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
5282                         return (0);
5283                 }
5284                 if (hpts_calling == 0) {
5285                         ret = -2;
5286                         bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
5287                         return (0);
5288                 }
5289                 /*
5290                  * Ok our timer went off early and we are not paced false
5291                  * alarm, go back to sleep.
5292                  */
5293                 left = bbr->r_ctl.rc_timer_exp - cts;
5294                 ret = -3;
5295                 bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
5296                 tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left));
5297                 return (1);
5298         }
5299         bbr->rc_tmr_stopped = 0;
5300         bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
5301         if (timers & PACE_TMR_DELACK) {
5302                 ret = bbr_timeout_delack(tp, bbr, cts);
5303         } else if (timers & PACE_TMR_PERSIT) {
5304                 ret = bbr_timeout_persist(tp, bbr, cts);
5305         } else if (timers & PACE_TMR_RACK) {
5306                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5307                 ret = bbr_timeout_rack(tp, bbr, cts);
5308         } else if (timers & PACE_TMR_TLP) {
5309                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5310                 ret = bbr_timeout_tlp(tp, bbr, cts);
5311         } else if (timers & PACE_TMR_RXT) {
5312                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
5313                 ret = bbr_timeout_rxt(tp, bbr, cts);
5314         } else if (timers & PACE_TMR_KEEP) {
5315                 ret = bbr_timeout_keepalive(tp, bbr, cts);
5316         }
5317         bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling);
5318         return (ret);
5319 }
5320
5321 static void
5322 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
5323 {
5324         if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
5325                 uint8_t hpts_removed = 0;
5326
5327                 if (bbr->rc_inp->inp_in_hpts &&
5328                     (bbr->rc_timer_first == 1)) {
5329                         /*
5330                          * If we are canceling timer's when we have the
5331                          * timer ahead of the output being paced. We also
5332                          * must remove ourselves from the hpts.
5333                          */
5334                         hpts_removed = 1;
5335                         tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
5336                         if (bbr->r_ctl.rc_last_delay_val) {
5337                                 /* Update the last hptsi delay too */
5338                                 uint32_t time_since_send;
5339
5340                                 if (TSTMP_GT(cts, bbr->rc_pacer_started))
5341                                         time_since_send = cts - bbr->rc_pacer_started;
5342                                 else
5343                                         time_since_send = 0;
5344                                 if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
5345                                         /* Cut down our slot time */
5346                                         bbr->r_ctl.rc_last_delay_val -= time_since_send;
5347                                 } else {
5348                                         bbr->r_ctl.rc_last_delay_val = 0;
5349                                 }
5350                                 bbr->rc_pacer_started = cts;
5351                         }
5352                 }
5353                 bbr->rc_timer_first = 0;
5354                 bbr_log_to_cancel(bbr, line, cts, hpts_removed);
5355                 bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
5356                 bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
5357         }
5358 }
5359
5360 static void
5361 bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
5362 {
5363         struct tcp_bbr *bbr;
5364
5365         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
5366         bbr->rc_all_timers_stopped = 1;
5367         return;
5368 }
5369
5370 /*
5371  * stop all timers always returning 0.
5372  */
5373 static int
5374 bbr_stopall(struct tcpcb *tp)
5375 {
5376         return (0);
5377 }
5378
5379 static void
5380 bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
5381 {
5382         return;
5383 }
5384
5385 /*
5386  * return true if a bbr timer (rack or tlp) is active.
5387  */
5388 static int
5389 bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
5390 {
5391         return (0);
5392 }
5393
5394 static uint32_t
5395 bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts)
5396 {
5397         struct bbr_sendmap *rsm;
5398
5399         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
5400         if ((rsm == NULL) || (u_rsm == rsm))
5401                 return (cts);
5402         return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
5403 }
5404
5405 static void
5406 bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr,
5407      struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time)
5408 {
5409         int32_t idx;
5410
5411         rsm->r_rtr_cnt++;
5412         rsm->r_dupack = 0;
5413         if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) {
5414                 rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS;
5415                 rsm->r_flags |= BBR_OVERMAX;
5416         }
5417         if (rsm->r_flags & BBR_RWND_COLLAPSED) {
5418                 /* Take off the collapsed flag at rxt */
5419                 rsm->r_flags &= ~BBR_RWND_COLLAPSED;
5420         }
5421         if (rsm->r_flags & BBR_MARKED_LOST) {
5422                 /* We have retransmitted, its no longer lost */
5423                 rsm->r_flags &= ~BBR_MARKED_LOST;
5424                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
5425         }
5426         if (rsm->r_flags & BBR_RXT_CLEARED) {
5427                 /*
5428                  * We hit a RXT timer on it and
5429                  * we cleared the "acked" flag.
5430                  * We now have it going back into
5431                  * flight, we can remove the cleared
5432                  * flag and possibly do accounting on
5433                  * this piece.
5434                  */
5435                 rsm->r_flags &= ~BBR_RXT_CLEARED;
5436         }
5437         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) {
5438                 bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
5439                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
5440         }
5441         idx = rsm->r_rtr_cnt - 1;
5442         rsm->r_tim_lastsent[idx] = cts;
5443         rsm->r_pacing_delay = pacing_time;
5444         rsm->r_delivered = bbr->r_ctl.rc_delivered;
5445         rsm->r_ts_valid = bbr->rc_ts_valid;
5446         if (bbr->rc_ts_valid)
5447                 rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
5448         if (bbr->r_ctl.r_app_limited_until)
5449                 rsm->r_app_limited = 1;
5450         else
5451                 rsm->r_app_limited = 0;
5452         if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
5453                 rsm->r_bbr_state = bbr_state_val(bbr);
5454         else
5455                 rsm->r_bbr_state = 8;
5456         if (rsm->r_flags & BBR_ACKED) {
5457                 /* Problably MTU discovery messing with us */
5458                 uint32_t old_flags;
5459
5460                 old_flags = rsm->r_flags;
5461                 rsm->r_flags &= ~BBR_ACKED;
5462                 bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
5463                 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
5464                 if (bbr->r_ctl.rc_sacked == 0)
5465                         bbr->r_ctl.rc_sacklast = NULL;
5466         }
5467         if (rsm->r_in_tmap) {
5468                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
5469         }
5470         TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
5471         rsm->r_in_tmap = 1;
5472         if (rsm->r_flags & BBR_SACK_PASSED) {
5473                 /* We have retransmitted due to the SACK pass */
5474                 rsm->r_flags &= ~BBR_SACK_PASSED;
5475                 rsm->r_flags |= BBR_WAS_SACKPASS;
5476         }
5477         rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
5478         rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
5479                                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
5480         bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
5481         if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
5482                 rsm->r_is_gain = 1;
5483                 rsm->r_is_drain = 0;
5484         } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
5485                 rsm->r_is_drain = 1;
5486                 rsm->r_is_gain = 0;
5487         } else {
5488                 rsm->r_is_drain = 0;
5489                 rsm->r_is_gain = 0;
5490         }
5491         rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */
5492 }
5493
5494 /*
5495  * Returns 0, or the sequence where we stopped
5496  * updating. We also update the lenp to be the amount
5497  * of data left.
5498  */
5499
5500 static uint32_t
5501 bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr,
5502     struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time)
5503 {
5504         /*
5505          * We (re-)transmitted starting at rsm->r_start for some length
5506          * (possibly less than r_end.
5507          */
5508         struct bbr_sendmap *nrsm;
5509         uint32_t c_end;
5510         int32_t len;
5511
5512         len = *lenp;
5513         c_end = rsm->r_start + len;
5514         if (SEQ_GEQ(c_end, rsm->r_end)) {
5515                 /*
5516                  * We retransmitted the whole piece or more than the whole
5517                  * slopping into the next rsm.
5518                  */
5519                 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
5520                 if (c_end == rsm->r_end) {
5521                         *lenp = 0;
5522                         return (0);
5523                 } else {
5524                         int32_t act_len;
5525
5526                         /* Hangs over the end return whats left */
5527                         act_len = rsm->r_end - rsm->r_start;
5528                         *lenp = (len - act_len);
5529                         return (rsm->r_end);
5530                 }
5531                 /* We don't get out of this block. */
5532         }
5533         /*
5534          * Here we retransmitted less than the whole thing which means we
5535          * have to split this into what was transmitted and what was not.
5536          */
5537         nrsm = bbr_alloc_full_limit(bbr);
5538         if (nrsm == NULL) {
5539                 *lenp = 0;
5540                 return (0);
5541         }
5542         /*
5543          * So here we are going to take the original rsm and make it what we
5544          * retransmitted. nrsm will be the tail portion we did not
5545          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
5546          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
5547          * 1, 6 and the new piece will be 6, 11.
5548          */
5549         bbr_clone_rsm(bbr, nrsm, rsm, c_end);
5550         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
5551         nrsm->r_dupack = 0;
5552         if (rsm->r_in_tmap) {
5553                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5554                 nrsm->r_in_tmap = 1;
5555         }
5556         rsm->r_flags &= (~BBR_HAS_FIN);
5557         bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
5558         *lenp = 0;
5559         return (0);
5560 }
5561
5562 static uint64_t
5563 bbr_get_hardware_rate(struct tcp_bbr *bbr)
5564 {
5565         uint64_t bw;
5566
5567         bw = bbr_get_bw(bbr);
5568         bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN];
5569         bw /= (uint64_t)BBR_UNIT;
5570         return(bw);
5571 }
5572
5573 static void
5574 bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts,
5575                        uint64_t act_rate, uint64_t rate_wanted)
5576 {
5577         /*
5578          * We could not get a full gains worth
5579          * of rate.
5580          */
5581         if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) {
5582                 /* we can't even get the real rate */
5583                 uint64_t red;
5584
5585                 bbr->skip_gain = 1;
5586                 bbr->gain_is_limited = 0;
5587                 red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate;
5588                 if (red)
5589                         filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts);
5590         } else {
5591                 /* We can use a lower gain */
5592                 bbr->skip_gain = 0;
5593                 bbr->gain_is_limited = 1;
5594         }
5595 }
5596
5597 static void
5598 bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts)
5599 {
5600         const struct tcp_hwrate_limit_table *nrte;
5601         int error, rate = -1;
5602
5603         if (bbr->r_ctl.crte == NULL)
5604                 return;
5605         if ((bbr->rc_inp->inp_route.ro_nh == NULL) ||
5606             (bbr->rc_inp->inp_route.ro_nh->nh_ifp == NULL)) {
5607                 /* Lost our routes? */
5608                 /* Clear the way for a re-attempt */
5609                 bbr->bbr_attempt_hdwr_pace = 0;
5610 lost_rate:
5611                 bbr->gain_is_limited = 0;
5612                 bbr->skip_gain = 0;
5613                 bbr->bbr_hdrw_pacing = 0;
5614                 counter_u64_add(bbr_flows_whdwr_pacing, -1);
5615                 counter_u64_add(bbr_flows_nohdwr_pacing, 1);
5616                 tcp_bbr_tso_size_check(bbr, cts);
5617                 return;
5618         }
5619         rate = bbr_get_hardware_rate(bbr);
5620         nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte,
5621                                    bbr->rc_tp,
5622                                    bbr->rc_inp->inp_route.ro_nh->nh_ifp,
5623                                    rate,
5624                                    (RS_PACING_GEQ|RS_PACING_SUB_OK),
5625                                    &error);
5626         if (nrte == NULL) {
5627                 goto lost_rate;
5628         }
5629         if (nrte != bbr->r_ctl.crte) {
5630                 bbr->r_ctl.crte = nrte;
5631                 if (error == 0)  {
5632                         BBR_STAT_INC(bbr_hdwr_rl_mod_ok);
5633                         if (bbr->r_ctl.crte->rate < rate) {
5634                                 /* We have a problem */
5635                                 bbr_setup_less_of_rate(bbr, cts,
5636                                                        bbr->r_ctl.crte->rate, rate);
5637                         } else {
5638                                 /* We are good */
5639                                 bbr->gain_is_limited = 0;
5640                                 bbr->skip_gain = 0;
5641                         }
5642                 } else {
5643                         /* A failure should release the tag */
5644                         BBR_STAT_INC(bbr_hdwr_rl_mod_fail);
5645                         bbr->gain_is_limited = 0;
5646                         bbr->skip_gain = 0;
5647                         bbr->bbr_hdrw_pacing = 0;
5648                 }
5649                 bbr_type_log_hdwr_pacing(bbr,
5650                                          bbr->r_ctl.crte->ptbl->rs_ifp,
5651                                          rate,
5652                                          ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate),
5653                                          __LINE__,
5654                                          cts,
5655                                          error);
5656         }
5657 }
5658
5659 static void
5660 bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts)
5661 {
5662         /*
5663          * If we have hardware pacing support
5664          * we need to factor that in for our
5665          * TSO size.
5666          */
5667         const struct tcp_hwrate_limit_table *rlp;
5668         uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay;
5669
5670         if ((bbr->bbr_hdrw_pacing == 0) ||
5671             (IN_RECOVERY(bbr->rc_tp->t_flags)) ||
5672             (bbr->r_ctl.crte == NULL))
5673                 return;
5674         if (bbr->hw_pacing_set == 0) {
5675                 /* Not yet by the hdwr pacing count delay */
5676                 return;
5677         }
5678         if (bbr_hdwr_pace_adjust == 0) {
5679                 /* No adjustment */
5680                 return;
5681         }
5682         rlp = bbr->r_ctl.crte;
5683         if (bbr->rc_tp->t_maxseg > bbr->rc_last_options)
5684                 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5685         else
5686                 maxseg = BBR_MIN_SEG - bbr->rc_last_options;
5687         /*
5688          * So lets first get the
5689          * time we will take between
5690          * TSO sized sends currently without
5691          * hardware help.
5692          */
5693         cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT,
5694                         bbr->r_ctl.rc_pace_max_segs, cts, 1);
5695         hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg;
5696         hdwr_delay *= rlp->time_between;
5697         if (cur_delay > hdwr_delay)
5698                 delta = cur_delay - hdwr_delay;
5699         else
5700                 delta = 0;
5701         bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay,
5702                              (bbr->r_ctl.rc_pace_max_segs / maxseg),
5703                              1);
5704         if (delta &&
5705             (delta < (max(rlp->time_between,
5706                           bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) {
5707                 /*
5708                  * Now lets divide by the pacing
5709                  * time between each segment the
5710                  * hardware sends rounding up and
5711                  * derive a bytes from that. We multiply
5712                  * that by bbr_hdwr_pace_adjust to get
5713                  * more bang for our buck.
5714                  *
5715                  * The goal is to have the software pacer
5716                  * waiting no more than an additional
5717                  * pacing delay if we can (without the
5718                  * compensation i.e. x bbr_hdwr_pace_adjust).
5719                  */
5720                 seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between),
5721                              (bbr->r_ctl.rc_pace_max_segs/maxseg));
5722                 seg_sz *= bbr_hdwr_pace_adjust;
5723                 if (bbr_hdwr_pace_floor &&
5724                     (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
5725                         /* Currently hardware paces
5726                          * out rs_min_seg segments at a time.
5727                          * We need to make sure we always send at least
5728                          * a full burst of bbr_hdwr_pace_floor down.
5729                          */
5730                         seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
5731                 }
5732                 seg_sz *= maxseg;
5733         } else if (delta == 0) {
5734                 /*
5735                  * The highest pacing rate is
5736                  * above our b/w gained. This means
5737                  * we probably are going quite fast at
5738                  * the hardware highest rate. Lets just multiply
5739                  * the calculated TSO size by the
5740                  * multiplier factor (its probably
5741                  * 4 segments in the default config for
5742                  * mlx).
5743                  */
5744                 seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust;
5745                 if (bbr_hdwr_pace_floor &&
5746                     (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
5747                         /* Currently hardware paces
5748                          * out rs_min_seg segments at a time.
5749                          * We need to make sure we always send at least
5750                          * a full burst of bbr_hdwr_pace_floor down.
5751                          */
5752                         seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
5753                 }
5754         } else {
5755                 /*
5756                  * The pacing time difference is so
5757                  * big that the hardware will
5758                  * pace out more rapidly then we
5759                  * really want and then we
5760                  * will have a long delay. Lets just keep
5761                  * the same TSO size so its as if
5762                  * we were not using hdwr pacing (we
5763                  * just gain a bit of spacing from the
5764                  * hardware if seg_sz > 1).
5765                  */
5766                 seg_sz = bbr->r_ctl.rc_pace_max_segs;
5767         }
5768         if (seg_sz > bbr->r_ctl.rc_pace_max_segs)
5769                 new_tso = seg_sz;
5770         else
5771                 new_tso = bbr->r_ctl.rc_pace_max_segs;
5772         if (new_tso >= (PACE_MAX_IP_BYTES-maxseg))
5773                 new_tso = PACE_MAX_IP_BYTES - maxseg;
5774
5775         if (new_tso != bbr->r_ctl.rc_pace_max_segs) {
5776                 bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0);
5777                 bbr->r_ctl.rc_pace_max_segs = new_tso;
5778         }
5779 }
5780
5781 static void
5782 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts)
5783 {
5784         uint64_t bw;
5785         uint32_t old_tso = 0, new_tso;
5786         uint32_t maxseg, bytes;
5787         uint32_t tls_seg=0;
5788         /*
5789          * Google/linux uses the following algorithm to determine
5790          * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18):
5791          *
5792          *  bytes = bw_in_bytes_per_second / 1000
5793          *  bytes = min(bytes, 64k)
5794          *  tso_segs = bytes / MSS
5795          *  if (bw < 1.2Mbs)
5796          *      min_tso_segs = 1
5797          *  else
5798          *      min_tso_segs = 2
5799          * tso_segs = max(tso_segs, min_tso_segs)
5800          *
5801          * * Note apply a device specific limit (we apply this in the
5802          *   tcp_m_copym).
5803          * Note that before the initial measurement is made google bursts out
5804          * a full iwnd just like new-reno/cubic.
5805          *
5806          * We do not use this algorithm. Instead we
5807          * use a two phased approach:
5808          *
5809          *  if ( bw <= per-tcb-cross-over)
5810          *     goal_tso =  calculate how much with this bw we
5811          *                 can send in goal-time seconds.
5812          *     if (goal_tso > mss)
5813          *         seg = goal_tso / mss
5814          *         tso = seg * mss
5815          *     else
5816          *         tso = mss
5817          *     if (tso > per-tcb-max)
5818          *         tso = per-tcb-max
5819          *  else if ( bw > 512Mbps)
5820          *     tso = max-tso (64k/mss)
5821          *  else
5822          *     goal_tso = bw / per-tcb-divsor
5823          *     seg = (goal_tso + mss-1)/mss
5824          *     tso = seg * mss
5825          *
5826          * if (tso < per-tcb-floor)
5827          *    tso = per-tcb-floor
5828          * if (tso > per-tcb-utter_max)
5829          *    tso = per-tcb-utter_max
5830          *
5831          * Note the default per-tcb-divisor is 1000 (same as google).
5832          * the goal cross over is 30Mbps however. To recreate googles
5833          * algorithm you need to set:
5834          *
5835          * cross-over = 23,168,000 bps
5836          * goal-time = 18000
5837          * per-tcb-max = 2
5838          * per-tcb-divisor = 1000
5839          * per-tcb-floor = 1
5840          *
5841          * This will get you "google bbr" behavior with respect to tso size.
5842          *
5843          * Note we do set anything TSO size until we are past the initial
5844          * window. Before that we gnerally use either a single MSS
5845          * or we use the full IW size (so we burst a IW at a time)
5846          * Also note that Hardware-TLS is special and does alternate
5847          * things to minimize PCI Bus Bandwidth use.
5848          */
5849
5850         if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) {
5851                 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5852         } else {
5853                 maxseg = BBR_MIN_SEG - bbr->rc_last_options;
5854         }
5855 #ifdef KERN_TLS
5856         if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
5857                 tls_seg =  ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd);
5858                 bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options);
5859         }
5860 #endif
5861         old_tso = bbr->r_ctl.rc_pace_max_segs;
5862         if (bbr->rc_past_init_win == 0) {
5863                 /*
5864                  * Not enough data has been acknowledged to make a
5865                  * judgement unless we are hardware TLS. Set up
5866                  * the initial TSO based on if we are sending a
5867                  * full IW at once or not.
5868                  */
5869                 if (bbr->rc_use_google)
5870                         bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2);
5871                 else if (bbr->bbr_init_win_cheat)
5872                         bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp);
5873                 else
5874                         bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
5875                 if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg)
5876                         bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg;
5877 #ifdef KERN_TLS
5878                 if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) {
5879                         /*
5880                          * For hardware TLS we set our min to the tls_seg size.
5881                          */
5882                         bbr->r_ctl.rc_pace_max_segs = tls_seg;
5883                         bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
5884                 }
5885 #endif
5886                 if (bbr->r_ctl.rc_pace_max_segs == 0) {
5887                         bbr->r_ctl.rc_pace_max_segs = maxseg;
5888                 }
5889                 bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0);
5890 #ifdef KERN_TLS
5891                 if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
5892 #endif
5893                         bbr_adjust_for_hw_pacing(bbr, cts);
5894                 return;
5895         }
5896         /**
5897          * Now lets set the TSO goal based on our delivery rate in
5898          * bytes per second. Note we only do this if
5899          * we have acked at least the initial cwnd worth of data.
5900          */
5901         bw = bbr_get_bw(bbr);
5902         if (IN_RECOVERY(bbr->rc_tp->t_flags) &&
5903              (bbr->rc_use_google == 0)) {
5904                 /* We clamp to one MSS in recovery */
5905                 new_tso = maxseg;
5906         } else if (bbr->rc_use_google) {
5907                 int min_tso_segs;
5908
5909                 /* Google considers the gain too */
5910                 if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) {
5911                         bw *= bbr->r_ctl.rc_bbr_hptsi_gain;
5912                         bw /= BBR_UNIT;
5913                 }
5914                 bytes = bw / 1024;
5915                 if (bytes > (64 * 1024))
5916                         bytes = 64 * 1024;
5917                 new_tso = bytes / maxseg;
5918                 if (bw < ONE_POINT_TWO_MEG)
5919                         min_tso_segs = 1;
5920                 else
5921                         min_tso_segs = 2;
5922                 if (new_tso < min_tso_segs)
5923                         new_tso = min_tso_segs;
5924                 new_tso *= maxseg;
5925         } else if (bbr->rc_no_pacing) {
5926                 new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg;
5927         } else if (bw <= bbr->r_ctl.bbr_cross_over) {
5928                 /*
5929                  * Calculate the worse case b/w TSO if we are inserting no
5930                  * more than a delay_target number of TSO's.
5931                  */
5932                 uint32_t tso_len, min_tso;
5933
5934                 tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw);
5935                 if (tso_len > maxseg) {
5936                         new_tso = tso_len / maxseg;
5937                         if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max)
5938                                 new_tso = bbr->r_ctl.bbr_hptsi_segments_max;
5939                         new_tso *= maxseg;
5940                 } else {
5941                         /*
5942                          * less than a full sized frame yikes.. long rtt or
5943                          * low bw?
5944                          */
5945                         min_tso = bbr_minseg(bbr);
5946                         if ((tso_len > min_tso) && (bbr_all_get_min == 0))
5947                                 new_tso = rounddown(tso_len, min_tso);
5948                         else
5949                                 new_tso = min_tso;
5950                 }
5951         } else if (bw > FIVETWELVE_MBPS) {
5952                 /*
5953                  * This guy is so fast b/w wise that we can TSO as large as
5954                  * possible of segments that the NIC will allow.
5955                  */
5956                 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
5957         } else {
5958                 /*
5959                  * This formula is based on attempting to send a segment or
5960                  * more every bbr_hptsi_per_second. The default is 1000
5961                  * which means you are targeting what you can send every 1ms
5962                  * based on the peers bw.
5963                  *
5964                  * If the number drops to say 500, then you are looking more
5965                  * at 2ms and you will raise how much we send in a single
5966                  * TSO thus saving CPU (less bbr_output_wtime() calls). The
5967                  * trade off of course is you will send more at once and
5968                  * thus tend to clump up the sends into larger "bursts"
5969                  * building a queue.
5970                  */
5971                 bw /= bbr->r_ctl.bbr_hptsi_per_second;
5972                 new_tso = roundup(bw, (uint64_t)maxseg);
5973                 /*
5974                  * Gate the floor to match what our lower than 48Mbps
5975                  * algorithm does. The ceiling (bbr_hptsi_segments_max) thus
5976                  * becomes the floor for this calculation.
5977                  */
5978                 if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg))
5979                         new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg);
5980         }
5981         if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor)))
5982                 new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor;
5983         if (new_tso > PACE_MAX_IP_BYTES)
5984                 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
5985         /* Enforce an utter maximum if we are not HW-TLS */
5986 #ifdef KERN_TLS
5987         if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0)
5988 #endif
5989                 if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) {
5990                         new_tso = bbr->r_ctl.bbr_utter_max * maxseg;
5991                 }
5992 #ifdef KERN_TLS
5993         if (tls_seg) {
5994                 /*
5995                  * Lets move the output size
5996                  * up to 1 or more TLS record sizes.
5997                  */
5998                 uint32_t temp;
5999
6000                 temp = roundup(new_tso, tls_seg);
6001                 new_tso = temp;
6002                 /* Back down if needed to under a full frame */
6003                 while (new_tso > PACE_MAX_IP_BYTES)
6004                         new_tso -= tls_seg;
6005         }
6006 #endif
6007         if (old_tso != new_tso) {
6008                 /* Only log changes */
6009                 bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0);
6010                 bbr->r_ctl.rc_pace_max_segs = new_tso;
6011         }
6012 #ifdef KERN_TLS
6013         if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) &&
6014              tls_seg) {
6015                 bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options;
6016         } else
6017 #endif
6018                 /* We have hardware pacing and not hardware TLS! */
6019                 bbr_adjust_for_hw_pacing(bbr, cts);
6020 }
6021
6022 static void
6023 bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len,
6024     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts,
6025     struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc,
6026     struct sockbuf *sb)
6027 {
6028
6029         struct bbr_sendmap *rsm, *nrsm;
6030         register uint32_t snd_max, snd_una;
6031         uint32_t pacing_time;
6032         /*
6033          * Add to the RACK log of packets in flight or retransmitted. If
6034          * there is a TS option we will use the TS echoed, if not we will
6035          * grab a TS.
6036          *
6037          * Retransmissions will increment the count and move the ts to its
6038          * proper place. Note that if options do not include TS's then we
6039          * won't be able to effectively use the ACK for an RTT on a retran.
6040          *
6041          * Notes about r_start and r_end. Lets consider a send starting at
6042          * sequence 1 for 10 bytes. In such an example the r_start would be
6043          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
6044          * This means that r_end is actually the first sequence for the next
6045          * slot (11).
6046          *
6047          */
6048         INP_WLOCK_ASSERT(tp->t_inpcb);
6049         if (err) {
6050                 /*
6051                  * We don't log errors -- we could but snd_max does not
6052                  * advance in this case either.
6053                  */
6054                 return;
6055         }
6056         if (th_flags & TH_RST) {
6057                 /*
6058                  * We don't log resets and we return immediately from
6059                  * sending
6060                  */
6061                 *abandon = 1;
6062                 return;
6063         }
6064         snd_una = tp->snd_una;
6065         if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) {
6066                 /*
6067                  * The call to bbr_log_output is made before bumping
6068                  * snd_max. This means we can record one extra byte on a SYN
6069                  * or FIN if seq_out is adding more on and a FIN is present
6070                  * (and we are not resending).
6071                  */
6072                 if (th_flags & TH_SYN)
6073                         len++;
6074                 if (th_flags & TH_FIN)
6075                         len++;
6076         }
6077         if (SEQ_LEQ((seq_out + len), snd_una)) {
6078                 /* Are sending an old segment to induce an ack (keep-alive)? */
6079                 return;
6080         }
6081         if (SEQ_LT(seq_out, snd_una)) {
6082                 /* huh? should we panic? */
6083                 uint32_t end;
6084
6085                 end = seq_out + len;
6086                 seq_out = snd_una;
6087                 len = end - seq_out;
6088         }
6089         snd_max = tp->snd_max;
6090         if (len == 0) {
6091                 /* We don't log zero window probes */
6092                 return;
6093         }
6094         pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1);
6095         /* First question is it a retransmission? */
6096         if (seq_out == snd_max) {
6097 again:
6098                 rsm = bbr_alloc(bbr);
6099                 if (rsm == NULL) {
6100                         return;
6101                 }
6102                 rsm->r_flags = 0;
6103                 if (th_flags & TH_SYN)
6104                         rsm->r_flags |= BBR_HAS_SYN;
6105                 if (th_flags & TH_FIN)
6106                         rsm->r_flags |= BBR_HAS_FIN;
6107                 rsm->r_tim_lastsent[0] = cts;
6108                 rsm->r_rtr_cnt = 1;
6109                 rsm->r_rtr_bytes = 0;
6110                 rsm->r_start = seq_out;
6111                 rsm->r_end = rsm->r_start + len;
6112                 rsm->r_dupack = 0;
6113                 rsm->r_delivered = bbr->r_ctl.rc_delivered;
6114                 rsm->r_pacing_delay = pacing_time;
6115                 rsm->r_ts_valid = bbr->rc_ts_valid;
6116                 if (bbr->rc_ts_valid)
6117                         rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
6118                 rsm->r_del_time = bbr->r_ctl.rc_del_time;
6119                 if (bbr->r_ctl.r_app_limited_until)
6120                         rsm->r_app_limited = 1;
6121                 else
6122                         rsm->r_app_limited = 0;
6123                 rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
6124                 rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
6125                                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
6126                 /*
6127                  * Here we must also add in this rsm since snd_max
6128                  * is updated after we return from a new send.
6129                  */
6130                 rsm->r_flight_at_send += len;
6131                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
6132                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
6133                 rsm->r_in_tmap = 1;
6134                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
6135                         rsm->r_bbr_state = bbr_state_val(bbr);
6136                 else
6137                         rsm->r_bbr_state = 8;
6138                 if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
6139                         rsm->r_is_gain = 1;
6140                         rsm->r_is_drain = 0;
6141                 } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
6142                         rsm->r_is_drain = 1;
6143                         rsm->r_is_gain = 0;
6144                 } else {
6145                         rsm->r_is_drain = 0;
6146                         rsm->r_is_gain = 0;
6147                 }
6148                 return;
6149         }
6150         /*
6151          * If we reach here its a retransmission and we need to find it.
6152          */
6153 more:
6154         if (hintrsm && (hintrsm->r_start == seq_out)) {
6155                 rsm = hintrsm;
6156                 hintrsm = NULL;
6157         } else if (bbr->r_ctl.rc_next) {
6158                 /* We have a hint from a previous run */
6159                 rsm = bbr->r_ctl.rc_next;
6160         } else {
6161                 /* No hints sorry */
6162                 rsm = NULL;
6163         }
6164         if ((rsm) && (rsm->r_start == seq_out)) {
6165                 /*
6166                  * We used rc_next or hintrsm  to retransmit, hopefully the
6167                  * likely case.
6168                  */
6169                 seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
6170                 if (len == 0) {
6171                         return;
6172                 } else {
6173                         goto more;
6174                 }
6175         }
6176         /* Ok it was not the last pointer go through it the hard way. */
6177         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
6178                 if (rsm->r_start == seq_out) {
6179                         seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
6180                         bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
6181                         if (len == 0) {
6182                                 return;
6183                         } else {
6184                                 continue;
6185                         }
6186                 }
6187                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
6188                         /* Transmitted within this piece */
6189                         /*
6190                          * Ok we must split off the front and then let the
6191                          * update do the rest
6192                          */
6193                         nrsm = bbr_alloc_full_limit(bbr);
6194                         if (nrsm == NULL) {
6195                                 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
6196                                 return;
6197                         }
6198                         /*
6199                          * copy rsm to nrsm and then trim the front of rsm
6200                          * to not include this part.
6201                          */
6202                         bbr_clone_rsm(bbr, nrsm, rsm, seq_out);
6203                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
6204                         if (rsm->r_in_tmap) {
6205                                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6206                                 nrsm->r_in_tmap = 1;
6207                         }
6208                         rsm->r_flags &= (~BBR_HAS_FIN);
6209                         seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time);
6210                         if (len == 0) {
6211                                 return;
6212                         }
6213                 }
6214         }
6215         /*
6216          * Hmm not found in map did they retransmit both old and on into the
6217          * new?
6218          */
6219         if (seq_out == tp->snd_max) {
6220                 goto again;
6221         } else if (SEQ_LT(seq_out, tp->snd_max)) {
6222 #ifdef BBR_INVARIANTS
6223                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
6224                     seq_out, len, tp->snd_una, tp->snd_max);
6225                 printf("Starting Dump of all rack entries\n");
6226                 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
6227                         printf("rsm:%p start:%u end:%u\n",
6228                             rsm, rsm->r_start, rsm->r_end);
6229                 }
6230                 printf("Dump complete\n");
6231                 panic("seq_out not found rack:%p tp:%p",
6232                     bbr, tp);
6233 #endif
6234         } else {
6235 #ifdef BBR_INVARIANTS
6236                 /*
6237                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
6238                  * flag)
6239                  */
6240                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
6241                     seq_out, len, tp->snd_max, tp);
6242 #endif
6243         }
6244 }
6245
6246 static void
6247 bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt)
6248 {
6249         /*
6250          * Collapse timeout back the cum-ack moved.
6251          */
6252         tp->t_rxtshift = 0;
6253         tp->t_softerror = 0;
6254 }
6255
6256
6257 static void
6258 tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin)
6259 {
6260         bbr->rtt_valid = 1;
6261         bbr->r_ctl.cur_rtt = rtt_usecs;
6262         bbr->r_ctl.ts_in = tsin;
6263         if (rsm_send_time)
6264                 bbr->r_ctl.cur_rtt_send_time = rsm_send_time;
6265 }
6266
6267 static void
6268 bbr_make_timestamp_determination(struct tcp_bbr *bbr)
6269 {
6270         /**
6271          * We have in our bbr control:
6272          * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp).
6273          * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts).
6274          * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts)
6275          * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time)
6276          *
6277          * Now we can calculate the time between the sends by doing:
6278          *
6279          * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts
6280          *
6281          * And the peer's time between receiving them by doing:
6282          *
6283          * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp
6284          *
6285          * We want to figure out if the timestamp values are in msec, 10msec or usec.
6286          * We also may find that we can't use the timestamps if say we see
6287          * that the peer_delta indicates that though we may have taken 10ms to
6288          * pace out the data, it only saw 1ms between the two packets. This would
6289          * indicate that somewhere on the path is a batching entity that is giving
6290          * out time-slices of the actual b/w. This would mean we could not use
6291          * reliably the peers timestamps.
6292          *
6293          * We expect delta > peer_delta initially. Until we figure out the
6294          * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio.
6295          * If we place 1000 there then its a ms vs our usec. If we place 10000 there
6296          * then its 10ms vs our usec. If the peer is running a usec clock we would
6297          * put a 1 there. If the value is faster then ours, we will disable the
6298          * use of timestamps (though we could revist this later if we find it to be not
6299          * just an isolated one or two flows)).
6300          *
6301          * To detect the batching middle boxes we will come up with our compensation and
6302          * if with it in place, we find the peer is drastically off (by some margin) in
6303          * the smaller direction, then we will assume the worst case and disable use of timestamps.
6304          *
6305          */
6306         uint64_t delta, peer_delta, delta_up;
6307
6308         delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts;
6309         if (delta < bbr_min_usec_delta) {
6310                 /*
6311                  * Have not seen a min amount of time
6312                  * between our send times so we can
6313                  * make a determination of the timestamp
6314                  * yet.
6315                  */
6316                 return;
6317         }
6318         peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp;
6319         if (peer_delta < bbr_min_peer_delta) {
6320                 /*
6321                  * We may have enough in the form of
6322                  * our delta but the peers number
6323                  * has not changed that much. It could
6324                  * be its clock ratio is such that
6325                  * we need more data (10ms tick) or
6326                  * there may be other compression scenarios
6327                  * going on. In any event we need the
6328                  * spread to be larger.
6329                  */
6330                 return;
6331         }
6332         /* Ok lets first see which way our delta is going */
6333         if (peer_delta > delta) {
6334                 /* Very unlikely, the peer without
6335                  * compensation shows that it saw
6336                  * the two sends arrive further apart
6337                  * then we saw then in micro-seconds.
6338                  */
6339                 if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) {
6340                         /* well it looks like the peer is a micro-second clock. */
6341                         bbr->rc_ts_clock_set = 1;
6342                         bbr->r_ctl.bbr_peer_tsratio = 1;
6343                 } else {
6344                         bbr->rc_ts_cant_be_used = 1;
6345                         bbr->rc_ts_clock_set = 1;
6346                 }
6347                 return;
6348         }
6349         /* Ok we know that the peer_delta is smaller than our send distance */
6350         bbr->rc_ts_clock_set = 1;
6351         /* First question is it within the percentage that they are using usec time? */
6352         delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent;
6353         if ((peer_delta + delta_up) >= delta) {
6354                 /* Its a usec clock */
6355                 bbr->r_ctl.bbr_peer_tsratio = 1;
6356                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6357                 return;
6358         }
6359         /* Ok if not usec, what about 10usec (though unlikely)? */
6360         delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent;
6361         if (((peer_delta * 10) + delta_up) >= delta) {
6362                 bbr->r_ctl.bbr_peer_tsratio = 10;
6363                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6364                 return;
6365         }
6366         /* And what about 100usec (though again unlikely)? */
6367         delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent;
6368         if (((peer_delta * 100) + delta_up) >= delta) {
6369                 bbr->r_ctl.bbr_peer_tsratio = 100;
6370                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6371                 return;
6372         }
6373         /* And how about 1 msec (the most likely one)? */
6374         delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent;
6375         if (((peer_delta * 1000) + delta_up) >= delta) {
6376                 bbr->r_ctl.bbr_peer_tsratio = 1000;
6377                 bbr_log_tstmp_validation(bbr, peer_delta, delta);
6378                 return;
6379         }
6380         /* Ok if not msec could it be 10 msec? */
6381         delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent;
6382         if (((peer_delta * 10000) + delta_up) >= delta) {
6383                 bbr->r_ctl.bbr_peer_tsratio = 10000;
6384                 return;
6385         }
6386         /* If we fall down here the clock tick so slowly we can't use it */
6387         bbr->rc_ts_cant_be_used = 1;
6388         bbr->r_ctl.bbr_peer_tsratio = 0;
6389         bbr_log_tstmp_validation(bbr, peer_delta, delta);
6390 }
6391
6392 /*
6393  * Collect new round-trip time estimate
6394  * and update averages and current timeout.
6395  */
6396 static void
6397 tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
6398 {
6399         int32_t delta;
6400         uint32_t rtt, tsin;
6401         int32_t rtt_ticks;
6402
6403
6404         if (bbr->rtt_valid == 0)
6405                 /* No valid sample */
6406                 return;
6407
6408         rtt = bbr->r_ctl.cur_rtt;
6409         tsin = bbr->r_ctl.ts_in;
6410         if (bbr->rc_prtt_set_ts) {
6411                 /*
6412                  * We are to force feed the rttProp filter due
6413                  * to an entry into PROBE_RTT. This assures
6414                  * that the times are sync'd between when we
6415                  * go into PROBE_RTT and the filter expiration.
6416                  *
6417                  * Google does not use a true filter, so they do
6418                  * this implicitly since they only keep one value
6419                  * and when they enter probe-rtt they update the
6420                  * value to the newest rtt.
6421                  */
6422                 uint32_t rtt_prop;
6423
6424                 bbr->rc_prtt_set_ts = 0;
6425                 rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
6426                 if (rtt > rtt_prop)
6427                         filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts);
6428                 else
6429                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
6430         }
6431         if (bbr->rc_ack_was_delayed)
6432                 rtt += bbr->r_ctl.rc_ack_hdwr_delay;
6433
6434         if (rtt < bbr->r_ctl.rc_lowest_rtt)
6435                 bbr->r_ctl.rc_lowest_rtt = rtt;
6436         bbr_log_rtt_sample(bbr, rtt, tsin);
6437         if (bbr->r_init_rtt) {
6438                 /*
6439                  * The initial rtt is not-trusted, nuke it and lets get
6440                  * our first valid measurement in.
6441                  */
6442                 bbr->r_init_rtt = 0;
6443                 tp->t_srtt = 0;
6444         }
6445         if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) {
6446                 /*
6447                  * So we have not yet figured out
6448                  * what the peers TSTMP value is
6449                  * in (most likely ms). We need a
6450                  * series of cum-ack's to determine
6451                  * this reliably.
6452                  */
6453                 if (bbr->rc_ack_is_cumack) {
6454                         if (bbr->rc_ts_data_set) {
6455                                 /* Lets attempt to determine the timestamp granularity. */
6456                                 bbr_make_timestamp_determination(bbr);
6457                         } else {
6458                                 bbr->rc_ts_data_set = 1;
6459                                 bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts;
6460                                 bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time;
6461                         }
6462                 } else {
6463                         /*
6464                          * We have to have consecutive acks
6465                          * reset any "filled" state to none.
6466                          */
6467                         bbr->rc_ts_data_set = 0;
6468                 }
6469         }
6470         /* Round it up */
6471         rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1)));
6472         if (rtt_ticks == 0)
6473                 rtt_ticks = 1;
6474         if (tp->t_srtt != 0) {
6475                 /*
6476                  * srtt is stored as fixed point with 5 bits after the
6477                  * binary point (i.e., scaled by 8).  The following magic is
6478                  * equivalent to the smoothing algorithm in rfc793 with an
6479                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
6480                  * Adjust rtt to origin 0.
6481                  */
6482
6483                 delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT)
6484                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6485
6486                 tp->t_srtt += delta;
6487                 if (tp->t_srtt <= 0)
6488                         tp->t_srtt = 1;
6489
6490                 /*
6491                  * We accumulate a smoothed rtt variance (actually, a
6492                  * smoothed mean difference), then set the retransmit timer
6493                  * to smoothed rtt + 4 times the smoothed variance. rttvar
6494                  * is stored as fixed point with 4 bits after the binary
6495                  * point (scaled by 16).  The following is equivalent to
6496                  * rfc793 smoothing with an alpha of .75 (rttvar =
6497                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
6498                  * wired-in beta.
6499                  */
6500                 if (delta < 0)
6501                         delta = -delta;
6502                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
6503                 tp->t_rttvar += delta;
6504                 if (tp->t_rttvar <= 0)
6505                         tp->t_rttvar = 1;
6506                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
6507                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6508         } else {
6509                 /*
6510                  * No rtt measurement yet - use the unsmoothed rtt. Set the
6511                  * variance to half the rtt (so our first retransmit happens
6512                  * at 3*rtt).
6513                  */
6514                 tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT;
6515                 tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1);
6516                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6517         }
6518         KMOD_TCPSTAT_INC(tcps_rttupdated);
6519         tp->t_rttupdated++;
6520 #ifdef STATS
6521         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks));
6522 #endif
6523         /*
6524          * the retransmit should happen at rtt + 4 * rttvar. Because of the
6525          * way we do the smoothing, srtt and rttvar will each average +1/2
6526          * tick of bias.  When we compute the retransmit timer, we want 1/2
6527          * tick of rounding and 1 extra tick because of +-1/2 tick
6528          * uncertainty in the firing of the timer.  The bias will give us
6529          * exactly the 1.5 tick we need.  But, because the bias is
6530          * statistical, we have to test that we don't drop below the minimum
6531          * feasible timer (which is 2 ticks).
6532          */
6533         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6534             max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2),
6535             MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
6536
6537         /*
6538          * We received an ack for a packet that wasn't retransmitted; it is
6539          * probably safe to discard any error indications we've received
6540          * recently.  This isn't quite right, but close enough for now (a
6541          * route might have failed after we sent a segment, and the return
6542          * path might not be symmetrical).
6543          */
6544         tp->t_softerror = 0;
6545         rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
6546         if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt)
6547                 bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
6548 }
6549
6550 static void
6551 bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
6552                    uint32_t t, uint32_t cts, int ack_type)
6553 {
6554         /*
6555          * For this RSM, we acknowledged the data from a previous
6556          * transmission, not the last one we made. This means we did a false
6557          * retransmit.
6558          */
6559         if (rsm->r_flags & BBR_HAS_FIN) {
6560                 /*
6561                  * The sending of the FIN often is multiple sent when we
6562                  * have everything outstanding ack'd. We ignore this case
6563                  * since its over now.
6564                  */
6565                 return;
6566         }
6567         if (rsm->r_flags & BBR_TLP) {
6568                 /*
6569                  * We expect TLP's to have this occur often
6570                  */
6571                 bbr->rc_tlp_rtx_out = 0;
6572                 return;
6573         }
6574         if (ack_type != BBR_CUM_ACKED) {
6575                 /*
6576                  * If it was not a cum-ack we
6577                  * don't really know for sure since
6578                  * the timestamp could be from some
6579                  * other transmission.
6580                  */
6581                 return;
6582         }
6583
6584         if (rsm->r_flags & BBR_WAS_SACKPASS) {
6585                 /*
6586                  * We retransmitted based on a sack and the earlier
6587                  * retransmission ack'd it - re-ordering is occuring.
6588                  */
6589                 BBR_STAT_INC(bbr_reorder_seen);
6590                 bbr->r_ctl.rc_reorder_ts = cts;
6591         }
6592         /* Back down the loss count */
6593         if (rsm->r_flags & BBR_MARKED_LOST) {
6594                 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
6595                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
6596                 rsm->r_flags &= ~BBR_MARKED_LOST;
6597                 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
6598                         /* LT sampling also needs adjustment */
6599                         bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
6600         }
6601         /***** RRS HERE ************************/
6602         /* Do we need to do this???            */
6603         /* bbr_reset_lt_bw_sampling(bbr, cts); */
6604         /***** RRS HERE ************************/
6605         BBR_STAT_INC(bbr_badfr);
6606         BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
6607 }
6608
6609
6610 static void
6611 bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
6612 {
6613         bbr->r_ctl.rc_rtt_shrinks = cts;
6614         if (bbr_can_force_probertt &&
6615             (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
6616             ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
6617                 /*
6618                  * We should enter probe-rtt its been too long
6619                  * since we have been there.
6620                  */
6621                 bbr_enter_probe_rtt(bbr, cts, __LINE__);
6622         } else
6623                 bbr_check_probe_rtt_limits(bbr, cts);
6624 }
6625
6626 static void
6627 tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts)
6628 {
6629         uint64_t orig_bw;
6630
6631         if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) {
6632                 /* We never apply a zero measurment */
6633                 bbr_log_type_bbrupd(bbr, 20, cts, 0, 0,
6634                                     0, 0, 0, 0, 0, 0);
6635                 return;
6636         }
6637         if (bbr->r_ctl.r_measurement_count < 0xffffffff)
6638                 bbr->r_ctl.r_measurement_count++;
6639         orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
6640         apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch);
6641         bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw,
6642                             (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate),
6643                             0, 0, 0, 0, 0, 0);
6644         if (orig_bw &&
6645             (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) {
6646                 if (bbr->bbr_hdrw_pacing) {
6647                         /*
6648                          * Apply a new rate to the hardware
6649                          * possibly.
6650                          */
6651                         bbr_update_hardware_pacing_rate(bbr, cts);
6652                 }
6653                 bbr_set_state_target(bbr, __LINE__);
6654                 tcp_bbr_tso_size_check(bbr, cts);
6655                 if (bbr->r_recovery_bw)  {
6656                         bbr_setup_red_bw(bbr, cts);
6657                         bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW);
6658                 }
6659         } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate))
6660                 tcp_bbr_tso_size_check(bbr, cts);
6661 }
6662
6663 static void
6664 bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
6665 {
6666         if (bbr->rc_in_persist == 0) {
6667                 /* We log only when not in persist */
6668                 /* Translate to a Bytes Per Second */
6669                 uint64_t tim, bw, ts_diff, ts_bw;
6670                 uint32_t upper, lower, delivered;
6671
6672                 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
6673                         tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
6674                 else
6675                         tim = 1;
6676                 /*
6677                  * Now that we have processed the tim (skipping the sample
6678                  * or possibly updating the time, go ahead and
6679                  * calculate the cdr.
6680                  */
6681                 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
6682                 bw = (uint64_t)delivered;
6683                 bw *= (uint64_t)USECS_IN_SECOND;
6684                 bw /= tim;
6685                 if (bw == 0) {
6686                         /* We must have a calculatable amount */
6687                         return;
6688                 }
6689                 upper = (bw >> 32) & 0x00000000ffffffff;
6690                 lower = bw & 0x00000000ffffffff;
6691                 /*
6692                  * If we are using this b/w shove it in now so we
6693                  * can see in the trace viewer if it gets over-ridden.
6694                  */
6695                 if (rsm->r_ts_valid &&
6696                     bbr->rc_ts_valid &&
6697                     bbr->rc_ts_clock_set &&
6698                     (bbr->rc_ts_cant_be_used == 0) &&
6699                     bbr->rc_use_ts_limit) {
6700                         ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1);
6701                         ts_diff *= bbr->r_ctl.bbr_peer_tsratio;
6702                         if ((delivered == 0) ||
6703                             (rtt < 1000)) {
6704                                 /* Can't use the ts */
6705                                 bbr_log_type_bbrupd(bbr, 61, cts,
6706                                                     ts_diff,
6707                                                     bbr->r_ctl.last_inbound_ts,
6708                                                     rsm->r_del_ack_ts, 0,
6709                                                     0, 0, 0, delivered);
6710                         } else {
6711                                 ts_bw = (uint64_t)delivered;
6712                                 ts_bw *= (uint64_t)USECS_IN_SECOND;
6713                                 ts_bw /= ts_diff;
6714                                 bbr_log_type_bbrupd(bbr, 62, cts,
6715                                                     (ts_bw >> 32),
6716                                                     (ts_bw & 0xffffffff), 0, 0,
6717                                                     0, 0, ts_diff, delivered);
6718                                 if ((bbr->ts_can_raise) &&
6719                                     (ts_bw > bw)) {
6720                                         bbr_log_type_bbrupd(bbr, 8, cts,
6721                                                             delivered,
6722                                                             ts_diff,
6723                                                             (bw >> 32),
6724                                                             (bw & 0x00000000ffffffff),
6725                                                             0, 0, 0, 0);
6726                                         bw = ts_bw;
6727                                 } else if (ts_bw && (ts_bw < bw)) {
6728                                         bbr_log_type_bbrupd(bbr, 7, cts,
6729                                                             delivered,
6730                                                             ts_diff,
6731                                                             (bw >> 32),
6732                                                             (bw & 0x00000000ffffffff),
6733                                                             0, 0, 0, 0);
6734                                         bw = ts_bw;
6735                                 }
6736                         }
6737                 }
6738                 if (rsm->r_first_sent_time &&
6739                     TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
6740                         uint64_t sbw, sti;
6741                         /*
6742                          * We use what was in flight at the time of our
6743                          * send  and the size of this send to figure
6744                          * out what we have been sending at (amount).
6745                          * For the time we take from the time of
6746                          * the send of the first send outstanding
6747                          * until this send plus this sends pacing
6748                          * time. This gives us a good calculation
6749                          * as to the rate we have been sending at.
6750                          */
6751
6752                         sbw = (uint64_t)(rsm->r_flight_at_send);
6753                         sbw *= (uint64_t)USECS_IN_SECOND;
6754                         sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
6755                         sti += rsm->r_pacing_delay;
6756                         sbw /= sti;
6757                         if (sbw < bw) {
6758                                 bbr_log_type_bbrupd(bbr, 6, cts,
6759                                                     delivered,
6760                                                     (uint32_t)sti,
6761                                                     (bw >> 32),
6762                                                     (uint32_t)bw,
6763                                                     rsm->r_first_sent_time, 0, (sbw >> 32),
6764                                                     (uint32_t)sbw);
6765                                 bw = sbw;
6766                         }
6767                 }
6768                 /* Use the google algorithm for b/w measurements */
6769                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6770                 if ((rsm->r_app_limited == 0) ||
6771                     (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) {
6772                         tcp_bbr_commit_bw(bbr, cts);
6773                         bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
6774                                             0, 0, 0, 0,  bbr->r_ctl.rc_del_time,  rsm->r_del_time);
6775                 }
6776         }
6777 }
6778
6779 static void
6780 bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
6781 {
6782         if (bbr->rc_in_persist == 0) {
6783                 /* We log only when not in persist */
6784                 /* Translate to a Bytes Per Second */
6785                 uint64_t tim, bw;
6786                 uint32_t upper, lower, delivered;
6787                 int no_apply = 0;
6788
6789                 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
6790                         tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
6791                 else
6792                         tim = 1;
6793                 /*
6794                  * Now that we have processed the tim (skipping the sample
6795                  * or possibly updating the time, go ahead and
6796                  * calculate the cdr.
6797                  */
6798                 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
6799                 bw = (uint64_t)delivered;
6800                 bw *= (uint64_t)USECS_IN_SECOND;
6801                 bw /= tim;
6802                 if (tim < bbr->r_ctl.rc_lowest_rtt) {
6803                         bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
6804                                             tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
6805
6806                         no_apply = 1;
6807                 }
6808                 upper = (bw >> 32) & 0x00000000ffffffff;
6809                 lower = bw & 0x00000000ffffffff;
6810                 /*
6811                  * If we are using this b/w shove it in now so we
6812                  * can see in the trace viewer if it gets over-ridden.
6813                  */
6814                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6815                 /* Gate by the sending rate */
6816                 if (rsm->r_first_sent_time &&
6817                     TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
6818                         uint64_t sbw, sti;
6819                         /*
6820                          * We use what was in flight at the time of our
6821                          * send  and the size of this send to figure
6822                          * out what we have been sending at (amount).
6823                          * For the time we take from the time of
6824                          * the send of the first send outstanding
6825                          * until this send plus this sends pacing
6826                          * time. This gives us a good calculation
6827                          * as to the rate we have been sending at.
6828                          */
6829
6830                         sbw = (uint64_t)(rsm->r_flight_at_send);
6831                         sbw *= (uint64_t)USECS_IN_SECOND;
6832                         sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
6833                         sti += rsm->r_pacing_delay;
6834                         sbw /= sti;
6835                         if (sbw < bw) {
6836                                 bbr_log_type_bbrupd(bbr, 6, cts,
6837                                                     delivered,
6838                                                     (uint32_t)sti,
6839                                                     (bw >> 32),
6840                                                     (uint32_t)bw,
6841                                                     rsm->r_first_sent_time, 0, (sbw >> 32),
6842                                                     (uint32_t)sbw);
6843                                 bw = sbw;
6844                         }
6845                         if ((sti > tim) &&
6846                             (sti < bbr->r_ctl.rc_lowest_rtt)) {
6847                                 bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
6848                                                     (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
6849                                 no_apply = 1;
6850                         } else
6851                                 no_apply = 0;
6852                 }
6853                 bbr->r_ctl.rc_bbr_cur_del_rate = bw;
6854                 if ((no_apply == 0) &&
6855                     ((rsm->r_app_limited == 0) ||
6856                      (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) {
6857                         tcp_bbr_commit_bw(bbr, cts);
6858                         bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
6859                                             0, 0, 0, 0, bbr->r_ctl.rc_del_time,  rsm->r_del_time);
6860                 }
6861         }
6862 }
6863
6864
6865 static void
6866 bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin,
6867     uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to)
6868 {
6869         uint64_t old_rttprop;
6870
6871         /* Update our delivery time and amount */
6872         bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start);
6873         bbr->r_ctl.rc_del_time = cts;
6874         if (rtt == 0) {
6875                 /*
6876                  * 0 means its a retransmit, for now we don't use these for
6877                  * the rest of BBR.
6878                  */
6879                 return;
6880         }
6881         if ((bbr->rc_use_google == 0) &&
6882             (match != BBR_RTT_BY_EXACTMATCH) &&
6883             (match != BBR_RTT_BY_TIMESTAMP)){
6884                 /*
6885                  * We get a lot of rtt updates, lets not pay attention to
6886                  * any that are not an exact match. That way we don't have
6887                  * to worry about timestamps and the whole nonsense of
6888                  * unsure if its a retransmission etc (if we ever had the
6889                  * timestamp fixed to always have the last thing sent this
6890                  * would not be a issue).
6891                  */
6892                 return;
6893         }
6894         if ((bbr_no_retran && bbr->rc_use_google) &&
6895             (match != BBR_RTT_BY_EXACTMATCH) &&
6896             (match != BBR_RTT_BY_TIMESTAMP)){
6897                 /*
6898                  * We only do measurements in google mode
6899                  * with bbr_no_retran on for sure things.
6900                  */
6901                 return;
6902         }
6903         /* Only update srtt if we know by exact match */
6904         tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin);
6905         if (ack_type == BBR_CUM_ACKED)
6906                 bbr->rc_ack_is_cumack = 1;
6907         else
6908                 bbr->rc_ack_is_cumack = 0;
6909         old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP);
6910         /*
6911          * Note the following code differs to the original
6912          * BBR spec. It calls for <= not <. However after a
6913          * long discussion in email with Neal, he acknowledged
6914          * that it should be < than so that we will have flows
6915          * going into probe-rtt (we were seeing cases where that
6916          * did not happen and caused ugly things to occur). We
6917          * have added this agreed upon fix to our code base.
6918          */
6919         if (rtt < old_rttprop) {
6920                 /* Update when we last saw a rtt drop */
6921                 bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0);
6922                 bbr_set_reduced_rtt(bbr, cts, __LINE__);
6923         }
6924         bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts,
6925             match, rsm->r_start, rsm->r_flags);
6926         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
6927         if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) {
6928                 /*
6929                  * The RTT-prop moved, reset the target (may be a
6930                  * nop for some states).
6931                  */
6932                 bbr_set_state_target(bbr, __LINE__);
6933                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)
6934                         bbr_log_rtt_shrinks(bbr, cts, 0, 0,
6935                                             __LINE__, BBR_RTTS_NEW_TARGET, 0);
6936                 else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP))
6937                         /* It went up */
6938                         bbr_check_probe_rtt_limits(bbr, cts);
6939         }
6940         if ((bbr->rc_use_google == 0) &&
6941             (match == BBR_RTT_BY_TIMESTAMP)) {
6942                 /*
6943                  * We don't do b/w update with
6944                  * these since they are not really
6945                  * reliable.
6946                  */
6947                 return;
6948         }
6949         if (bbr->r_ctl.r_app_limited_until &&
6950             (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) {
6951                 /* We are no longer app-limited */
6952                 bbr->r_ctl.r_app_limited_until = 0;
6953         }
6954         if (bbr->rc_use_google) {
6955                 bbr_google_measurement(bbr, rsm, rtt, cts);
6956         } else {
6957                 bbr_nf_measurement(bbr, rsm, rtt, cts);
6958         }
6959 }
6960
6961 /*
6962  * Convert a timestamp that the main stack
6963  * uses (milliseconds) into one that bbr uses
6964  * (microseconds). Return that converted timestamp.
6965  */
6966 static uint32_t
6967 bbr_ts_convert(uint32_t cts) {
6968         uint32_t sec, msec;
6969
6970         sec = cts / MS_IN_USEC;
6971         msec = cts - (MS_IN_USEC * sec);
6972         return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC));
6973 }
6974
6975 /*
6976  * Return 0 if we did not update the RTT time, return
6977  * 1 if we did.
6978  */
6979 static int
6980 bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
6981     struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack)
6982 {
6983         int32_t i;
6984         uint32_t t, uts = 0;
6985
6986         if ((rsm->r_flags & BBR_ACKED) ||
6987             (rsm->r_flags & BBR_WAS_RENEGED) ||
6988             (rsm->r_flags & BBR_RXT_CLEARED)) {
6989                 /* Already done */
6990                 return (0);
6991         }
6992         if (rsm->r_rtr_cnt == 1) {
6993                 /*
6994                  * Only one transmit. Hopefully the normal case.
6995                  */
6996                 if (TSTMP_GT(cts, rsm->r_tim_lastsent[0]))
6997                         t = cts - rsm->r_tim_lastsent[0];
6998                 else
6999                         t = 1;
7000                 if ((int)t <= 0)
7001                         t = 1;
7002                 bbr->r_ctl.rc_last_rtt = t;
7003                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
7004                                     BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to);
7005                 return (1);
7006         }
7007         /* Convert to usecs */
7008         if ((bbr_can_use_ts_for_rtt == 1) &&
7009             (bbr->rc_use_google == 1) &&
7010             (ack_type == BBR_CUM_ACKED) &&
7011             (to->to_flags & TOF_TS) &&
7012             (to->to_tsecr != 0)) {
7013
7014                 t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr;
7015                 if (t < 1)
7016                         t = 1;
7017                 t *= MS_IN_USEC;
7018                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
7019                                     BBR_RTT_BY_TIMESTAMP,
7020                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)],
7021                                     ack_type, to);
7022                 return (1);
7023         }
7024         uts = bbr_ts_convert(to->to_tsecr);
7025         if ((to->to_flags & TOF_TS) &&
7026             (to->to_tsecr != 0) &&
7027             (ack_type == BBR_CUM_ACKED) &&
7028             ((rsm->r_flags & BBR_OVERMAX) == 0)) {
7029                 /*
7030                  * Now which timestamp does it match? In this block the ACK
7031                  * may be coming from a previous transmission.
7032                  */
7033                 uint32_t fudge;
7034
7035                 fudge = BBR_TIMER_FUDGE;
7036                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
7037                         if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) &&
7038                             (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) {
7039                                 if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7040                                         t = cts - rsm->r_tim_lastsent[i];
7041                                 else
7042                                         t = 1;
7043                                 if ((int)t <= 0)
7044                                         t = 1;
7045                                 bbr->r_ctl.rc_last_rtt = t;
7046                                 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING,
7047                                                     rsm->r_tim_lastsent[i], ack_type, to);
7048                                 if ((i + 1) < rsm->r_rtr_cnt) {
7049                                         /* Likely */
7050                                         bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
7051                                 } else if (rsm->r_flags & BBR_TLP) {
7052                                         bbr->rc_tlp_rtx_out = 0;
7053                                 }
7054                                 return (1);
7055                         }
7056                 }
7057                 /* Fall through if we can't find a matching timestamp */
7058         }
7059         /*
7060          * Ok its a SACK block that we retransmitted. or a windows
7061          * machine without timestamps. We can tell nothing from the
7062          * time-stamp since its not there or the time the peer last
7063          * recieved a segment that moved forward its cum-ack point.
7064          *
7065          * Lets look at the last retransmit and see what we can tell
7066          * (with BBR for space we only keep 2 note we have to keep
7067          * at least 2 so the map can not be condensed more).
7068          */
7069         i = rsm->r_rtr_cnt - 1;
7070         if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7071                 t = cts - rsm->r_tim_lastsent[i];
7072         else
7073                 goto not_sure;
7074         if (t < bbr->r_ctl.rc_lowest_rtt) {
7075                 /*
7076                  * We retransmitted and the ack came back in less
7077                  * than the smallest rtt we have observed in the
7078                  * windowed rtt. We most likey did an improper
7079                  * retransmit as outlined in 4.2 Step 3 point 2 in
7080                  * the rack-draft.
7081                  *
7082                  * Use the prior transmission to update all the
7083                  * information as long as there is only one prior
7084                  * transmission.
7085                  */
7086                 if ((rsm->r_flags & BBR_OVERMAX) == 0) {
7087 #ifdef BBR_INVARIANTS
7088                         if (rsm->r_rtr_cnt == 1)
7089                                 panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags);
7090 #endif
7091                         i = rsm->r_rtr_cnt - 2;
7092                         if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
7093                                 t = cts - rsm->r_tim_lastsent[i];
7094                         else
7095                                 t = 1;
7096                         bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
7097                                             rsm->r_tim_lastsent[i], ack_type, to);
7098                         bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
7099                 } else {
7100                         /*
7101                          * Too many prior transmissions, just
7102                          * updated BBR delivered
7103                          */
7104 not_sure:
7105                         bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
7106                                             BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
7107                 }
7108         } else {
7109                 /*
7110                  * We retransmitted it and the retransmit did the
7111                  * job.
7112                  */
7113                 if (rsm->r_flags & BBR_TLP)
7114                         bbr->rc_tlp_rtx_out = 0;
7115                 if ((rsm->r_flags & BBR_OVERMAX) == 0)
7116                         bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts,
7117                                             BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to);
7118                 else
7119                         bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
7120                                             BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
7121                 return (1);
7122         }
7123         return (0);
7124 }
7125
7126 /*
7127  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
7128  */
7129 static void
7130 bbr_log_sack_passed(struct tcpcb *tp,
7131     struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
7132 {
7133         struct bbr_sendmap *nrsm;
7134
7135         nrsm = rsm;
7136         TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap,
7137             bbr_head, r_tnext) {
7138                 if (nrsm == rsm) {
7139                         /* Skip orginal segment he is acked */
7140                         continue;
7141                 }
7142                 if (nrsm->r_flags & BBR_ACKED) {
7143                         /* Skip ack'd segments */
7144                         continue;
7145                 }
7146                 if (nrsm->r_flags & BBR_SACK_PASSED) {
7147                         /*
7148                          * We found one that is already marked
7149                          * passed, we have been here before and
7150                          * so all others below this are marked.
7151                          */
7152                         break;
7153                 }
7154                 BBR_STAT_INC(bbr_sack_passed);
7155                 nrsm->r_flags |= BBR_SACK_PASSED;
7156                 if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) &&
7157                     bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) {
7158                         bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start;
7159                         bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start;
7160                         nrsm->r_flags |= BBR_MARKED_LOST;
7161                 }
7162                 nrsm->r_flags &= ~BBR_WAS_SACKPASS;
7163         }
7164 }
7165
7166 /*
7167  * Returns the number of bytes that were
7168  * newly ack'd by sack blocks.
7169  */
7170 static uint32_t
7171 bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack,
7172     struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts)
7173 {
7174         int32_t times = 0;
7175         uint32_t start, end, maxseg, changed = 0;
7176         struct bbr_sendmap *rsm, *nrsm;
7177         int32_t used_ref = 1;
7178         uint8_t went_back = 0, went_fwd = 0;
7179
7180         maxseg = tp->t_maxseg - bbr->rc_last_options;
7181         start = sack->start;
7182         end = sack->end;
7183         rsm = *prsm;
7184         if (rsm == NULL)
7185                 used_ref = 0;
7186
7187         /* Do we locate the block behind where we last were? */
7188         if (rsm && SEQ_LT(start, rsm->r_start)) {
7189                 went_back = 1;
7190                 TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
7191                         if (SEQ_GEQ(start, rsm->r_start) &&
7192                             SEQ_LT(start, rsm->r_end)) {
7193                                 goto do_rest_ofb;
7194                         }
7195                 }
7196         }
7197 start_at_beginning:
7198         went_fwd = 1;
7199         /*
7200          * Ok lets locate the block where this guy is fwd from rsm (if its
7201          * set)
7202          */
7203         TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) {
7204                 if (SEQ_GEQ(start, rsm->r_start) &&
7205                     SEQ_LT(start, rsm->r_end)) {
7206                         break;
7207                 }
7208         }
7209 do_rest_ofb:
7210         if (rsm == NULL) {
7211                 /*
7212                  * This happens when we get duplicate sack blocks with the
7213                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
7214                  * will not change there location so we would just start at
7215                  * the end of the first one and get lost.
7216                  */
7217                 if (tp->t_flags & TF_SENTFIN) {
7218                         /*
7219                          * Check to see if we have not logged the FIN that
7220                          * went out.
7221                          */
7222                         nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
7223                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
7224                                 /*
7225                                  * Ok we did not get the FIN logged.
7226                                  */
7227                                 nrsm->r_end++;
7228                                 rsm = nrsm;
7229                                 goto do_rest_ofb;
7230                         }
7231                 }
7232                 if (times == 1) {
7233 #ifdef BBR_INVARIANTS
7234                         panic("tp:%p bbr:%p sack:%p to:%p prsm:%p",
7235                             tp, bbr, sack, to, prsm);
7236 #else
7237                         goto out;
7238 #endif
7239                 }
7240                 times++;
7241                 BBR_STAT_INC(bbr_sack_proc_restart);
7242                 rsm = NULL;
7243                 goto start_at_beginning;
7244         }
7245         /* Ok we have an ACK for some piece of rsm */
7246         if (rsm->r_start != start) {
7247                 /*
7248                  * Need to split this in two pieces the before and after.
7249                  */
7250                 if (bbr_sack_mergable(rsm, start, end))
7251                         nrsm = bbr_alloc_full_limit(bbr);
7252                 else
7253                         nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
7254                 if (nrsm == NULL) {
7255                         /* We could not allocate ignore the sack */
7256                         struct sackblk blk;
7257
7258                         blk.start = start;
7259                         blk.end = end;
7260                         sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
7261                         goto out;
7262                 }
7263                 bbr_clone_rsm(bbr, nrsm, rsm, start);
7264                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
7265                 if (rsm->r_in_tmap) {
7266                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7267                         nrsm->r_in_tmap = 1;
7268                 }
7269                 rsm->r_flags &= (~BBR_HAS_FIN);
7270                 rsm = nrsm;
7271         }
7272         if (SEQ_GEQ(end, rsm->r_end)) {
7273                 /*
7274                  * The end of this block is either beyond this guy or right
7275                  * at this guy.
7276                  */
7277                 if ((rsm->r_flags & BBR_ACKED) == 0) {
7278                         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
7279                         changed += (rsm->r_end - rsm->r_start);
7280                         bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7281                         bbr_log_sack_passed(tp, bbr, rsm);
7282                         if (rsm->r_flags & BBR_MARKED_LOST) {
7283                                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7284                         }
7285                         /* Is Reordering occuring? */
7286                         if (rsm->r_flags & BBR_SACK_PASSED) {
7287                                 BBR_STAT_INC(bbr_reorder_seen);
7288                                 bbr->r_ctl.rc_reorder_ts = cts;
7289                                 if (rsm->r_flags & BBR_MARKED_LOST) {
7290                                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7291                                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7292                                                 /* LT sampling also needs adjustment */
7293                                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7294                                 }
7295                         }
7296                         rsm->r_flags |= BBR_ACKED;
7297                         rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
7298                         if (rsm->r_in_tmap) {
7299                                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7300                                 rsm->r_in_tmap = 0;
7301                         }
7302                 }
7303                 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
7304                 if (end == rsm->r_end) {
7305                         /* This block only - done */
7306                         goto out;
7307                 }
7308                 /* There is more not coverend by this rsm move on */
7309                 start = rsm->r_end;
7310                 nrsm = TAILQ_NEXT(rsm, r_next);
7311                 rsm = nrsm;
7312                 times = 0;
7313                 goto do_rest_ofb;
7314         }
7315         if (rsm->r_flags & BBR_ACKED) {
7316                 /* Been here done that */
7317                 goto out;
7318         }
7319         /* Ok we need to split off this one at the tail */
7320         if (bbr_sack_mergable(rsm, start, end))
7321                 nrsm = bbr_alloc_full_limit(bbr);
7322         else
7323                 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
7324         if (nrsm == NULL) {
7325                 /* failed XXXrrs what can we do but loose the sack info? */
7326                 struct sackblk blk;
7327
7328                 blk.start = start;
7329                 blk.end = end;
7330                 sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
7331                 goto out;
7332         }
7333         /* Clone it */
7334         bbr_clone_rsm(bbr, nrsm, rsm, end);
7335         /* The sack block does not cover this guy fully */
7336         rsm->r_flags &= (~BBR_HAS_FIN);
7337         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
7338         if (rsm->r_in_tmap) {
7339                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7340                 nrsm->r_in_tmap = 1;
7341         }
7342         nrsm->r_dupack = 0;
7343         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
7344         bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
7345         changed += (rsm->r_end - rsm->r_start);
7346         bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7347         bbr_log_sack_passed(tp, bbr, rsm);
7348         /* Is Reordering occuring? */
7349         if (rsm->r_flags & BBR_MARKED_LOST) {
7350                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7351         }
7352         if (rsm->r_flags & BBR_SACK_PASSED) {
7353                 BBR_STAT_INC(bbr_reorder_seen);
7354                 bbr->r_ctl.rc_reorder_ts = cts;
7355                 if (rsm->r_flags & BBR_MARKED_LOST) {
7356                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7357                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7358                                 /* LT sampling also needs adjustment */
7359                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7360                 }
7361         }
7362         rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
7363         rsm->r_flags |= BBR_ACKED;
7364         if (rsm->r_in_tmap) {
7365                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7366                 rsm->r_in_tmap = 0;
7367         }
7368 out:
7369         if (rsm && (rsm->r_flags & BBR_ACKED)) {
7370                 /*
7371                  * Now can we merge this newly acked
7372                  * block with either the previous or
7373                  * next block?
7374                  */
7375                 nrsm = TAILQ_NEXT(rsm, r_next);
7376                 if (nrsm &&
7377                     (nrsm->r_flags & BBR_ACKED)) {
7378                         /* yep this and next can be merged */
7379                         rsm = bbr_merge_rsm(bbr, rsm, nrsm);
7380                 }
7381                 /* Now what about the previous? */
7382                 nrsm = TAILQ_PREV(rsm, bbr_head, r_next);
7383                 if (nrsm &&
7384                     (nrsm->r_flags & BBR_ACKED)) {
7385                         /* yep the previous and this can be merged */
7386                         rsm = bbr_merge_rsm(bbr, nrsm, rsm);
7387                 }
7388         }
7389         if (used_ref == 0) {
7390                 BBR_STAT_INC(bbr_sack_proc_all);
7391         } else {
7392                 BBR_STAT_INC(bbr_sack_proc_short);
7393         }
7394         if (went_fwd && went_back) {
7395                 BBR_STAT_INC(bbr_sack_search_both);
7396         } else if (went_fwd) {
7397                 BBR_STAT_INC(bbr_sack_search_fwd);
7398         } else if (went_back) {
7399                 BBR_STAT_INC(bbr_sack_search_back);
7400         }
7401         /* Save off where the next seq is */
7402         if (rsm)
7403                 bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
7404         else
7405                 bbr->r_ctl.rc_sacklast = NULL;
7406         *prsm = rsm;
7407         return (changed);
7408 }
7409
7410
7411 static void inline
7412 bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack)
7413 {
7414         struct bbr_sendmap *tmap;
7415
7416         BBR_STAT_INC(bbr_reneges_seen);
7417         tmap = NULL;
7418         while (rsm && (rsm->r_flags & BBR_ACKED)) {
7419                 /* Its no longer sacked, mark it so */
7420                 uint32_t oflags;
7421                 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7422 #ifdef BBR_INVARIANTS
7423                 if (rsm->r_in_tmap) {
7424                         panic("bbr:%p rsm:%p flags:0x%x in tmap?",
7425                             bbr, rsm, rsm->r_flags);
7426                 }
7427 #endif
7428                 oflags = rsm->r_flags;
7429                 if (rsm->r_flags & BBR_MARKED_LOST) {
7430                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7431                         bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7432                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7433                                 /* LT sampling also needs adjustment */
7434                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7435                 }
7436                 rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST);
7437                 rsm->r_flags |= BBR_WAS_RENEGED;
7438                 rsm->r_flags |= BBR_RXT_CLEARED;
7439                 bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__);
7440                 /* Rebuild it into our tmap */
7441                 if (tmap == NULL) {
7442                         TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7443                         tmap = rsm;
7444                 } else {
7445                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext);
7446                         tmap = rsm;
7447                 }
7448                 tmap->r_in_tmap = 1;
7449                 /*
7450                  * XXXrrs Delivered? Should we do anything here?
7451                  *
7452                  * Of course we don't on a rxt timeout so maybe its ok that
7453                  * we don't?
7454                  *
7455                  * For now lets not.
7456                  */
7457                 rsm = TAILQ_NEXT(rsm, r_next);
7458         }
7459         /*
7460          * Now lets possibly clear the sack filter so we start recognizing
7461          * sacks that cover this area.
7462          */
7463         sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack);
7464 }
7465
7466 static void
7467 bbr_log_syn(struct tcpcb *tp, struct tcpopt *to)
7468 {
7469         struct tcp_bbr *bbr;
7470         struct bbr_sendmap *rsm;
7471         uint32_t cts;
7472
7473         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7474         cts = bbr->r_ctl.rc_rcvtime;
7475         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7476         if (rsm && (rsm->r_flags & BBR_HAS_SYN)) {
7477                 if ((rsm->r_end - rsm->r_start) <= 1) {
7478                         /* Log out the SYN completely */
7479                         bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7480                         rsm->r_rtr_bytes = 0;
7481                         TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
7482                         if (rsm->r_in_tmap) {
7483                                 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7484                                 rsm->r_in_tmap = 0;
7485                         }
7486                         if (bbr->r_ctl.rc_next == rsm) {
7487                                 /* scoot along the marker */
7488                                 bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7489                         }
7490                         if (to != NULL)
7491                                 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0);
7492                         bbr_free(bbr, rsm);
7493                 } else {
7494                         /* There is more (Fast open)? strip out SYN. */
7495                         rsm->r_flags &= ~BBR_HAS_SYN;
7496                         rsm->r_start++;
7497                 }
7498         }
7499 }
7500
7501 /*
7502  * Returns the number of bytes that were
7503  * acknowledged by SACK blocks.
7504  */
7505
7506 static uint32_t
7507 bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
7508     uint32_t *prev_acked)
7509 {
7510         uint32_t changed, last_seq, entered_recovery = 0;
7511         struct tcp_bbr *bbr;
7512         struct bbr_sendmap *rsm;
7513         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
7514         register uint32_t th_ack;
7515         int32_t i, j, k, new_sb, num_sack_blks = 0;
7516         uint32_t cts, acked, ack_point, sack_changed = 0;
7517         uint32_t p_maxseg, maxseg, p_acked = 0;
7518
7519         INP_WLOCK_ASSERT(tp->t_inpcb);
7520         if (th->th_flags & TH_RST) {
7521                 /* We don't log resets */
7522                 return (0);
7523         }
7524         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7525         cts = bbr->r_ctl.rc_rcvtime;
7526
7527         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7528         changed = 0;
7529         maxseg = tp->t_maxseg - bbr->rc_last_options;
7530         p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg);
7531         th_ack = th->th_ack;
7532         if (SEQ_GT(th_ack, tp->snd_una)) {
7533                 acked = th_ack - tp->snd_una;
7534                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__);
7535                 bbr->rc_tp->t_acktime = ticks;
7536         } else
7537                 acked = 0;
7538         if (SEQ_LEQ(th_ack, tp->snd_una)) {
7539                 /* Only sent here for sack processing */
7540                 goto proc_sack;
7541         }
7542         if (rsm && SEQ_GT(th_ack, rsm->r_start)) {
7543                 changed = th_ack - rsm->r_start;
7544         } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) {
7545                 /*
7546                  * For the SYN incoming case we will not have called
7547                  * tcp_output for the sending of the SYN, so there will be
7548                  * no map. All other cases should probably be a panic.
7549                  */
7550                 if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) {
7551                         /*
7552                          * We have a timestamp that can be used to generate
7553                          * an initial RTT.
7554                          */
7555                         uint32_t ts, now, rtt;
7556
7557                         ts = bbr_ts_convert(to->to_tsecr);
7558                         now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv));
7559                         rtt = now - ts;
7560                         if (rtt < 1)
7561                                 rtt = 1;
7562                         bbr_log_type_bbrrttprop(bbr, rtt,
7563                                                 tp->iss, 0, cts,
7564                                                 BBR_RTT_BY_TIMESTAMP, tp->iss, 0);
7565                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
7566                         changed = 1;
7567                         bbr->r_wanted_output = 1;
7568                         goto out;
7569                 }
7570                 goto proc_sack;
7571         } else if (rsm == NULL) {
7572                 goto out;
7573         }
7574         if (changed) {
7575                 /*
7576                  * The ACK point is advancing to th_ack, we must drop off
7577                  * the packets in the rack log and calculate any eligble
7578                  * RTT's.
7579                  */
7580                 bbr->r_wanted_output = 1;
7581 more:
7582                 if (rsm == NULL) {
7583
7584                         if (tp->t_flags & TF_SENTFIN) {
7585                                 /* if we send a FIN we will not hav a map */
7586                                 goto proc_sack;
7587                         }
7588 #ifdef BBR_INVARIANTS
7589                         panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n",
7590                             tp,
7591                             th, tp->t_state, bbr,
7592                             tp->snd_una, tp->snd_max, changed);
7593 #endif
7594                         goto proc_sack;
7595                 }
7596         }
7597         if (SEQ_LT(th_ack, rsm->r_start)) {
7598                 /* Huh map is missing this */
7599 #ifdef BBR_INVARIANTS
7600                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n",
7601                     rsm->r_start,
7602                     th_ack, tp->t_state,
7603                     bbr->r_state, bbr);
7604                 panic("th-ack is bad bbr:%p tp:%p", bbr, tp);
7605 #endif
7606                 goto proc_sack;
7607         } else if (th_ack == rsm->r_start) {
7608                 /* None here to ack */
7609                 goto proc_sack;
7610         }
7611         /*
7612          * Clear the dup ack counter, it will
7613          * either be freed or if there is some
7614          * remaining we need to start it at zero.
7615          */
7616         rsm->r_dupack = 0;
7617         /* Now do we consume the whole thing? */
7618         if (SEQ_GEQ(th_ack, rsm->r_end)) {
7619                 /* Its all consumed. */
7620                 uint32_t left;
7621
7622                 if (rsm->r_flags & BBR_ACKED) {
7623                         /*
7624                          * It was acked on the scoreboard -- remove it from
7625                          * total
7626                          */
7627                         p_acked += (rsm->r_end - rsm->r_start);
7628                         bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7629                         if (bbr->r_ctl.rc_sacked == 0)
7630                                 bbr->r_ctl.rc_sacklast = NULL;
7631                 } else {
7632                         bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack);
7633                         if (rsm->r_flags & BBR_MARKED_LOST) {
7634                                 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
7635                         }
7636                         if (rsm->r_flags & BBR_SACK_PASSED) {
7637                                 /*
7638                                  * There are acked segments ACKED on the
7639                                  * scoreboard further up. We are seeing
7640                                  * reordering.
7641                                  */
7642                                 BBR_STAT_INC(bbr_reorder_seen);
7643                                 bbr->r_ctl.rc_reorder_ts = cts;
7644                                 if (rsm->r_flags & BBR_MARKED_LOST) {
7645                                         bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
7646                                         if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
7647                                                 /* LT sampling also needs adjustment */
7648                                                 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
7649                                 }
7650                         }
7651                         rsm->r_flags &= ~BBR_MARKED_LOST;
7652                 }
7653                 bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7654                 rsm->r_rtr_bytes = 0;
7655                 TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
7656                 if (rsm->r_in_tmap) {
7657                         TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
7658                         rsm->r_in_tmap = 0;
7659                 }
7660                 if (bbr->r_ctl.rc_next == rsm) {
7661                         /* scoot along the marker */
7662                         bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7663                 }
7664                 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
7665                 /* Adjust the packet counts */
7666                 left = th_ack - rsm->r_end;
7667                 /* Free back to zone */
7668                 bbr_free(bbr, rsm);
7669                 if (left) {
7670                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7671                         goto more;
7672                 }
7673                 goto proc_sack;
7674         }
7675         if (rsm->r_flags & BBR_ACKED) {
7676                 /*
7677                  * It was acked on the scoreboard -- remove it from total
7678                  * for the part being cum-acked.
7679                  */
7680                 p_acked += (rsm->r_end - rsm->r_start);
7681                 bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
7682                 if (bbr->r_ctl.rc_sacked == 0)
7683                         bbr->r_ctl.rc_sacklast = NULL;
7684         } else {
7685                 /*
7686                  * It was acked up to th_ack point for the first time
7687                  */
7688                 struct bbr_sendmap lrsm;
7689
7690                 memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap));
7691                 lrsm.r_end = th_ack;
7692                 bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack);
7693         }
7694         if ((rsm->r_flags & BBR_MARKED_LOST) &&
7695             ((rsm->r_flags & BBR_ACKED) == 0)) {
7696                 /*
7697                  * It was marked lost and partly ack'd now
7698                  * for the first time. We lower the rc_lost_bytes
7699                  * and still leave it MARKED.
7700                  */
7701                 bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start;
7702         }
7703         bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
7704         bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7705         rsm->r_rtr_bytes = 0;
7706         /* adjust packet count */
7707         rsm->r_start = th_ack;
7708 proc_sack:
7709         /* Check for reneging */
7710         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
7711         if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) {
7712                 /*
7713                  * The peer has moved snd_una up to the edge of this send,
7714                  * i.e. one that it had previously acked. The only way that
7715                  * can be true if the peer threw away data (space issues)
7716                  * that it had previously sacked (else it would have given
7717                  * us snd_una up to (rsm->r_end). We need to undo the acked
7718                  * markings here.
7719                  *
7720                  * Note we have to look to make sure th_ack is our
7721                  * rsm->r_start in case we get an old ack where th_ack is
7722                  * behind snd_una.
7723                  */
7724                 bbr_peer_reneges(bbr, rsm, th->th_ack);
7725         }
7726         if ((to->to_flags & TOF_SACK) == 0) {
7727                 /* We are done nothing left to log */
7728                 goto out;
7729         }
7730         rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
7731         if (rsm) {
7732                 last_seq = rsm->r_end;
7733         } else {
7734                 last_seq = tp->snd_max;
7735         }
7736         /* Sack block processing */
7737         if (SEQ_GT(th_ack, tp->snd_una))
7738                 ack_point = th_ack;
7739         else
7740                 ack_point = tp->snd_una;
7741         for (i = 0; i < to->to_nsacks; i++) {
7742                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
7743                     &sack, sizeof(sack));
7744                 sack.start = ntohl(sack.start);
7745                 sack.end = ntohl(sack.end);
7746                 if (SEQ_GT(sack.end, sack.start) &&
7747                     SEQ_GT(sack.start, ack_point) &&
7748                     SEQ_LT(sack.start, tp->snd_max) &&
7749                     SEQ_GT(sack.end, ack_point) &&
7750                     SEQ_LEQ(sack.end, tp->snd_max)) {
7751                         if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) &&
7752                             (SEQ_LT(sack.end, last_seq)) &&
7753                             ((sack.end - sack.start) < (p_maxseg / 8))) {
7754                                 /*
7755                                  * Not the last piece and its smaller than
7756                                  * 1/8th of a p_maxseg. We ignore this.
7757                                  */
7758                                 BBR_STAT_INC(bbr_runt_sacks);
7759                                 continue;
7760                         }
7761                         sack_blocks[num_sack_blks] = sack;
7762                         num_sack_blks++;
7763 #ifdef NETFLIX_STATS
7764                 } else if (SEQ_LEQ(sack.start, th_ack) &&
7765                     SEQ_LEQ(sack.end, th_ack)) {
7766                         /*
7767                          * Its a D-SACK block.
7768                          */
7769                         tcp_record_dsack(sack.start, sack.end);
7770 #endif
7771                 }
7772         }
7773         if (num_sack_blks == 0)
7774                 goto out;
7775         /*
7776          * Sort the SACK blocks so we can update the rack scoreboard with
7777          * just one pass.
7778          */
7779         new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks,
7780                                   num_sack_blks, th->th_ack);
7781         ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks);
7782         BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks);
7783         BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb));
7784         num_sack_blks = new_sb;
7785         if (num_sack_blks < 2) {
7786                 goto do_sack_work;
7787         }
7788         /* Sort the sacks */
7789         for (i = 0; i < num_sack_blks; i++) {
7790                 for (j = i + 1; j < num_sack_blks; j++) {
7791                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
7792                                 sack = sack_blocks[i];
7793                                 sack_blocks[i] = sack_blocks[j];
7794                                 sack_blocks[j] = sack;
7795                         }
7796                 }
7797         }
7798         /*
7799          * Now are any of the sack block ends the same (yes some
7800          * implememtations send these)?
7801          */
7802 again:
7803         if (num_sack_blks > 1) {
7804                 for (i = 0; i < num_sack_blks; i++) {
7805                         for (j = i + 1; j < num_sack_blks; j++) {
7806                                 if (sack_blocks[i].end == sack_blocks[j].end) {
7807                                         /*
7808                                          * Ok these two have the same end we
7809                                          * want the smallest end and then
7810                                          * throw away the larger and start
7811                                          * again.
7812                                          */
7813                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
7814                                                 /*
7815                                                  * The second block covers
7816                                                  * more area use that
7817                                                  */
7818                                                 sack_blocks[i].start = sack_blocks[j].start;
7819                                         }
7820                                         /*
7821                                          * Now collapse out the dup-sack and
7822                                          * lower the count
7823                                          */
7824                                         for (k = (j + 1); k < num_sack_blks; k++) {
7825                                                 sack_blocks[j].start = sack_blocks[k].start;
7826                                                 sack_blocks[j].end = sack_blocks[k].end;
7827                                                 j++;
7828                                         }
7829                                         num_sack_blks--;
7830                                         goto again;
7831                                 }
7832                         }
7833                 }
7834         }
7835 do_sack_work:
7836         rsm = bbr->r_ctl.rc_sacklast;
7837         for (i = 0; i < num_sack_blks; i++) {
7838                 acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts);
7839                 if (acked) {
7840                         bbr->r_wanted_output = 1;
7841                         changed += acked;
7842                         sack_changed += acked;
7843                 }
7844         }
7845 out:
7846         *prev_acked = p_acked;
7847         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
7848                 /*
7849                  * Ok we have a high probability that we need to go in to
7850                  * recovery since we have data sack'd
7851                  */
7852                 struct bbr_sendmap *rsm;
7853
7854                 rsm = bbr_check_recovery_mode(tp, bbr, cts);
7855                 if (rsm) {
7856                         /* Enter recovery */
7857                         entered_recovery = 1;
7858                         bbr->r_wanted_output = 1;
7859                         /*
7860                          * When we enter recovery we need to assure we send
7861                          * one packet.
7862                          */
7863                         if (bbr->r_ctl.rc_resend == NULL) {
7864                                 bbr->r_ctl.rc_resend = rsm;
7865                         }
7866                 }
7867         }
7868         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
7869                 /*
7870                  * See if we need to rack-retransmit anything if so set it
7871                  * up as the thing to resend assuming something else is not
7872                  * already in that position.
7873                  */
7874                 if (bbr->r_ctl.rc_resend == NULL) {
7875                         bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
7876                 }
7877         }
7878         /*
7879          * We return the amount that changed via sack, this is used by the
7880          * ack-received code to augment what was changed between th_ack <->
7881          * snd_una.
7882          */
7883         return (sack_changed);
7884 }
7885
7886 static void
7887 bbr_strike_dupack(struct tcp_bbr *bbr)
7888 {
7889         struct bbr_sendmap *rsm;
7890
7891         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
7892         if (rsm && (rsm->r_dupack < 0xff)) {
7893                 rsm->r_dupack++;
7894                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD)
7895                         bbr->r_wanted_output = 1;
7896         }
7897 }
7898
7899 /*
7900  * Return value of 1, we do not need to call bbr_process_data().
7901  * return value of 0, bbr_process_data can be called.
7902  * For ret_val if its 0 the TCB is locked and valid, if its non-zero
7903  * its unlocked and probably unsafe to touch the TCB.
7904  */
7905 static int
7906 bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
7907     struct tcpcb *tp, struct tcpopt *to,
7908     uint32_t tiwin, int32_t tlen,
7909     int32_t * ofia, int32_t thflags, int32_t * ret_val)
7910 {
7911         int32_t ourfinisacked = 0;
7912         int32_t acked_amount;
7913         uint16_t nsegs;
7914         int32_t acked;
7915         uint32_t lost, sack_changed = 0;
7916         struct mbuf *mfree;
7917         struct tcp_bbr *bbr;
7918         uint32_t prev_acked = 0;
7919
7920         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
7921         lost = bbr->r_ctl.rc_lost;
7922         nsegs = max(1, m->m_pkthdr.lro_nsegs);
7923         if (SEQ_GT(th->th_ack, tp->snd_max)) {
7924                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
7925                 bbr->r_wanted_output = 1;
7926                 return (1);
7927         }
7928         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
7929                 /* Process the ack */
7930                 if (bbr->rc_in_persist)
7931                         tp->t_rxtshift = 0;
7932                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
7933                         bbr_strike_dupack(bbr);
7934                 sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
7935         }
7936         bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost));
7937         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
7938                 /*
7939                  * Old ack, behind the last one rcv'd or a duplicate ack
7940                  * with SACK info.
7941                  */
7942                 if (th->th_ack == tp->snd_una) {
7943                         bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0);
7944                         if (bbr->r_state == TCPS_SYN_SENT) {
7945                                 /*
7946                                  * Special case on where we sent SYN. When
7947                                  * the SYN-ACK is processed in syn_sent
7948                                  * state it bumps the snd_una. This causes
7949                                  * us to hit here even though we did ack 1
7950                                  * byte.
7951                                  *
7952                                  * Go through the nothing left case so we
7953                                  * send data.
7954                                  */
7955                                 goto nothing_left;
7956                         }
7957                 }
7958                 return (0);
7959         }
7960         /*
7961          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
7962          * something we sent.
7963          */
7964         if (tp->t_flags & TF_NEEDSYN) {
7965                 /*
7966                  * T/TCP: Connection was half-synchronized, and our SYN has
7967                  * been ACK'd (so connection is now fully synchronized).  Go
7968                  * to non-starred state, increment snd_una for ACK of SYN,
7969                  * and check if we can do window scaling.
7970                  */
7971                 tp->t_flags &= ~TF_NEEDSYN;
7972                 tp->snd_una++;
7973                 /* Do window scaling? */
7974                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
7975                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
7976                         tp->rcv_scale = tp->request_r_scale;
7977                         /* Send window already scaled. */
7978                 }
7979         }
7980         INP_WLOCK_ASSERT(tp->t_inpcb);
7981
7982         acked = BYTES_THIS_ACK(tp, th);
7983         KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
7984         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
7985
7986         /*
7987          * If we just performed our first retransmit, and the ACK arrives
7988          * within our recovery window, then it was a mistake to do the
7989          * retransmit in the first place.  Recover our original cwnd and
7990          * ssthresh, and proceed to transmit where we left off.
7991          */
7992         if (tp->t_flags & TF_PREVVALID) {
7993                 tp->t_flags &= ~TF_PREVVALID;
7994                 if (tp->t_rxtshift == 1 &&
7995                     (int)(ticks - tp->t_badrxtwin) < 0)
7996                         bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
7997         }
7998         SOCKBUF_LOCK(&so->so_snd);
7999         acked_amount = min(acked, (int)sbavail(&so->so_snd));
8000         tp->snd_wnd -= acked_amount;
8001         mfree = sbcut_locked(&so->so_snd, acked_amount);
8002         /* NB: sowwakeup_locked() does an implicit unlock. */
8003         sowwakeup_locked(so);
8004         m_freem(mfree);
8005         if (SEQ_GT(th->th_ack, tp->snd_una)) {
8006                 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
8007         }
8008         tp->snd_una = th->th_ack;
8009         bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost));
8010         if (IN_RECOVERY(tp->t_flags)) {
8011                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
8012                     (SEQ_LT(th->th_ack, tp->snd_max))) {
8013                         tcp_bbr_partialack(tp);
8014                 } else {
8015                         bbr_post_recovery(tp);
8016                 }
8017         }
8018         if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
8019                 tp->snd_recover = tp->snd_una;
8020         }
8021         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
8022                 tp->snd_nxt = tp->snd_max;
8023         }
8024         if (tp->snd_una == tp->snd_max) {
8025                 /* Nothing left outstanding */
8026 nothing_left:
8027                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
8028                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
8029                         bbr->rc_tp->t_acktime = 0;
8030                 if ((sbused(&so->so_snd) == 0) &&
8031                     (tp->t_flags & TF_SENTFIN)) {
8032                         ourfinisacked = 1;
8033                 }
8034                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8035                 if (bbr->rc_in_persist == 0) {
8036                         bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
8037                 }
8038                 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
8039                 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
8040                 /*
8041                  * We invalidate the last ack here since we
8042                  * don't want to transfer forward the time
8043                  * for our sum's calculations.
8044                  */
8045                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
8046                     (sbavail(&so->so_snd) == 0) &&
8047                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
8048                         /*
8049                          * The socket was gone and the peer sent data, time
8050                          * to reset him.
8051                          */
8052                         *ret_val = 1;
8053                         tp = tcp_close(tp);
8054                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
8055                         BBR_STAT_INC(bbr_dropped_af_data);
8056                         return (1);
8057                 }
8058                 /* Set need output so persist might get set */
8059                 bbr->r_wanted_output = 1;
8060         }
8061         if (ofia)
8062                 *ofia = ourfinisacked;
8063         return (0);
8064 }
8065
8066 static void
8067 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
8068 {
8069         if (bbr->rc_in_persist == 0) {
8070                 bbr_timer_cancel(bbr, __LINE__, cts);
8071                 bbr->r_ctl.rc_last_delay_val = 0;
8072                 tp->t_rxtshift = 0;
8073                 bbr->rc_in_persist = 1;
8074                 bbr->r_ctl.rc_went_idle_time = cts;
8075                 /* We should be capped when rw went to 0 but just in case */
8076                 bbr_log_type_pesist(bbr, cts, 0, line, 1);
8077                 /* Time freezes for the state, so do the accounting now */
8078                 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
8079                         uint32_t time_in;
8080
8081                         time_in = cts - bbr->r_ctl.rc_bbr_state_time;
8082                         if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
8083                                 int32_t idx;
8084
8085                                 idx = bbr_state_val(bbr);
8086                                 counter_u64_add(bbr_state_time[(idx + 5)], time_in);
8087                         } else {
8088                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
8089                         }
8090                 }
8091                 bbr->r_ctl.rc_bbr_state_time = cts;
8092         }
8093 }
8094
8095 static void
8096 bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time)
8097 {
8098         /*
8099          * Note that if idle time does not exceed our
8100          * threshold, we do nothing continuing the state
8101          * transitions we were last walking through.
8102          */
8103         if (idle_time >= bbr_idle_restart_threshold) {
8104                 if (bbr->rc_use_idle_restart) {
8105                         bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT;
8106                         /*
8107                          * Set our target using BBR_UNIT, so
8108                          * we increase at a dramatic rate but
8109                          * we stop when we get the pipe
8110                          * full again for our current b/w estimate.
8111                          */
8112                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
8113                         bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
8114                         bbr_set_state_target(bbr, __LINE__);
8115                         /* Now setup our gains to ramp up */
8116                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
8117                         bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
8118                         bbr_log_type_statechange(bbr, cts, __LINE__);
8119                 } else {
8120                         bbr_substate_change(bbr, cts, __LINE__, 1);
8121                 }
8122         }
8123 }
8124
8125 static void
8126 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
8127 {
8128         uint32_t idle_time;
8129
8130         if (bbr->rc_in_persist == 0)
8131                 return;
8132         idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time);
8133         bbr->rc_in_persist = 0;
8134         bbr->rc_hit_state_1 = 0;
8135         tp->t_flags &= ~TF_FORCEDATA;
8136         bbr->r_ctl.rc_del_time = cts;
8137         /*
8138          * We invalidate the last ack here since we
8139          * don't want to transfer forward the time
8140          * for our sum's calculations.
8141          */
8142         if (bbr->rc_inp->inp_in_hpts) {
8143                 tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
8144                 bbr->rc_timer_first = 0;
8145                 bbr->r_ctl.rc_hpts_flags = 0;
8146                 bbr->r_ctl.rc_last_delay_val = 0;
8147                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
8148                 bbr->r_agg_early_set = 0;
8149                 bbr->r_ctl.rc_agg_early = 0;
8150         }
8151         bbr_log_type_pesist(bbr, cts, idle_time, line, 0);
8152         if (idle_time >= bbr_rtt_probe_time) {
8153                 /*
8154                  * This qualifies as a RTT_PROBE session since we drop the
8155                  * data outstanding to nothing and waited more than
8156                  * bbr_rtt_probe_time.
8157                  */
8158                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0);
8159                 bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts;
8160         }
8161         tp->t_rxtshift = 0;
8162         /*
8163          * If in probeBW and we have persisted more than an RTT lets do
8164          * special handling.
8165          */
8166         /* Force a time based epoch */
8167         bbr_set_epoch(bbr, cts, __LINE__);
8168         /*
8169          * Setup the lost so we don't count anything against the guy
8170          * we have been stuck with during persists.
8171          */
8172         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
8173         /* Time un-freezes for the state */
8174         bbr->r_ctl.rc_bbr_state_time = cts;
8175         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) ||
8176             (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) {
8177                 /*
8178                  * If we are going back to probe-bw
8179                  * or probe_rtt, we may need to possibly
8180                  * do a fast restart.
8181                  */
8182                 bbr_restart_after_idle(bbr, cts, idle_time);
8183         }
8184 }
8185
8186 static void
8187 bbr_collapsed_window(struct tcp_bbr *bbr)
8188 {
8189         /*
8190          * Now we must walk the
8191          * send map and divide the
8192          * ones left stranded. These
8193          * guys can't cause us to abort
8194          * the connection and are really
8195          * "unsent". However if a buggy
8196          * client actually did keep some
8197          * of the data i.e. collapsed the win
8198          * and refused to ack and then opened
8199          * the win and acked that data. We would
8200          * get into an ack war, the simplier
8201          * method then of just pretending we
8202          * did not send those segments something
8203          * won't work.
8204          */
8205         struct bbr_sendmap *rsm, *nrsm;
8206         tcp_seq max_seq;
8207         uint32_t maxseg;
8208         int can_split = 0;
8209         int fnd = 0;
8210
8211         maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
8212         max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd;
8213         bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0);
8214         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
8215                 /* Find the first seq past or at maxseq */
8216                 if (rsm->r_flags & BBR_RWND_COLLAPSED)
8217                         rsm->r_flags &= ~BBR_RWND_COLLAPSED;
8218                 if (SEQ_GEQ(max_seq, rsm->r_start) &&
8219                     SEQ_GEQ(rsm->r_end, max_seq)) {
8220                         fnd = 1;
8221                         break;
8222                 }
8223         }
8224         bbr->rc_has_collapsed = 0;
8225         if (!fnd) {
8226                 /* Nothing to do strange */
8227                 return;
8228         }
8229         /*
8230          * Now can we split?
8231          *
8232          * We don't want to split if splitting
8233          * would generate too many small segments
8234          * less we let an attacker fragment our
8235          * send_map and leave us out of memory.
8236          */
8237         if ((max_seq != rsm->r_start) &&
8238             (max_seq != rsm->r_end)){
8239                 /* can we split? */
8240                 int res1, res2;
8241
8242                 res1 = max_seq - rsm->r_start;
8243                 res2 = rsm->r_end - max_seq;
8244                 if ((res1 >= (maxseg/8)) &&
8245                     (res2 >= (maxseg/8))) {
8246                         /* No small pieces here */
8247                         can_split = 1;
8248                 } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) {
8249                         /* We are under the limit */
8250                         can_split = 1;
8251                 }
8252         }
8253         /* Ok do we need to split this rsm? */
8254         if (max_seq == rsm->r_start) {
8255                 /* It's this guy no split required */
8256                 nrsm = rsm;
8257         } else if (max_seq == rsm->r_end) {
8258                 /* It's the next one no split required. */
8259                 nrsm = TAILQ_NEXT(rsm, r_next);
8260                 if (nrsm == NULL) {
8261                         /* Huh? */
8262                         return;
8263                 }
8264         } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) {
8265                 /* yep we need to split it */
8266                 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
8267                 if (nrsm == NULL) {
8268                         /* failed XXXrrs what can we do mark the whole? */
8269                         nrsm = rsm;
8270                         goto no_split;
8271                 }
8272                 /* Clone it */
8273                 bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0);
8274                 bbr_clone_rsm(bbr, nrsm, rsm, max_seq);
8275                 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
8276                 if (rsm->r_in_tmap) {
8277                         TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8278                         nrsm->r_in_tmap = 1;
8279                 }
8280         } else {
8281                 /*
8282                  * Split not allowed just start here just
8283                  * use this guy.
8284                  */
8285                 nrsm = rsm;
8286         }
8287 no_split:
8288         BBR_STAT_INC(bbr_collapsed_win);
8289         /* reuse fnd as a count */
8290         fnd = 0;
8291         TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) {
8292                 nrsm->r_flags |= BBR_RWND_COLLAPSED;
8293                 fnd++;
8294                 bbr->rc_has_collapsed = 1;
8295         }
8296         bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd);
8297 }
8298
8299 static void
8300 bbr_un_collapse_window(struct tcp_bbr *bbr)
8301 {
8302         struct bbr_sendmap *rsm;
8303         int cleared = 0;
8304
8305         TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
8306                 if (rsm->r_flags & BBR_RWND_COLLAPSED) {
8307                         /* Clear the flag */
8308                         rsm->r_flags &= ~BBR_RWND_COLLAPSED;
8309                         cleared++;
8310                 } else
8311                         break;
8312         }
8313         bbr_log_type_rwnd_collapse(bbr,
8314                                    (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared);
8315         bbr->rc_has_collapsed = 0;
8316 }
8317
8318 /*
8319  * Return value of 1, the TCB is unlocked and most
8320  * likely gone, return value of 0, the TCB is still
8321  * locked.
8322  */
8323 static int
8324 bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
8325     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
8326     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
8327 {
8328         /*
8329          * Update window information. Don't look at window if no ACK: TAC's
8330          * send garbage on first SYN.
8331          */
8332         uint16_t nsegs;
8333         int32_t tfo_syn;
8334         struct tcp_bbr *bbr;
8335
8336         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8337         INP_WLOCK_ASSERT(tp->t_inpcb);
8338         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8339         if ((thflags & TH_ACK) &&
8340             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
8341             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
8342             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
8343                 /* keep track of pure window updates */
8344                 if (tlen == 0 &&
8345                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
8346                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
8347                 tp->snd_wnd = tiwin;
8348                 tp->snd_wl1 = th->th_seq;
8349                 tp->snd_wl2 = th->th_ack;
8350                 if (tp->snd_wnd > tp->max_sndwnd)
8351                         tp->max_sndwnd = tp->snd_wnd;
8352                 bbr->r_wanted_output = 1;
8353         } else if (thflags & TH_ACK) {
8354                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
8355                         tp->snd_wnd = tiwin;
8356                         tp->snd_wl1 = th->th_seq;
8357                         tp->snd_wl2 = th->th_ack;
8358                 }
8359         }
8360         if (tp->snd_wnd < ctf_outstanding(tp))
8361                 /* The peer collapsed its window on us */
8362                 bbr_collapsed_window(bbr);
8363         else if (bbr->rc_has_collapsed)
8364                 bbr_un_collapse_window(bbr);
8365         /* Was persist timer active and now we have window space? */
8366         if ((bbr->rc_in_persist != 0) &&
8367             (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
8368                                 bbr_minseg(bbr)))) {
8369                 /*
8370                  * Make the rate persist at end of persist mode if idle long
8371                  * enough
8372                  */
8373                 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8374
8375                 /* Make sure we output to start the timer */
8376                 bbr->r_wanted_output = 1;
8377         }
8378         /* Do we need to enter persist? */
8379         if ((bbr->rc_in_persist == 0) &&
8380             (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
8381             TCPS_HAVEESTABLISHED(tp->t_state) &&
8382             (tp->snd_max == tp->snd_una) &&
8383             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8384             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
8385                 /* No send window.. we must enter persist */
8386                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8387         }
8388         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
8389                 m_freem(m);
8390                 return (0);
8391         }
8392         /*
8393          * Process segments with URG.
8394          */
8395         if ((thflags & TH_URG) && th->th_urp &&
8396             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8397                 /*
8398                  * This is a kludge, but if we receive and accept random
8399                  * urgent pointers, we'll crash in soreceive.  It's hard to
8400                  * imagine someone actually wanting to send this much urgent
8401                  * data.
8402                  */
8403                 SOCKBUF_LOCK(&so->so_rcv);
8404                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
8405                         th->th_urp = 0; /* XXX */
8406                         thflags &= ~TH_URG;     /* XXX */
8407                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
8408                         goto dodata;    /* XXX */
8409                 }
8410                 /*
8411                  * If this segment advances the known urgent pointer, then
8412                  * mark the data stream.  This should not happen in
8413                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
8414                  * FIN has been received from the remote side. In these
8415                  * states we ignore the URG.
8416                  *
8417                  * According to RFC961 (Assigned Protocols), the urgent
8418                  * pointer points to the last octet of urgent data.  We
8419                  * continue, however, to consider it to indicate the first
8420                  * octet of data past the urgent section as the original
8421                  * spec states (in one of two places).
8422                  */
8423                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
8424                         tp->rcv_up = th->th_seq + th->th_urp;
8425                         so->so_oobmark = sbavail(&so->so_rcv) +
8426                             (tp->rcv_up - tp->rcv_nxt) - 1;
8427                         if (so->so_oobmark == 0)
8428                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
8429                         sohasoutofband(so);
8430                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
8431                 }
8432                 SOCKBUF_UNLOCK(&so->so_rcv);
8433                 /*
8434                  * Remove out of band data so doesn't get presented to user.
8435                  * This can happen independent of advancing the URG pointer,
8436                  * but if two URG's are pending at once, some out-of-band
8437                  * data may creep in... ick.
8438                  */
8439                 if (th->th_urp <= (uint32_t)tlen &&
8440                     !(so->so_options & SO_OOBINLINE)) {
8441                         /* hdr drop is delayed */
8442                         tcp_pulloutofband(so, th, m, drop_hdrlen);
8443                 }
8444         } else {
8445                 /*
8446                  * If no out of band data is expected, pull receive urgent
8447                  * pointer along with the receive window.
8448                  */
8449                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
8450                         tp->rcv_up = tp->rcv_nxt;
8451         }
8452 dodata:                         /* XXX */
8453         INP_WLOCK_ASSERT(tp->t_inpcb);
8454
8455         /*
8456          * Process the segment text, merging it into the TCP sequencing
8457          * queue, and arranging for acknowledgment of receipt if necessary.
8458          * This process logically involves adjusting tp->rcv_wnd as data is
8459          * presented to the user (this happens in tcp_usrreq.c, case
8460          * PRU_RCVD).  If a FIN has already been received on this connection
8461          * then we just ignore the text.
8462          */
8463         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
8464                    IS_FASTOPEN(tp->t_flags));
8465         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
8466             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8467                 tcp_seq save_start = th->th_seq;
8468                 tcp_seq save_rnxt  = tp->rcv_nxt;
8469                 int     save_tlen  = tlen;
8470
8471                 m_adj(m, drop_hdrlen);  /* delayed header drop */
8472                 /*
8473                  * Insert segment which includes th into TCP reassembly
8474                  * queue with control block tp.  Set thflags to whether
8475                  * reassembly now includes a segment with FIN.  This handles
8476                  * the common case inline (segment is the next to be
8477                  * received on an established connection, and the queue is
8478                  * empty), avoiding linkage into and removal from the queue
8479                  * and repetition of various conversions. Set DELACK for
8480                  * segments received in order, but ack immediately when
8481                  * segments are out of order (so fast retransmit can work).
8482                  */
8483                 if (th->th_seq == tp->rcv_nxt &&
8484                     SEGQ_EMPTY(tp) &&
8485                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
8486                     tfo_syn)) {
8487 #ifdef NETFLIX_SB_LIMITS
8488                         u_int mcnt, appended;
8489
8490                         if (so->so_rcv.sb_shlim) {
8491                                 mcnt = m_memcnt(m);
8492                                 appended = 0;
8493                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8494                                     CFO_NOSLEEP, NULL) == false) {
8495                                         counter_u64_add(tcp_sb_shlim_fails, 1);
8496                                         m_freem(m);
8497                                         return (0);
8498                                 }
8499                         }
8500
8501 #endif
8502                         if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) {
8503                                 bbr->bbr_segs_rcvd += max(1, nsegs);
8504                                 tp->t_flags |= TF_DELACK;
8505                                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8506                         } else {
8507                                 bbr->r_wanted_output = 1;
8508                                 tp->t_flags |= TF_ACKNOW;
8509                         }
8510                         tp->rcv_nxt += tlen;
8511                         thflags = th->th_flags & TH_FIN;
8512                         KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
8513                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
8514                         SOCKBUF_LOCK(&so->so_rcv);
8515                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
8516                                 m_freem(m);
8517                         else
8518 #ifdef NETFLIX_SB_LIMITS
8519                                 appended =
8520 #endif
8521                                         sbappendstream_locked(&so->so_rcv, m, 0);
8522                         /* NB: sorwakeup_locked() does an implicit unlock. */
8523                         sorwakeup_locked(so);
8524 #ifdef NETFLIX_SB_LIMITS
8525                         if (so->so_rcv.sb_shlim && appended != mcnt)
8526                                 counter_fo_release(so->so_rcv.sb_shlim,
8527                                     mcnt - appended);
8528 #endif
8529                 } else {
8530                         /*
8531                          * XXX: Due to the header drop above "th" is
8532                          * theoretically invalid by now.  Fortunately
8533                          * m_adj() doesn't actually frees any mbufs when
8534                          * trimming from the head.
8535                          */
8536                         tcp_seq temp = save_start;
8537                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
8538                         tp->t_flags |= TF_ACKNOW;
8539                 }
8540                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
8541                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
8542                                 /*
8543                                  * DSACK actually handled in the fastpath
8544                                  * above.
8545                                  */
8546                                 tcp_update_sack_list(tp, save_start,
8547                                     save_start + save_tlen);
8548                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
8549                                 if ((tp->rcv_numsacks >= 1) &&
8550                                     (tp->sackblks[0].end == save_start)) {
8551                                         /*
8552                                          * Partial overlap, recorded at todrop
8553                                          * above.
8554                                          */
8555                                         tcp_update_sack_list(tp,
8556                                             tp->sackblks[0].start,
8557                                             tp->sackblks[0].end);
8558                                 } else {
8559                                         tcp_update_dsack_list(tp, save_start,
8560                                             save_start + save_tlen);
8561                                 }
8562                         } else if (tlen >= save_tlen) {
8563                                 /* Update of sackblks. */
8564                                 tcp_update_dsack_list(tp, save_start,
8565                                     save_start + save_tlen);
8566                         } else if (tlen > 0) {
8567                                 tcp_update_dsack_list(tp, save_start,
8568                                     save_start + tlen);
8569                         }
8570                 }
8571         } else {
8572                 m_freem(m);
8573                 thflags &= ~TH_FIN;
8574         }
8575
8576         /*
8577          * If FIN is received ACK the FIN and let the user know that the
8578          * connection is closing.
8579          */
8580         if (thflags & TH_FIN) {
8581                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8582                         socantrcvmore(so);
8583                         /*
8584                          * If connection is half-synchronized (ie NEEDSYN
8585                          * flag on) then delay ACK, so it may be piggybacked
8586                          * when SYN is sent. Otherwise, since we received a
8587                          * FIN then no more input can be expected, send ACK
8588                          * now.
8589                          */
8590                         if (tp->t_flags & TF_NEEDSYN) {
8591                                 tp->t_flags |= TF_DELACK;
8592                                 bbr_timer_cancel(bbr,
8593                                     __LINE__, bbr->r_ctl.rc_rcvtime);
8594                         } else {
8595                                 tp->t_flags |= TF_ACKNOW;
8596                         }
8597                         tp->rcv_nxt++;
8598                 }
8599                 switch (tp->t_state) {
8600
8601                         /*
8602                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
8603                          * CLOSE_WAIT state.
8604                          */
8605                 case TCPS_SYN_RECEIVED:
8606                         tp->t_starttime = ticks;
8607                         /* FALLTHROUGH */
8608                 case TCPS_ESTABLISHED:
8609                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
8610                         break;
8611
8612                         /*
8613                          * If still in FIN_WAIT_1 STATE FIN has not been
8614                          * acked so enter the CLOSING state.
8615                          */
8616                 case TCPS_FIN_WAIT_1:
8617                         tcp_state_change(tp, TCPS_CLOSING);
8618                         break;
8619
8620                         /*
8621                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
8622                          * starting the time-wait timer, turning off the
8623                          * other standard timers.
8624                          */
8625                 case TCPS_FIN_WAIT_2:
8626                         bbr->rc_timer_first = 1;
8627                         bbr_timer_cancel(bbr,
8628                             __LINE__, bbr->r_ctl.rc_rcvtime);
8629                         INP_WLOCK_ASSERT(tp->t_inpcb);
8630                         tcp_twstart(tp);
8631                         return (1);
8632                 }
8633         }
8634         /*
8635          * Return any desired output.
8636          */
8637         if ((tp->t_flags & TF_ACKNOW) ||
8638             (sbavail(&so->so_snd) > ctf_outstanding(tp))) {
8639                 bbr->r_wanted_output = 1;
8640         }
8641         INP_WLOCK_ASSERT(tp->t_inpcb);
8642         return (0);
8643 }
8644
8645 /*
8646  * Here nothing is really faster, its just that we
8647  * have broken out the fast-data path also just like
8648  * the fast-ack. Return 1 if we processed the packet
8649  * return 0 if you need to take the "slow-path".
8650  */
8651 static int
8652 bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
8653     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8654     uint32_t tiwin, int32_t nxt_pkt)
8655 {
8656         uint16_t nsegs;
8657         int32_t newsize = 0;    /* automatic sockbuf scaling */
8658         struct tcp_bbr *bbr;
8659 #ifdef NETFLIX_SB_LIMITS
8660         u_int mcnt, appended;
8661 #endif
8662 #ifdef TCPDEBUG
8663         /*
8664          * The size of tcp_saveipgen must be the size of the max ip header,
8665          * now IPv6.
8666          */
8667         u_char tcp_saveipgen[IP6_HDR_LEN];
8668         struct tcphdr tcp_savetcp;
8669         short ostate = 0;
8670
8671 #endif
8672         /* On the hpts and we would have called output */
8673         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8674
8675         /*
8676          * If last ACK falls within this segment's sequence numbers, record
8677          * the timestamp. NOTE that the test is modified according to the
8678          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
8679          */
8680         if (bbr->r_ctl.rc_resend != NULL) {
8681                 return (0);
8682         }
8683         if (tiwin && tiwin != tp->snd_wnd) {
8684                 return (0);
8685         }
8686         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
8687                 return (0);
8688         }
8689         if (__predict_false((to->to_flags & TOF_TS) &&
8690             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
8691                 return (0);
8692         }
8693         if (__predict_false((th->th_ack != tp->snd_una))) {
8694                 return (0);
8695         }
8696         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
8697                 return (0);
8698         }
8699         if ((to->to_flags & TOF_TS) != 0 &&
8700             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
8701                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
8702                 tp->ts_recent = to->to_tsval;
8703         }
8704         /*
8705          * This is a pure, in-sequence data packet with nothing on the
8706          * reassembly queue and we have enough buffer space to take it.
8707          */
8708         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8709
8710 #ifdef NETFLIX_SB_LIMITS
8711         if (so->so_rcv.sb_shlim) {
8712                 mcnt = m_memcnt(m);
8713                 appended = 0;
8714                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8715                     CFO_NOSLEEP, NULL) == false) {
8716                         counter_u64_add(tcp_sb_shlim_fails, 1);
8717                         m_freem(m);
8718                         return (1);
8719                 }
8720         }
8721 #endif
8722         /* Clean receiver SACK report if present */
8723         if (tp->rcv_numsacks)
8724                 tcp_clean_sackreport(tp);
8725         KMOD_TCPSTAT_INC(tcps_preddat);
8726         tp->rcv_nxt += tlen;
8727         /*
8728          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
8729          */
8730         tp->snd_wl1 = th->th_seq;
8731         /*
8732          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
8733          */
8734         tp->rcv_up = tp->rcv_nxt;
8735         KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
8736         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
8737 #ifdef TCPDEBUG
8738         if (so->so_options & SO_DEBUG)
8739                 tcp_trace(TA_INPUT, ostate, tp,
8740                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
8741 #endif
8742         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
8743
8744         /* Add data to socket buffer. */
8745         SOCKBUF_LOCK(&so->so_rcv);
8746         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
8747                 m_freem(m);
8748         } else {
8749                 /*
8750                  * Set new socket buffer size. Give up when limit is
8751                  * reached.
8752                  */
8753                 if (newsize)
8754                         if (!sbreserve_locked(&so->so_rcv,
8755                             newsize, so, NULL))
8756                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
8757                 m_adj(m, drop_hdrlen);  /* delayed header drop */
8758
8759 #ifdef NETFLIX_SB_LIMITS
8760                 appended =
8761 #endif
8762                         sbappendstream_locked(&so->so_rcv, m, 0);
8763                 ctf_calc_rwin(so, tp);
8764         }
8765         /* NB: sorwakeup_locked() does an implicit unlock. */
8766         sorwakeup_locked(so);
8767 #ifdef NETFLIX_SB_LIMITS
8768         if (so->so_rcv.sb_shlim && mcnt != appended)
8769                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
8770 #endif
8771         if (DELAY_ACK(tp, bbr, nsegs)) {
8772                 bbr->bbr_segs_rcvd += max(1, nsegs);
8773                 tp->t_flags |= TF_DELACK;
8774                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8775         } else {
8776                 bbr->r_wanted_output = 1;
8777                 tp->t_flags |= TF_ACKNOW;
8778         }
8779         return (1);
8780 }
8781
8782 /*
8783  * This subfunction is used to try to highly optimize the
8784  * fast path. We again allow window updates that are
8785  * in sequence to remain in the fast-path. We also add
8786  * in the __predict's to attempt to help the compiler.
8787  * Note that if we return a 0, then we can *not* process
8788  * it and the caller should push the packet into the
8789  * slow-path. If we return 1, then all is well and
8790  * the packet is fully processed.
8791  */
8792 static int
8793 bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
8794     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8795     uint32_t tiwin, int32_t nxt_pkt)
8796 {
8797         int32_t acked;
8798         uint16_t nsegs;
8799         uint32_t sack_changed;
8800 #ifdef TCPDEBUG
8801         /*
8802          * The size of tcp_saveipgen must be the size of the max ip header,
8803          * now IPv6.
8804          */
8805         u_char tcp_saveipgen[IP6_HDR_LEN];
8806         struct tcphdr tcp_savetcp;
8807         short ostate = 0;
8808
8809 #endif
8810         uint32_t prev_acked = 0;
8811         struct tcp_bbr *bbr;
8812
8813         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
8814                 /* Old ack, behind (or duplicate to) the last one rcv'd */
8815                 return (0);
8816         }
8817         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
8818                 /* Above what we have sent? */
8819                 return (0);
8820         }
8821         if (__predict_false(tiwin == 0)) {
8822                 /* zero window */
8823                 return (0);
8824         }
8825         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
8826                 /* We need a SYN or a FIN, unlikely.. */
8827                 return (0);
8828         }
8829         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
8830                 /* Timestamp is behind .. old ack with seq wrap? */
8831                 return (0);
8832         }
8833         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
8834                 /* Still recovering */
8835                 return (0);
8836         }
8837         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8838         if (__predict_false(bbr->r_ctl.rc_resend != NULL)) {
8839                 /* We are retransmitting */
8840                 return (0);
8841         }
8842         if (__predict_false(bbr->rc_in_persist != 0)) {
8843                 /* In persist mode */
8844                 return (0);
8845         }
8846         if (bbr->r_ctl.rc_sacked) {
8847                 /* We have sack holes on our scoreboard */
8848                 return (0);
8849         }
8850         /* Ok if we reach here, we can process a fast-ack */
8851         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8852         sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
8853         /*
8854          * We never detect loss in fast ack [we can't
8855          * have a sack and can't be in recovery so
8856          * we always pass 0 (nothing detected)].
8857          */
8858         bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0);
8859         /* Did the window get updated? */
8860         if (tiwin != tp->snd_wnd) {
8861                 tp->snd_wnd = tiwin;
8862                 tp->snd_wl1 = th->th_seq;
8863                 if (tp->snd_wnd > tp->max_sndwnd)
8864                         tp->max_sndwnd = tp->snd_wnd;
8865         }
8866         /* Do we need to exit persists? */
8867         if ((bbr->rc_in_persist != 0) &&
8868             (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
8869                                bbr_minseg(bbr)))) {
8870                 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8871                 bbr->r_wanted_output = 1;
8872         }
8873         /* Do we need to enter persists? */
8874         if ((bbr->rc_in_persist == 0) &&
8875             (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
8876             TCPS_HAVEESTABLISHED(tp->t_state) &&
8877             (tp->snd_max == tp->snd_una) &&
8878             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8879             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
8880                 /* No send window.. we must enter persist */
8881                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
8882         }
8883         /*
8884          * If last ACK falls within this segment's sequence numbers, record
8885          * the timestamp. NOTE that the test is modified according to the
8886          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
8887          */
8888         if ((to->to_flags & TOF_TS) != 0 &&
8889             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
8890                 tp->ts_recent_age = bbr->r_ctl.rc_rcvtime;
8891                 tp->ts_recent = to->to_tsval;
8892         }
8893         /*
8894          * This is a pure ack for outstanding data.
8895          */
8896         KMOD_TCPSTAT_INC(tcps_predack);
8897
8898         /*
8899          * "bad retransmit" recovery.
8900          */
8901         if (tp->t_flags & TF_PREVVALID) {
8902                 tp->t_flags &= ~TF_PREVVALID;
8903                 if (tp->t_rxtshift == 1 &&
8904                     (int)(ticks - tp->t_badrxtwin) < 0)
8905                         bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
8906         }
8907         /*
8908          * Recalculate the transmit timer / rtt.
8909          *
8910          * Some boxes send broken timestamp replies during the SYN+ACK
8911          * phase, ignore timestamps of 0 or we could calculate a huge RTT
8912          * and blow up the retransmit timer.
8913          */
8914         acked = BYTES_THIS_ACK(tp, th);
8915
8916 #ifdef TCP_HHOOK
8917         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
8918         hhook_run_tcp_est_in(tp, th, to);
8919 #endif
8920
8921         KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
8922         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
8923         sbdrop(&so->so_snd, acked);
8924
8925         if (SEQ_GT(th->th_ack, tp->snd_una))
8926                 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
8927         tp->snd_una = th->th_ack;
8928         if (tp->snd_wnd < ctf_outstanding(tp))
8929                 /* The peer collapsed its window on us */
8930                 bbr_collapsed_window(bbr);
8931         else if (bbr->rc_has_collapsed)
8932                 bbr_un_collapse_window(bbr);
8933
8934         if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
8935                 tp->snd_recover = tp->snd_una;
8936         }
8937         bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0);
8938         /*
8939          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
8940          */
8941         tp->snd_wl2 = th->th_ack;
8942         m_freem(m);
8943         /*
8944          * If all outstanding data are acked, stop retransmit timer,
8945          * otherwise restart timer using current (possibly backed-off)
8946          * value. If process is waiting for space, wakeup/selwakeup/signal.
8947          * If data are ready to send, let tcp_output decide between more
8948          * output or persist.
8949          */
8950 #ifdef TCPDEBUG
8951         if (so->so_options & SO_DEBUG)
8952                 tcp_trace(TA_INPUT, ostate, tp,
8953                     (void *)tcp_saveipgen,
8954                     &tcp_savetcp, 0);
8955 #endif
8956         /* Wake up the socket if we have room to write more */
8957         sowwakeup(so);
8958         if (tp->snd_una == tp->snd_max) {
8959                 /* Nothing left outstanding */
8960                 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
8961                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
8962                         bbr->rc_tp->t_acktime = 0;
8963                 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
8964                 if (bbr->rc_in_persist == 0) {
8965                         bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
8966                 }
8967                 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
8968                 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
8969                 /*
8970                  * We invalidate the last ack here since we
8971                  * don't want to transfer forward the time
8972                  * for our sum's calculations.
8973                  */
8974                 bbr->r_wanted_output = 1;
8975         }
8976         if (sbavail(&so->so_snd)) {
8977                 bbr->r_wanted_output = 1;
8978         }
8979         return (1);
8980 }
8981
8982 /*
8983  * Return value of 1, the TCB is unlocked and most
8984  * likely gone, return value of 0, the TCB is still
8985  * locked.
8986  */
8987 static int
8988 bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
8989     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8990     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
8991 {
8992         int32_t todrop;
8993         int32_t ourfinisacked = 0;
8994         struct tcp_bbr *bbr;
8995         int32_t ret_val = 0;
8996
8997         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
8998         ctf_calc_rwin(so, tp);
8999         /*
9000          * If the state is SYN_SENT: if seg contains an ACK, but not for our
9001          * SYN, drop the input. if seg contains a RST, then drop the
9002          * connection. if seg does not contain SYN, then drop it. Otherwise
9003          * this is an acceptable SYN segment initialize tp->rcv_nxt and
9004          * tp->irs if seg contains ack then advance tp->snd_una. BRR does
9005          * not support ECN so we will not say we are capable. if SYN has
9006          * been acked change to ESTABLISHED else SYN_RCVD state arrange for
9007          * segment to be acked (eventually) continue processing rest of
9008          * data/controls, beginning with URG
9009          */
9010         if ((thflags & TH_ACK) &&
9011             (SEQ_LEQ(th->th_ack, tp->iss) ||
9012             SEQ_GT(th->th_ack, tp->snd_max))) {
9013                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9014                 return (1);
9015         }
9016         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
9017                 TCP_PROBE5(connect__refused, NULL, tp,
9018                     mtod(m, const char *), tp, th);
9019                 tp = tcp_drop(tp, ECONNREFUSED);
9020                 ctf_do_drop(m, tp);
9021                 return (1);
9022         }
9023         if (thflags & TH_RST) {
9024                 ctf_do_drop(m, tp);
9025                 return (1);
9026         }
9027         if (!(thflags & TH_SYN)) {
9028                 ctf_do_drop(m, tp);
9029                 return (1);
9030         }
9031         tp->irs = th->th_seq;
9032         tcp_rcvseqinit(tp);
9033         if (thflags & TH_ACK) {
9034                 int tfo_partial = 0;
9035
9036                 KMOD_TCPSTAT_INC(tcps_connects);
9037                 soisconnected(so);
9038 #ifdef MAC
9039                 mac_socketpeer_set_from_mbuf(m, so);
9040 #endif
9041                 /* Do window scaling on this connection? */
9042                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
9043                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
9044                         tp->rcv_scale = tp->request_r_scale;
9045                 }
9046                 tp->rcv_adv += min(tp->rcv_wnd,
9047                     TCP_MAXWIN << tp->rcv_scale);
9048                 /*
9049                  * If not all the data that was sent in the TFO SYN
9050                  * has been acked, resend the remainder right away.
9051                  */
9052                 if (IS_FASTOPEN(tp->t_flags) &&
9053                     (tp->snd_una != tp->snd_max)) {
9054                         tp->snd_nxt = th->th_ack;
9055                         tfo_partial = 1;
9056                 }
9057                 /*
9058                  * If there's data, delay ACK; if there's also a FIN ACKNOW
9059                  * will be turned on later.
9060                  */
9061                 if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && (tfo_partial == 0)) {
9062                         bbr->bbr_segs_rcvd += 1;
9063                         tp->t_flags |= TF_DELACK;
9064                         bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
9065                 } else {
9066                         bbr->r_wanted_output = 1;
9067                         tp->t_flags |= TF_ACKNOW;
9068                 }
9069                 if (SEQ_GT(th->th_ack, tp->iss)) {
9070                         /*
9071                          * The SYN is acked
9072                          * handle it specially.
9073                          */
9074                         bbr_log_syn(tp, to);
9075                 }
9076                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
9077                         /*
9078                          * We advance snd_una for the
9079                          * fast open case. If th_ack is
9080                          * acknowledging data beyond
9081                          * snd_una we can't just call
9082                          * ack-processing since the
9083                          * data stream in our send-map
9084                          * will start at snd_una + 1 (one
9085                          * beyond the SYN). If its just
9086                          * equal we don't need to do that
9087                          * and there is no send_map.
9088                          */
9089                         tp->snd_una++;
9090                 }
9091                 /*
9092                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
9093                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
9094                  */
9095                 tp->t_starttime = ticks;
9096                 if (tp->t_flags & TF_NEEDFIN) {
9097                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
9098                         tp->t_flags &= ~TF_NEEDFIN;
9099                         thflags &= ~TH_SYN;
9100                 } else {
9101                         tcp_state_change(tp, TCPS_ESTABLISHED);
9102                         TCP_PROBE5(connect__established, NULL, tp,
9103                             mtod(m, const char *), tp, th);
9104                         cc_conn_init(tp);
9105                 }
9106         } else {
9107                 /*
9108                  * Received initial SYN in SYN-SENT[*] state => simultaneous
9109                  * open.  If segment contains CC option and there is a
9110                  * cached CC, apply TAO test. If it succeeds, connection is *
9111                  * half-synchronized. Otherwise, do 3-way handshake:
9112                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
9113                  * there was no CC option, clear cached CC value.
9114                  */
9115                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
9116                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
9117         }
9118         INP_WLOCK_ASSERT(tp->t_inpcb);
9119         /*
9120          * Advance th->th_seq to correspond to first data byte. If data,
9121          * trim to stay within window, dropping FIN if necessary.
9122          */
9123         th->th_seq++;
9124         if (tlen > tp->rcv_wnd) {
9125                 todrop = tlen - tp->rcv_wnd;
9126                 m_adj(m, -todrop);
9127                 tlen = tp->rcv_wnd;
9128                 thflags &= ~TH_FIN;
9129                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
9130                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
9131         }
9132         tp->snd_wl1 = th->th_seq - 1;
9133         tp->rcv_up = th->th_seq;
9134         /*
9135          * Client side of transaction: already sent SYN and data. If the
9136          * remote host used T/TCP to validate the SYN, our data will be
9137          * ACK'd; if so, enter normal data segment processing in the middle
9138          * of step 5, ack processing. Otherwise, goto step 6.
9139          */
9140         if (thflags & TH_ACK) {
9141                 if ((to->to_flags & TOF_TS) != 0) {
9142                         uint32_t t, rtt;
9143
9144                         t = tcp_tv_to_mssectick(&bbr->rc_tv);
9145                         if (TSTMP_GEQ(t, to->to_tsecr)) {
9146                                 rtt = t - to->to_tsecr;
9147                                 if (rtt == 0) {
9148                                         rtt = 1;
9149                                 }
9150                                 rtt *= MS_IN_USEC;
9151                                 tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
9152                                 apply_filter_min_small(&bbr->r_ctl.rc_rttprop,
9153                                                        rtt, bbr->r_ctl.rc_rcvtime);
9154                         }
9155                 }
9156                 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
9157                         return (ret_val);
9158                 /* We may have changed to FIN_WAIT_1 above */
9159                 if (tp->t_state == TCPS_FIN_WAIT_1) {
9160                         /*
9161                          * In FIN_WAIT_1 STATE in addition to the processing
9162                          * for the ESTABLISHED state if our FIN is now
9163                          * acknowledged then enter FIN_WAIT_2.
9164                          */
9165                         if (ourfinisacked) {
9166                                 /*
9167                                  * If we can't receive any more data, then
9168                                  * closing user can proceed. Starting the
9169                                  * timer is contrary to the specification,
9170                                  * but if we don't get a FIN we'll hang
9171                                  * forever.
9172                                  *
9173                                  * XXXjl: we should release the tp also, and
9174                                  * use a compressed state.
9175                                  */
9176                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9177                                         soisdisconnected(so);
9178                                         tcp_timer_activate(tp, TT_2MSL,
9179                                             (tcp_fast_finwait2_recycle ?
9180                                             tcp_finwait2_timeout :
9181                                             TP_MAXIDLE(tp)));
9182                                 }
9183                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
9184                         }
9185                 }
9186         }
9187         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9188             tiwin, thflags, nxt_pkt));
9189 }
9190
9191 /*
9192  * Return value of 1, the TCB is unlocked and most
9193  * likely gone, return value of 0, the TCB is still
9194  * locked.
9195  */
9196 static int
9197 bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
9198                 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9199                 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9200 {
9201         int32_t ourfinisacked = 0;
9202         int32_t ret_val;
9203         struct tcp_bbr *bbr;
9204
9205         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9206         ctf_calc_rwin(so, tp);
9207         if ((thflags & TH_ACK) &&
9208             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
9209              SEQ_GT(th->th_ack, tp->snd_max))) {
9210                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9211                 return (1);
9212         }
9213         if (IS_FASTOPEN(tp->t_flags)) {
9214                 /*
9215                  * When a TFO connection is in SYN_RECEIVED, the only valid
9216                  * packets are the initial SYN, a retransmit/copy of the
9217                  * initial SYN (possibly with a subset of the original
9218                  * data), a valid ACK, a FIN, or a RST.
9219                  */
9220                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
9221                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9222                         return (1);
9223                 } else if (thflags & TH_SYN) {
9224                         /* non-initial SYN is ignored */
9225                         if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
9226                             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
9227                             (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
9228                                 ctf_do_drop(m, NULL);
9229                                 return (0);
9230                         }
9231                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
9232                         ctf_do_drop(m, NULL);
9233                         return (0);
9234                 }
9235         }
9236         if ((thflags & TH_RST) ||
9237             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9238                 return (ctf_process_rst(m, th, so, tp));
9239         /*
9240          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9241          * it's less than ts_recent, drop it.
9242          */
9243         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9244             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9245                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9246                         return (ret_val);
9247         }
9248         /*
9249          * In the SYN-RECEIVED state, validate that the packet belongs to
9250          * this connection before trimming the data to fit the receive
9251          * window.  Check the sequence number versus IRS since we know the
9252          * sequence numbers haven't wrapped.  This is a partial fix for the
9253          * "LAND" DoS attack.
9254          */
9255         if (SEQ_LT(th->th_seq, tp->irs)) {
9256                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9257                 return (1);
9258         }
9259         INP_WLOCK_ASSERT(tp->t_inpcb);
9260         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9261                 return (ret_val);
9262         }
9263         /*
9264          * If last ACK falls within this segment's sequence numbers, record
9265          * its timestamp. NOTE: 1) That the test incorporates suggestions
9266          * from the latest proposal of the tcplw@cray.com list (Braden
9267          * 1993/04/26). 2) That updating only on newer timestamps interferes
9268          * with our earlier PAWS tests, so this check should be solely
9269          * predicated on the sequence space of this segment. 3) That we
9270          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9271          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9272          * SEG.Len, This modified check allows us to overcome RFC1323's
9273          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9274          * p.869. In such cases, we can still calculate the RTT correctly
9275          * when RCV.NXT == Last.ACK.Sent.
9276          */
9277         if ((to->to_flags & TOF_TS) != 0 &&
9278             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9279             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9280                     ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9281                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9282                 tp->ts_recent = to->to_tsval;
9283         }
9284         tp->snd_wnd = tiwin;
9285         /*
9286          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9287          * is on (half-synchronized state), then queue data for later
9288          * processing; else drop segment and return.
9289          */
9290         if ((thflags & TH_ACK) == 0) {
9291                 if (IS_FASTOPEN(tp->t_flags)) {
9292                         cc_conn_init(tp);
9293                 }
9294                 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9295                                          tiwin, thflags, nxt_pkt));
9296         }
9297         KMOD_TCPSTAT_INC(tcps_connects);
9298         soisconnected(so);
9299         /* Do window scaling? */
9300         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
9301             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
9302                 tp->rcv_scale = tp->request_r_scale;
9303         }
9304         /*
9305          * ok for the first time in lets see if we can use the ts to figure
9306          * out what the initial RTT was.
9307          */
9308         if ((to->to_flags & TOF_TS) != 0) {
9309                 uint32_t t, rtt;
9310
9311                 t = tcp_tv_to_mssectick(&bbr->rc_tv);
9312                 if (TSTMP_GEQ(t, to->to_tsecr)) {
9313                         rtt = t - to->to_tsecr;
9314                         if (rtt == 0) {
9315                                 rtt = 1;
9316                         }
9317                         rtt *= MS_IN_USEC;
9318                         tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
9319                         apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime);
9320                 }
9321         }
9322         /* Drop off any SYN in the send map (probably not there)  */
9323         if (thflags & TH_ACK)
9324                 bbr_log_syn(tp, to);
9325         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
9326
9327                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
9328                 tp->t_tfo_pending = NULL;
9329         }
9330         /*
9331          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
9332          * FIN-WAIT-1
9333          */
9334         tp->t_starttime = ticks;
9335         if (tp->t_flags & TF_NEEDFIN) {
9336                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
9337                 tp->t_flags &= ~TF_NEEDFIN;
9338         } else {
9339                 tcp_state_change(tp, TCPS_ESTABLISHED);
9340                 TCP_PROBE5(accept__established, NULL, tp,
9341                            mtod(m, const char *), tp, th);
9342                 /*
9343                  * TFO connections call cc_conn_init() during SYN
9344                  * processing.  Calling it again here for such connections
9345                  * is not harmless as it would undo the snd_cwnd reduction
9346                  * that occurs when a TFO SYN|ACK is retransmitted.
9347                  */
9348                 if (!IS_FASTOPEN(tp->t_flags))
9349                         cc_conn_init(tp);
9350         }
9351         /*
9352          * Account for the ACK of our SYN prior to
9353          * regular ACK processing below, except for
9354          * simultaneous SYN, which is handled later.
9355          */
9356         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
9357                 tp->snd_una++;
9358         /*
9359          * If segment contains data or ACK, will call tcp_reass() later; if
9360          * not, do so now to pass queued data to user.
9361          */
9362         if (tlen == 0 && (thflags & TH_FIN) == 0)
9363                 (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
9364                         (struct mbuf *)0);
9365         tp->snd_wl1 = th->th_seq - 1;
9366         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9367                 return (ret_val);
9368         }
9369         if (tp->t_state == TCPS_FIN_WAIT_1) {
9370                 /* We could have went to FIN_WAIT_1 (or EST) above */
9371                 /*
9372                  * In FIN_WAIT_1 STATE in addition to the processing for the
9373                  * ESTABLISHED state if our FIN is now acknowledged then
9374                  * enter FIN_WAIT_2.
9375                  */
9376                 if (ourfinisacked) {
9377                         /*
9378                          * If we can't receive any more data, then closing
9379                          * user can proceed. Starting the timer is contrary
9380                          * to the specification, but if we don't get a FIN
9381                          * we'll hang forever.
9382                          *
9383                          * XXXjl: we should release the tp also, and use a
9384                          * compressed state.
9385                          */
9386                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9387                                 soisdisconnected(so);
9388                                 tcp_timer_activate(tp, TT_2MSL,
9389                                                    (tcp_fast_finwait2_recycle ?
9390                                                     tcp_finwait2_timeout :
9391                                                     TP_MAXIDLE(tp)));
9392                         }
9393                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
9394                 }
9395         }
9396         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9397                                  tiwin, thflags, nxt_pkt));
9398 }
9399
9400 /*
9401  * Return value of 1, the TCB is unlocked and most
9402  * likely gone, return value of 0, the TCB is still
9403  * locked.
9404  */
9405 static int
9406 bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
9407     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9408     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9409 {
9410         struct tcp_bbr *bbr;
9411         int32_t ret_val;
9412
9413         /*
9414          * Header prediction: check for the two common cases of a
9415          * uni-directional data xfer.  If the packet has no control flags,
9416          * is in-sequence, the window didn't change and we're not
9417          * retransmitting, it's a candidate.  If the length is zero and the
9418          * ack moved forward, we're the sender side of the xfer.  Just free
9419          * the data acked & wake any higher level process that was blocked
9420          * waiting for space.  If the length is non-zero and the ack didn't
9421          * move, we're the receiver side.  If we're getting packets in-order
9422          * (the reassembly queue is empty), add the data toc The socket
9423          * buffer and note that we need a delayed ack. Make sure that the
9424          * hidden state-flags are also off. Since we check for
9425          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
9426          */
9427         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9428         if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) {
9429                 /*
9430                  * If we have delived under 4 segments increase the initial
9431                  * window if raised by the peer. We use this to determine
9432                  * dynamic and static rwnd's at the end of a connection.
9433                  */
9434                 bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd);
9435         }
9436         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
9437             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
9438             __predict_true(SEGQ_EMPTY(tp)) &&
9439             __predict_true(th->th_seq == tp->rcv_nxt)) {
9440                 if (tlen == 0) {
9441                         if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
9442                             tiwin, nxt_pkt)) {
9443                                 return (0);
9444                         }
9445                 } else {
9446                         if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
9447                             tiwin, nxt_pkt)) {
9448                                 return (0);
9449                         }
9450                 }
9451         }
9452         ctf_calc_rwin(so, tp);
9453
9454         if ((thflags & TH_RST) ||
9455             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9456                 return (ctf_process_rst(m, th, so, tp));
9457         /*
9458          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9459          * synchronized state.
9460          */
9461         if (thflags & TH_SYN) {
9462                 ctf_challenge_ack(m, th, tp, &ret_val);
9463                 return (ret_val);
9464         }
9465         /*
9466          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9467          * it's less than ts_recent, drop it.
9468          */
9469         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9470             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9471                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9472                         return (ret_val);
9473         }
9474         INP_WLOCK_ASSERT(tp->t_inpcb);
9475         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9476                 return (ret_val);
9477         }
9478         /*
9479          * If last ACK falls within this segment's sequence numbers, record
9480          * its timestamp. NOTE: 1) That the test incorporates suggestions
9481          * from the latest proposal of the tcplw@cray.com list (Braden
9482          * 1993/04/26). 2) That updating only on newer timestamps interferes
9483          * with our earlier PAWS tests, so this check should be solely
9484          * predicated on the sequence space of this segment. 3) That we
9485          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9486          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9487          * SEG.Len, This modified check allows us to overcome RFC1323's
9488          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9489          * p.869. In such cases, we can still calculate the RTT correctly
9490          * when RCV.NXT == Last.ACK.Sent.
9491          */
9492         if ((to->to_flags & TOF_TS) != 0 &&
9493             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9494             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9495             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9496                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9497                 tp->ts_recent = to->to_tsval;
9498         }
9499         /*
9500          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9501          * is on (half-synchronized state), then queue data for later
9502          * processing; else drop segment and return.
9503          */
9504         if ((thflags & TH_ACK) == 0) {
9505                 if (tp->t_flags & TF_NEEDSYN) {
9506                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9507                             tiwin, thflags, nxt_pkt));
9508                 } else if (tp->t_flags & TF_ACKNOW) {
9509                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9510                         bbr->r_wanted_output = 1;
9511                         return (ret_val);
9512                 } else {
9513                         ctf_do_drop(m, NULL);
9514                         return (0);
9515                 }
9516         }
9517         /*
9518          * Ack processing.
9519          */
9520         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9521                 return (ret_val);
9522         }
9523         if (sbavail(&so->so_snd)) {
9524                 if (bbr_progress_timeout_check(bbr)) {
9525                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9526                         return (1);
9527                 }
9528         }
9529         /* State changes only happen in bbr_process_data() */
9530         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9531             tiwin, thflags, nxt_pkt));
9532 }
9533
9534 /*
9535  * Return value of 1, the TCB is unlocked and most
9536  * likely gone, return value of 0, the TCB is still
9537  * locked.
9538  */
9539 static int
9540 bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
9541     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9542     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9543 {
9544         struct tcp_bbr *bbr;
9545         int32_t ret_val;
9546
9547         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9548         ctf_calc_rwin(so, tp);
9549         if ((thflags & TH_RST) ||
9550             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9551                 return (ctf_process_rst(m, th, so, tp));
9552         /*
9553          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9554          * synchronized state.
9555          */
9556         if (thflags & TH_SYN) {
9557                 ctf_challenge_ack(m, th, tp, &ret_val);
9558                 return (ret_val);
9559         }
9560         /*
9561          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9562          * it's less than ts_recent, drop it.
9563          */
9564         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9565             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9566                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9567                         return (ret_val);
9568         }
9569         INP_WLOCK_ASSERT(tp->t_inpcb);
9570         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9571                 return (ret_val);
9572         }
9573         /*
9574          * If last ACK falls within this segment's sequence numbers, record
9575          * its timestamp. NOTE: 1) That the test incorporates suggestions
9576          * from the latest proposal of the tcplw@cray.com list (Braden
9577          * 1993/04/26). 2) That updating only on newer timestamps interferes
9578          * with our earlier PAWS tests, so this check should be solely
9579          * predicated on the sequence space of this segment. 3) That we
9580          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9581          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9582          * SEG.Len, This modified check allows us to overcome RFC1323's
9583          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9584          * p.869. In such cases, we can still calculate the RTT correctly
9585          * when RCV.NXT == Last.ACK.Sent.
9586          */
9587         if ((to->to_flags & TOF_TS) != 0 &&
9588             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9589             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9590             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9591                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9592                 tp->ts_recent = to->to_tsval;
9593         }
9594         /*
9595          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9596          * is on (half-synchronized state), then queue data for later
9597          * processing; else drop segment and return.
9598          */
9599         if ((thflags & TH_ACK) == 0) {
9600                 if (tp->t_flags & TF_NEEDSYN) {
9601                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9602                             tiwin, thflags, nxt_pkt));
9603                 } else if (tp->t_flags & TF_ACKNOW) {
9604                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9605                         bbr->r_wanted_output = 1;
9606                         return (ret_val);
9607                 } else {
9608                         ctf_do_drop(m, NULL);
9609                         return (0);
9610                 }
9611         }
9612         /*
9613          * Ack processing.
9614          */
9615         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9616                 return (ret_val);
9617         }
9618         if (sbavail(&so->so_snd)) {
9619                 if (bbr_progress_timeout_check(bbr)) {
9620                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9621                         return (1);
9622                 }
9623         }
9624         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9625             tiwin, thflags, nxt_pkt));
9626 }
9627
9628 static int
9629 bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr,
9630     struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so)
9631 {
9632
9633         if (bbr->rc_allow_data_af_clo == 0) {
9634 close_now:
9635                 tp = tcp_close(tp);
9636                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
9637                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
9638                 return (1);
9639         }
9640         if (sbavail(&so->so_snd) == 0)
9641                 goto close_now;
9642         /* Ok we allow data that is ignored and a followup reset */
9643         tp->rcv_nxt = th->th_seq + *tlen;
9644         tp->t_flags2 |= TF2_DROP_AF_DATA;
9645         bbr->r_wanted_output = 1;
9646         *tlen = 0;
9647         return (0);
9648 }
9649
9650 /*
9651  * Return value of 1, the TCB is unlocked and most
9652  * likely gone, return value of 0, the TCB is still
9653  * locked.
9654  */
9655 static int
9656 bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
9657     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9658     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9659 {
9660         int32_t ourfinisacked = 0;
9661         int32_t ret_val;
9662         struct tcp_bbr *bbr;
9663
9664         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9665         ctf_calc_rwin(so, tp);
9666         if ((thflags & TH_RST) ||
9667             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9668                 return (ctf_process_rst(m, th, so, tp));
9669         /*
9670          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9671          * synchronized state.
9672          */
9673         if (thflags & TH_SYN) {
9674                 ctf_challenge_ack(m, th, tp, &ret_val);
9675                 return (ret_val);
9676         }
9677         /*
9678          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9679          * it's less than ts_recent, drop it.
9680          */
9681         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9682             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9683                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9684                         return (ret_val);
9685         }
9686         INP_WLOCK_ASSERT(tp->t_inpcb);
9687         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9688                 return (ret_val);
9689         }
9690         /*
9691          * If new data are received on a connection after the user processes
9692          * are gone, then RST the other end.
9693          */
9694         if ((so->so_state & SS_NOFDREF) && tlen) {
9695                 /*
9696                  * We call a new function now so we might continue and setup
9697                  * to reset at all data being ack'd.
9698                  */
9699                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9700                         return (1);
9701         }
9702         /*
9703          * If last ACK falls within this segment's sequence numbers, record
9704          * its timestamp. NOTE: 1) That the test incorporates suggestions
9705          * from the latest proposal of the tcplw@cray.com list (Braden
9706          * 1993/04/26). 2) That updating only on newer timestamps interferes
9707          * with our earlier PAWS tests, so this check should be solely
9708          * predicated on the sequence space of this segment. 3) That we
9709          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9710          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9711          * SEG.Len, This modified check allows us to overcome RFC1323's
9712          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9713          * p.869. In such cases, we can still calculate the RTT correctly
9714          * when RCV.NXT == Last.ACK.Sent.
9715          */
9716         if ((to->to_flags & TOF_TS) != 0 &&
9717             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9718             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9719             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9720                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9721                 tp->ts_recent = to->to_tsval;
9722         }
9723         /*
9724          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9725          * is on (half-synchronized state), then queue data for later
9726          * processing; else drop segment and return.
9727          */
9728         if ((thflags & TH_ACK) == 0) {
9729                 if (tp->t_flags & TF_NEEDSYN) {
9730                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9731                             tiwin, thflags, nxt_pkt));
9732                 } else if (tp->t_flags & TF_ACKNOW) {
9733                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9734                         bbr->r_wanted_output = 1;
9735                         return (ret_val);
9736                 } else {
9737                         ctf_do_drop(m, NULL);
9738                         return (0);
9739                 }
9740         }
9741         /*
9742          * Ack processing.
9743          */
9744         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9745                 return (ret_val);
9746         }
9747         if (ourfinisacked) {
9748                 /*
9749                  * If we can't receive any more data, then closing user can
9750                  * proceed. Starting the timer is contrary to the
9751                  * specification, but if we don't get a FIN we'll hang
9752                  * forever.
9753                  *
9754                  * XXXjl: we should release the tp also, and use a
9755                  * compressed state.
9756                  */
9757                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9758                         soisdisconnected(so);
9759                         tcp_timer_activate(tp, TT_2MSL,
9760                             (tcp_fast_finwait2_recycle ?
9761                             tcp_finwait2_timeout :
9762                             TP_MAXIDLE(tp)));
9763                 }
9764                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
9765         }
9766         if (sbavail(&so->so_snd)) {
9767                 if (bbr_progress_timeout_check(bbr)) {
9768                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9769                         return (1);
9770                 }
9771         }
9772         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9773             tiwin, thflags, nxt_pkt));
9774 }
9775
9776 /*
9777  * Return value of 1, the TCB is unlocked and most
9778  * likely gone, return value of 0, the TCB is still
9779  * locked.
9780  */
9781 static int
9782 bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
9783     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9784     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9785 {
9786         int32_t ourfinisacked = 0;
9787         int32_t ret_val;
9788         struct tcp_bbr *bbr;
9789
9790         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9791         ctf_calc_rwin(so, tp);
9792         if ((thflags & TH_RST) ||
9793             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9794                 return (ctf_process_rst(m, th, so, tp));
9795         /*
9796          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9797          * synchronized state.
9798          */
9799         if (thflags & TH_SYN) {
9800                 ctf_challenge_ack(m, th, tp, &ret_val);
9801                 return (ret_val);
9802         }
9803         /*
9804          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9805          * it's less than ts_recent, drop it.
9806          */
9807         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9808             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9809                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9810                         return (ret_val);
9811         }
9812         INP_WLOCK_ASSERT(tp->t_inpcb);
9813         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9814                 return (ret_val);
9815         }
9816         /*
9817          * If new data are received on a connection after the user processes
9818          * are gone, then RST the other end.
9819          */
9820         if ((so->so_state & SS_NOFDREF) && tlen) {
9821                 /*
9822                  * We call a new function now so we might continue and setup
9823                  * to reset at all data being ack'd.
9824                  */
9825                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9826                         return (1);
9827         }
9828         /*
9829          * If last ACK falls within this segment's sequence numbers, record
9830          * its timestamp. NOTE: 1) That the test incorporates suggestions
9831          * from the latest proposal of the tcplw@cray.com list (Braden
9832          * 1993/04/26). 2) That updating only on newer timestamps interferes
9833          * with our earlier PAWS tests, so this check should be solely
9834          * predicated on the sequence space of this segment. 3) That we
9835          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9836          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9837          * SEG.Len, This modified check allows us to overcome RFC1323's
9838          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9839          * p.869. In such cases, we can still calculate the RTT correctly
9840          * when RCV.NXT == Last.ACK.Sent.
9841          */
9842         if ((to->to_flags & TOF_TS) != 0 &&
9843             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9844             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9845             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9846                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9847                 tp->ts_recent = to->to_tsval;
9848         }
9849         /*
9850          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9851          * is on (half-synchronized state), then queue data for later
9852          * processing; else drop segment and return.
9853          */
9854         if ((thflags & TH_ACK) == 0) {
9855                 if (tp->t_flags & TF_NEEDSYN) {
9856                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9857                             tiwin, thflags, nxt_pkt));
9858                 } else if (tp->t_flags & TF_ACKNOW) {
9859                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9860                         bbr->r_wanted_output = 1;
9861                         return (ret_val);
9862                 } else {
9863                         ctf_do_drop(m, NULL);
9864                         return (0);
9865                 }
9866         }
9867         /*
9868          * Ack processing.
9869          */
9870         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9871                 return (ret_val);
9872         }
9873         if (ourfinisacked) {
9874                 tcp_twstart(tp);
9875                 m_freem(m);
9876                 return (1);
9877         }
9878         if (sbavail(&so->so_snd)) {
9879                 if (bbr_progress_timeout_check(bbr)) {
9880                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9881                         return (1);
9882                 }
9883         }
9884         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9885             tiwin, thflags, nxt_pkt));
9886 }
9887
9888 /*
9889  * Return value of 1, the TCB is unlocked and most
9890  * likely gone, return value of 0, the TCB is still
9891  * locked.
9892  */
9893 static int
9894 bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
9895     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9896     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
9897 {
9898         int32_t ourfinisacked = 0;
9899         int32_t ret_val;
9900         struct tcp_bbr *bbr;
9901
9902         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
9903         ctf_calc_rwin(so, tp);
9904         if ((thflags & TH_RST) ||
9905             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9906                 return (ctf_process_rst(m, th, so, tp));
9907         /*
9908          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9909          * synchronized state.
9910          */
9911         if (thflags & TH_SYN) {
9912                 ctf_challenge_ack(m, th, tp, &ret_val);
9913                 return (ret_val);
9914         }
9915         /*
9916          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9917          * it's less than ts_recent, drop it.
9918          */
9919         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9920             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9921                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9922                         return (ret_val);
9923         }
9924         INP_WLOCK_ASSERT(tp->t_inpcb);
9925         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9926                 return (ret_val);
9927         }
9928         /*
9929          * If new data are received on a connection after the user processes
9930          * are gone, then RST the other end.
9931          */
9932         if ((so->so_state & SS_NOFDREF) && tlen) {
9933                 /*
9934                  * We call a new function now so we might continue and setup
9935                  * to reset at all data being ack'd.
9936                  */
9937                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
9938                         return (1);
9939         }
9940         /*
9941          * If last ACK falls within this segment's sequence numbers, record
9942          * its timestamp. NOTE: 1) That the test incorporates suggestions
9943          * from the latest proposal of the tcplw@cray.com list (Braden
9944          * 1993/04/26). 2) That updating only on newer timestamps interferes
9945          * with our earlier PAWS tests, so this check should be solely
9946          * predicated on the sequence space of this segment. 3) That we
9947          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9948          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9949          * SEG.Len, This modified check allows us to overcome RFC1323's
9950          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9951          * p.869. In such cases, we can still calculate the RTT correctly
9952          * when RCV.NXT == Last.ACK.Sent.
9953          */
9954         if ((to->to_flags & TOF_TS) != 0 &&
9955             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9956             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9957             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9958                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
9959                 tp->ts_recent = to->to_tsval;
9960         }
9961         /*
9962          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9963          * is on (half-synchronized state), then queue data for later
9964          * processing; else drop segment and return.
9965          */
9966         if ((thflags & TH_ACK) == 0) {
9967                 if (tp->t_flags & TF_NEEDSYN) {
9968                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9969                             tiwin, thflags, nxt_pkt));
9970                 } else if (tp->t_flags & TF_ACKNOW) {
9971                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9972                         bbr->r_wanted_output = 1;
9973                         return (ret_val);
9974                 } else {
9975                         ctf_do_drop(m, NULL);
9976                         return (0);
9977                 }
9978         }
9979         /*
9980          * case TCPS_LAST_ACK: Ack processing.
9981          */
9982         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9983                 return (ret_val);
9984         }
9985         if (ourfinisacked) {
9986                 tp = tcp_close(tp);
9987                 ctf_do_drop(m, tp);
9988                 return (1);
9989         }
9990         if (sbavail(&so->so_snd)) {
9991                 if (bbr_progress_timeout_check(bbr)) {
9992                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9993                         return (1);
9994                 }
9995         }
9996         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
9997             tiwin, thflags, nxt_pkt));
9998 }
9999
10000
10001 /*
10002  * Return value of 1, the TCB is unlocked and most
10003  * likely gone, return value of 0, the TCB is still
10004  * locked.
10005  */
10006 static int
10007 bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
10008     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10009     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
10010 {
10011         int32_t ourfinisacked = 0;
10012         int32_t ret_val;
10013         struct tcp_bbr *bbr;
10014
10015         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10016         ctf_calc_rwin(so, tp);
10017         /* Reset receive buffer auto scaling when not in bulk receive mode. */
10018         if ((thflags & TH_RST) ||
10019             (tp->t_fin_is_rst && (thflags & TH_FIN)))
10020                 return (ctf_process_rst(m, th, so, tp));
10021
10022         /*
10023          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
10024          * synchronized state.
10025          */
10026         if (thflags & TH_SYN) {
10027                 ctf_challenge_ack(m, th, tp, &ret_val);
10028                 return (ret_val);
10029         }
10030         INP_WLOCK_ASSERT(tp->t_inpcb);
10031         /*
10032          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
10033          * it's less than ts_recent, drop it.
10034          */
10035         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
10036             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
10037                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
10038                         return (ret_val);
10039         }
10040         INP_WLOCK_ASSERT(tp->t_inpcb);
10041         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
10042                 return (ret_val);
10043         }
10044         /*
10045          * If new data are received on a connection after the user processes
10046          * are gone, then we may RST the other end depending on the outcome
10047          * of bbr_check_data_after_close.
10048          */
10049         if ((so->so_state & SS_NOFDREF) &&
10050             tlen) {
10051                 /*
10052                  * We call a new function now so we might continue and setup
10053                  * to reset at all data being ack'd.
10054                  */
10055                 if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
10056                         return (1);
10057         }
10058         INP_WLOCK_ASSERT(tp->t_inpcb);
10059         /*
10060          * If last ACK falls within this segment's sequence numbers, record
10061          * its timestamp. NOTE: 1) That the test incorporates suggestions
10062          * from the latest proposal of the tcplw@cray.com list (Braden
10063          * 1993/04/26). 2) That updating only on newer timestamps interferes
10064          * with our earlier PAWS tests, so this check should be solely
10065          * predicated on the sequence space of this segment. 3) That we
10066          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
10067          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
10068          * SEG.Len, This modified check allows us to overcome RFC1323's
10069          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
10070          * p.869. In such cases, we can still calculate the RTT correctly
10071          * when RCV.NXT == Last.ACK.Sent.
10072          */
10073         INP_WLOCK_ASSERT(tp->t_inpcb);
10074         if ((to->to_flags & TOF_TS) != 0 &&
10075             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
10076             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
10077             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
10078                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
10079                 tp->ts_recent = to->to_tsval;
10080         }
10081         /*
10082          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
10083          * is on (half-synchronized state), then queue data for later
10084          * processing; else drop segment and return.
10085          */
10086         if ((thflags & TH_ACK) == 0) {
10087                 if (tp->t_flags & TF_NEEDSYN) {
10088                         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
10089                             tiwin, thflags, nxt_pkt));
10090                 } else if (tp->t_flags & TF_ACKNOW) {
10091                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10092                         bbr->r_wanted_output = 1;
10093                         return (ret_val);
10094                 } else {
10095                         ctf_do_drop(m, NULL);
10096                         return (0);
10097                 }
10098         }
10099         /*
10100          * Ack processing.
10101          */
10102         INP_WLOCK_ASSERT(tp->t_inpcb);
10103         if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10104                 return (ret_val);
10105         }
10106         if (sbavail(&so->so_snd)) {
10107                 if (bbr_progress_timeout_check(bbr)) {
10108                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10109                         return (1);
10110                 }
10111         }
10112         INP_WLOCK_ASSERT(tp->t_inpcb);
10113         return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
10114             tiwin, thflags, nxt_pkt));
10115 }
10116
10117 static void
10118 bbr_stop_all_timers(struct tcpcb *tp)
10119 {
10120         struct tcp_bbr *bbr;
10121
10122         /*
10123          * Assure no timers are running.
10124          */
10125         if (tcp_timer_active(tp, TT_PERSIST)) {
10126                 /* We enter in persists, set the flag appropriately */
10127                 bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10128                 bbr->rc_in_persist = 1;
10129         }
10130         tcp_timer_suspend(tp, TT_PERSIST);
10131         tcp_timer_suspend(tp, TT_REXMT);
10132         tcp_timer_suspend(tp, TT_KEEP);
10133         tcp_timer_suspend(tp, TT_DELACK);
10134 }
10135
10136 static void
10137 bbr_google_mode_on(struct tcp_bbr *bbr)
10138 {
10139         bbr->rc_use_google = 1;
10140         bbr->rc_no_pacing = 0;
10141         bbr->r_ctl.bbr_google_discount = bbr_google_discount;
10142         bbr->r_use_policer = bbr_policer_detection_enabled;
10143         bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
10144         bbr->bbr_use_rack_cheat = 0;
10145         bbr->r_ctl.rc_incr_tmrs = 0;
10146         bbr->r_ctl.rc_inc_tcp_oh = 0;
10147         bbr->r_ctl.rc_inc_ip_oh = 0;
10148         bbr->r_ctl.rc_inc_enet_oh = 0;
10149         reset_time(&bbr->r_ctl.rc_delrate,
10150                    BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
10151         reset_time_small(&bbr->r_ctl.rc_rttprop,
10152                          (11 * USECS_IN_SECOND));
10153         tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
10154 }
10155
10156 static void
10157 bbr_google_mode_off(struct tcp_bbr *bbr)
10158 {
10159         bbr->rc_use_google = 0;
10160         bbr->r_ctl.bbr_google_discount = 0;
10161         bbr->no_pacing_until = bbr_no_pacing_until;
10162         bbr->r_use_policer = 0;
10163         if (bbr->no_pacing_until)
10164                 bbr->rc_no_pacing = 1;
10165         else
10166                 bbr->rc_no_pacing = 0;
10167         if (bbr_use_rack_resend_cheat)
10168                 bbr->bbr_use_rack_cheat = 1;
10169         else
10170                 bbr->bbr_use_rack_cheat = 0;
10171         if (bbr_incr_timers)
10172                 bbr->r_ctl.rc_incr_tmrs = 1;
10173         else
10174                 bbr->r_ctl.rc_incr_tmrs = 0;
10175         if (bbr_include_tcp_oh)
10176                 bbr->r_ctl.rc_inc_tcp_oh = 1;
10177         else
10178                 bbr->r_ctl.rc_inc_tcp_oh = 0;
10179         if (bbr_include_ip_oh)
10180                 bbr->r_ctl.rc_inc_ip_oh = 1;
10181         else
10182                 bbr->r_ctl.rc_inc_ip_oh = 0;
10183         if (bbr_include_enet_oh)
10184                 bbr->r_ctl.rc_inc_enet_oh = 1;
10185         else
10186                 bbr->r_ctl.rc_inc_enet_oh = 0;
10187         bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
10188         reset_time(&bbr->r_ctl.rc_delrate,
10189                    bbr_num_pktepo_for_del_limit);
10190         reset_time_small(&bbr->r_ctl.rc_rttprop,
10191                          (bbr_filter_len_sec * USECS_IN_SECOND));
10192         tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
10193 }
10194 /*
10195  * Return 0 on success, non-zero on failure
10196  * which indicates the error (usually no memory).
10197  */
10198 static int
10199 bbr_init(struct tcpcb *tp)
10200 {
10201         struct tcp_bbr *bbr = NULL;
10202         struct inpcb *inp;
10203         uint32_t cts;
10204
10205         tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
10206         if (tp->t_fb_ptr == NULL) {
10207                 /*
10208                  * We need to allocate memory but cant. The INP and INP_INFO
10209                  * locks and they are recusive (happens during setup. So a
10210                  * scheme to drop the locks fails :(
10211                  *
10212                  */
10213                 return (ENOMEM);
10214         }
10215         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10216         bbr->rtt_valid = 0;
10217         inp = tp->t_inpcb;
10218         inp->inp_flags2 |= INP_CANNOT_DO_ECN;
10219         inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
10220         TAILQ_INIT(&bbr->r_ctl.rc_map);
10221         TAILQ_INIT(&bbr->r_ctl.rc_free);
10222         TAILQ_INIT(&bbr->r_ctl.rc_tmap);
10223         bbr->rc_tp = tp;
10224         if (tp->t_inpcb) {
10225                 bbr->rc_inp = tp->t_inpcb;
10226         }
10227         cts = tcp_get_usecs(&bbr->rc_tv);
10228         tp->t_acktime = 0;
10229         bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close;
10230         bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade;
10231         bbr->rc_tlp_threshold = bbr_tlp_thresh;
10232         bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh;
10233         bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay;
10234         bbr->r_ctl.rc_min_to = bbr_min_to;
10235         bbr->rc_bbr_state = BBR_STATE_STARTUP;
10236         bbr->r_ctl.bbr_lost_at_state = 0;
10237         bbr->r_ctl.rc_lost_at_startup = 0;
10238         bbr->rc_all_timers_stopped = 0;
10239         bbr->r_ctl.rc_bbr_lastbtlbw = 0;
10240         bbr->r_ctl.rc_pkt_epoch_del = 0;
10241         bbr->r_ctl.rc_pkt_epoch = 0;
10242         bbr->r_ctl.rc_lowest_rtt = 0xffffffff;
10243         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain;
10244         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
10245         bbr->r_ctl.rc_went_idle_time = cts;
10246         bbr->rc_pacer_started = cts;
10247         bbr->r_ctl.rc_pkt_epoch_time = cts;
10248         bbr->r_ctl.rc_rcvtime = cts;
10249         bbr->r_ctl.rc_bbr_state_time = cts;
10250         bbr->r_ctl.rc_del_time = cts;
10251         bbr->r_ctl.rc_tlp_rxt_last_time = cts;
10252         bbr->r_ctl.last_in_probertt = cts;
10253         bbr->skip_gain = 0;
10254         bbr->gain_is_limited = 0;
10255         bbr->no_pacing_until = bbr_no_pacing_until;
10256         if (bbr->no_pacing_until)
10257                 bbr->rc_no_pacing = 1;
10258         if (bbr_use_google_algo) {
10259                 bbr->rc_no_pacing = 0;
10260                 bbr->rc_use_google = 1;
10261                 bbr->r_ctl.bbr_google_discount = bbr_google_discount;
10262                 bbr->r_use_policer = bbr_policer_detection_enabled;
10263         } else {
10264                 bbr->rc_use_google = 0;
10265                 bbr->r_ctl.bbr_google_discount = 0;
10266                 bbr->r_use_policer = 0;
10267         }
10268         if (bbr_ts_limiting)
10269                 bbr->rc_use_ts_limit = 1;
10270         else
10271                 bbr->rc_use_ts_limit = 0;
10272         if (bbr_ts_can_raise)
10273                 bbr->ts_can_raise = 1;
10274         else
10275                 bbr->ts_can_raise = 0;
10276         if (V_tcp_delack_enabled == 1)
10277                 tp->t_delayed_ack = 2;
10278         else if (V_tcp_delack_enabled == 0)
10279                 tp->t_delayed_ack = 0;
10280         else if (V_tcp_delack_enabled < 100)
10281                 tp->t_delayed_ack = V_tcp_delack_enabled;
10282         else
10283                 tp->t_delayed_ack = 2;
10284         if (bbr->rc_use_google == 0)
10285                 bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
10286         else
10287                 bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
10288         bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms;
10289         bbr->rc_max_rto_sec = bbr_rto_max_sec;
10290         bbr->rc_init_win = bbr_def_init_win;
10291         if (tp->t_flags & TF_REQ_TSTMP)
10292                 bbr->rc_last_options = TCP_TS_OVERHEAD;
10293         bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options;
10294         bbr->r_ctl.rc_high_rwnd = tp->snd_wnd;
10295         bbr->r_init_rtt = 1;
10296
10297         counter_u64_add(bbr_flows_nohdwr_pacing, 1);
10298         if (bbr_allow_hdwr_pacing)
10299                 bbr->bbr_hdw_pace_ena = 1;
10300         else
10301                 bbr->bbr_hdw_pace_ena = 0;
10302         if (bbr_sends_full_iwnd)
10303                 bbr->bbr_init_win_cheat = 1;
10304         else
10305                 bbr->bbr_init_win_cheat = 0;
10306         bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max;
10307         bbr->r_ctl.rc_drain_pg = bbr_drain_gain;
10308         bbr->r_ctl.rc_startup_pg = bbr_high_gain;
10309         bbr->rc_loss_exit = bbr_exit_startup_at_loss;
10310         bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain;
10311         bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second;
10312         bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar;
10313         bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max;
10314         bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor;
10315         bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min;
10316         bbr->r_ctl.bbr_cross_over = bbr_cross_over;
10317         bbr->r_ctl.rc_rtt_shrinks = cts;
10318         if (bbr->rc_use_google) {
10319                 setup_time_filter(&bbr->r_ctl.rc_delrate,
10320                                   FILTER_TYPE_MAX,
10321                                   BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
10322                 setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
10323                                         FILTER_TYPE_MIN, (11 * USECS_IN_SECOND));
10324         } else {
10325                 setup_time_filter(&bbr->r_ctl.rc_delrate,
10326                                   FILTER_TYPE_MAX,
10327                                   bbr_num_pktepo_for_del_limit);
10328                 setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
10329                                         FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND));
10330         }
10331         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0);
10332         if (bbr_uses_idle_restart)
10333                 bbr->rc_use_idle_restart = 1;
10334         else
10335                 bbr->rc_use_idle_restart = 0;
10336         bbr->r_ctl.rc_bbr_cur_del_rate = 0;
10337         bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps;
10338         if (bbr_resends_use_tso)
10339                 bbr->rc_resends_use_tso = 1;
10340 #ifdef NETFLIX_PEAKRATE
10341         tp->t_peakrate_thr = tp->t_maxpeakrate;
10342 #endif
10343         if (tp->snd_una != tp->snd_max) {
10344                 /* Create a send map for the current outstanding data */
10345                 struct bbr_sendmap *rsm;
10346
10347                 rsm = bbr_alloc(bbr);
10348                 if (rsm == NULL) {
10349                         uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
10350                         tp->t_fb_ptr = NULL;
10351                         return (ENOMEM);
10352                 }
10353                 rsm->r_flags = BBR_OVERMAX;
10354                 rsm->r_tim_lastsent[0] = cts;
10355                 rsm->r_rtr_cnt = 1;
10356                 rsm->r_rtr_bytes = 0;
10357                 rsm->r_start = tp->snd_una;
10358                 rsm->r_end = tp->snd_max;
10359                 rsm->r_dupack = 0;
10360                 rsm->r_delivered = bbr->r_ctl.rc_delivered;
10361                 rsm->r_ts_valid = 0;
10362                 rsm->r_del_ack_ts = tp->ts_recent;
10363                 rsm->r_del_time = cts;
10364                 if (bbr->r_ctl.r_app_limited_until)
10365                         rsm->r_app_limited = 1;
10366                 else
10367                         rsm->r_app_limited = 0;
10368                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
10369                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
10370                 rsm->r_in_tmap = 1;
10371                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
10372                         rsm->r_bbr_state = bbr_state_val(bbr);
10373                 else
10374                         rsm->r_bbr_state = 8;
10375         }
10376         if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0))
10377                 bbr->bbr_use_rack_cheat = 1;
10378         if (bbr_incr_timers && (bbr->rc_use_google == 0))
10379                 bbr->r_ctl.rc_incr_tmrs = 1;
10380         if (bbr_include_tcp_oh && (bbr->rc_use_google == 0))
10381                 bbr->r_ctl.rc_inc_tcp_oh = 1;
10382         if (bbr_include_ip_oh && (bbr->rc_use_google == 0))
10383                 bbr->r_ctl.rc_inc_ip_oh = 1;
10384         if (bbr_include_enet_oh && (bbr->rc_use_google == 0))
10385                 bbr->r_ctl.rc_inc_enet_oh = 1;
10386
10387         bbr_log_type_statechange(bbr, cts, __LINE__);
10388         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
10389             (tp->t_srtt)) {
10390                 uint32_t rtt;
10391
10392                 rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
10393                 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
10394         }
10395         /* announce the settings and state */
10396         bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT);
10397         tcp_bbr_tso_size_check(bbr, cts);
10398         /*
10399          * Now call the generic function to start a timer. This will place
10400          * the TCB on the hptsi wheel if a timer is needed with appropriate
10401          * flags.
10402          */
10403         bbr_stop_all_timers(tp);
10404         bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
10405         return (0);
10406 }
10407
10408 /*
10409  * Return 0 if we can accept the connection. Return
10410  * non-zero if we can't handle the connection. A EAGAIN
10411  * means you need to wait until the connection is up.
10412  * a EADDRNOTAVAIL means we can never handle the connection
10413  * (no SACK).
10414  */
10415 static int
10416 bbr_handoff_ok(struct tcpcb *tp)
10417 {
10418         if ((tp->t_state == TCPS_CLOSED) ||
10419             (tp->t_state == TCPS_LISTEN)) {
10420                 /* Sure no problem though it may not stick */
10421                 return (0);
10422         }
10423         if ((tp->t_state == TCPS_SYN_SENT) ||
10424             (tp->t_state == TCPS_SYN_RECEIVED)) {
10425                 /*
10426                  * We really don't know you have to get to ESTAB or beyond
10427                  * to tell.
10428                  */
10429                 return (EAGAIN);
10430         }
10431         if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) {
10432                 return (0);
10433         }
10434         /*
10435          * If we reach here we don't do SACK on this connection so we can
10436          * never do rack.
10437          */
10438         return (EINVAL);
10439 }
10440
10441 static void
10442 bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
10443 {
10444         if (tp->t_fb_ptr) {
10445                 uint32_t calc;
10446                 struct tcp_bbr *bbr;
10447                 struct bbr_sendmap *rsm;
10448
10449                 bbr = (struct tcp_bbr *)tp->t_fb_ptr;
10450                 if (bbr->r_ctl.crte)
10451                         tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
10452                 bbr_log_flowend(bbr);
10453                 bbr->rc_tp = NULL;
10454                 if (tp->t_inpcb) {
10455                         /* Backout any flags2 we applied */
10456                         tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN;
10457                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
10458                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
10459                 }
10460                 if (bbr->bbr_hdrw_pacing)
10461                         counter_u64_add(bbr_flows_whdwr_pacing, -1);
10462                 else
10463                         counter_u64_add(bbr_flows_nohdwr_pacing, -1);
10464                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
10465                 while (rsm) {
10466                         TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
10467                         uma_zfree(bbr_zone, rsm);
10468                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
10469                 }
10470                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
10471                 while (rsm) {
10472                         TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
10473                         uma_zfree(bbr_zone, rsm);
10474                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
10475                 }
10476                 calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd;
10477                 if (calc > (bbr->r_ctl.rc_init_rwnd / 10))
10478                         BBR_STAT_INC(bbr_dynamic_rwnd);
10479                 else
10480                         BBR_STAT_INC(bbr_static_rwnd);
10481                 bbr->r_ctl.rc_free_cnt = 0;
10482                 uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
10483                 tp->t_fb_ptr = NULL;
10484         }
10485         /* Make sure snd_nxt is correctly set */
10486         tp->snd_nxt = tp->snd_max;
10487 }
10488
10489 static void
10490 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win)
10491 {
10492         switch (tp->t_state) {
10493         case TCPS_SYN_SENT:
10494                 bbr->r_state = TCPS_SYN_SENT;
10495                 bbr->r_substate = bbr_do_syn_sent;
10496                 break;
10497         case TCPS_SYN_RECEIVED:
10498                 bbr->r_state = TCPS_SYN_RECEIVED;
10499                 bbr->r_substate = bbr_do_syn_recv;
10500                 break;
10501         case TCPS_ESTABLISHED:
10502                 bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd);
10503                 bbr->r_state = TCPS_ESTABLISHED;
10504                 bbr->r_substate = bbr_do_established;
10505                 break;
10506         case TCPS_CLOSE_WAIT:
10507                 bbr->r_state = TCPS_CLOSE_WAIT;
10508                 bbr->r_substate = bbr_do_close_wait;
10509                 break;
10510         case TCPS_FIN_WAIT_1:
10511                 bbr->r_state = TCPS_FIN_WAIT_1;
10512                 bbr->r_substate = bbr_do_fin_wait_1;
10513                 break;
10514         case TCPS_CLOSING:
10515                 bbr->r_state = TCPS_CLOSING;
10516                 bbr->r_substate = bbr_do_closing;
10517                 break;
10518         case TCPS_LAST_ACK:
10519                 bbr->r_state = TCPS_LAST_ACK;
10520                 bbr->r_substate = bbr_do_lastack;
10521                 break;
10522         case TCPS_FIN_WAIT_2:
10523                 bbr->r_state = TCPS_FIN_WAIT_2;
10524                 bbr->r_substate = bbr_do_fin_wait_2;
10525                 break;
10526         case TCPS_LISTEN:
10527         case TCPS_CLOSED:
10528         case TCPS_TIME_WAIT:
10529         default:
10530                 break;
10531         };
10532 }
10533
10534 static void
10535 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog)
10536 {
10537         /*
10538          * Now what state are we going into now? Is there adjustments
10539          * needed?
10540          */
10541         int32_t old_state, old_gain;
10542
10543
10544         old_state = bbr_state_val(bbr);
10545         old_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
10546         if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) {
10547                 /* Save the lowest srtt we saw in our end of the sub-state */
10548                 bbr->rc_hit_state_1 = 0;
10549                 if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff)
10550                         bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state;
10551         }
10552         bbr->rc_bbr_substate++;
10553         if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) {
10554                 /* Cycle back to first state-> gain */
10555                 bbr->rc_bbr_substate = 0;
10556         }
10557         if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
10558                 /*
10559                  * We enter the gain(5/4) cycle (possibly less if
10560                  * shallow buffer detection is enabled)
10561                  */
10562                 if (bbr->skip_gain) {
10563                         /*
10564                          * Hardware pacing has set our rate to
10565                          * the max and limited our b/w just
10566                          * do level i.e. no gain.
10567                          */
10568                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1];
10569                 } else if (bbr->gain_is_limited &&
10570                            bbr->bbr_hdrw_pacing &&
10571                            bbr->r_ctl.crte) {
10572                         /*
10573                          * We can't gain above the hardware pacing
10574                          * rate which is less than our rate + the gain
10575                          * calculate the gain needed to reach the hardware
10576                          * pacing rate..
10577                          */
10578                         uint64_t bw, rate, gain_calc;
10579
10580                         bw = bbr_get_bw(bbr);
10581                         rate = bbr->r_ctl.crte->rate;
10582                         if ((rate > bw) &&
10583                             (((bw *  (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) {
10584                                 gain_calc = (rate * BBR_UNIT) / bw;
10585                                 if (gain_calc < BBR_UNIT)
10586                                         gain_calc = BBR_UNIT;
10587                                 bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc;
10588                         } else {
10589                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
10590                         }
10591                 } else
10592                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
10593                 if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) {
10594                         bbr->r_ctl.rc_bbr_state_atflight = cts;
10595                 } else
10596                         bbr->r_ctl.rc_bbr_state_atflight = 0;
10597         } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
10598                 bbr->rc_hit_state_1 = 1;
10599                 bbr->r_ctl.rc_exta_time_gd = 0;
10600                 bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
10601                                                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10602                 if (bbr_state_drain_2_tar) {
10603                         bbr->r_ctl.rc_bbr_state_atflight = 0;
10604                 } else
10605                         bbr->r_ctl.rc_bbr_state_atflight = cts;
10606                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN];
10607         } else {
10608                 /* All other cycles hit here 2-7 */
10609                 if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) {
10610                         if (bbr_sub_drain_slam_cwnd &&
10611                             (bbr->rc_use_google == 0) &&
10612                             (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
10613                                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
10614                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10615                         }
10616                         if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP))
10617                                 bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) -
10618                                                                bbr_get_rtt(bbr, BBR_RTT_PROP));
10619                         else
10620                                 bbr->r_ctl.rc_exta_time_gd = 0;
10621                         if (bbr->r_ctl.rc_exta_time_gd) {
10622                                 bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd;
10623                                 /* Now chop up the time for each state (div by 7) */
10624                                 bbr->r_ctl.rc_level_state_extra /= 7;
10625                                 if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) {
10626                                         /* Add a randomization */
10627                                         bbr_randomize_extra_state_time(bbr);
10628                                 }
10629                         }
10630                 }
10631                 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
10632                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)];
10633         }
10634         if (bbr->rc_use_google) {
10635                 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
10636         }
10637         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
10638         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
10639         if (dolog)
10640                 bbr_log_type_statechange(bbr, cts, line);
10641
10642         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10643                 uint32_t time_in;
10644
10645                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
10646                 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
10647                         counter_u64_add(bbr_state_time[(old_state + 5)], time_in);
10648                 } else {
10649                         counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
10650                 }
10651         }
10652         bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
10653         bbr_set_state_target(bbr, __LINE__);
10654         if (bbr_sub_drain_slam_cwnd &&
10655             (bbr->rc_use_google == 0) &&
10656             (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
10657                 /* Slam down the cwnd */
10658                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10659                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10660                 if (bbr_sub_drain_app_limit) {
10661                         /* Go app limited if we are on a long drain */
10662                         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered +
10663                                                           ctf_flight_size(bbr->rc_tp,
10664                                                               (bbr->r_ctl.rc_sacked +
10665                                                                bbr->r_ctl.rc_lost_bytes)));
10666                 }
10667                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10668         }
10669         if (bbr->rc_lt_use_bw) {
10670                 /* In policed mode we clamp pacing_gain to BBR_UNIT */
10671                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
10672         }
10673         /* Google changes TSO size every cycle */
10674         if (bbr->rc_use_google)
10675                 tcp_bbr_tso_size_check(bbr, cts);
10676         bbr->r_ctl.gain_epoch = cts;
10677         bbr->r_ctl.rc_bbr_state_time = cts;
10678         bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch;
10679 }
10680
10681 static void
10682 bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
10683 {
10684         if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) &&
10685             (google_allow_early_out == 1) &&
10686             (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) {
10687                 /* We have reached out target flight size possibly early */
10688                 goto change_state;
10689         }
10690         if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10691                 return;
10692         }
10693         if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) {
10694                 /*
10695                  * Must be a rttProp movement forward before
10696                  * we can change states.
10697                  */
10698                 return;
10699         }
10700         if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
10701                 /*
10702                  * The needed time has passed but for
10703                  * the gain cycle extra rules apply:
10704                  * 1) If we have seen loss, we exit
10705                  * 2) If we have not reached the target
10706                  *    we stay in GAIN (gain-to-target).
10707                  */
10708                 if (google_consider_lost && losses)
10709                         goto change_state;
10710                 if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) {
10711                         return;
10712                 }
10713         }
10714 change_state:
10715         /* For gain we must reach our target, all others last 1 rttProp */
10716         bbr_substate_change(bbr, cts, __LINE__, 1);
10717 }
10718
10719 static void
10720 bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
10721 {
10722         uint32_t flight, bbr_cur_cycle_time;
10723
10724         if (bbr->rc_use_google) {
10725                 bbr_set_probebw_google_gains(bbr, cts, losses);
10726                 return;
10727         }
10728         if (cts == 0) {
10729                 /*
10730                  * Never alow cts to be 0 we
10731                  * do this so we can judge if
10732                  * we have set a timestamp.
10733                  */
10734                 cts = 1;
10735         }
10736         if (bbr_state_is_pkt_epoch)
10737                 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
10738         else
10739                 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP);
10740
10741         if (bbr->r_ctl.rc_bbr_state_atflight == 0) {
10742                 if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
10743                         flight = ctf_flight_size(bbr->rc_tp,
10744                                      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10745                         if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) {
10746                                 /* Keep it slam down */
10747                                 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) {
10748                                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10749                                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10750                                 }
10751                                 if (bbr_sub_drain_app_limit) {
10752                                         /* Go app limited if we are on a long drain */
10753                                         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight);
10754                                 }
10755                         }
10756                         if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) &&
10757                             (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) ||
10758                              (flight >= bbr->r_ctl.flightsize_at_drain))) {
10759                                 /*
10760                                  * Still here after the same time as
10761                                  * the gain. We need to drain harder
10762                                  * for the next srtt. Reduce by a set amount
10763                                  * the gain drop is capped at DRAIN states
10764                                  * value (88).
10765                                  */
10766                                 bbr->r_ctl.flightsize_at_drain = flight;
10767                                 if (bbr_drain_drop_mul &&
10768                                     bbr_drain_drop_div &&
10769                                     (bbr_drain_drop_mul < bbr_drain_drop_div)) {
10770                                         /* Use your specific drop value (def 4/5 = 20%) */
10771                                         bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul;
10772                                         bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div;
10773                                 } else {
10774                                         /* You get drop of 20% */
10775                                         bbr->r_ctl.rc_bbr_hptsi_gain *= 4;
10776                                         bbr->r_ctl.rc_bbr_hptsi_gain /= 5;
10777                                 }
10778                                 if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) {
10779                                         /* Reduce our gain again to the bottom  */
10780                                         bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
10781                                 }
10782                                 bbr_log_exit_gain(bbr, cts, 4);
10783                                 /*
10784                                  * Extend out so we wait another
10785                                  * epoch before dropping again.
10786                                  */
10787                                 bbr->r_ctl.gain_epoch = cts;
10788                         }
10789                         if (flight <= bbr->r_ctl.rc_target_at_state) {
10790                                 if (bbr_sub_drain_slam_cwnd &&
10791                                     (bbr->rc_use_google == 0) &&
10792                                     (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
10793                                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
10794                                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10795                                 }
10796                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10797                                 bbr_log_exit_gain(bbr, cts, 3);
10798                         }
10799                 } else {
10800                         /* Its a gain  */
10801                         if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) {
10802                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10803                                 goto change_state;
10804                         }
10805                         if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) ||
10806                             ((ctf_outstanding(bbr->rc_tp) +  bbr->rc_tp->t_maxseg - 1) >=
10807                              bbr->rc_tp->snd_wnd)) {
10808                                 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
10809                                 bbr_log_exit_gain(bbr, cts, 2);
10810                         }
10811                 }
10812                 /**
10813                  * We fall through and return always one of two things has
10814                  * occured.
10815                  * 1) We are still not at target
10816                  *    <or>
10817                  * 2) We reached the target and set rc_bbr_state_atflight
10818                  *    which means we no longer hit this block
10819                  *    next time we are called.
10820                  */
10821                 return;
10822         }
10823 change_state:
10824         if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time))
10825                 return;
10826         if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) {
10827                 /* Less than a full time-period has passed */
10828                 return;
10829         }
10830         if (bbr->r_ctl.rc_level_state_extra &&
10831             (bbr_state_val(bbr) > BBR_SUB_DRAIN) &&
10832             ((cts - bbr->r_ctl.rc_bbr_state_time) <
10833              (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
10834                 /* Less than a full time-period + extra has passed */
10835                 return;
10836         }
10837         if (bbr_gain_gets_extra_too &&
10838             bbr->r_ctl.rc_level_state_extra &&
10839             (bbr_state_val(bbr) == BBR_SUB_GAIN) &&
10840             ((cts - bbr->r_ctl.rc_bbr_state_time) <
10841              (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
10842                 /* Less than a full time-period + extra has passed */
10843                 return;
10844         }
10845         bbr_substate_change(bbr, cts, __LINE__, 1);
10846 }
10847
10848 static uint32_t
10849 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain)
10850 {
10851         uint32_t mss, tar;
10852
10853         if (bbr->rc_use_google) {
10854                 /* Google just uses the cwnd target */
10855                 tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain);
10856         } else {
10857                 mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options),
10858                           bbr->r_ctl.rc_pace_max_segs);
10859                 /* Get the base cwnd with gain rounded to a mss */
10860                 tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr),
10861                                                       gain), mss);
10862                 /* Make sure it is within our min */
10863                 if (tar < get_min_cwnd(bbr))
10864                         return (get_min_cwnd(bbr));
10865         }
10866         return (tar);
10867 }
10868
10869 static void
10870 bbr_set_state_target(struct tcp_bbr *bbr, int line)
10871 {
10872         uint32_t tar, meth;
10873
10874         if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
10875             ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
10876                 /* Special case using old probe-rtt method */
10877                 tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
10878                 meth = 1;
10879         } else {
10880                 /* Non-probe-rtt case and reduced probe-rtt  */
10881                 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
10882                     (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) {
10883                         /* For gain cycle we use the hptsi gain */
10884                         tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
10885                         meth = 2;
10886                 } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) {
10887                         /*
10888                          * If configured, or for google all other states
10889                          * get BBR_UNIT.
10890                          */
10891                         tar = bbr_get_a_state_target(bbr, BBR_UNIT);
10892                         meth = 3;
10893                 } else {
10894                         /*
10895                          * Or we set a target based on the pacing gain
10896                          * for non-google mode and default (non-configured).
10897                          * Note we don't set a target goal below drain (192).
10898                          */
10899                         if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN])  {
10900                                 tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]);
10901                                 meth = 4;
10902                         } else {
10903                                 tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
10904                                 meth = 5;
10905                         }
10906                 }
10907         }
10908         bbr_log_set_of_state_target(bbr, tar, line, meth);
10909         bbr->r_ctl.rc_target_at_state = tar;
10910 }
10911
10912 static void
10913 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
10914 {
10915         /* Change to probe_rtt */
10916         uint32_t time_in;
10917
10918         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
10919         bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
10920                                              (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
10921         bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain
10922                                           + bbr->r_ctl.rc_delivered);
10923         /* Setup so we force feed the filter */
10924         if (bbr->rc_use_google || bbr_probertt_sets_rtt)
10925                 bbr->rc_prtt_set_ts = 1;
10926         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
10927                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
10928                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
10929         }
10930         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0);
10931         bbr->r_ctl.rc_rtt_shrinks = cts;
10932         bbr->r_ctl.last_in_probertt = cts;
10933         bbr->r_ctl.rc_probertt_srttchktim = cts;
10934         bbr->r_ctl.rc_bbr_state_time = cts;
10935         bbr->rc_bbr_state = BBR_STATE_PROBE_RTT;
10936         /* We need to force the filter to update */
10937
10938         if ((bbr_sub_drain_slam_cwnd) &&
10939             bbr->rc_hit_state_1 &&
10940             (bbr->rc_use_google == 0) &&
10941             (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
10942                 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd)
10943                         bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10944         } else
10945                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
10946         /* Update the lost */
10947         bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
10948         if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){
10949                 /* Set to the non-configurable default of 4 (PROBE_RTT_MIN)  */
10950                 bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
10951                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10952                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
10953                 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
10954                 bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6);
10955                 bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd;
10956         } else {
10957                 /*
10958                  * We bring it down slowly by using a hptsi gain that is
10959                  * probably 75%. This will slowly float down our outstanding
10960                  * without tampering with the cwnd.
10961                  */
10962                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
10963                 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
10964                 bbr_set_state_target(bbr, __LINE__);
10965                 if (bbr_prtt_slam_cwnd &&
10966                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
10967                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
10968                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
10969                 }
10970         }
10971         if (ctf_flight_size(bbr->rc_tp,
10972                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
10973             bbr->r_ctl.rc_target_at_state) {
10974                 /* We are at target */
10975                 bbr->r_ctl.rc_bbr_enters_probertt = cts;
10976         } else {
10977                 /* We need to come down to reach target before our time begins */
10978                 bbr->r_ctl.rc_bbr_enters_probertt = 0;
10979         }
10980         bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch;
10981         BBR_STAT_INC(bbr_enter_probertt);
10982         bbr_log_exit_gain(bbr, cts, 0);
10983         bbr_log_type_statechange(bbr, cts, line);
10984 }
10985
10986 static void
10987 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts)
10988 {
10989         /*
10990          * Sanity check on probe-rtt intervals.
10991          * In crazy situations where we are competing
10992          * against new-reno flows with huge buffers
10993          * our rtt-prop interval could come to dominate
10994          * things if we can't get through a full set
10995          * of cycles, we need to adjust it.
10996          */
10997         if (bbr_can_adjust_probertt &&
10998             (bbr->rc_use_google == 0)) {
10999                 uint16_t val = 0;
11000                 uint32_t cur_rttp, fval, newval, baseval;
11001
11002                 /* Are we to small and go into probe-rtt to often? */
11003                 baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1));
11004                 cur_rttp = roundup(baseval, USECS_IN_SECOND);
11005                 fval = bbr_filter_len_sec * USECS_IN_SECOND;
11006                 if (bbr_is_ratio == 0) {
11007                         if (fval > bbr_rtt_probe_limit)
11008                                 newval = cur_rttp + (fval - bbr_rtt_probe_limit);
11009                         else
11010                                 newval = cur_rttp;
11011                 } else {
11012                         int mul;
11013
11014                         mul = fval / bbr_rtt_probe_limit;
11015                         newval = cur_rttp * mul;
11016                 }
11017                 if (cur_rttp >  bbr->r_ctl.rc_probertt_int) {
11018                         bbr->r_ctl.rc_probertt_int = cur_rttp;
11019                         reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
11020                         val = 1;
11021                 } else {
11022                         /*
11023                          * No adjustments were made
11024                          * do we need to shrink it?
11025                          */
11026                         if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) {
11027                                 if (cur_rttp <= bbr_rtt_probe_limit) {
11028                                         /*
11029                                          * Things have calmed down lets
11030                                          * shrink all the way to default
11031                                          */
11032                                         bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
11033                                         reset_time_small(&bbr->r_ctl.rc_rttprop,
11034                                                          (bbr_filter_len_sec * USECS_IN_SECOND));
11035                                         cur_rttp = bbr_rtt_probe_limit;
11036                                         newval = (bbr_filter_len_sec * USECS_IN_SECOND);
11037                                         val = 2;
11038                                 } else {
11039                                         /*
11040                                          * Well does some adjustment make sense?
11041                                          */
11042                                         if (cur_rttp < bbr->r_ctl.rc_probertt_int) {
11043                                                 /* We can reduce interval time some */
11044                                                 bbr->r_ctl.rc_probertt_int = cur_rttp;
11045                                                 reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
11046                                                 val = 3;
11047                                         }
11048                                 }
11049                         }
11050                 }
11051                 if (val)
11052                         bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val);
11053         }
11054 }
11055
11056 static void
11057 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
11058 {
11059         /* Exit probe-rtt */
11060
11061         if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) {
11062                 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
11063                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11064         }
11065         bbr_log_exit_gain(bbr, cts, 1);
11066         bbr->rc_hit_state_1 = 0;
11067         bbr->r_ctl.rc_rtt_shrinks = cts;
11068         bbr->r_ctl.last_in_probertt = cts;
11069         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0);
11070         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11071         bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp,
11072                                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
11073                                           bbr->r_ctl.rc_delivered);
11074         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11075                 uint32_t time_in;
11076
11077                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11078                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11079         }
11080         if (bbr->rc_filled_pipe) {
11081                 /* Switch to probe_bw */
11082                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11083                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11084                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
11085                 bbr_substate_change(bbr, cts, __LINE__, 0);
11086                 bbr_log_type_statechange(bbr, cts, __LINE__);
11087         } else {
11088                 /* Back to startup */
11089                 bbr->rc_bbr_state = BBR_STATE_STARTUP;
11090                 bbr->r_ctl.rc_bbr_state_time = cts;
11091                 /*
11092                  * We don't want to give a complete free 3
11093                  * measurements until we exit, so we use
11094                  * the number of pe's we were in probe-rtt
11095                  * to add to the startup_epoch. That way
11096                  * we will still retain the old state.
11097                  */
11098                 bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt);
11099                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11100                 /* Make sure to use the lower pg when shifting back in */
11101                 if (bbr->r_ctl.rc_lost &&
11102                     bbr_use_lower_gain_in_startup &&
11103                     (bbr->rc_use_google == 0))
11104                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
11105                 else
11106                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
11107                 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
11108                 /* Probably not needed but set it anyway */
11109                 bbr_set_state_target(bbr, __LINE__);
11110                 bbr_log_type_statechange(bbr, cts, __LINE__);
11111                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11112                     bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0);
11113         }
11114         bbr_check_probe_rtt_limits(bbr, cts);
11115 }
11116
11117 static int32_t inline
11118 bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts)
11119 {
11120         if ((bbr->rc_past_init_win == 1) &&
11121             (bbr->rc_in_persist == 0) &&
11122             (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) {
11123                 return (1);
11124         }
11125         if (bbr_can_force_probertt &&
11126             (bbr->rc_in_persist == 0) &&
11127             (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
11128             ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
11129                 return (1);
11130         }
11131         return (0);
11132 }
11133
11134
11135 static int32_t
11136 bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t  pkt_epoch)
11137 {
11138         uint64_t btlbw, gain;
11139         if (pkt_epoch == 0) {
11140                 /*
11141                  * Need to be on a pkt-epoch to continue.
11142                  */
11143                 return (0);
11144         }
11145         btlbw = bbr_get_full_bw(bbr);
11146         gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11147                  (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11148         if (btlbw >= gain) {
11149                 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
11150                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11151                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
11152                 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
11153         }
11154         if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)
11155                 return (1);
11156         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11157                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
11158         return(0);
11159 }
11160
11161 static int32_t inline
11162 bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch)
11163 {
11164         /* Have we gained 25% in the last 3 packet based epoch's? */
11165         uint64_t btlbw, gain;
11166         int do_exit;
11167         int delta, rtt_gain;
11168
11169         if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
11170             (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
11171                 /*
11172                  * This qualifies as a RTT_PROBE session since we drop the
11173                  * data outstanding to nothing and waited more than
11174                  * bbr_rtt_probe_time.
11175                  */
11176                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
11177                 bbr_set_reduced_rtt(bbr, cts, __LINE__);
11178         }
11179         if (bbr_should_enter_probe_rtt(bbr, cts)) {
11180                 bbr_enter_probe_rtt(bbr, cts, __LINE__);
11181                 return (0);
11182         }
11183         if (bbr->rc_use_google)
11184                 return (bbr_google_startup(bbr, cts,  pkt_epoch));
11185
11186         if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
11187             (bbr_use_lower_gain_in_startup)) {
11188                 /* Drop to a lower gain 1.5 x since we saw loss */
11189                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
11190         }
11191         if (pkt_epoch == 0) {
11192                 /*
11193                  * Need to be on a pkt-epoch to continue.
11194                  */
11195                 return (0);
11196         }
11197         if (bbr_rtt_gain_thresh) {
11198                 /*
11199                  * Do we allow a flow to stay
11200                  * in startup with no loss and no
11201                  * gain in rtt over a set threshold?
11202                  */
11203                 if (bbr->r_ctl.rc_pkt_epoch_rtt &&
11204                     bbr->r_ctl.startup_last_srtt &&
11205                     (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) {
11206                         delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt;
11207                         rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt;
11208                 } else
11209                         rtt_gain = 0;
11210                 if ((bbr->r_ctl.startup_last_srtt == 0)  ||
11211                     (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt))
11212                         /* First time or new lower value */
11213                         bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
11214
11215                 if ((bbr->r_ctl.rc_lost == 0) &&
11216                     (rtt_gain < bbr_rtt_gain_thresh)) {
11217                         /*
11218                          * No loss, and we are under
11219                          * our gain threhold for
11220                          * increasing RTT.
11221                          */
11222                         if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
11223                                 bbr->r_ctl.rc_bbr_last_startup_epoch++;
11224                         bbr_log_startup_event(bbr, cts, rtt_gain,
11225                                               delta, bbr->r_ctl.startup_last_srtt, 10);
11226                         return (0);
11227                 }
11228         }
11229         if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) &&
11230             (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) &&
11231             (!IN_RECOVERY(bbr->rc_tp->t_flags))) {
11232                 /*
11233                  * We only assess if we have a new measurment when
11234                  * we have no loss and are not in recovery.
11235                  * Drag up by one our last_startup epoch so we will hold
11236                  * the number of non-gain we have already accumulated.
11237                  */
11238                 if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
11239                         bbr->r_ctl.rc_bbr_last_startup_epoch++;
11240                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11241                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9);
11242                 return (0);
11243         }
11244         /* Case where we reduced the lost (bad retransmit) */
11245         if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost)
11246                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11247         bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count;
11248         btlbw = bbr_get_full_bw(bbr);
11249         if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower)
11250                 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11251                          (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11252         else
11253                 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
11254                          (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
11255         do_exit = 0;
11256         if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw)
11257                 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
11258         if (btlbw >= gain) {
11259                 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
11260                 /* Update the lost so we won't exit in next set of tests */
11261                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11262                 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11263                                       bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
11264         }
11265         if ((bbr->rc_loss_exit &&
11266              (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
11267              (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) &&
11268             ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) {
11269                 /*
11270                  * If we had no gain,  we had loss and that loss was above
11271                  * our threshould, the rwnd is not constrained, and we have
11272                  * had at least 3 packet epochs exit. Note that this is
11273                  * switched off by sysctl. Google does not do this by the
11274                  * way.
11275                  */
11276                 if ((ctf_flight_size(bbr->rc_tp,
11277                          (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
11278                      (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) {
11279                         do_exit = 1;
11280                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11281                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4);
11282                 } else {
11283                         /* Just record an updated loss value */
11284                         bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11285                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11286                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5);
11287                 }
11288         } else
11289                 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
11290         if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) ||
11291             do_exit) {
11292                 /* Return 1 to exit the startup state. */
11293                 return (1);
11294         }
11295         /* Stay in startup */
11296         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11297                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
11298         return (0);
11299 }
11300
11301 static void
11302 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses)
11303 {
11304         /*
11305          * A tick occured in the rtt epoch do we need to do anything?
11306          */
11307 #ifdef BBR_INVARIANTS
11308         if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
11309             (bbr->rc_bbr_state != BBR_STATE_DRAIN) &&
11310             (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) &&
11311             (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
11312             (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) {
11313                 /* Debug code? */
11314                 panic("Unknown BBR state %d?\n", bbr->rc_bbr_state);
11315         }
11316 #endif
11317         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
11318                 /* Do we exit the startup state? */
11319                 if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) {
11320                         uint32_t time_in;
11321
11322                         bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
11323                                               bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6);
11324                         bbr->rc_filled_pipe = 1;
11325                         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11326                         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11327
11328                                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11329                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11330                         } else
11331                                 time_in = 0;
11332                         if (bbr->rc_no_pacing)
11333                                 bbr->rc_no_pacing = 0;
11334                         bbr->r_ctl.rc_bbr_state_time = cts;
11335                         bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg;
11336                         bbr->rc_bbr_state = BBR_STATE_DRAIN;
11337                         bbr_set_state_target(bbr, __LINE__);
11338                         if ((bbr->rc_use_google == 0) &&
11339                             bbr_slam_cwnd_in_main_drain) {
11340                                 /* Here we don't have to worry about probe-rtt */
11341                                 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
11342                                 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11343                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11344                         }
11345                         bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
11346                         bbr_log_type_statechange(bbr, cts, __LINE__);
11347                         if (ctf_flight_size(bbr->rc_tp,
11348                                 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
11349                             bbr->r_ctl.rc_target_at_state) {
11350                                 /*
11351                                  * Switch to probe_bw if we are already
11352                                  * there
11353                                  */
11354                                 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11355                                 bbr_substate_change(bbr, cts, __LINE__, 0);
11356                                 bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11357                                 bbr_log_type_statechange(bbr, cts, __LINE__);
11358                         }
11359                 }
11360         } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) {
11361                 uint32_t inflight;
11362                 struct tcpcb *tp;
11363
11364                 tp = bbr->rc_tp;
11365                 inflight = ctf_flight_size(tp,
11366                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11367                 if (inflight >= bbr->r_ctl.rc_target_at_state) {
11368                         /* We have reached a flight of the cwnd target */
11369                         bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11370                         bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
11371                         bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
11372                         bbr_set_state_target(bbr, __LINE__);
11373                         /*
11374                          * Rig it so we don't do anything crazy and
11375                          * start fresh with a new randomization.
11376                          */
11377                         bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
11378                         bbr->rc_bbr_substate = BBR_SUB_LEVEL6;
11379                         bbr_substate_change(bbr, cts, __LINE__, 1);
11380                 }
11381         } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) {
11382                 /* Has in-flight reached the bdp (or less)? */
11383                 uint32_t inflight;
11384                 struct tcpcb *tp;
11385
11386                 tp = bbr->rc_tp;
11387                 inflight = ctf_flight_size(tp,
11388                               (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11389                 if ((bbr->rc_use_google == 0) &&
11390                     bbr_slam_cwnd_in_main_drain &&
11391                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11392                         /*
11393                          * Here we don't have to worry about probe-rtt
11394                          * re-slam it, but keep it slammed down.
11395                          */
11396                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11397                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11398                 }
11399                 if (inflight <= bbr->r_ctl.rc_target_at_state) {
11400                         /* We have drained */
11401                         bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
11402                         bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
11403                         if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
11404                                 uint32_t time_in;
11405
11406                                 time_in = cts - bbr->r_ctl.rc_bbr_state_time;
11407                                 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
11408                         }
11409                         if ((bbr->rc_use_google == 0) &&
11410                             bbr_slam_cwnd_in_main_drain &&
11411                             (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
11412                                 /* Restore the cwnd */
11413                                 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
11414                                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11415                         }
11416                         /* Setup probe-rtt has being done now RRS-HERE */
11417                         bbr->r_ctl.rc_rtt_shrinks = cts;
11418                         bbr->r_ctl.last_in_probertt = cts;
11419                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0);
11420                         /* Randomly pick a sub-state */
11421                         bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
11422                         bbr_substate_change(bbr, cts, __LINE__, 0);
11423                         bbr_log_type_statechange(bbr, cts, __LINE__);
11424                 }
11425         } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) {
11426                 uint32_t flight;
11427
11428                 flight = ctf_flight_size(bbr->rc_tp,
11429                              (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11430                 bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered);
11431                 if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) &&
11432                     (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11433                         /*
11434                          * We must keep cwnd at the desired MSS.
11435                          */
11436                         bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
11437                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11438                 } else if ((bbr_prtt_slam_cwnd) &&
11439                            (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
11440                         /* Re-slam it */
11441                         bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
11442                         bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
11443                 }
11444                 if (bbr->r_ctl.rc_bbr_enters_probertt == 0) {
11445                         /* Has outstanding reached our target? */
11446                         if (flight <= bbr->r_ctl.rc_target_at_state) {
11447                                 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0);
11448                                 bbr->r_ctl.rc_bbr_enters_probertt = cts;
11449                                 /* If time is exactly 0, be 1usec off */
11450                                 if (bbr->r_ctl.rc_bbr_enters_probertt == 0)
11451                                         bbr->r_ctl.rc_bbr_enters_probertt = 1;
11452                                 if (bbr->rc_use_google == 0) {
11453                                         /*
11454                                          * Restore any lowering that as occured to
11455                                          * reach here
11456                                          */
11457                                         if (bbr->r_ctl.bbr_rttprobe_gain_val)
11458                                                 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
11459                                         else
11460                                                 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
11461                                 }
11462                         }
11463                         if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) &&
11464                             (bbr->rc_use_google == 0) &&
11465                             bbr->r_ctl.bbr_rttprobe_gain_val &&
11466                             (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) ||
11467                              (flight >= bbr->r_ctl.flightsize_at_drain))) {
11468                                 /*
11469                                  * We have doddled with our current hptsi
11470                                  * gain an srtt and have still not made it
11471                                  * to target, or we have increased our flight.
11472                                  * Lets reduce the gain by xx%
11473                                  * flooring the reduce at DRAIN (based on
11474                                  * mul/div)
11475                                  */
11476                                 int red;
11477
11478                                 bbr->r_ctl.flightsize_at_drain = flight;
11479                                 bbr->r_ctl.rc_probertt_srttchktim = cts;
11480                                 red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1);
11481                                 if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) {
11482                                         /* Reduce our gain again */
11483                                         bbr->r_ctl.rc_bbr_hptsi_gain -= red;
11484                                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0);
11485                                 } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) {
11486                                         /* one more chance before we give up */
11487                                         bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
11488                                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0);
11489                                 } else {
11490                                         /* At the very bottom */
11491                                         bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1);
11492                                 }
11493                         }
11494                 }
11495                 if (bbr->r_ctl.rc_bbr_enters_probertt &&
11496                     (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) &&
11497                     ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) {
11498                         /* Time to exit probe RTT normally */
11499                         bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts);
11500                 }
11501         } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
11502                 if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
11503                     (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
11504                         /*
11505                          * This qualifies as a RTT_PROBE session since we
11506                          * drop the data outstanding to nothing and waited
11507                          * more than bbr_rtt_probe_time.
11508                          */
11509                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
11510                         bbr_set_reduced_rtt(bbr, cts, __LINE__);
11511                 }
11512                 if (bbr_should_enter_probe_rtt(bbr, cts)) {
11513                         bbr_enter_probe_rtt(bbr, cts, __LINE__);
11514                 } else {
11515                         bbr_set_probebw_gains(bbr, cts, losses);
11516                 }
11517         }
11518 }
11519
11520 static void
11521 bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses)
11522 {
11523         int32_t epoch = 0;
11524
11525         if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
11526                 bbr_set_epoch(bbr, cts, line);
11527                 /* At each epoch doe lt bw sampling */
11528                 epoch = 1;
11529         }
11530         bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses);
11531 }
11532
11533 static int
11534 bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
11535     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
11536     int32_t nxt_pkt, struct timeval *tv)
11537 {
11538         int32_t thflags, retval;
11539         uint32_t cts, lcts;
11540         uint32_t tiwin;
11541         struct tcpopt to;
11542         struct tcp_bbr *bbr;
11543         struct bbr_sendmap *rsm;
11544         struct timeval ltv;
11545         int32_t did_out = 0;
11546         int32_t in_recovery;
11547         uint16_t nsegs;
11548         int32_t prev_state;
11549         uint32_t lost;
11550
11551         nsegs = max(1, m->m_pkthdr.lro_nsegs);
11552         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
11553         /* add in our stats */
11554         kern_prefetch(bbr, &prev_state);
11555         prev_state = 0;
11556         thflags = th->th_flags;
11557         /*
11558          * If this is either a state-changing packet or current state isn't
11559          * established, we require a write lock on tcbinfo.  Otherwise, we
11560          * allow the tcbinfo to be in either alocked or unlocked, as the
11561          * caller may have unnecessarily acquired a write lock due to a
11562          * race.
11563          */
11564         INP_WLOCK_ASSERT(tp->t_inpcb);
11565         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
11566             __func__));
11567         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
11568             __func__));
11569
11570         tp->t_rcvtime = ticks;
11571         /*
11572          * Unscale the window into a 32-bit value. For the SYN_SENT state
11573          * the scale is zero.
11574          */
11575         tiwin = th->th_win << tp->snd_scale;
11576 #ifdef STATS
11577         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
11578 #endif
11579         /*
11580          * Parse options on any incoming segment.
11581          */
11582         tcp_dooptions(&to, (u_char *)(th + 1),
11583             (th->th_off << 2) - sizeof(struct tcphdr),
11584             (thflags & TH_SYN) ? TO_SYN : 0);
11585
11586         if (m->m_flags & M_TSTMP) {
11587                 /* Prefer the hardware timestamp if present */
11588                 struct timespec ts;
11589
11590                 mbuf_tstmp2timespec(m, &ts);
11591                 bbr->rc_tv.tv_sec = ts.tv_sec;
11592                 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
11593                 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
11594         } else if (m->m_flags & M_TSTMP_LRO) {
11595                 /* Next the arrival timestamp */
11596                 struct timespec ts;
11597
11598                 mbuf_tstmp2timespec(m, &ts);
11599                 bbr->rc_tv.tv_sec = ts.tv_sec;
11600                 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
11601                 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
11602         } else {
11603                 /*
11604                  * Ok just get the current time.
11605                  */
11606                 bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv);
11607         }
11608         /*
11609          * If echoed timestamp is later than the current time, fall back to
11610          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
11611          * were used when this connection was established.
11612          */
11613         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
11614                 to.to_tsecr -= tp->ts_offset;
11615                 if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv)))
11616                         to.to_tsecr = 0;
11617         }
11618         /*
11619          * If its the first time in we need to take care of options and
11620          * verify we can do SACK for rack!
11621          */
11622         if (bbr->r_state == 0) {
11623                 /*
11624                  * Process options only when we get SYN/ACK back. The SYN
11625                  * case for incoming connections is handled in tcp_syncache.
11626                  * According to RFC1323 the window field in a SYN (i.e., a
11627                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
11628                  * this is traditional behavior, may need to be cleaned up.
11629                  */
11630                 if (bbr->rc_inp == NULL) {
11631                         bbr->rc_inp = tp->t_inpcb;
11632                 }
11633                 /*
11634                  * We need to init rc_inp here since its not init'd when
11635                  * bbr_init is called
11636                  */
11637                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
11638                         if ((to.to_flags & TOF_SCALE) &&
11639                             (tp->t_flags & TF_REQ_SCALE)) {
11640                                 tp->t_flags |= TF_RCVD_SCALE;
11641                                 tp->snd_scale = to.to_wscale;
11642                         }
11643                         /*
11644                          * Initial send window.  It will be updated with the
11645                          * next incoming segment to the scaled value.
11646                          */
11647                         tp->snd_wnd = th->th_win;
11648                         if (to.to_flags & TOF_TS) {
11649                                 tp->t_flags |= TF_RCVD_TSTMP;
11650                                 tp->ts_recent = to.to_tsval;
11651                                 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
11652                         }
11653                         if (to.to_flags & TOF_MSS)
11654                                 tcp_mss(tp, to.to_mss);
11655                         if ((tp->t_flags & TF_SACK_PERMIT) &&
11656                             (to.to_flags & TOF_SACKPERM) == 0)
11657                                 tp->t_flags &= ~TF_SACK_PERMIT;
11658                         if (IS_FASTOPEN(tp->t_flags)) {
11659                                 if (to.to_flags & TOF_FASTOPEN) {
11660                                         uint16_t mss;
11661
11662                                         if (to.to_flags & TOF_MSS)
11663                                                 mss = to.to_mss;
11664                                         else
11665                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
11666                                                         mss = TCP6_MSS;
11667                                                 else
11668                                                         mss = TCP_MSS;
11669                                         tcp_fastopen_update_cache(tp, mss,
11670                                             to.to_tfo_len, to.to_tfo_cookie);
11671                                 } else
11672                                         tcp_fastopen_disable_path(tp);
11673                         }
11674                 }
11675                 /*
11676                  * At this point we are at the initial call. Here we decide
11677                  * if we are doing RACK or not. We do this by seeing if
11678                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
11679                  * we switch to the default code.
11680                  */
11681                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
11682                         /* Bail */
11683                         tcp_switch_back_to_default(tp);
11684                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
11685                             tlen, iptos);
11686                         return (1);
11687                 }
11688                 /* Set the flag */
11689                 bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
11690                 tcp_set_hpts(tp->t_inpcb);
11691                 sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack);
11692         }
11693         if (thflags & TH_ACK) {
11694                 /* Track ack types */
11695                 if (to.to_flags & TOF_SACK)
11696                         BBR_STAT_INC(bbr_acks_with_sacks);
11697                 else
11698                         BBR_STAT_INC(bbr_plain_acks);
11699         }
11700         /*
11701          * This is the one exception case where we set the rack state
11702          * always. All other times (timers etc) we must have a rack-state
11703          * set (so we assure we have done the checks above for SACK).
11704          */
11705         if (bbr->r_state != tp->t_state)
11706                 bbr_set_state(tp, bbr, tiwin);
11707
11708         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL)
11709                 kern_prefetch(rsm, &prev_state);
11710         prev_state = bbr->r_state;
11711         bbr->rc_ack_was_delayed = 0;
11712         lost = bbr->r_ctl.rc_lost;
11713         bbr->rc_is_pkt_epoch_now = 0;
11714         if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) {
11715                 /* Get the real time into lcts and figure the real delay */
11716                 lcts = tcp_get_usecs(&ltv);
11717                 if (TSTMP_GT(lcts, cts)) {
11718                         bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts;
11719                         bbr->rc_ack_was_delayed = 1;
11720                         if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay,
11721                                      bbr->r_ctl.highest_hdwr_delay))
11722                                 bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay;
11723                 } else {
11724                         bbr->r_ctl.rc_ack_hdwr_delay = 0;
11725                         bbr->rc_ack_was_delayed = 0;
11726                 }
11727         } else {
11728                 bbr->r_ctl.rc_ack_hdwr_delay = 0;
11729                 bbr->rc_ack_was_delayed = 0;
11730         }
11731         bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m);
11732         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
11733                 retval = 0;
11734                 m_freem(m);
11735                 goto done_with_input;
11736         }
11737         /*
11738          * If a segment with the ACK-bit set arrives in the SYN-SENT state
11739          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
11740          */
11741         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
11742             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
11743                 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11744                 return (1);
11745         }
11746         in_recovery = IN_RECOVERY(tp->t_flags);
11747         if (tiwin > bbr->r_ctl.rc_high_rwnd)
11748                 bbr->r_ctl.rc_high_rwnd = tiwin;
11749 #ifdef BBR_INVARIANTS
11750         if ((tp->t_inpcb->inp_flags & INP_DROPPED) ||
11751             (tp->t_inpcb->inp_flags2 & INP_FREED)) {
11752                 panic("tp:%p bbr:%p given a dropped inp:%p",
11753                     tp, bbr, tp->t_inpcb);
11754         }
11755 #endif
11756         bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp,
11757                                             (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11758         bbr->rtt_valid = 0;
11759         if (to.to_flags & TOF_TS) {
11760                 bbr->rc_ts_valid = 1;
11761                 bbr->r_ctl.last_inbound_ts = to.to_tsval;
11762         } else {
11763                 bbr->rc_ts_valid = 0;
11764                 bbr->r_ctl.last_inbound_ts = 0;
11765         }
11766         retval = (*bbr->r_substate) (m, th, so,
11767             tp, &to, drop_hdrlen,
11768             tlen, tiwin, thflags, nxt_pkt);
11769 #ifdef BBR_INVARIANTS
11770         if ((retval == 0) &&
11771             (tp->t_inpcb == NULL)) {
11772                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
11773                     retval, tp, prev_state);
11774         }
11775 #endif
11776         if (nxt_pkt == 0)
11777                 BBR_STAT_INC(bbr_rlock_left_ret0);
11778         else
11779                 BBR_STAT_INC(bbr_rlock_left_ret1);
11780         if (retval == 0) {
11781                 /*
11782                  * If retval is 1 the tcb is unlocked and most likely the tp
11783                  * is gone.
11784                  */
11785                 INP_WLOCK_ASSERT(tp->t_inpcb);
11786                 tcp_bbr_xmit_timer_commit(bbr, tp, cts);
11787                 if (bbr->rc_is_pkt_epoch_now)
11788                         bbr_set_pktepoch(bbr, cts, __LINE__);
11789                 bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
11790                 if (nxt_pkt == 0) {
11791                         if (bbr->r_wanted_output != 0) {
11792                                 bbr->rc_output_starts_timer = 0;
11793                                 did_out = 1;
11794                                 (void)tp->t_fb->tfb_tcp_output(tp);
11795                         } else
11796                                 bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
11797                 }
11798                 if ((nxt_pkt == 0) &&
11799                     ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
11800                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
11801                      (tp->t_flags & TF_DELACK) ||
11802                      ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
11803                       (tp->t_state <= TCPS_CLOSING)))) {
11804                         /*
11805                          * We could not send (probably in the hpts but
11806                          * stopped the timer)?
11807                          */
11808                         if ((tp->snd_max == tp->snd_una) &&
11809                             ((tp->t_flags & TF_DELACK) == 0) &&
11810                             (bbr->rc_inp->inp_in_hpts) &&
11811                             (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11812                                 /*
11813                                  * keep alive not needed if we are hptsi
11814                                  * output yet
11815                                  */
11816                                 ;
11817                         } else {
11818                                 if (bbr->rc_inp->inp_in_hpts) {
11819                                         tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT);
11820                                         if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
11821                                             (TSTMP_GT(lcts, bbr->rc_pacer_started))) {
11822                                                 uint32_t del;
11823
11824                                                 del = lcts - bbr->rc_pacer_started;
11825                                                 if (bbr->r_ctl.rc_last_delay_val > del) {
11826                                                         BBR_STAT_INC(bbr_force_timer_start);
11827                                                         bbr->r_ctl.rc_last_delay_val -= del;
11828                                                         bbr->rc_pacer_started = lcts;
11829                                                 } else {
11830                                                         /* We are late */
11831                                                         bbr->r_ctl.rc_last_delay_val = 0;
11832                                                         BBR_STAT_INC(bbr_force_output);
11833                                                         (void)tp->t_fb->tfb_tcp_output(tp);
11834                                                 }
11835                                         }
11836                                 }
11837                                 bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val,
11838                                     0);
11839                         }
11840                 } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) {
11841                         /* Do we have the correct timer running? */
11842                         bbr_timer_audit(tp, bbr, lcts, &so->so_snd);
11843                 }
11844                 /* Do we have a new state */
11845                 if (bbr->r_state != tp->t_state)
11846                         bbr_set_state(tp, bbr, tiwin);
11847 done_with_input:
11848                 bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out);
11849                 if (did_out)
11850                         bbr->r_wanted_output = 0;
11851 #ifdef BBR_INVARIANTS
11852                 if (tp->t_inpcb == NULL) {
11853                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
11854                             did_out,
11855                             retval, tp, prev_state);
11856                 }
11857 #endif
11858         }
11859         return (retval);
11860 }
11861
11862 static void
11863 bbr_log_type_hrdwtso(struct tcpcb *tp, struct tcp_bbr *bbr, int len, int mod, int what_we_can_send)
11864 {
11865         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
11866                 union tcp_log_stackspecific log;
11867                 struct timeval tv;
11868                 uint32_t cts;
11869
11870                 cts = tcp_get_usecs(&tv);
11871                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
11872                 log.u_bbr.flex1 = bbr->r_ctl.rc_pace_min_segs;
11873                 log.u_bbr.flex2 = what_we_can_send;
11874                 log.u_bbr.flex3 = bbr->r_ctl.rc_pace_max_segs;
11875                 log.u_bbr.flex4 = len;
11876                 log.u_bbr.flex5 = 0;
11877                 log.u_bbr.flex7 = mod;
11878                 log.u_bbr.flex8 = 1;
11879                 TCP_LOG_EVENTP(tp, NULL,
11880                     &tp->t_inpcb->inp_socket->so_rcv,
11881                     &tp->t_inpcb->inp_socket->so_snd,
11882                     TCP_HDWR_TLS, 0,
11883                     0, &log, false, &tv);
11884         }
11885 }
11886
11887 static void
11888 bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
11889     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
11890 {
11891         struct timeval tv;
11892         int retval;
11893
11894         /* First lets see if we have old packets */
11895         if (tp->t_in_pkt) {
11896                 if (ctf_do_queued_segments(so, tp, 1)) {
11897                         m_freem(m);
11898                         return;
11899                 }
11900         }
11901         if (m->m_flags & M_TSTMP_LRO) {
11902                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
11903                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
11904         } else {
11905                 /* Should not be should we kassert instead? */
11906                 tcp_get_usecs(&tv);
11907         }
11908         retval = bbr_do_segment_nounlock(m, th, so, tp,
11909                                          drop_hdrlen, tlen, iptos, 0, &tv);
11910         if (retval == 0)
11911                 INP_WUNLOCK(tp->t_inpcb);
11912 }
11913
11914 /*
11915  * Return how much data can be sent without violating the
11916  * cwnd or rwnd.
11917  */
11918
11919 static inline uint32_t
11920 bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin,
11921     uint32_t avail, int32_t sb_offset, uint32_t cts)
11922 {
11923         uint32_t len;
11924
11925         if (ctf_outstanding(tp) >= tp->snd_wnd) {
11926                 /* We never want to go over our peers rcv-window */
11927                 len = 0;
11928         } else {
11929                 uint32_t flight;
11930
11931                 flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
11932                 if (flight >= sendwin) {
11933                         /*
11934                          * We have in flight what we are allowed by cwnd (if
11935                          * it was rwnd blocking it would have hit above out
11936                          * >= tp->snd_wnd).
11937                          */
11938                         return (0);
11939                 }
11940                 len = sendwin - flight;
11941                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
11942                         /* We would send too much (beyond the rwnd) */
11943                         len = tp->snd_wnd - ctf_outstanding(tp);
11944                 }
11945                 if ((len + sb_offset) > avail) {
11946                         /*
11947                          * We don't have that much in the SB, how much is
11948                          * there?
11949                          */
11950                         len = avail - sb_offset;
11951                 }
11952         }
11953         return (len);
11954 }
11955
11956 static inline void
11957 bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
11958 {
11959 #ifdef NETFLIX_STATS
11960         KMOD_TCPSTAT_INC(tcps_sndpack_error);
11961         KMOD_TCPSTAT_ADD(tcps_sndbyte_error, len);
11962 #endif
11963 }
11964
11965 static inline void
11966 bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
11967 {
11968         if (error) {
11969                 bbr_do_error_accounting(tp, bbr, rsm, len, error);
11970                 return;
11971         }
11972         if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
11973                 /* Window probe */
11974                 KMOD_TCPSTAT_INC(tcps_sndprobe);
11975 #ifdef STATS
11976                 stats_voi_update_abs_u32(tp->t_stats,
11977                     VOI_TCP_RETXPB, len);
11978 #endif
11979         } else if (rsm) {
11980                 if (rsm->r_flags & BBR_TLP) {
11981                         /*
11982                          * TLP should not count in retran count, but in its
11983                          * own bin
11984                          */
11985 #ifdef NETFLIX_STATS
11986                         tp->t_sndtlppack++;
11987                         tp->t_sndtlpbyte += len;
11988                         KMOD_TCPSTAT_INC(tcps_tlpresends);
11989                         KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len);
11990 #endif
11991                 } else {
11992                         /* Retransmit */
11993                         tp->t_sndrexmitpack++;
11994                         KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
11995                         KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
11996 #ifdef STATS
11997                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
11998                             len);
11999 #endif
12000                 }
12001                 /*
12002                  * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is
12003                  * sub-state
12004                  */
12005                 counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len);
12006                 if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) {
12007                         /* Non probe_bw log in 1, 2, or 4. */
12008                         counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len);
12009                 } else {
12010                         /*
12011                          * Log our probe state 3, and log also 5-13 to show
12012                          * us the recovery sub-state for the send. This
12013                          * means that 3 == (5+6+7+8+9+10+11+12+13)
12014                          */
12015                         counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len);
12016                         counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len);
12017                 }
12018                 /* Place in both 16's the totals of retransmitted */
12019                 counter_u64_add(bbr_state_lost[16], len);
12020                 counter_u64_add(bbr_state_resend[16], len);
12021                 /* Place in 17's the total sent */
12022                 counter_u64_add(bbr_state_resend[17], len);
12023                 counter_u64_add(bbr_state_lost[17], len);
12024
12025         } else {
12026                 /* New sends */
12027                 KMOD_TCPSTAT_INC(tcps_sndpack);
12028                 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
12029                 /* Place in 17's the total sent */
12030                 counter_u64_add(bbr_state_resend[17], len);
12031                 counter_u64_add(bbr_state_lost[17], len);
12032 #ifdef STATS
12033                 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
12034                     len);
12035 #endif
12036         }
12037 }
12038
12039 static void
12040 bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level)
12041 {
12042         if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) {
12043                 /*
12044                  * Limit the cwnd to not be above N x the target plus whats
12045                  * is outstanding. The target is based on the current b/w
12046                  * estimate.
12047                  */
12048                 uint32_t target;
12049
12050                 target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT);
12051                 target += ctf_outstanding(tp);
12052                 target *= bbr_target_cwnd_mult_limit;
12053                 if (tp->snd_cwnd > target)
12054                         tp->snd_cwnd = target;
12055                 bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__);
12056         }
12057 }
12058
12059 static int
12060 bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg)
12061 {
12062         /*
12063          * "adv" is the amount we could increase the window, taking into
12064          * account that we are limited by TCP_MAXWIN << tp->rcv_scale.
12065          */
12066         uint32_t adv;
12067         int32_t oldwin;
12068
12069         adv = min(recwin, TCP_MAXWIN << tp->rcv_scale);
12070         if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
12071                 oldwin = (tp->rcv_adv - tp->rcv_nxt);
12072                 adv -= oldwin;
12073         } else
12074                 oldwin = 0;
12075
12076         /*
12077          * If the new window size ends up being the same as the old size
12078          * when it is scaled, then don't force a window update.
12079          */
12080         if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
12081                 return (0);
12082
12083         if (adv >= (2 * maxseg) &&
12084             (adv >= (so->so_rcv.sb_hiwat / 4) ||
12085             recwin <= (so->so_rcv.sb_hiwat / 8) ||
12086             so->so_rcv.sb_hiwat <= 8 * maxseg)) {
12087                 return (1);
12088         }
12089         if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
12090                 return (1);
12091         return (0);
12092 }
12093
12094 /*
12095  * Return 0 on success and a errno on failure to send.
12096  * Note that a 0 return may not mean we sent anything
12097  * if the TCB was on the hpts. A non-zero return
12098  * does indicate the error we got from ip[6]_output.
12099  */
12100 static int
12101 bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
12102 {
12103         struct socket *so;
12104         int32_t len;
12105         uint32_t cts;
12106         uint32_t recwin, sendwin;
12107         int32_t sb_offset;
12108         int32_t flags, abandon, error = 0;
12109         struct tcp_log_buffer *lgb = NULL;
12110         struct mbuf *m;
12111         struct mbuf *mb;
12112         uint32_t if_hw_tsomaxsegcount = 0;
12113         uint32_t if_hw_tsomaxsegsize = 0;
12114         uint32_t if_hw_tsomax = 0;
12115         struct ip *ip = NULL;
12116 #ifdef TCPDEBUG
12117         struct ipovly *ipov = NULL;
12118 #endif
12119         struct tcp_bbr *bbr;
12120         struct tcphdr *th;
12121 #ifdef NETFLIX_TCPOUDP
12122         struct udphdr *udp = NULL;
12123 #endif
12124         u_char opt[TCP_MAXOLEN];
12125         unsigned ipoptlen, optlen, hdrlen;
12126 #ifdef NETFLIX_TCPOUDP
12127         unsigned ulen;
12128 #endif
12129         uint32_t bbr_seq;
12130         uint32_t delay_calc=0;
12131         uint8_t doing_tlp = 0;
12132         uint8_t local_options;
12133 #ifdef BBR_INVARIANTS
12134         uint8_t doing_retran_from = 0;
12135         uint8_t picked_up_retran = 0;
12136 #endif
12137         uint8_t wanted_cookie = 0;
12138         uint8_t more_to_rxt=0;
12139         int32_t prefetch_so_done = 0;
12140         int32_t prefetch_rsm = 0;
12141         uint32_t what_we_can = 0;
12142         uint32_t tot_len = 0;
12143         uint32_t rtr_cnt = 0;
12144         uint32_t maxseg, pace_max_segs, p_maxseg;
12145         int32_t csum_flags;
12146         int32_t hw_tls;
12147 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12148         unsigned ipsec_optlen = 0;
12149
12150 #endif
12151         volatile int32_t sack_rxmit;
12152         struct bbr_sendmap *rsm = NULL;
12153         int32_t tso, mtu;
12154         int force_tso = 0;
12155         struct tcpopt to;
12156         int32_t slot = 0;
12157         struct inpcb *inp;
12158         struct sockbuf *sb;
12159         uint32_t hpts_calling;
12160 #ifdef INET6
12161         struct ip6_hdr *ip6 = NULL;
12162         int32_t isipv6;
12163 #endif
12164         uint8_t app_limited = BBR_JR_SENT_DATA;
12165         uint8_t filled_all = 0;
12166         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
12167         /* We take a cache hit here */
12168         memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
12169         cts = tcp_tv_to_usectick(&bbr->rc_tv);
12170         inp = bbr->rc_inp;
12171         so = inp->inp_socket;
12172         sb = &so->so_snd;
12173 #ifdef KERN_TLS
12174         if (sb->sb_flags & SB_TLS_IFNET)
12175                 hw_tls = 1;
12176         else
12177 #endif
12178                 hw_tls = 0;
12179         kern_prefetch(sb, &maxseg);
12180         maxseg = tp->t_maxseg - bbr->rc_last_options;
12181         if (bbr_minseg(bbr) < maxseg) {
12182                 tcp_bbr_tso_size_check(bbr, cts);
12183         }
12184         /* Remove any flags that indicate we are pacing on the inp  */
12185         pace_max_segs = bbr->r_ctl.rc_pace_max_segs;
12186         p_maxseg = min(maxseg, pace_max_segs);
12187         INP_WLOCK_ASSERT(inp);
12188 #ifdef TCP_OFFLOAD
12189         if (tp->t_flags & TF_TOE)
12190                 return (tcp_offload_output(tp));
12191 #endif
12192
12193 #ifdef INET6
12194         if (bbr->r_state) {
12195                 /* Use the cache line loaded if possible */
12196                 isipv6 = bbr->r_is_v6;
12197         } else {
12198                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
12199         }
12200 #endif
12201         if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
12202             inp->inp_in_hpts) {
12203                 /*
12204                  * We are on the hpts for some timer but not hptsi output.
12205                  * Possibly remove from the hpts so we can send/recv etc.
12206                  */
12207                 if ((tp->t_flags & TF_ACKNOW) == 0) {
12208                         /*
12209                          * No immediate demand right now to send an ack, but
12210                          * the user may have read, making room for new data
12211                          * (a window update). If so we may want to cancel
12212                          * whatever timer is running (KEEP/DEL-ACK?) and
12213                          * continue to send out a window update. Or we may
12214                          * have gotten more data into the socket buffer to
12215                          * send.
12216                          */
12217                         recwin = min(max(sbspace(&so->so_rcv), 0),
12218                             TCP_MAXWIN << tp->rcv_scale);
12219                         if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) &&
12220                             ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <=
12221                             (tp->snd_max - tp->snd_una))) {
12222                                 /*
12223                                  * Nothing new to send and no window update
12224                                  * is needed to send. Lets just return and
12225                                  * let the timer-run off.
12226                                  */
12227                                 return (0);
12228                         }
12229                 }
12230                 tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12231                 bbr_timer_cancel(bbr, __LINE__, cts);
12232         }
12233         if (bbr->r_ctl.rc_last_delay_val) {
12234                 /* Calculate a rough delay for early escape to sending  */
12235                 if (SEQ_GT(cts, bbr->rc_pacer_started))
12236                         delay_calc = cts - bbr->rc_pacer_started;
12237                 if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
12238                         delay_calc -= bbr->r_ctl.rc_last_delay_val;
12239                 else
12240                         delay_calc = 0;
12241         }
12242         /* Mark that we have called bbr_output(). */
12243         if ((bbr->r_timer_override) ||
12244             (tp->t_flags & TF_FORCEDATA) ||
12245             (tp->t_state < TCPS_ESTABLISHED)) {
12246                 /* Timeouts or early states are exempt */
12247                 if (inp->inp_in_hpts)
12248                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12249         } else if (inp->inp_in_hpts) {
12250                 if ((bbr->r_ctl.rc_last_delay_val) &&
12251                     (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
12252                     delay_calc) {
12253                         /*
12254                          * We were being paced for output and the delay has
12255                          * already exceeded when we were supposed to be
12256                          * called, lets go ahead and pull out of the hpts
12257                          * and call output.
12258                          */
12259                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);
12260                         bbr->r_ctl.rc_last_delay_val = 0;
12261                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12262                 } else if (tp->t_state == TCPS_CLOSED) {
12263                         bbr->r_ctl.rc_last_delay_val = 0;
12264                         tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT);
12265                 } else {
12266                         /*
12267                          * On the hpts, you shall not pass! even if ACKNOW
12268                          * is on, we will when the hpts fires, unless of
12269                          * course we are overdue.
12270                          */
12271                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1);
12272                         return (0);
12273                 }
12274         }
12275         bbr->rc_cwnd_limited = 0;
12276         if (bbr->r_ctl.rc_last_delay_val) {
12277                 /* recalculate the real delay and deal with over/under  */
12278                 if (SEQ_GT(cts, bbr->rc_pacer_started))
12279                         delay_calc = cts - bbr->rc_pacer_started;
12280                 else
12281                         delay_calc = 0;
12282                 if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
12283                         /* Setup the delay which will be added in */
12284                         delay_calc -= bbr->r_ctl.rc_last_delay_val;
12285                 else {
12286                         /*
12287                          * We are early setup to adjust
12288                          * our slot time.
12289                          */
12290                         uint64_t merged_val;
12291
12292                         bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc);
12293                         bbr->r_agg_early_set = 1;
12294                         if (bbr->r_ctl.rc_hptsi_agg_delay) {
12295                                 if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) {
12296                                         /* Nope our previous late cancels out the early */
12297                                         bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early;
12298                                         bbr->r_agg_early_set = 0;
12299                                         bbr->r_ctl.rc_agg_early = 0;
12300                                 } else {
12301                                         bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay;
12302                                         bbr->r_ctl.rc_hptsi_agg_delay = 0;
12303                                 }
12304                         }
12305                         merged_val = bbr->rc_pacer_started;
12306                         merged_val <<= 32;
12307                         merged_val |= bbr->r_ctl.rc_last_delay_val;
12308                         bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls,
12309                                                  bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val,
12310                                                  bbr->r_agg_early_set, 3);
12311                         bbr->r_ctl.rc_last_delay_val = 0;
12312                         BBR_STAT_INC(bbr_early);
12313                         delay_calc = 0;
12314                 }
12315         } else {
12316                 /* We were not delayed due to hptsi */
12317                 if (bbr->r_agg_early_set)
12318                         bbr->r_ctl.rc_agg_early = 0;
12319                 bbr->r_agg_early_set = 0;
12320                 delay_calc = 0;
12321         }
12322         if (delay_calc) {
12323                 /*
12324                  * We had a hptsi delay which means we are falling behind on
12325                  * sending at the expected rate. Calculate an extra amount
12326                  * of data we can send, if any, to put us back on track.
12327                  */
12328                 if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay)
12329                         bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff;
12330                 else
12331                         bbr->r_ctl.rc_hptsi_agg_delay += delay_calc;
12332         }
12333         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
12334         if ((tp->snd_una == tp->snd_max) &&
12335             (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
12336             (sbavail(sb))) {
12337                 /*
12338                  * Ok we have been idle with nothing outstanding
12339                  * we possibly need to start fresh with either a new
12340                  * suite of states or a fast-ramp up.
12341                  */
12342                 bbr_restart_after_idle(bbr,
12343                                        cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time));
12344         }
12345         /*
12346          * Now was there a hptsi delay where we are behind? We only count
12347          * being behind if: a) We are not in recovery. b) There was a delay.
12348          * <and> c) We had room to send something.
12349          *
12350          */
12351         hpts_calling = inp->inp_hpts_calls;
12352         inp->inp_hpts_calls = 0;
12353         if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
12354                 if (bbr_process_timers(tp, bbr, cts, hpts_calling)) {
12355                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
12356                         return (0);
12357                 }
12358         }
12359         bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
12360         if (hpts_calling &&
12361             (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12362                 bbr->r_ctl.rc_last_delay_val = 0;
12363         }
12364         bbr->r_timer_override = 0;
12365         bbr->r_wanted_output = 0;
12366         /*
12367          * For TFO connections in SYN_RECEIVED, only allow the initial
12368          * SYN|ACK and those sent by the retransmit timer.
12369          */
12370         if (IS_FASTOPEN(tp->t_flags) &&
12371             ((tp->t_state == TCPS_SYN_RECEIVED) ||
12372              (tp->t_state == TCPS_SYN_SENT)) &&
12373             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
12374             (tp->t_rxtshift == 0)) {    /* not a retransmit */
12375                 return (0);
12376         }
12377         /*
12378          * Before sending anything check for a state update. For hpts
12379          * calling without input this is important. If its input calling
12380          * then this was already done.
12381          */
12382         if (bbr->rc_use_google == 0)
12383                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
12384 again:
12385         /*
12386          * If we've recently taken a timeout, snd_max will be greater than
12387          * snd_max. BBR in general does not pay much attention to snd_nxt
12388          * for historic reasons the persist timer still uses it. This means
12389          * we have to look at it. All retransmissions that are not persits
12390          * use the rsm that needs to be sent so snd_nxt is ignored. At the
12391          * end of this routine we pull snd_nxt always up to snd_max.
12392          */
12393         doing_tlp = 0;
12394 #ifdef BBR_INVARIANTS
12395         doing_retran_from = picked_up_retran = 0;
12396 #endif
12397         error = 0;
12398         tso = 0;
12399         slot = 0;
12400         mtu = 0;
12401         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
12402         sb_offset = tp->snd_max - tp->snd_una;
12403         flags = tcp_outflags[tp->t_state];
12404         sack_rxmit = 0;
12405         len = 0;
12406         rsm = NULL;
12407         if (flags & TH_RST) {
12408                 SOCKBUF_LOCK(sb);
12409                 goto send;
12410         }
12411 recheck_resend:
12412         while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
12413                 /* We need to always have one in reserve */
12414                 rsm = bbr_alloc(bbr);
12415                 if (rsm == NULL) {
12416                         error = ENOMEM;
12417                         /* Lie to get on the hpts */
12418                         tot_len = tp->t_maxseg;
12419                         if (hpts_calling)
12420                                 /* Retry in a ms */
12421                                 slot = 1001;
12422                         goto just_return_nolock;
12423                 }
12424                 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
12425                 bbr->r_ctl.rc_free_cnt++;
12426                 rsm = NULL;
12427         }
12428         /* What do we send, a resend? */
12429         if (bbr->r_ctl.rc_resend == NULL) {
12430                 /* Check for rack timeout */
12431                 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
12432                 if (bbr->r_ctl.rc_resend) {
12433 #ifdef BBR_INVARIANTS
12434                         picked_up_retran = 1;
12435 #endif
12436                         bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend);
12437                 }
12438         }
12439         if (bbr->r_ctl.rc_resend) {
12440                 rsm = bbr->r_ctl.rc_resend;
12441 #ifdef BBR_INVARIANTS
12442                 doing_retran_from = 1;
12443 #endif
12444                 /* Remove any TLP flags its a RACK or T-O */
12445                 rsm->r_flags &= ~BBR_TLP;
12446                 bbr->r_ctl.rc_resend = NULL;
12447                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
12448 #ifdef BBR_INVARIANTS
12449                         panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n",
12450                             tp, bbr, rsm, rsm->r_start, tp->snd_una);
12451                         goto recheck_resend;
12452 #else
12453                         /* TSNH */
12454                         rsm = NULL;
12455                         goto recheck_resend;
12456 #endif
12457                 }
12458                 rtr_cnt++;
12459                 if (rsm->r_flags & BBR_HAS_SYN) {
12460                         /* Only retransmit a SYN by itself */
12461                         len = 0;
12462                         if ((flags & TH_SYN) == 0) {
12463                                 /* Huh something is wrong */
12464                                 rsm->r_start++;
12465                                 if (rsm->r_start == rsm->r_end) {
12466                                         /* Clean it up, somehow we missed the ack? */
12467                                         bbr_log_syn(tp, NULL);
12468                                 } else {
12469                                         /* TFO with data? */
12470                                         rsm->r_flags &= ~BBR_HAS_SYN;
12471                                         len = rsm->r_end - rsm->r_start;
12472                                 }
12473                         } else {
12474                                 /* Retransmitting SYN */
12475                                 rsm = NULL;
12476                                 SOCKBUF_LOCK(sb);
12477                                 goto send;
12478                         }
12479                 } else
12480                         len = rsm->r_end - rsm->r_start;
12481                 if ((bbr->rc_resends_use_tso == 0) &&
12482 #ifdef KERN_TLS
12483                     ((sb->sb_flags & SB_TLS_IFNET) == 0) &&
12484 #endif
12485                     (len > maxseg)) {
12486                         len = maxseg;
12487                         more_to_rxt = 1;
12488                 }
12489                 sb_offset = rsm->r_start - tp->snd_una;
12490                 if (len > 0) {
12491                         sack_rxmit = 1;
12492                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
12493                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
12494                             min(len, maxseg));
12495                 } else {
12496                         /* I dont think this can happen */
12497                         rsm = NULL;
12498                         goto recheck_resend;
12499                 }
12500                 BBR_STAT_INC(bbr_resends_set);
12501         } else if (bbr->r_ctl.rc_tlp_send) {
12502                 /*
12503                  * Tail loss probe
12504                  */
12505                 doing_tlp = 1;
12506                 rsm = bbr->r_ctl.rc_tlp_send;
12507                 bbr->r_ctl.rc_tlp_send = NULL;
12508                 sack_rxmit = 1;
12509                 len = rsm->r_end - rsm->r_start;
12510                 rtr_cnt++;
12511                 if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
12512                         len = maxseg;
12513
12514                 if (SEQ_GT(tp->snd_una, rsm->r_start)) {
12515 #ifdef BBR_INVARIANTS
12516                         panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u",
12517                             tp, bbr, tp->snd_una, rsm, rsm->r_start);
12518 #else
12519                         /* TSNH */
12520                         rsm = NULL;
12521                         goto recheck_resend;
12522 #endif
12523                 }
12524                 sb_offset = rsm->r_start - tp->snd_una;
12525                 BBR_STAT_INC(bbr_tlp_set);
12526         }
12527         /*
12528          * Enforce a connection sendmap count limit if set
12529          * as long as we are not retransmiting.
12530          */
12531         if ((rsm == NULL) &&
12532             (V_tcp_map_entries_limit > 0) &&
12533             (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
12534                 BBR_STAT_INC(bbr_alloc_limited);
12535                 if (!bbr->alloc_limit_reported) {
12536                         bbr->alloc_limit_reported = 1;
12537                         BBR_STAT_INC(bbr_alloc_limited_conns);
12538                 }
12539                 goto just_return_nolock;
12540         }
12541 #ifdef BBR_INVARIANTS
12542         if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) {
12543                 panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u",
12544                     tp, bbr, rsm, sb_offset, len);
12545         }
12546 #endif
12547         /*
12548          * Get standard flags, and add SYN or FIN if requested by 'hidden'
12549          * state flags.
12550          */
12551         if (tp->t_flags & TF_NEEDFIN && (rsm == NULL))
12552                 flags |= TH_FIN;
12553         if (tp->t_flags & TF_NEEDSYN)
12554                 flags |= TH_SYN;
12555
12556         if (rsm && (rsm->r_flags & BBR_HAS_FIN)) {
12557                 /* we are retransmitting the fin */
12558                 len--;
12559                 if (len) {
12560                         /*
12561                          * When retransmitting data do *not* include the
12562                          * FIN. This could happen from a TLP probe if we
12563                          * allowed data with a FIN.
12564                          */
12565                         flags &= ~TH_FIN;
12566                 }
12567         } else if (rsm) {
12568                 if (flags & TH_FIN)
12569                         flags &= ~TH_FIN;
12570         }
12571         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
12572                 void *end_rsm;
12573
12574                 end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
12575                 if (end_rsm)
12576                         kern_prefetch(end_rsm, &prefetch_rsm);
12577                 prefetch_rsm = 1;
12578         }
12579         SOCKBUF_LOCK(sb);
12580         /*
12581          * If in persist timeout with window of 0, send 1 byte. Otherwise,
12582          * if window is small but nonzero and time TF_SENTFIN expired, we
12583          * will send what we can and go to transmit state.
12584          */
12585         if (tp->t_flags & TF_FORCEDATA) {
12586                 if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) {
12587                         /*
12588                          * If we still have some data to send, then clear
12589                          * the FIN bit.  Usually this would happen below
12590                          * when it realizes that we aren't sending all the
12591                          * data.  However, if we have exactly 1 byte of
12592                          * unsent data, then it won't clear the FIN bit
12593                          * below, and if we are in persist state, we wind up
12594                          * sending the packet without recording that we sent
12595                          * the FIN bit.
12596                          *
12597                          * We can't just blindly clear the FIN bit, because
12598                          * if we don't have any more data to send then the
12599                          * probe will be the FIN itself.
12600                          */
12601                         if (sb_offset < sbused(sb))
12602                                 flags &= ~TH_FIN;
12603                         sendwin = 1;
12604                 } else {
12605                         if ((bbr->rc_in_persist != 0) &&
12606                             (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
12607                                                bbr_minseg(bbr)))) {
12608                                 /* Exit persists if there is space */
12609                                 bbr_exit_persist(tp, bbr, cts, __LINE__);
12610                         }
12611                         if (rsm == NULL) {
12612                                 /*
12613                                  * If we are dropping persist mode then we
12614                                  * need to correct sb_offset if not a
12615                                  * retransmit.
12616                                  */
12617                                 sb_offset = tp->snd_max - tp->snd_una;
12618                         }
12619                 }
12620         }
12621         /*
12622          * If snd_nxt == snd_max and we have transmitted a FIN, the
12623          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
12624          * negative length.  This can also occur when TCP opens up its
12625          * congestion window while receiving additional duplicate acks after
12626          * fast-retransmit because TCP will reset snd_nxt to snd_max after
12627          * the fast-retransmit.
12628          *
12629          * In the normal retransmit-FIN-only case, however, snd_nxt will be
12630          * set to snd_una, the sb_offset will be 0, and the length may wind
12631          * up 0.
12632          *
12633          * If sack_rxmit is true we are retransmitting from the scoreboard
12634          * in which case len is already set.
12635          */
12636         if (sack_rxmit == 0) {
12637                 uint32_t avail;
12638
12639                 avail = sbavail(sb);
12640                 if (SEQ_GT(tp->snd_max, tp->snd_una))
12641                         sb_offset = tp->snd_max - tp->snd_una;
12642                 else
12643                         sb_offset = 0;
12644                 if (bbr->rc_tlp_new_data) {
12645                         /* TLP is forcing out new data */
12646                         uint32_t tlplen;
12647
12648                         doing_tlp = 1;
12649                         tlplen = maxseg;
12650
12651                         if (tlplen > (uint32_t)(avail - sb_offset)) {
12652                                 tlplen = (uint32_t)(avail - sb_offset);
12653                         }
12654                         if (tlplen > tp->snd_wnd) {
12655                                 len = tp->snd_wnd;
12656                         } else {
12657                                 len = tlplen;
12658                         }
12659                         bbr->rc_tlp_new_data = 0;
12660                 } else {
12661                         what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
12662                         if ((len < p_maxseg) &&
12663                             (bbr->rc_in_persist == 0) &&
12664                             (ctf_outstanding(tp) >= (2 * p_maxseg)) &&
12665                             ((avail - sb_offset) >= p_maxseg)) {
12666                                 /*
12667                                  * We are not completing whats in the socket
12668                                  * buffer (i.e. there is at least a segment
12669                                  * waiting to send) and we have 2 or more
12670                                  * segments outstanding. There is no sense
12671                                  * of sending a little piece. Lets defer and
12672                                  * and wait until we can send a whole
12673                                  * segment.
12674                                  */
12675                                 len = 0;
12676                         }
12677                         if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) {
12678                                 /*
12679                                  * We are in persists, figure out if
12680                                  * a retransmit is available (maybe the previous
12681                                  * persists we sent) or if we have to send new
12682                                  * data.
12683                                  */
12684                                 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
12685                                 if (rsm) {
12686                                         len = rsm->r_end - rsm->r_start;
12687                                         if (rsm->r_flags & BBR_HAS_FIN)
12688                                                 len--;
12689                                         if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
12690                                                 len = maxseg;
12691                                         if (len > 1)
12692                                                 BBR_STAT_INC(bbr_persist_reneg);
12693                                         /*
12694                                          * XXXrrs we could force the len to
12695                                          * 1 byte here to cause the chunk to
12696                                          * split apart.. but that would then
12697                                          * mean we always retransmit it as
12698                                          * one byte even after the window
12699                                          * opens.
12700                                          */
12701                                         sack_rxmit = 1;
12702                                         sb_offset = rsm->r_start - tp->snd_una;
12703                                 } else {
12704                                         /*
12705                                          * First time through in persists or peer
12706                                          * acked our one byte. Though we do have
12707                                          * to have something in the sb.
12708                                          */
12709                                         len = 1;
12710                                         sb_offset = 0;
12711                                         if (avail == 0)
12712                                             len = 0;
12713                                 }
12714                         }
12715                 }
12716         }
12717         if (prefetch_so_done == 0) {
12718                 kern_prefetch(so, &prefetch_so_done);
12719                 prefetch_so_done = 1;
12720         }
12721         /*
12722          * Lop off SYN bit if it has already been sent.  However, if this is
12723          * SYN-SENT state and if segment contains data and if we don't know
12724          * that foreign host supports TAO, suppress sending segment.
12725          */
12726         if ((flags & TH_SYN) && (rsm == NULL) &&
12727             SEQ_GT(tp->snd_max, tp->snd_una)) {
12728                 if (tp->t_state != TCPS_SYN_RECEIVED)
12729                         flags &= ~TH_SYN;
12730                 /*
12731                  * When sending additional segments following a TFO SYN|ACK,
12732                  * do not include the SYN bit.
12733                  */
12734                 if (IS_FASTOPEN(tp->t_flags) &&
12735                     (tp->t_state == TCPS_SYN_RECEIVED))
12736                         flags &= ~TH_SYN;
12737                 sb_offset--, len++;
12738                 if (sbavail(sb) == 0)
12739                         len = 0;
12740         } else if ((flags & TH_SYN) && rsm) {
12741                 /*
12742                  * Subtract one from the len for the SYN being
12743                  * retransmitted.
12744                  */
12745                 len--;
12746         }
12747         /*
12748          * Be careful not to send data and/or FIN on SYN segments. This
12749          * measure is needed to prevent interoperability problems with not
12750          * fully conformant TCP implementations.
12751          */
12752         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
12753                 len = 0;
12754                 flags &= ~TH_FIN;
12755         }
12756         /*
12757          * On TFO sockets, ensure no data is sent in the following cases:
12758          *
12759          *  - When retransmitting SYN|ACK on a passively-created socket
12760          *  - When retransmitting SYN on an actively created socket
12761          *  - When sending a zero-length cookie (cookie request) on an
12762          *    actively created socket
12763          *  - When the socket is in the CLOSED state (RST is being sent)
12764          */
12765         if (IS_FASTOPEN(tp->t_flags) &&
12766             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
12767              ((tp->t_state == TCPS_SYN_SENT) &&
12768               (tp->t_tfo_client_cookie_len == 0)) ||
12769              (flags & TH_RST))) {
12770                 len = 0;
12771                 sack_rxmit = 0;
12772                 rsm = NULL;
12773         }
12774         /* Without fast-open there should never be data sent on a SYN */
12775         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
12776                 len = 0;
12777         if (len <= 0) {
12778                 /*
12779                  * If FIN has been sent but not acked, but we haven't been
12780                  * called to retransmit, len will be < 0.  Otherwise, window
12781                  * shrank after we sent into it.  If window shrank to 0,
12782                  * cancel pending retransmit, pull snd_nxt back to (closed)
12783                  * window, and set the persist timer if it isn't already
12784                  * going.  If the window didn't close completely, just wait
12785                  * for an ACK.
12786                  *
12787                  * We also do a general check here to ensure that we will
12788                  * set the persist timer when we have data to send, but a
12789                  * 0-byte window. This makes sure the persist timer is set
12790                  * even if the packet hits one of the "goto send" lines
12791                  * below.
12792                  */
12793                 len = 0;
12794                 if ((tp->snd_wnd == 0) &&
12795                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12796                     (tp->snd_una == tp->snd_max) &&
12797                     (sb_offset < (int)sbavail(sb))) {
12798                         /*
12799                          * Not enough room in the rwnd to send
12800                          * a paced segment out.
12801                          */
12802                         bbr_enter_persist(tp, bbr, cts, __LINE__);
12803                 }
12804         } else if ((rsm == NULL) &&
12805                    (doing_tlp == 0) &&
12806                    (len < bbr->r_ctl.rc_pace_max_segs)) {
12807                 /*
12808                  * We are not sending a full segment for
12809                  * some reason. Should we not send anything (think
12810                  * sws or persists)?
12811                  */
12812                 if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
12813                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12814                     (len < (int)(sbavail(sb) - sb_offset))) {
12815                         /*
12816                          * Here the rwnd is less than
12817                          * the pacing size, this is not a retransmit,
12818                          * we are established and
12819                          * the send is not the last in the socket buffer
12820                          * lets not send, and possibly enter persists.
12821                          */
12822                         len = 0;
12823                         if (tp->snd_max == tp->snd_una)
12824                                 bbr_enter_persist(tp, bbr, cts, __LINE__);
12825                 } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) &&
12826                            (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12827                                                  bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
12828                            (len < (int)(sbavail(sb) - sb_offset)) &&
12829                            (len < bbr_minseg(bbr))) {
12830                         /*
12831                          * Here we are not retransmitting, and
12832                          * the cwnd is not so small that we could
12833                          * not send at least a min size (rxt timer
12834                          * not having gone off), We have 2 segments or
12835                          * more already in flight, its not the tail end
12836                          * of the socket buffer  and the cwnd is blocking
12837                          * us from sending out minimum pacing segment size.
12838                          * Lets not send anything.
12839                          */
12840                         bbr->rc_cwnd_limited = 1;
12841                         len = 0;
12842                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
12843                             min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
12844                            (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
12845                                                  bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
12846                            (len < (int)(sbavail(sb) - sb_offset)) &&
12847                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
12848                         /*
12849                          * Here we have a send window but we have
12850                          * filled it up and we can't send another pacing segment.
12851                          * We also have in flight more than 2 segments
12852                          * and we are not completing the sb i.e. we allow
12853                          * the last bytes of the sb to go out even if
12854                          * its not a full pacing segment.
12855                          */
12856                         len = 0;
12857                 }
12858         }
12859         /* len will be >= 0 after this point. */
12860         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
12861         tcp_sndbuf_autoscale(tp, so, sendwin);
12862         /*
12863          *
12864          */
12865         if (bbr->rc_in_persist &&
12866             len &&
12867             (rsm == NULL) &&
12868             (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) {
12869                 /*
12870                  * We are in persist, not doing a retransmit and don't have enough space
12871                  * yet to send a full TSO. So is it at the end of the sb
12872                  * if so we need to send else nuke to 0 and don't send.
12873                  */
12874                 int sbleft;
12875                 if (sbavail(sb) > sb_offset)
12876                         sbleft = sbavail(sb) - sb_offset;
12877                 else
12878                         sbleft = 0;
12879                 if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) {
12880                         /* not at end of sb lets not send */
12881                         len = 0;
12882                 }
12883         }
12884         /*
12885          * Decide if we can use TCP Segmentation Offloading (if supported by
12886          * hardware).
12887          *
12888          * TSO may only be used if we are in a pure bulk sending state.  The
12889          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
12890          * options prevent using TSO.  With TSO the TCP header is the same
12891          * (except for the sequence number) for all generated packets.  This
12892          * makes it impossible to transmit any options which vary per
12893          * generated segment or packet.
12894          *
12895          * IPv4 handling has a clear separation of ip options and ip header
12896          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen()
12897          * does the right thing below to provide length of just ip options
12898          * and thus checking for ipoptlen is enough to decide if ip options
12899          * are present.
12900          */
12901 #ifdef INET6
12902         if (isipv6)
12903                 ipoptlen = ip6_optlen(inp);
12904         else
12905 #endif
12906         if (inp->inp_options)
12907                 ipoptlen = inp->inp_options->m_len -
12908                     offsetof(struct ipoption, ipopt_list);
12909         else
12910                 ipoptlen = 0;
12911 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12912         /*
12913          * Pre-calculate here as we save another lookup into the darknesses
12914          * of IPsec that way and can actually decide if TSO is ok.
12915          */
12916 #ifdef INET6
12917         if (isipv6 && IPSEC_ENABLED(ipv6))
12918                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
12919 #ifdef INET
12920         else
12921 #endif
12922 #endif                          /* INET6 */
12923 #ifdef INET
12924         if (IPSEC_ENABLED(ipv4))
12925                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
12926 #endif                          /* INET */
12927 #endif                          /* IPSEC */
12928 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12929         ipoptlen += ipsec_optlen;
12930 #endif
12931         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
12932             (len > maxseg) &&
12933             (tp->t_port == 0) &&
12934             ((tp->t_flags & TF_SIGNATURE) == 0) &&
12935             tp->rcv_numsacks == 0 &&
12936             ipoptlen == 0)
12937                 tso = 1;
12938
12939         recwin = min(max(sbspace(&so->so_rcv), 0),
12940             TCP_MAXWIN << tp->rcv_scale);
12941         /*
12942          * Sender silly window avoidance.   We transmit under the following
12943          * conditions when len is non-zero:
12944          *
12945          * - We have a full segment (or more with TSO) - This is the last
12946          * buffer in a write()/send() and we are either idle or running
12947          * NODELAY - we've timed out (e.g. persist timer) - we have more
12948          * then 1/2 the maximum send window's worth of data (receiver may be
12949          * limited the window size) - we need to retransmit
12950          */
12951         if (rsm)
12952                 goto send;
12953         if (len) {
12954                 if (sack_rxmit)
12955                         goto send;
12956                 if (len >= p_maxseg)
12957                         goto send;
12958                 /*
12959                  * NOTE! on localhost connections an 'ack' from the remote
12960                  * end may occur synchronously with the output and cause us
12961                  * to flush a buffer queued with moretocome.  XXX
12962                  *
12963                  */
12964                 if (((tp->t_flags & TF_MORETOCOME) == 0) &&     /* normal case */
12965                     ((tp->t_flags & TF_NODELAY) ||
12966                     ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) &&
12967                     (tp->t_flags & TF_NOPUSH) == 0) {
12968                         goto send;
12969                 }
12970                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
12971                         goto send;
12972                 }
12973                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
12974                         goto send;
12975                 }
12976                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
12977                         goto send;
12978                 }
12979         }
12980         /*
12981          * Sending of standalone window updates.
12982          *
12983          * Window updates are important when we close our window due to a
12984          * full socket buffer and are opening it again after the application
12985          * reads data from it.  Once the window has opened again and the
12986          * remote end starts to send again the ACK clock takes over and
12987          * provides the most current window information.
12988          *
12989          * We must avoid the silly window syndrome whereas every read from
12990          * the receive buffer, no matter how small, causes a window update
12991          * to be sent.  We also should avoid sending a flurry of window
12992          * updates when the socket buffer had queued a lot of data and the
12993          * application is doing small reads.
12994          *
12995          * Prevent a flurry of pointless window updates by only sending an
12996          * update when we can increase the advertized window by more than
12997          * 1/4th of the socket buffer capacity.  When the buffer is getting
12998          * full or is very small be more aggressive and send an update
12999          * whenever we can increase by two mss sized segments. In all other
13000          * situations the ACK's to new incoming data will carry further
13001          * window increases.
13002          *
13003          * Don't send an independent window update if a delayed ACK is
13004          * pending (it will get piggy-backed on it) or the remote side
13005          * already has done a half-close and won't send more data.  Skip
13006          * this if the connection is in T/TCP half-open state.
13007          */
13008         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
13009             !(tp->t_flags & TF_DELACK) &&
13010             !TCPS_HAVERCVDFIN(tp->t_state)) {
13011                 /* Check to see if we should do a window update */
13012                 if (bbr_window_update_needed(tp, so, recwin, maxseg))
13013                         goto send;
13014         }
13015         /*
13016          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
13017          * is also a catch-all for the retransmit timer timeout case.
13018          */
13019         if (tp->t_flags & TF_ACKNOW) {
13020                 goto send;
13021         }
13022         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
13023                 goto send;
13024         }
13025         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
13026                 goto send;
13027         }
13028         /*
13029          * If our state indicates that FIN should be sent and we have not
13030          * yet done so, then we need to send.
13031          */
13032         if (flags & TH_FIN &&
13033             ((tp->t_flags & TF_SENTFIN) == 0)) {
13034                 goto send;
13035         }
13036         /*
13037          * No reason to send a segment, just return.
13038          */
13039 just_return:
13040         SOCKBUF_UNLOCK(sb);
13041 just_return_nolock:
13042         if (tot_len)
13043                 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
13044         if (bbr->rc_no_pacing)
13045                 slot = 0;
13046         if (tot_len == 0) {
13047                 if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
13048                     tp->snd_wnd) {
13049                         BBR_STAT_INC(bbr_rwnd_limited);
13050                         app_limited = BBR_JR_RWND_LIMITED;
13051                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
13052                         if ((bbr->rc_in_persist == 0) &&
13053                             TCPS_HAVEESTABLISHED(tp->t_state) &&
13054                             (tp->snd_max == tp->snd_una) &&
13055                             sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
13056                                 /* No send window.. we must enter persist */
13057                                 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
13058                         }
13059                 } else if (ctf_outstanding(tp) >= sbavail(sb)) {
13060                         BBR_STAT_INC(bbr_app_limited);
13061                         app_limited = BBR_JR_APP_LIMITED;
13062                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
13063                 } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
13064                                                  bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) {
13065                         BBR_STAT_INC(bbr_cwnd_limited);
13066                         app_limited = BBR_JR_CWND_LIMITED;
13067                         bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
13068                                                                         bbr->r_ctl.rc_lost_bytes)));
13069                         bbr->rc_cwnd_limited = 1;
13070                 } else {
13071                         BBR_STAT_INC(bbr_app_limited);
13072                         app_limited = BBR_JR_APP_LIMITED;
13073                         bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
13074                 }
13075                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
13076                 bbr->r_agg_early_set = 0;
13077                 bbr->r_ctl.rc_agg_early = 0;
13078                 bbr->r_ctl.rc_last_delay_val = 0;
13079         } else if (bbr->rc_use_google == 0)
13080                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
13081         /* Are we app limited? */
13082         if ((app_limited == BBR_JR_APP_LIMITED) ||
13083             (app_limited == BBR_JR_RWND_LIMITED)) {
13084                 /**
13085                  * We are application limited.
13086                  */
13087                 bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
13088                                                                        bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered);
13089         }
13090         if (tot_len == 0)
13091                 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1);
13092         tp->t_flags &= ~TF_FORCEDATA;
13093         /* Dont update the time if we did not send */
13094         bbr->r_ctl.rc_last_delay_val = 0;
13095         bbr->rc_output_starts_timer = 1;
13096         bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
13097         bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
13098         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
13099                 /* Make sure snd_nxt is drug up */
13100                 tp->snd_nxt = tp->snd_max;
13101         }
13102         return (error);
13103
13104 send:
13105         if (doing_tlp == 0) {
13106                 /*
13107                  * Data not a TLP, and its not the rxt firing. If it is the
13108                  * rxt firing, we want to leave the tlp_in_progress flag on
13109                  * so we don't send another TLP. It has to be a rack timer
13110                  * or normal send (response to acked data) to clear the tlp
13111                  * in progress flag.
13112                  */
13113                 bbr->rc_tlp_in_progress = 0;
13114                 bbr->rc_tlp_rtx_out = 0;
13115         } else {
13116                 /*
13117                  * Its a TLP.
13118                  */
13119                 bbr->rc_tlp_in_progress = 1;
13120         }
13121         bbr_timer_cancel(bbr, __LINE__, cts);
13122         if (rsm == NULL) {
13123                 if (sbused(sb) > 0) {
13124                         /*
13125                          * This is sub-optimal. We only send a stand alone
13126                          * FIN on its own segment.
13127                          */
13128                         if (flags & TH_FIN) {
13129                                 flags &= ~TH_FIN;
13130                                 if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
13131                                         /* Lets not send this */
13132                                         slot = 0;
13133                                         goto just_return;
13134                                 }
13135                         }
13136                 }
13137         } else {
13138                 /*
13139                  * We do *not* send a FIN on a retransmit if it has data.
13140                  * The if clause here where len > 1 should never come true.
13141                  */
13142                 if ((len > 0) &&
13143                     (((rsm->r_flags & BBR_HAS_FIN) == 0) &&
13144                     (flags & TH_FIN))) {
13145                         flags &= ~TH_FIN;
13146                         len--;
13147                 }
13148         }
13149         SOCKBUF_LOCK_ASSERT(sb);
13150         if (len > 0) {
13151                 if ((tp->snd_una == tp->snd_max) &&
13152                     (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
13153                         /*
13154                          * This qualifies as a RTT_PROBE session since we
13155                          * drop the data outstanding to nothing and waited
13156                          * more than bbr_rtt_probe_time.
13157                          */
13158                         bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
13159                         bbr_set_reduced_rtt(bbr, cts, __LINE__);
13160                 }
13161                 if (len >= maxseg)
13162                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
13163                 else
13164                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
13165         }
13166         /*
13167          * Before ESTABLISHED, force sending of initial options unless TCP
13168          * set not to do any options. NOTE: we assume that the IP/TCP header
13169          * plus TCP options always fit in a single mbuf, leaving room for a
13170          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
13171          * + optlen <= MCLBYTES
13172          */
13173         optlen = 0;
13174 #ifdef INET6
13175         if (isipv6)
13176                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
13177         else
13178 #endif
13179                 hdrlen = sizeof(struct tcpiphdr);
13180
13181         /*
13182          * Compute options for segment. We only have to care about SYN and
13183          * established connection segments.  Options for SYN-ACK segments
13184          * are handled in TCP syncache.
13185          */
13186         to.to_flags = 0;
13187         local_options = 0;
13188         if ((tp->t_flags & TF_NOOPT) == 0) {
13189                 /* Maximum segment size. */
13190                 if (flags & TH_SYN) {
13191                         to.to_mss = tcp_mssopt(&inp->inp_inc);
13192 #ifdef NETFLIX_TCPOUDP
13193                         if (tp->t_port)
13194                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
13195 #endif
13196                         to.to_flags |= TOF_MSS;
13197                         /*
13198                          * On SYN or SYN|ACK transmits on TFO connections,
13199                          * only include the TFO option if it is not a
13200                          * retransmit, as the presence of the TFO option may
13201                          * have caused the original SYN or SYN|ACK to have
13202                          * been dropped by a middlebox.
13203                          */
13204                         if (IS_FASTOPEN(tp->t_flags) &&
13205                             (tp->t_rxtshift == 0)) {
13206                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
13207                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
13208                                         to.to_tfo_cookie =
13209                                             (u_int8_t *)&tp->t_tfo_cookie.server;
13210                                         to.to_flags |= TOF_FASTOPEN;
13211                                         wanted_cookie = 1;
13212                                 } else if (tp->t_state == TCPS_SYN_SENT) {
13213                                         to.to_tfo_len =
13214                                             tp->t_tfo_client_cookie_len;
13215                                         to.to_tfo_cookie =
13216                                             tp->t_tfo_cookie.client;
13217                                         to.to_flags |= TOF_FASTOPEN;
13218                                         wanted_cookie = 1;
13219                                 }
13220                         }
13221                 }
13222                 /* Window scaling. */
13223                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
13224                         to.to_wscale = tp->request_r_scale;
13225                         to.to_flags |= TOF_SCALE;
13226                 }
13227                 /* Timestamps. */
13228                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
13229                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
13230                         to.to_tsval =   tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset;
13231                         to.to_tsecr = tp->ts_recent;
13232                         to.to_flags |= TOF_TS;
13233                         local_options += TCPOLEN_TIMESTAMP + 2;
13234                 }
13235                 /* Set receive buffer autosizing timestamp. */
13236                 if (tp->rfbuf_ts == 0 &&
13237                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
13238                         tp->rfbuf_ts =  tcp_tv_to_mssectick(&bbr->rc_tv);
13239                 /* Selective ACK's. */
13240                 if (flags & TH_SYN)
13241                         to.to_flags |= TOF_SACKPERM;
13242                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
13243                     tp->rcv_numsacks > 0) {
13244                         to.to_flags |= TOF_SACK;
13245                         to.to_nsacks = tp->rcv_numsacks;
13246                         to.to_sacks = (u_char *)tp->sackblks;
13247                 }
13248 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13249                 /* TCP-MD5 (RFC2385). */
13250                 if (tp->t_flags & TF_SIGNATURE)
13251                         to.to_flags |= TOF_SIGNATURE;
13252 #endif                          /* TCP_SIGNATURE */
13253
13254                 /* Processing the options. */
13255                 hdrlen += (optlen = tcp_addoptions(&to, opt));
13256                 /*
13257                  * If we wanted a TFO option to be added, but it was unable
13258                  * to fit, ensure no data is sent.
13259                  */
13260                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
13261                     !(to.to_flags & TOF_FASTOPEN))
13262                         len = 0;
13263         }
13264 #ifdef NETFLIX_TCPOUDP
13265         if (tp->t_port) {
13266                 if (V_tcp_udp_tunneling_port == 0) {
13267                         /* The port was removed?? */
13268                         SOCKBUF_UNLOCK(&so->so_snd);
13269                         return (EHOSTUNREACH);
13270                 }
13271                 hdrlen += sizeof(struct udphdr);
13272         }
13273 #endif
13274 #ifdef INET6
13275         if (isipv6)
13276                 ipoptlen = ip6_optlen(tp->t_inpcb);
13277         else
13278 #endif
13279         if (tp->t_inpcb->inp_options)
13280                 ipoptlen = tp->t_inpcb->inp_options->m_len -
13281                     offsetof(struct ipoption, ipopt_list);
13282         else
13283                 ipoptlen = 0;
13284         ipoptlen = 0;
13285 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
13286         ipoptlen += ipsec_optlen;
13287 #endif
13288         if (bbr->rc_last_options != local_options) {
13289                 /*
13290                  * Cache the options length this generally does not change
13291                  * on a connection. We use this to calculate TSO.
13292                  */
13293                 bbr->rc_last_options = local_options;
13294         }
13295         maxseg = tp->t_maxseg - (ipoptlen + optlen);
13296         p_maxseg = min(maxseg, pace_max_segs);
13297         /*
13298          * Adjust data length if insertion of options will bump the packet
13299          * length beyond the t_maxseg length. Clear the FIN bit because we
13300          * cut off the tail of the segment.
13301          */
13302 #ifdef KERN_TLS
13303         /* force TSO for so TLS offload can get mss */
13304         if (sb->sb_flags & SB_TLS_IFNET) {
13305                 force_tso = 1;
13306         }
13307 #endif
13308
13309         if (len > maxseg) {
13310                 if (len != 0 && (flags & TH_FIN)) {
13311                         flags &= ~TH_FIN;
13312                 }
13313                 if (tso) {
13314                         uint32_t moff;
13315                         int32_t max_len;
13316
13317                         /* extract TSO information */
13318                         if_hw_tsomax = tp->t_tsomax;
13319                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
13320                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
13321                         KASSERT(ipoptlen == 0,
13322                             ("%s: TSO can't do IP options", __func__));
13323
13324                         /*
13325                          * Check if we should limit by maximum payload
13326                          * length:
13327                          */
13328                         if (if_hw_tsomax != 0) {
13329                                 /* compute maximum TSO length */
13330                                 max_len = (if_hw_tsomax - hdrlen -
13331                                     max_linkhdr);
13332                                 if (max_len <= 0) {
13333                                         len = 0;
13334                                 } else if (len > max_len) {
13335                                         len = max_len;
13336                                 }
13337                         }
13338                         /*
13339                          * Prevent the last segment from being fractional
13340                          * unless the send sockbuf can be emptied:
13341                          */
13342                         if (((sb_offset + len) < sbavail(sb)) &&
13343                             (hw_tls == 0)) {
13344                                 moff = len % (uint32_t)maxseg;
13345                                 if (moff != 0) {
13346                                         len -= moff;
13347                                 }
13348                         }
13349                         /*
13350                          * In case there are too many small fragments don't
13351                          * use TSO:
13352                          */
13353                         if (len <= maxseg) {
13354                                 len = maxseg;
13355                                 tso = 0;
13356                         }
13357                 } else {
13358                         /* Not doing TSO */
13359                         if (optlen + ipoptlen >= tp->t_maxseg) {
13360                                 /*
13361                                  * Since we don't have enough space to put
13362                                  * the IP header chain and the TCP header in
13363                                  * one packet as required by RFC 7112, don't
13364                                  * send it. Also ensure that at least one
13365                                  * byte of the payload can be put into the
13366                                  * TCP segment.
13367                                  */
13368                                 SOCKBUF_UNLOCK(&so->so_snd);
13369                                 error = EMSGSIZE;
13370                                 sack_rxmit = 0;
13371                                 goto out;
13372                         }
13373                         len = maxseg;
13374                 }
13375         } else {
13376                 /* Not doing TSO */
13377                 if_hw_tsomaxsegcount = 0;
13378                 tso = 0;
13379         }
13380         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
13381             ("%s: len > IP_MAXPACKET", __func__));
13382 #ifdef DIAGNOSTIC
13383 #ifdef INET6
13384         if (max_linkhdr + hdrlen > MCLBYTES)
13385 #else
13386         if (max_linkhdr + hdrlen > MHLEN)
13387 #endif
13388                 panic("tcphdr too big");
13389 #endif
13390         /*
13391          * This KASSERT is here to catch edge cases at a well defined place.
13392          * Before, those had triggered (random) panic conditions further
13393          * down.
13394          */
13395 #ifdef BBR_INVARIANTS
13396         if (sack_rxmit) {
13397                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
13398                         panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u",
13399                             rsm, tp, bbr, rsm->r_start, tp->snd_una);
13400                 }
13401         }
13402 #endif
13403         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
13404         if ((len == 0) &&
13405             (flags & TH_FIN) &&
13406             (sbused(sb))) {
13407                 /*
13408                  * We have outstanding data, don't send a fin by itself!.
13409                  */
13410                 slot = 0;
13411                 goto just_return;
13412         }
13413         /*
13414          * Grab a header mbuf, attaching a copy of data to be transmitted,
13415          * and initialize the header from the template for sends on this
13416          * connection.
13417          */
13418         if (len) {
13419                 uint32_t moff;
13420                 uint32_t orig_len;
13421
13422                 /*
13423                  * We place a limit on sending with hptsi.
13424                  */
13425                 if ((rsm == NULL) && len > pace_max_segs)
13426                         len = pace_max_segs;
13427                 if (len <= maxseg)
13428                         tso = 0;
13429 #ifdef INET6
13430                 if (MHLEN < hdrlen + max_linkhdr)
13431                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
13432                 else
13433 #endif
13434                         m = m_gethdr(M_NOWAIT, MT_DATA);
13435
13436                 if (m == NULL) {
13437                         BBR_STAT_INC(bbr_failed_mbuf_aloc);
13438                         bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
13439                         SOCKBUF_UNLOCK(sb);
13440                         error = ENOBUFS;
13441                         sack_rxmit = 0;
13442                         goto out;
13443                 }
13444                 m->m_data += max_linkhdr;
13445                 m->m_len = hdrlen;
13446                 /*
13447                  * Start the m_copy functions from the closest mbuf to the
13448                  * sb_offset in the socket buffer chain.
13449                  */
13450                 if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) {
13451 #ifdef BBR_INVARIANTS
13452                         if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0)))
13453                                 panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u",
13454                                     tp, bbr, len, sb_offset, sbavail(sb), rsm,
13455                                     doing_retran_from,
13456                                     picked_up_retran,
13457                                     doing_tlp);
13458
13459 #endif
13460                         /*
13461                          * In this messed up situation we have two choices,
13462                          * a) pretend the send worked, and just start timers
13463                          * and what not (not good since that may lead us
13464                          * back here a lot). <or> b) Send the lowest segment
13465                          * in the map. <or> c) Drop the connection. Lets do
13466                          * <b> which if it continues to happen will lead to
13467                          * <c> via timeouts.
13468                          */
13469                         BBR_STAT_INC(bbr_offset_recovery);
13470                         rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
13471                         sb_offset = 0;
13472                         if (rsm == NULL) {
13473                                 sack_rxmit = 0;
13474                                 len = sbavail(sb);
13475                         } else {
13476                                 sack_rxmit = 1;
13477                                 if (rsm->r_start != tp->snd_una) {
13478                                         /*
13479                                          * Things are really messed up, <c>
13480                                          * is the only thing to do.
13481                                          */
13482                                         BBR_STAT_INC(bbr_offset_drop);
13483                                         tcp_set_inp_to_drop(inp, EFAULT);
13484                                         return (0);
13485                                 }
13486                                 len = rsm->r_end - rsm->r_start;
13487                         }
13488                         if (len > sbavail(sb))
13489                                 len = sbavail(sb);
13490                         if (len > maxseg)
13491                                 len = maxseg;
13492                 }
13493                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
13494                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
13495                         m_copydata(mb, moff, (int)len,
13496                             mtod(m, caddr_t)+hdrlen);
13497                         if (rsm == NULL)
13498                                 sbsndptr_adv(sb, mb, len);
13499                         m->m_len += len;
13500                 } else {
13501                         struct sockbuf *msb;
13502
13503                         if (rsm)
13504                                 msb = NULL;
13505                         else
13506                                 msb = sb;
13507 #ifdef BBR_INVARIANTS
13508                         if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) {
13509                                 if (rsm) {
13510                                         panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ",
13511                                             tp, bbr, len, moff,
13512                                             sbavail(sb), rsm,
13513                                             tp->snd_una, rsm->r_flags, rsm->r_start,
13514                                             doing_retran_from,
13515                                             picked_up_retran,
13516                                             doing_tlp, sack_rxmit);
13517                                 } else {
13518                                         panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u",
13519                                             tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una);
13520                                 }
13521                         }
13522 #endif
13523                         orig_len = len;
13524                         m->m_next = tcp_m_copym(
13525 #ifdef NETFLIX_COPY_ARGS
13526                                 tp,
13527 #endif
13528                                 mb, moff, &len,
13529                                 if_hw_tsomaxsegcount,
13530                                 if_hw_tsomaxsegsize, msb,
13531                                 ((rsm == NULL) ? hw_tls : 0)
13532 #ifdef NETFLIX_COPY_ARGS
13533                                 , &filled_all
13534 #endif
13535                                 );
13536                         if (len <= maxseg && !force_tso) {
13537                                 /*
13538                                  * Must have ran out of mbufs for the copy
13539                                  * shorten it to no longer need tso. Lets
13540                                  * not put on sendalot since we are low on
13541                                  * mbufs.
13542                                  */
13543                                 tso = 0;
13544                         }
13545                         if (m->m_next == NULL) {
13546                                 SOCKBUF_UNLOCK(sb);
13547                                 (void)m_free(m);
13548                                 error = ENOBUFS;
13549                                 sack_rxmit = 0;
13550                                 goto out;
13551                         }
13552                 }
13553 #ifdef BBR_INVARIANTS
13554                 if (tso && len < maxseg) {
13555                         panic("tp:%p tso on, but len:%d < maxseg:%d",
13556                             tp, len, maxseg);
13557                 }
13558                 if (tso && if_hw_tsomaxsegcount) {
13559                         int32_t seg_cnt = 0;
13560                         struct mbuf *foo;
13561
13562                         foo = m;
13563                         while (foo) {
13564                                 seg_cnt++;
13565                                 foo = foo->m_next;
13566                         }
13567                         if (seg_cnt > if_hw_tsomaxsegcount) {
13568                                 panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount);
13569                         }
13570                 }
13571 #endif
13572                 /*
13573                  * If we're sending everything we've got, set PUSH. (This
13574                  * will keep happy those implementations which only give
13575                  * data to the user when a buffer fills or a PUSH comes in.)
13576                  */
13577                 if (sb_offset + len == sbused(sb) &&
13578                     sbused(sb) &&
13579                     !(flags & TH_SYN)) {
13580                         flags |= TH_PUSH;
13581                 }
13582                 SOCKBUF_UNLOCK(sb);
13583         } else {
13584                 SOCKBUF_UNLOCK(sb);
13585                 if (tp->t_flags & TF_ACKNOW)
13586                         KMOD_TCPSTAT_INC(tcps_sndacks);
13587                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
13588                         KMOD_TCPSTAT_INC(tcps_sndctrl);
13589                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
13590                         KMOD_TCPSTAT_INC(tcps_sndurg);
13591                 else
13592                         KMOD_TCPSTAT_INC(tcps_sndwinup);
13593
13594                 m = m_gethdr(M_NOWAIT, MT_DATA);
13595                 if (m == NULL) {
13596                         BBR_STAT_INC(bbr_failed_mbuf_aloc);
13597                         bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
13598                         error = ENOBUFS;
13599                         /* Fudge the send time since we could not send */
13600                         sack_rxmit = 0;
13601                         goto out;
13602                 }
13603 #ifdef INET6
13604                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
13605                     MHLEN >= hdrlen) {
13606                         M_ALIGN(m, hdrlen);
13607                 } else
13608 #endif
13609                         m->m_data += max_linkhdr;
13610                 m->m_len = hdrlen;
13611         }
13612         SOCKBUF_UNLOCK_ASSERT(sb);
13613         m->m_pkthdr.rcvif = (struct ifnet *)0;
13614 #ifdef MAC
13615         mac_inpcb_create_mbuf(inp, m);
13616 #endif
13617 #ifdef INET6
13618         if (isipv6) {
13619                 ip6 = mtod(m, struct ip6_hdr *);
13620 #ifdef NETFLIX_TCPOUDP
13621                 if (tp->t_port) {
13622                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
13623                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13624                         udp->uh_dport = tp->t_port;
13625                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
13626                         udp->uh_ulen = htons(ulen);
13627                         th = (struct tcphdr *)(udp + 1);
13628                 } else {
13629 #endif
13630                         th = (struct tcphdr *)(ip6 + 1);
13631
13632 #ifdef NETFLIX_TCPOUDP
13633                 }
13634 #endif
13635                 tcpip_fillheaders(inp,
13636 #ifdef NETFLIX_TCPOUDP
13637                                   tp->t_port,
13638 #endif
13639                                   ip6, th);
13640         } else
13641 #endif                          /* INET6 */
13642         {
13643                 ip = mtod(m, struct ip *);
13644 #ifdef TCPDEBUG
13645                 ipov = (struct ipovly *)ip;
13646 #endif
13647 #ifdef NETFLIX_TCPOUDP
13648                 if (tp->t_port) {
13649                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
13650                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13651                         udp->uh_dport = tp->t_port;
13652                         ulen = hdrlen + len - sizeof(struct ip);
13653                         udp->uh_ulen = htons(ulen);
13654                         th = (struct tcphdr *)(udp + 1);
13655                 } else
13656 #endif
13657                         th = (struct tcphdr *)(ip + 1);
13658                 tcpip_fillheaders(inp,
13659 #ifdef NETFLIX_TCPOUDP
13660                                   tp->t_port,
13661 #endif
13662                                   ip, th);
13663         }
13664         /*
13665          * If we are doing retransmissions, then snd_nxt will not reflect
13666          * the first unsent octet.  For ACK only packets, we do not want the
13667          * sequence number of the retransmitted packet, we want the sequence
13668          * number of the next unsent octet.  So, if there is no data (and no
13669          * SYN or FIN), use snd_max instead of snd_nxt when filling in
13670          * ti_seq.  But if we are in persist state, snd_max might reflect
13671          * one byte beyond the right edge of the window, so use snd_nxt in
13672          * that case, since we know we aren't doing a retransmission.
13673          * (retransmit and persist are mutually exclusive...)
13674          */
13675         if (sack_rxmit == 0) {
13676                 if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) {
13677                         /* New data (including new persists) */
13678                         th->th_seq = htonl(tp->snd_max);
13679                         bbr_seq = tp->snd_max;
13680                 } else if (flags & TH_SYN) {
13681                         /* Syn's always send from iss */
13682                         th->th_seq = htonl(tp->iss);
13683                         bbr_seq = tp->iss;
13684                 } else if (flags & TH_FIN) {
13685                         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) {
13686                                 /*
13687                                  * If we sent the fin already its 1 minus
13688                                  * snd_max
13689                                  */
13690                                 th->th_seq = (htonl(tp->snd_max - 1));
13691                                 bbr_seq = (tp->snd_max - 1);
13692                         } else {
13693                                 /* First time FIN use snd_max */
13694                                 th->th_seq = htonl(tp->snd_max);
13695                                 bbr_seq = tp->snd_max;
13696                         }
13697                 } else if (flags & TH_RST) {
13698                         /*
13699                          * For a Reset send the last cum ack in sequence
13700                          * (this like any other choice may still generate a
13701                          * challenge ack, if a ack-update packet is in
13702                          * flight).
13703                          */
13704                         th->th_seq = htonl(tp->snd_una);
13705                         bbr_seq = tp->snd_una;
13706                 } else {
13707                         /*
13708                          * len == 0 and not persist we use snd_max, sending
13709                          * an ack unless we have sent the fin then its 1
13710                          * minus.
13711                          */
13712                         /*
13713                          * XXXRRS Question if we are in persists and we have
13714                          * nothing outstanding to send and we have not sent
13715                          * a FIN, we will send an ACK. In such a case it
13716                          * might be better to send (tp->snd_una - 1) which
13717                          * would force the peer to ack.
13718                          */
13719                         if (tp->t_flags & TF_SENTFIN) {
13720                                 th->th_seq = htonl(tp->snd_max - 1);
13721                                 bbr_seq = (tp->snd_max - 1);
13722                         } else {
13723                                 th->th_seq = htonl(tp->snd_max);
13724                                 bbr_seq = tp->snd_max;
13725                         }
13726                 }
13727         } else {
13728                 /* All retransmits use the rsm to guide the send */
13729                 th->th_seq = htonl(rsm->r_start);
13730                 bbr_seq = rsm->r_start;
13731         }
13732         th->th_ack = htonl(tp->rcv_nxt);
13733         if (optlen) {
13734                 bcopy(opt, th + 1, optlen);
13735                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
13736         }
13737         th->th_flags = flags;
13738         /*
13739          * Calculate receive window.  Don't shrink window, but avoid silly
13740          * window syndrome.
13741          */
13742         if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) &&
13743                                   recwin < maxseg)))
13744                 recwin = 0;
13745         if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
13746             recwin < (tp->rcv_adv - tp->rcv_nxt))
13747                 recwin = (tp->rcv_adv - tp->rcv_nxt);
13748         if (recwin > TCP_MAXWIN << tp->rcv_scale)
13749                 recwin = TCP_MAXWIN << tp->rcv_scale;
13750
13751         /*
13752          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
13753          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
13754          * handled in syncache.
13755          */
13756         if (flags & TH_SYN)
13757                 th->th_win = htons((u_short)
13758                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
13759         else {
13760                 /* Avoid shrinking window with window scaling. */
13761                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
13762                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
13763         }
13764         /*
13765          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
13766          * window.  This may cause the remote transmitter to stall.  This
13767          * flag tells soreceive() to disable delayed acknowledgements when
13768          * draining the buffer.  This can occur if the receiver is
13769          * attempting to read more data than can be buffered prior to
13770          * transmitting on the connection.
13771          */
13772         if (th->th_win == 0) {
13773                 tp->t_sndzerowin++;
13774                 tp->t_flags |= TF_RXWIN0SENT;
13775         } else
13776                 tp->t_flags &= ~TF_RXWIN0SENT;
13777         if (SEQ_GT(tp->snd_up, tp->snd_max)) {
13778                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max));
13779                 th->th_flags |= TH_URG;
13780         } else
13781                 /*
13782                  * If no urgent pointer to send, then we pull the urgent
13783                  * pointer to the left edge of the send window so that it
13784                  * doesn't drift into the send window on sequence number
13785                  * wraparound.
13786                  */
13787                 tp->snd_up = tp->snd_una;       /* drag it along */
13788
13789 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13790         if (to.to_flags & TOF_SIGNATURE) {
13791                 /*
13792                  * Calculate MD5 signature and put it into the place
13793                  * determined before. NOTE: since TCP options buffer doesn't
13794                  * point into mbuf's data, calculate offset and use it.
13795                  */
13796                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
13797                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
13798                         /*
13799                          * Do not send segment if the calculation of MD5
13800                          * digest has failed.
13801                          */
13802                         goto out;
13803                 }
13804         }
13805 #endif
13806
13807         /*
13808          * Put TCP length in extended header, and then checksum extended
13809          * header and data.
13810          */
13811         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
13812 #ifdef INET6
13813         if (isipv6) {
13814                 /*
13815                  * ip6_plen is not need to be filled now, and will be filled
13816                  * in ip6_output.
13817                  */
13818 #ifdef NETFLIX_TCPOUDP
13819                 if (tp->t_port) {
13820                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
13821                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13822                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
13823                         th->th_sum = htons(0);
13824                         UDPSTAT_INC(udps_opackets);
13825                 } else {
13826 #endif
13827                         csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
13828                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13829                         th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
13830                             optlen + len, IPPROTO_TCP, 0);
13831 #ifdef NETFLIX_TCPOUDP
13832                 }
13833 #endif
13834         }
13835 #endif
13836 #if defined(INET6) && defined(INET)
13837         else
13838 #endif
13839 #ifdef INET
13840         {
13841 #ifdef NETFLIX_TCPOUDP
13842                 if (tp->t_port) {
13843                         m->m_pkthdr.csum_flags = CSUM_UDP;
13844                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13845                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
13846                             ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
13847                         th->th_sum = htons(0);
13848                         UDPSTAT_INC(udps_opackets);
13849                 } else {
13850 #endif
13851                         csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
13852                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13853                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
13854                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
13855                             IPPROTO_TCP + len + optlen));
13856 #ifdef NETFLIX_TCPOUDP
13857                 }
13858 #endif
13859                 /* IP version must be set here for ipv4/ipv6 checking later */
13860                 KASSERT(ip->ip_v == IPVERSION,
13861                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
13862         }
13863 #endif
13864
13865         /*
13866          * Enable TSO and specify the size of the segments. The TCP pseudo
13867          * header checksum is always provided. XXX: Fixme: This is currently
13868          * not the case for IPv6.
13869          */
13870         if (tso || force_tso) {
13871                 KASSERT(force_tso || len > maxseg,
13872                     ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg));
13873                 m->m_pkthdr.csum_flags |= CSUM_TSO;
13874                 csum_flags |= CSUM_TSO;
13875                 m->m_pkthdr.tso_segsz = maxseg;
13876         }
13877         KASSERT(len + hdrlen == m_length(m, NULL),
13878             ("%s: mbuf chain different than expected: %d + %u != %u",
13879             __func__, len, hdrlen, m_length(m, NULL)));
13880
13881 #ifdef TCP_HHOOK
13882         /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */
13883         hhook_run_tcp_est_out(tp, th, &to, len, tso);
13884 #endif
13885 #ifdef TCPDEBUG
13886         /*
13887          * Trace.
13888          */
13889         if (so->so_options & SO_DEBUG) {
13890                 u_short save = 0;
13891
13892 #ifdef INET6
13893                 if (!isipv6)
13894 #endif
13895                 {
13896                         save = ipov->ih_len;
13897                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
13898                               * (th->th_off << 2) */ );
13899                 }
13900                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
13901 #ifdef INET6
13902                 if (!isipv6)
13903 #endif
13904                         ipov->ih_len = save;
13905         }
13906 #endif                          /* TCPDEBUG */
13907
13908         /* Log to the black box */
13909         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13910                 union tcp_log_stackspecific log;
13911
13912                 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
13913                 /* Record info on type of transmission */
13914                 log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay;
13915                 log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
13916                 log.u_bbr.flex3 = maxseg;
13917                 log.u_bbr.flex4 = delay_calc;
13918                 /* Encode filled_all into the upper flex5 bit */
13919                 log.u_bbr.flex5 = bbr->rc_past_init_win;
13920                 log.u_bbr.flex5 <<= 1;
13921                 log.u_bbr.flex5 |= bbr->rc_no_pacing;
13922                 log.u_bbr.flex5 <<= 29;
13923                 if (filled_all)
13924                         log.u_bbr.flex5 |= 0x80000000;
13925                 log.u_bbr.flex5 |= tp->t_maxseg;
13926                 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
13927                 log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
13928                 /* lets poke in the low and the high here for debugging */
13929                 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
13930                 if (rsm || sack_rxmit) {
13931                         if (doing_tlp)
13932                                 log.u_bbr.flex8 = 2;
13933                         else
13934                                 log.u_bbr.flex8 = 1;
13935                 } else {
13936                         log.u_bbr.flex8 = 0;
13937                 }
13938                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
13939                     len, &log, false, NULL, NULL, 0, tv);
13940         } else {
13941                 lgb = NULL;
13942         }
13943         /*
13944          * Fill in IP length and desired time to live and send to IP level.
13945          * There should be a better way to handle ttl and tos; we could keep
13946          * them in the template, but need a way to checksum without them.
13947          */
13948         /*
13949          * m->m_pkthdr.len should have been set before cksum calcuration,
13950          * because in6_cksum() need it.
13951          */
13952 #ifdef INET6
13953         if (isipv6) {
13954                 /*
13955                  * we separately set hoplimit for every segment, since the
13956                  * user might want to change the value via setsockopt. Also,
13957                  * desired default hop limit might be changed via Neighbor
13958                  * Discovery.
13959                  */
13960                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
13961
13962                 /*
13963                  * Set the packet size here for the benefit of DTrace
13964                  * probes. ip6_output() will set it properly; it's supposed
13965                  * to include the option header lengths as well.
13966                  */
13967                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
13968
13969                 if (V_path_mtu_discovery && maxseg > V_tcp_minmss)
13970                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
13971                 else
13972                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
13973
13974                 if (tp->t_state == TCPS_SYN_SENT)
13975                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
13976
13977                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
13978                 /* TODO: IPv6 IP6TOS_ECT bit on */
13979                 error = ip6_output(m, inp->in6p_outputopts,
13980                     &inp->inp_route6,
13981                     ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
13982                     NULL, NULL, inp);
13983
13984                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
13985                         mtu = inp->inp_route6.ro_nh->nh_mtu;
13986         }
13987 #endif                          /* INET6 */
13988 #if defined(INET) && defined(INET6)
13989         else
13990 #endif
13991 #ifdef INET
13992         {
13993                 ip->ip_len = htons(m->m_pkthdr.len);
13994 #ifdef INET6
13995                 if (isipv6)
13996                         ip->ip_ttl = in6_selecthlim(inp, NULL);
13997 #endif                          /* INET6 */
13998                 /*
13999                  * If we do path MTU discovery, then we set DF on every
14000                  * packet. This might not be the best thing to do according
14001                  * to RFC3390 Section 2. However the tcp hostcache migitates
14002                  * the problem so it affects only the first tcp connection
14003                  * with a host.
14004                  *
14005                  * NB: Don't set DF on small MTU/MSS to have a safe
14006                  * fallback.
14007                  */
14008                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
14009                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
14010                         if (tp->t_port == 0 || len < V_tcp_minmss) {
14011                                 ip->ip_off |= htons(IP_DF);
14012                         }
14013                 } else {
14014                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
14015                 }
14016
14017                 if (tp->t_state == TCPS_SYN_SENT)
14018                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
14019
14020                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
14021
14022                 error = ip_output(m, inp->inp_options, &inp->inp_route,
14023                     ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
14024                     inp);
14025                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
14026                         mtu = inp->inp_route.ro_nh->nh_mtu;
14027         }
14028 #endif                          /* INET */
14029 out:
14030
14031         if (lgb) {
14032                 lgb->tlb_errno = error;
14033                 lgb = NULL;
14034         }
14035         /*
14036          * In transmit state, time the transmission and arrange for the
14037          * retransmit.  In persist state, just set snd_max.
14038          */
14039         if (error == 0) {
14040                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
14041                     (tp->t_flags & TF_SACK_PERMIT) &&
14042                     tp->rcv_numsacks > 0)
14043                         tcp_clean_dsack_blocks(tp);
14044                 /* We sent an ack clear the bbr_segs_rcvd count */
14045                 bbr->output_error_seen = 0;
14046                 bbr->oerror_cnt = 0;
14047                 bbr->bbr_segs_rcvd = 0;
14048                 if (len == 0)
14049                         counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1);
14050                 else if (hw_tls) {
14051                         if (filled_all ||
14052                             (len >= bbr->r_ctl.rc_pace_max_segs))
14053                                 BBR_STAT_INC(bbr_meets_tso_thresh);
14054                         else {
14055                                 if (doing_tlp) {
14056                                         BBR_STAT_INC(bbr_miss_tlp);
14057                                         bbr_log_type_hrdwtso(tp, bbr, len, 1, what_we_can);
14058
14059
14060                                 } else if (rsm) {
14061                                         BBR_STAT_INC(bbr_miss_retran);
14062                                         bbr_log_type_hrdwtso(tp, bbr, len, 2, what_we_can);
14063                                 } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > sbavail(sb)) {
14064                                         BBR_STAT_INC(bbr_miss_tso_app);
14065                                         bbr_log_type_hrdwtso(tp, bbr, len, 3, what_we_can);
14066                                 } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
14067                                                                  bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_cwnd) {
14068                                         BBR_STAT_INC(bbr_miss_tso_cwnd);
14069                                         bbr_log_type_hrdwtso(tp, bbr, len, 4, what_we_can);
14070                                 } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_wnd) {
14071                                         BBR_STAT_INC(bbr_miss_tso_rwnd);
14072                                         bbr_log_type_hrdwtso(tp, bbr, len, 5, what_we_can);
14073                                 } else {
14074                                         BBR_STAT_INC(bbr_miss_unknown);
14075                                         bbr_log_type_hrdwtso(tp, bbr, len, 6, what_we_can);
14076                                 }
14077                         }
14078                 }
14079                 /* Do accounting for new sends */
14080                 if ((len > 0) && (rsm == NULL)) {
14081                         int idx;
14082                         if (tp->snd_una == tp->snd_max) {
14083                                 /*
14084                                  * Special case to match google, when
14085                                  * nothing is in flight the delivered
14086                                  * time does get updated to the current
14087                                  * time (see tcp_rate_bsd.c).
14088                                  */
14089                                 bbr->r_ctl.rc_del_time = cts;
14090                         }
14091                         if (len >= maxseg) {
14092                                 idx = (len / maxseg) + 3;
14093                                 if (idx >= TCP_MSS_ACCT_ATIMER)
14094                                         counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1);
14095                                 else
14096                                         counter_u64_add(bbr_out_size[idx], 1);
14097                         } else {
14098                                 /* smaller than a MSS */
14099                                 idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options);
14100                                 if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV)
14101                                         idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1);
14102                                 counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1);
14103                         }
14104                 }
14105         }
14106         abandon = 0;
14107         /*
14108          * We must do the send accounting before we log the output,
14109          * otherwise the state of the rsm could change and we account to the
14110          * wrong bucket.
14111          */
14112         if (len > 0) {
14113                 bbr_do_send_accounting(tp, bbr, rsm, len, error);
14114                 if (error == 0) {
14115                         if (tp->snd_una == tp->snd_max)
14116                                 bbr->r_ctl.rc_tlp_rxt_last_time = cts;
14117                 }
14118         }
14119         bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error,
14120             cts, mb, &abandon, rsm, 0, sb);
14121         if (abandon) {
14122                 /*
14123                  * If bbr_log_output destroys the TCB or sees a TH_RST being
14124                  * sent we should hit this condition.
14125                  */
14126                 return (0);
14127         }
14128         if (((tp->t_flags & TF_FORCEDATA) == 0) ||
14129             (bbr->rc_in_persist == 0)) {
14130                 /*
14131                  * Advance snd_nxt over sequence space of this segment.
14132                  */
14133                 if (error)
14134                         /* We don't log or do anything with errors */
14135                         goto skip_upd;
14136
14137                 if (tp->snd_una == tp->snd_max &&
14138                     (len || (flags & (TH_SYN | TH_FIN)))) {
14139                         /*
14140                          * Update the time we just added data since none was
14141                          * outstanding.
14142                          */
14143                         bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
14144                         bbr->rc_tp->t_acktime  = ticks;
14145                 }
14146                 if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) {
14147                         if (flags & TH_SYN) {
14148                                 tp->snd_max++;
14149                         }
14150                         if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
14151                                 tp->snd_max++;
14152                                 tp->t_flags |= TF_SENTFIN;
14153                         }
14154                 }
14155                 if (sack_rxmit == 0)
14156                         tp->snd_max += len;
14157 skip_upd:
14158                 if ((error == 0) && len)
14159                         tot_len += len;
14160         } else {
14161                 /* Persists case */
14162                 int32_t xlen = len;
14163
14164                 if (error)
14165                         goto nomore;
14166
14167                 if (flags & TH_SYN)
14168                         ++xlen;
14169                 if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
14170                         ++xlen;
14171                         tp->t_flags |= TF_SENTFIN;
14172                 }
14173                 if (xlen && (tp->snd_una == tp->snd_max)) {
14174                         /*
14175                          * Update the time we just added data since none was
14176                          * outstanding.
14177                          */
14178                         bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
14179                         bbr->rc_tp->t_acktime = ticks;
14180                 }
14181                 if (sack_rxmit == 0)
14182                         tp->snd_max += xlen;
14183                 tot_len += (len + optlen + ipoptlen);
14184         }
14185 nomore:
14186         if (error) {
14187                 /*
14188                  * Failures do not advance the seq counter above. For the
14189                  * case of ENOBUFS we will fall out and become ack-clocked.
14190                  * capping the cwnd at the current flight.
14191                  * Everything else will just have to retransmit with the timer
14192                  * (no pacer).
14193                  */
14194                 SOCKBUF_UNLOCK_ASSERT(sb);
14195                 BBR_STAT_INC(bbr_saw_oerr);
14196                 /* Clear all delay/early tracks */
14197                 bbr->r_ctl.rc_hptsi_agg_delay = 0;
14198                 bbr->r_ctl.rc_agg_early = 0;
14199                 bbr->r_agg_early_set = 0;
14200                 bbr->output_error_seen = 1;
14201                 if (bbr->oerror_cnt < 0xf)
14202                         bbr->oerror_cnt++;
14203                 if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
14204                         /* drop the session */
14205                         tcp_set_inp_to_drop(inp, ENETDOWN);
14206                 }
14207                 switch (error) {
14208                 case ENOBUFS:
14209                         /*
14210                          * Make this guy have to get ack's to send
14211                          * more but lets make sure we don't
14212                          * slam him below a T-O (1MSS).
14213                          */
14214                         if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
14215                                 tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
14216                                                                     bbr->r_ctl.rc_lost_bytes)) - maxseg;
14217                                 if (tp->snd_cwnd < maxseg)
14218                                         tp->snd_cwnd = maxseg;
14219                         }
14220                         slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
14221                         BBR_STAT_INC(bbr_saw_enobuf);
14222                         if (bbr->bbr_hdrw_pacing)
14223                                 counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
14224                         else
14225                                 counter_u64_add(bbr_nohdwr_pacing_enobuf, 1);
14226                         /*
14227                          * Here even in the enobuf's case we want to do our
14228                          * state update. The reason being we may have been
14229                          * called by the input function. If so we have had
14230                          * things change.
14231                          */
14232                         error = 0;
14233                         goto enobufs;
14234                 case EMSGSIZE:
14235                         /*
14236                          * For some reason the interface we used initially
14237                          * to send segments changed to another or lowered
14238                          * its MTU. If TSO was active we either got an
14239                          * interface without TSO capabilits or TSO was
14240                          * turned off. If we obtained mtu from ip_output()
14241                          * then update it and try again.
14242                          */
14243                         /* Turn on tracing (or try to) */
14244                         {
14245                                 int old_maxseg;
14246
14247                                 old_maxseg = tp->t_maxseg;
14248                                 BBR_STAT_INC(bbr_saw_emsgsiz);
14249                                 bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts);
14250                                 if (mtu != 0)
14251                                         tcp_mss_update(tp, -1, mtu, NULL, NULL);
14252                                 if (old_maxseg <= tp->t_maxseg) {
14253                                         /* Huh it did not shrink? */
14254                                         tp->t_maxseg = old_maxseg - 40;
14255                                         bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts);
14256                                 }
14257                                 tp->t_flags &= ~TF_FORCEDATA;
14258                                 /*
14259                                  * Nuke all other things that can interfere
14260                                  * with slot
14261                                  */
14262                                 if ((tot_len + len) && (len >= tp->t_maxseg)) {
14263                                         slot = bbr_get_pacing_delay(bbr,
14264                                             bbr->r_ctl.rc_bbr_hptsi_gain,
14265                                             (tot_len + len), cts, 0);
14266                                         if (slot < bbr_error_base_paceout)
14267                                                 slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
14268                                 } else
14269                                         slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
14270                                 bbr->rc_output_starts_timer = 1;
14271                                 bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
14272                                     tot_len);
14273                                 return (error);
14274                         }
14275                 case EPERM:
14276                         tp->t_softerror = error;
14277                         /* Fall through */
14278                 case EHOSTDOWN:
14279                 case EHOSTUNREACH:
14280                 case ENETDOWN:
14281                 case ENETUNREACH:
14282                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
14283                                 tp->t_softerror = error;
14284                         }
14285                         /* FALLTHROUGH */
14286                 default:
14287                         tp->t_flags &= ~TF_FORCEDATA;
14288                         slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
14289                         bbr->rc_output_starts_timer = 1;
14290                         bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
14291                         return (error);
14292                 }
14293 #ifdef STATS
14294         } else if (((tp->t_flags & TF_GPUTINPROG) == 0) &&
14295                     len &&
14296                     (rsm == NULL) &&
14297             (bbr->rc_in_persist == 0)) {
14298                 tp->gput_seq = bbr_seq;
14299                 tp->gput_ack = bbr_seq +
14300                     min(sbavail(&so->so_snd) - sb_offset, sendwin);
14301                 tp->gput_ts = cts;
14302                 tp->t_flags |= TF_GPUTINPROG;
14303 #endif
14304         }
14305         KMOD_TCPSTAT_INC(tcps_sndtotal);
14306         if ((bbr->bbr_hdw_pace_ena) &&
14307             (bbr->bbr_attempt_hdwr_pace == 0) &&
14308             (bbr->rc_past_init_win) &&
14309             (bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
14310             (get_filter_value(&bbr->r_ctl.rc_delrate)) &&
14311             (inp->inp_route.ro_nh &&
14312              inp->inp_route.ro_nh->nh_ifp)) {
14313                 /*
14314                  * We are past the initial window and
14315                  * have at least one measurement so we
14316                  * could use hardware pacing if its available.
14317                  * We have an interface and we have not attempted
14318                  * to setup hardware pacing, lets try to now.
14319                  */
14320                 uint64_t rate_wanted;
14321                 int err = 0;
14322
14323                 rate_wanted = bbr_get_hardware_rate(bbr);
14324                 bbr->bbr_attempt_hdwr_pace = 1;
14325                 bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp,
14326                                                       inp->inp_route.ro_nh->nh_ifp,
14327                                                       rate_wanted,
14328                                                       (RS_PACING_GEQ|RS_PACING_SUB_OK),
14329                                                       &err);
14330                 if (bbr->r_ctl.crte) {
14331                         bbr_type_log_hdwr_pacing(bbr,
14332                                                  bbr->r_ctl.crte->ptbl->rs_ifp,
14333                                                  rate_wanted,
14334                                                  bbr->r_ctl.crte->rate,
14335                                                  __LINE__, cts, err);
14336                         BBR_STAT_INC(bbr_hdwr_rl_add_ok);
14337                         counter_u64_add(bbr_flows_nohdwr_pacing, -1);
14338                         counter_u64_add(bbr_flows_whdwr_pacing, 1);
14339                         bbr->bbr_hdrw_pacing = 1;
14340                         /* Now what is our gain status? */
14341                         if (bbr->r_ctl.crte->rate < rate_wanted) {
14342                                 /* We have a problem */
14343                                 bbr_setup_less_of_rate(bbr, cts,
14344                                                        bbr->r_ctl.crte->rate, rate_wanted);
14345                         } else {
14346                                 /* We are good */
14347                                 bbr->gain_is_limited = 0;
14348                                 bbr->skip_gain = 0;
14349                         }
14350                         tcp_bbr_tso_size_check(bbr, cts);
14351                 } else {
14352                         bbr_type_log_hdwr_pacing(bbr,
14353                                                  inp->inp_route.ro_nh->nh_ifp,
14354                                                  rate_wanted,
14355                                                  0,
14356                                                  __LINE__, cts, err);
14357                         BBR_STAT_INC(bbr_hdwr_rl_add_fail);
14358                 }
14359         }
14360         if (bbr->bbr_hdrw_pacing) {
14361                 /*
14362                  * Worry about cases where the route
14363                  * changes or something happened that we
14364                  * lost our hardware pacing possibly during
14365                  * the last ip_output call.
14366                  */
14367                 if (inp->inp_snd_tag == NULL) {
14368                         /* A change during ip output disabled hw pacing? */
14369                         bbr->bbr_hdrw_pacing = 0;
14370                 } else if ((inp->inp_route.ro_nh == NULL) ||
14371                     (inp->inp_route.ro_nh->nh_ifp != inp->inp_snd_tag->ifp)) {
14372                         /*
14373                          * We had an interface or route change,
14374                          * detach from the current hdwr pacing
14375                          * and setup to re-attempt next go
14376                          * round.
14377                          */
14378                         bbr->bbr_hdrw_pacing = 0;
14379                         bbr->bbr_attempt_hdwr_pace = 0;
14380                         tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
14381                         tcp_bbr_tso_size_check(bbr, cts);
14382                 }
14383         }
14384         /*
14385          * Data sent (as far as we can tell). If this advertises a larger
14386          * window than any other segment, then remember the size of the
14387          * advertised window. Any pending ACK has now been sent.
14388          */
14389         if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
14390                 tp->rcv_adv = tp->rcv_nxt + recwin;
14391
14392         tp->last_ack_sent = tp->rcv_nxt;
14393         if ((error == 0) &&
14394             (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) &&
14395             (doing_tlp == 0) &&
14396             (tso == 0) &&
14397             (hw_tls == 0) &&
14398             (len > 0) &&
14399             ((flags & TH_RST) == 0) &&
14400             (IN_RECOVERY(tp->t_flags) == 0) &&
14401             (bbr->rc_in_persist == 0) &&
14402             ((tp->t_flags & TF_FORCEDATA) == 0) &&
14403             (tot_len < bbr->r_ctl.rc_pace_max_segs)) {
14404                 /*
14405                  * For non-tso we need to goto again until we have sent out
14406                  * enough data to match what we are hptsi out every hptsi
14407                  * interval.
14408                  */
14409                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
14410                         /* Make sure snd_nxt is drug up */
14411                         tp->snd_nxt = tp->snd_max;
14412                 }
14413                 if (rsm != NULL) {
14414                         rsm = NULL;
14415                         goto skip_again;
14416                 }
14417                 rsm = NULL;
14418                 sack_rxmit = 0;
14419                 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
14420                 goto again;
14421         }
14422 skip_again:
14423         if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
14424                 /*
14425                  * Calculate/Re-Calculate the hptsi slot in usecs based on
14426                  * what we have sent so far
14427                  */
14428                 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
14429                 if (bbr->rc_no_pacing)
14430                         slot = 0;
14431         }
14432         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
14433 enobufs:
14434         if (bbr->rc_use_google == 0)
14435                 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
14436         bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
14437                                                         bbr->r_ctl.rc_lost_bytes)));
14438         bbr->rc_output_starts_timer = 1;
14439         if (bbr->bbr_use_rack_cheat &&
14440             (more_to_rxt ||
14441              ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
14442                 /* Rack cheats and shotguns out all rxt's 1ms apart */
14443                 if (slot > 1000)
14444                         slot = 1000;
14445         }
14446         if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
14447                 /*
14448                  * We don't change the tso size until some number of sends
14449                  * to give the hardware commands time to get down
14450                  * to the interface.
14451                  */
14452                 bbr->r_ctl.bbr_hdwr_cnt_noset_snt++;
14453                 if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) {
14454                         bbr->hw_pacing_set = 1;
14455                         tcp_bbr_tso_size_check(bbr, cts);
14456                 }
14457         }
14458         bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
14459         if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
14460                 /* Make sure snd_nxt is drug up */
14461                 tp->snd_nxt = tp->snd_max;
14462         }
14463         return (error);
14464
14465 }
14466
14467 /*
14468  * See bbr_output_wtime() for return values.
14469  */
14470 static int
14471 bbr_output(struct tcpcb *tp)
14472 {
14473         int32_t ret;
14474         struct timeval tv;
14475         struct tcp_bbr *bbr;
14476
14477         NET_EPOCH_ASSERT();
14478
14479         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14480         INP_WLOCK_ASSERT(tp->t_inpcb);
14481         (void)tcp_get_usecs(&tv);
14482         ret = bbr_output_wtime(tp, &tv);
14483         return (ret);
14484 }
14485
14486 static void
14487 bbr_mtu_chg(struct tcpcb *tp)
14488 {
14489         struct tcp_bbr *bbr;
14490         struct bbr_sendmap *rsm, *frsm = NULL;
14491         uint32_t maxseg;
14492
14493         /*
14494          * The MTU has changed. a) Clear the sack filter. b) Mark everything
14495          * over the current size as SACK_PASS so a retransmit will occur.
14496          */
14497
14498         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14499         maxseg = tp->t_maxseg - bbr->rc_last_options;
14500         sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
14501         TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
14502                 /* Don't mess with ones acked (by sack?) */
14503                 if (rsm->r_flags & BBR_ACKED)
14504                         continue;
14505                 if ((rsm->r_end - rsm->r_start) > maxseg) {
14506                         /*
14507                          * We mark sack-passed on all the previous large
14508                          * sends we did. This will force them to retransmit.
14509                          */
14510                         rsm->r_flags |= BBR_SACK_PASSED;
14511                         if (((rsm->r_flags & BBR_MARKED_LOST) == 0) &&
14512                             bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) {
14513                                 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
14514                                 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
14515                                 rsm->r_flags |= BBR_MARKED_LOST;
14516                         }
14517                         if (frsm == NULL)
14518                                 frsm = rsm;
14519                 }
14520         }
14521         if (frsm) {
14522                 bbr->r_ctl.rc_resend = frsm;
14523         }
14524 }
14525
14526 /*
14527  * bbr_ctloutput() must drop the inpcb lock before performing copyin on
14528  * socket option arguments.  When it re-acquires the lock after the copy, it
14529  * has to revalidate that the connection is still valid for the socket
14530  * option.
14531  */
14532 static int
14533 bbr_set_sockopt(struct socket *so, struct sockopt *sopt,
14534                 struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
14535 {
14536         int32_t error = 0, optval;
14537
14538         switch (sopt->sopt_name) {
14539         case TCP_RACK_PACE_MAX_SEG:
14540         case TCP_RACK_MIN_TO:
14541         case TCP_RACK_REORD_THRESH:
14542         case TCP_RACK_REORD_FADE:
14543         case TCP_RACK_TLP_THRESH:
14544         case TCP_RACK_PKT_DELAY:
14545         case TCP_BBR_ALGORITHM:
14546         case TCP_BBR_TSLIMITS:
14547         case TCP_BBR_IWINTSO:
14548         case TCP_BBR_RECFORCE:
14549         case TCP_BBR_STARTUP_PG:
14550         case TCP_BBR_DRAIN_PG:
14551         case TCP_BBR_RWND_IS_APP:
14552         case TCP_BBR_PROBE_RTT_INT:
14553         case TCP_BBR_PROBE_RTT_GAIN:
14554         case TCP_BBR_PROBE_RTT_LEN:
14555         case TCP_BBR_STARTUP_LOSS_EXIT:
14556         case TCP_BBR_USEDEL_RATE:
14557         case TCP_BBR_MIN_RTO:
14558         case TCP_BBR_MAX_RTO:
14559         case TCP_BBR_PACE_PER_SEC:
14560         case TCP_DELACK:
14561         case TCP_BBR_PACE_DEL_TAR:
14562         case TCP_BBR_SEND_IWND_IN_TSO:
14563         case TCP_BBR_EXTRA_STATE:
14564         case TCP_BBR_UTTER_MAX_TSO:
14565         case TCP_BBR_MIN_TOPACEOUT:
14566         case TCP_BBR_FLOOR_MIN_TSO:
14567         case TCP_BBR_TSTMP_RAISES:
14568         case TCP_BBR_POLICER_DETECT:
14569         case TCP_BBR_USE_RACK_CHEAT:
14570         case TCP_DATA_AFTER_CLOSE:
14571         case TCP_BBR_HDWR_PACE:
14572         case TCP_BBR_PACE_SEG_MAX:
14573         case TCP_BBR_PACE_SEG_MIN:
14574         case TCP_BBR_PACE_CROSS:
14575         case TCP_BBR_PACE_OH:
14576 #ifdef NETFLIX_PEAKRATE
14577         case TCP_MAXPEAKRATE:
14578 #endif
14579         case TCP_BBR_TMR_PACE_OH:
14580         case TCP_BBR_RACK_RTT_USE:
14581         case TCP_BBR_RETRAN_WTSO:
14582                 break;
14583         default:
14584                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14585                 break;
14586         }
14587         INP_WUNLOCK(inp);
14588         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
14589         if (error)
14590                 return (error);
14591         INP_WLOCK(inp);
14592         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
14593                 INP_WUNLOCK(inp);
14594                 return (ECONNRESET);
14595         }
14596         tp = intotcpcb(inp);
14597         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
14598         switch (sopt->sopt_name) {
14599         case TCP_BBR_PACE_PER_SEC:
14600                 BBR_OPTS_INC(tcp_bbr_pace_per_sec);
14601                 bbr->r_ctl.bbr_hptsi_per_second = optval;
14602                 break;
14603         case TCP_BBR_PACE_DEL_TAR:
14604                 BBR_OPTS_INC(tcp_bbr_pace_del_tar);
14605                 bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval;
14606                 break;
14607         case TCP_BBR_PACE_SEG_MAX:
14608                 BBR_OPTS_INC(tcp_bbr_pace_seg_max);
14609                 bbr->r_ctl.bbr_hptsi_segments_max = optval;
14610                 break;
14611         case TCP_BBR_PACE_SEG_MIN:
14612                 BBR_OPTS_INC(tcp_bbr_pace_seg_min);
14613                 bbr->r_ctl.bbr_hptsi_bytes_min = optval;
14614                 break;
14615         case TCP_BBR_PACE_CROSS:
14616                 BBR_OPTS_INC(tcp_bbr_pace_cross);
14617                 bbr->r_ctl.bbr_cross_over = optval;
14618                 break;
14619         case TCP_BBR_ALGORITHM:
14620                 BBR_OPTS_INC(tcp_bbr_algorithm);
14621                 if (optval && (bbr->rc_use_google == 0)) {
14622                         /* Turn on the google mode */
14623                         bbr_google_mode_on(bbr);
14624                         if ((optval > 3) && (optval < 500)) {
14625                                 /*
14626                                  * Must be at least greater than .3%
14627                                  * and must be less than 50.0%.
14628                                  */
14629                                 bbr->r_ctl.bbr_google_discount = optval;
14630                         }
14631                 } else if ((optval == 0) && (bbr->rc_use_google == 1)) {
14632                         /* Turn off the google mode */
14633                         bbr_google_mode_off(bbr);
14634                 }
14635                 break;
14636         case TCP_BBR_TSLIMITS:
14637                 BBR_OPTS_INC(tcp_bbr_tslimits);
14638                 if (optval == 1)
14639                         bbr->rc_use_ts_limit = 1;
14640                 else if (optval == 0)
14641                         bbr->rc_use_ts_limit = 0;
14642                 else
14643                         error = EINVAL;
14644                 break;
14645
14646         case TCP_BBR_IWINTSO:
14647                 BBR_OPTS_INC(tcp_bbr_iwintso);
14648                 if ((optval >= 0) && (optval < 128)) {
14649                         uint32_t twin;
14650
14651                         bbr->rc_init_win = optval;
14652                         twin = bbr_initial_cwnd(bbr, tp);
14653                         if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd))
14654                                 tp->snd_cwnd = twin;
14655                         else
14656                                 error = EBUSY;
14657                 } else
14658                         error = EINVAL;
14659                 break;
14660         case TCP_BBR_STARTUP_PG:
14661                 BBR_OPTS_INC(tcp_bbr_startup_pg);
14662                 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) {
14663                         bbr->r_ctl.rc_startup_pg = optval;
14664                         if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
14665                                 bbr->r_ctl.rc_bbr_hptsi_gain = optval;
14666                         }
14667                 } else
14668                         error = EINVAL;
14669                 break;
14670         case TCP_BBR_DRAIN_PG:
14671                 BBR_OPTS_INC(tcp_bbr_drain_pg);
14672                 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE))
14673                         bbr->r_ctl.rc_drain_pg = optval;
14674                 else
14675                         error = EINVAL;
14676                 break;
14677         case TCP_BBR_PROBE_RTT_LEN:
14678                 BBR_OPTS_INC(tcp_bbr_probertt_len);
14679                 if (optval <= 1)
14680                         reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND));
14681                 else
14682                         error = EINVAL;
14683                 break;
14684         case TCP_BBR_PROBE_RTT_GAIN:
14685                 BBR_OPTS_INC(tcp_bbr_probertt_gain);
14686                 if (optval <= BBR_UNIT)
14687                         bbr->r_ctl.bbr_rttprobe_gain_val = optval;
14688                 else
14689                         error = EINVAL;
14690                 break;
14691         case TCP_BBR_PROBE_RTT_INT:
14692                 BBR_OPTS_INC(tcp_bbr_probe_rtt_int);
14693                 if (optval > 1000)
14694                         bbr->r_ctl.rc_probertt_int = optval;
14695                 else
14696                         error = EINVAL;
14697                 break;
14698         case TCP_BBR_MIN_TOPACEOUT:
14699                 BBR_OPTS_INC(tcp_bbr_topaceout);
14700                 if (optval == 0) {
14701                         bbr->no_pacing_until = 0;
14702                         bbr->rc_no_pacing = 0;
14703                 } else if (optval <= 0x00ff) {
14704                         bbr->no_pacing_until = optval;
14705                         if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) &&
14706                             (bbr->rc_bbr_state == BBR_STATE_STARTUP)){
14707                                 /* Turn on no pacing */
14708                                 bbr->rc_no_pacing = 1;
14709                         }
14710                 } else
14711                         error = EINVAL;
14712                 break;
14713         case TCP_BBR_STARTUP_LOSS_EXIT:
14714                 BBR_OPTS_INC(tcp_bbr_startup_loss_exit);
14715                 bbr->rc_loss_exit = optval;
14716                 break;
14717         case TCP_BBR_USEDEL_RATE:
14718                 error = EINVAL;
14719                 break;
14720         case TCP_BBR_MIN_RTO:
14721                 BBR_OPTS_INC(tcp_bbr_min_rto);
14722                 bbr->r_ctl.rc_min_rto_ms = optval;
14723                 break;
14724         case TCP_BBR_MAX_RTO:
14725                 BBR_OPTS_INC(tcp_bbr_max_rto);
14726                 bbr->rc_max_rto_sec = optval;
14727                 break;
14728         case TCP_RACK_MIN_TO:
14729                 /* Minimum time between rack t-o's in ms */
14730                 BBR_OPTS_INC(tcp_rack_min_to);
14731                 bbr->r_ctl.rc_min_to = optval;
14732                 break;
14733         case TCP_RACK_REORD_THRESH:
14734                 /* RACK reorder threshold (shift amount) */
14735                 BBR_OPTS_INC(tcp_rack_reord_thresh);
14736                 if ((optval > 0) && (optval < 31))
14737                         bbr->r_ctl.rc_reorder_shift = optval;
14738                 else
14739                         error = EINVAL;
14740                 break;
14741         case TCP_RACK_REORD_FADE:
14742                 /* Does reordering fade after ms time */
14743                 BBR_OPTS_INC(tcp_rack_reord_fade);
14744                 bbr->r_ctl.rc_reorder_fade = optval;
14745                 break;
14746         case TCP_RACK_TLP_THRESH:
14747                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
14748                 BBR_OPTS_INC(tcp_rack_tlp_thresh);
14749                 if (optval)
14750                         bbr->rc_tlp_threshold = optval;
14751                 else
14752                         error = EINVAL;
14753                 break;
14754         case TCP_BBR_USE_RACK_CHEAT:
14755                 BBR_OPTS_INC(tcp_use_rackcheat);
14756                 if (bbr->rc_use_google) {
14757                         error = EINVAL;
14758                         break;
14759                 }
14760                 BBR_OPTS_INC(tcp_rack_cheat);
14761                 if (optval)
14762                         bbr->bbr_use_rack_cheat = 1;
14763                 else
14764                         bbr->bbr_use_rack_cheat = 0;
14765                 break;
14766         case TCP_BBR_FLOOR_MIN_TSO:
14767                 BBR_OPTS_INC(tcp_utter_max_tso);
14768                 if ((optval >= 0) && (optval < 40))
14769                         bbr->r_ctl.bbr_hptsi_segments_floor = optval;
14770                 else
14771                         error = EINVAL;
14772                 break;
14773         case TCP_BBR_UTTER_MAX_TSO:
14774                 BBR_OPTS_INC(tcp_utter_max_tso);
14775                 if ((optval >= 0) && (optval < 0xffff))
14776                         bbr->r_ctl.bbr_utter_max = optval;
14777                 else
14778                         error = EINVAL;
14779                 break;
14780
14781         case TCP_BBR_EXTRA_STATE:
14782                 BBR_OPTS_INC(tcp_extra_state);
14783                 if (optval)
14784                         bbr->rc_use_idle_restart = 1;
14785                 else
14786                         bbr->rc_use_idle_restart = 0;
14787                 break;
14788         case TCP_BBR_SEND_IWND_IN_TSO:
14789                 BBR_OPTS_INC(tcp_iwnd_tso);
14790                 if (optval) {
14791                         bbr->bbr_init_win_cheat = 1;
14792                         if (bbr->rc_past_init_win == 0) {
14793                                 uint32_t cts;
14794                                 cts = tcp_get_usecs(&bbr->rc_tv);
14795                                 tcp_bbr_tso_size_check(bbr, cts);
14796                         }
14797                 } else
14798                         bbr->bbr_init_win_cheat = 0;
14799                 break;
14800         case TCP_BBR_HDWR_PACE:
14801                 BBR_OPTS_INC(tcp_hdwr_pacing);
14802                 if (optval){
14803                         bbr->bbr_hdw_pace_ena = 1;
14804                         bbr->bbr_attempt_hdwr_pace = 0;
14805                 } else {
14806                         bbr->bbr_hdw_pace_ena = 0;
14807 #ifdef RATELIMIT
14808                         if (bbr->bbr_hdrw_pacing) {
14809                                 bbr->bbr_hdrw_pacing = 0;
14810                                 in_pcbdetach_txrtlmt(bbr->rc_inp);
14811                         }
14812 #endif
14813                 }
14814                 break;
14815
14816         case TCP_DELACK:
14817                 BBR_OPTS_INC(tcp_delack);
14818                 if (optval < 100) {
14819                         if (optval == 0) /* off */
14820                                 tp->t_delayed_ack = 0;
14821                         else if (optval == 1) /* on which is 2 */
14822                                 tp->t_delayed_ack = 2;
14823                         else /* higher than 2 and less than 100 */
14824                                 tp->t_delayed_ack = optval;
14825                         if (tp->t_flags & TF_DELACK) {
14826                                 tp->t_flags &= ~TF_DELACK;
14827                                 tp->t_flags |= TF_ACKNOW;
14828                                 bbr_output(tp);
14829                         }
14830                 } else
14831                         error = EINVAL;
14832                 break;
14833         case TCP_RACK_PKT_DELAY:
14834                 /* RACK added ms i.e. rack-rtt + reord + N */
14835                 BBR_OPTS_INC(tcp_rack_pkt_delay);
14836                 bbr->r_ctl.rc_pkt_delay = optval;
14837                 break;
14838 #ifdef NETFLIX_PEAKRATE
14839         case TCP_MAXPEAKRATE:
14840                 BBR_OPTS_INC(tcp_maxpeak);
14841                 error = tcp_set_maxpeakrate(tp, optval);
14842                 if (!error)
14843                         tp->t_peakrate_thr = tp->t_maxpeakrate;
14844                 break;
14845 #endif
14846         case TCP_BBR_RETRAN_WTSO:
14847                 BBR_OPTS_INC(tcp_retran_wtso);
14848                 if (optval)
14849                         bbr->rc_resends_use_tso = 1;
14850                 else
14851                         bbr->rc_resends_use_tso = 0;
14852                 break;
14853         case TCP_DATA_AFTER_CLOSE:
14854                 BBR_OPTS_INC(tcp_data_ac);
14855                 if (optval)
14856                         bbr->rc_allow_data_af_clo = 1;
14857                 else
14858                         bbr->rc_allow_data_af_clo = 0;
14859                 break;
14860         case TCP_BBR_POLICER_DETECT:
14861                 BBR_OPTS_INC(tcp_policer_det);
14862                 if (bbr->rc_use_google == 0)
14863                         error = EINVAL;
14864                 else if (optval)
14865                         bbr->r_use_policer = 1;
14866                 else
14867                         bbr->r_use_policer = 0;
14868                 break;
14869
14870         case TCP_BBR_TSTMP_RAISES:
14871                 BBR_OPTS_INC(tcp_ts_raises);
14872                 if (optval)
14873                         bbr->ts_can_raise = 1;
14874                 else
14875                         bbr->ts_can_raise = 0;
14876                 break;
14877         case TCP_BBR_TMR_PACE_OH:
14878                 BBR_OPTS_INC(tcp_pacing_oh_tmr);
14879                 if (bbr->rc_use_google) {
14880                         error = EINVAL;
14881                 } else {
14882                         if (optval)
14883                                 bbr->r_ctl.rc_incr_tmrs = 1;
14884                         else
14885                                 bbr->r_ctl.rc_incr_tmrs = 0;
14886                 }
14887                 break;
14888         case TCP_BBR_PACE_OH:
14889                 BBR_OPTS_INC(tcp_pacing_oh);
14890                 if (bbr->rc_use_google) {
14891                         error = EINVAL;
14892                 } else {
14893                         if (optval > (BBR_INCL_TCP_OH|
14894                                       BBR_INCL_IP_OH|
14895                                       BBR_INCL_ENET_OH)) {
14896                                 error = EINVAL;
14897                                 break;
14898                         }
14899                         if (optval & BBR_INCL_TCP_OH)
14900                                 bbr->r_ctl.rc_inc_tcp_oh = 1;
14901                         else
14902                                 bbr->r_ctl.rc_inc_tcp_oh = 0;
14903                         if (optval & BBR_INCL_IP_OH)
14904                                 bbr->r_ctl.rc_inc_ip_oh = 1;
14905                         else
14906                                 bbr->r_ctl.rc_inc_ip_oh = 0;
14907                         if (optval & BBR_INCL_ENET_OH)
14908                                 bbr->r_ctl.rc_inc_enet_oh = 1;
14909                         else
14910                                 bbr->r_ctl.rc_inc_enet_oh = 0;
14911                 }
14912                 break;
14913         default:
14914                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14915                 break;
14916         }
14917 #ifdef NETFLIX_STATS
14918         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
14919 #endif
14920         INP_WUNLOCK(inp);
14921         return (error);
14922 }
14923
14924 /*
14925  * return 0 on success, error-num on failure
14926  */
14927 static int
14928 bbr_get_sockopt(struct socket *so, struct sockopt *sopt,
14929     struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr)
14930 {
14931         int32_t error, optval;
14932
14933         /*
14934          * Because all our options are either boolean or an int, we can just
14935          * pull everything into optval and then unlock and copy. If we ever
14936          * add a option that is not a int, then this will have quite an
14937          * impact to this routine.
14938          */
14939         switch (sopt->sopt_name) {
14940         case TCP_BBR_PACE_PER_SEC:
14941                 optval = bbr->r_ctl.bbr_hptsi_per_second;
14942                 break;
14943         case TCP_BBR_PACE_DEL_TAR:
14944                 optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar;
14945                 break;
14946         case TCP_BBR_PACE_SEG_MAX:
14947                 optval = bbr->r_ctl.bbr_hptsi_segments_max;
14948                 break;
14949         case TCP_BBR_MIN_TOPACEOUT:
14950                 optval = bbr->no_pacing_until;
14951                 break;
14952         case TCP_BBR_PACE_SEG_MIN:
14953                 optval = bbr->r_ctl.bbr_hptsi_bytes_min;
14954                 break;
14955         case TCP_BBR_PACE_CROSS:
14956                 optval = bbr->r_ctl.bbr_cross_over;
14957                 break;
14958         case TCP_BBR_ALGORITHM:
14959                 optval = bbr->rc_use_google;
14960                 break;
14961         case TCP_BBR_TSLIMITS:
14962                 optval = bbr->rc_use_ts_limit;
14963                 break;
14964         case TCP_BBR_IWINTSO:
14965                 optval = bbr->rc_init_win;
14966                 break;
14967         case TCP_BBR_STARTUP_PG:
14968                 optval = bbr->r_ctl.rc_startup_pg;
14969                 break;
14970         case TCP_BBR_DRAIN_PG:
14971                 optval = bbr->r_ctl.rc_drain_pg;
14972                 break;
14973         case TCP_BBR_PROBE_RTT_INT:
14974                 optval = bbr->r_ctl.rc_probertt_int;
14975                 break;
14976         case TCP_BBR_PROBE_RTT_LEN:
14977                 optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND);
14978                 break;
14979         case TCP_BBR_PROBE_RTT_GAIN:
14980                 optval = bbr->r_ctl.bbr_rttprobe_gain_val;
14981                 break;
14982         case TCP_BBR_STARTUP_LOSS_EXIT:
14983                 optval = bbr->rc_loss_exit;
14984                 break;
14985         case TCP_BBR_USEDEL_RATE:
14986                 error = EINVAL;
14987                 break;
14988         case TCP_BBR_MIN_RTO:
14989                 optval = bbr->r_ctl.rc_min_rto_ms;
14990                 break;
14991         case TCP_BBR_MAX_RTO:
14992                 optval = bbr->rc_max_rto_sec;
14993                 break;
14994         case TCP_RACK_PACE_MAX_SEG:
14995                 /* Max segments in a pace */
14996                 optval = bbr->r_ctl.rc_pace_max_segs;
14997                 break;
14998         case TCP_RACK_MIN_TO:
14999                 /* Minimum time between rack t-o's in ms */
15000                 optval = bbr->r_ctl.rc_min_to;
15001                 break;
15002         case TCP_RACK_REORD_THRESH:
15003                 /* RACK reorder threshold (shift amount) */
15004                 optval = bbr->r_ctl.rc_reorder_shift;
15005                 break;
15006         case TCP_RACK_REORD_FADE:
15007                 /* Does reordering fade after ms time */
15008                 optval = bbr->r_ctl.rc_reorder_fade;
15009                 break;
15010         case TCP_BBR_USE_RACK_CHEAT:
15011                 /* Do we use the rack cheat for rxt */
15012                 optval = bbr->bbr_use_rack_cheat;
15013                 break;
15014         case TCP_BBR_FLOOR_MIN_TSO:
15015                 optval = bbr->r_ctl.bbr_hptsi_segments_floor;
15016                 break;
15017         case TCP_BBR_UTTER_MAX_TSO:
15018                 optval = bbr->r_ctl.bbr_utter_max;
15019                 break;
15020         case TCP_BBR_SEND_IWND_IN_TSO:
15021                 /* Do we send TSO size segments initially */
15022                 optval = bbr->bbr_init_win_cheat;
15023                 break;
15024         case TCP_BBR_EXTRA_STATE:
15025                 optval = bbr->rc_use_idle_restart;
15026                 break;
15027         case TCP_RACK_TLP_THRESH:
15028                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
15029                 optval = bbr->rc_tlp_threshold;
15030                 break;
15031         case TCP_RACK_PKT_DELAY:
15032                 /* RACK added ms i.e. rack-rtt + reord + N */
15033                 optval = bbr->r_ctl.rc_pkt_delay;
15034                 break;
15035         case TCP_BBR_RETRAN_WTSO:
15036                 optval = bbr->rc_resends_use_tso;
15037                 break;
15038         case TCP_DATA_AFTER_CLOSE:
15039                 optval = bbr->rc_allow_data_af_clo;
15040                 break;
15041         case TCP_DELACK:
15042                 optval = tp->t_delayed_ack;
15043                 break;
15044         case TCP_BBR_HDWR_PACE:
15045                 optval = bbr->bbr_hdw_pace_ena;
15046                 break;
15047         case TCP_BBR_POLICER_DETECT:
15048                 optval = bbr->r_use_policer;
15049                 break;
15050         case TCP_BBR_TSTMP_RAISES:
15051                 optval = bbr->ts_can_raise;
15052                 break;
15053         case TCP_BBR_TMR_PACE_OH:
15054                 optval = bbr->r_ctl.rc_incr_tmrs;
15055                 break;
15056         case TCP_BBR_PACE_OH:
15057                 optval = 0;
15058                 if (bbr->r_ctl.rc_inc_tcp_oh)
15059                         optval |= BBR_INCL_TCP_OH;
15060                 if (bbr->r_ctl.rc_inc_ip_oh)
15061                         optval |= BBR_INCL_IP_OH;
15062                 if (bbr->r_ctl.rc_inc_enet_oh)
15063                         optval |= BBR_INCL_ENET_OH;
15064                 break;
15065         default:
15066                 return (tcp_default_ctloutput(so, sopt, inp, tp));
15067                 break;
15068         }
15069         INP_WUNLOCK(inp);
15070         error = sooptcopyout(sopt, &optval, sizeof optval);
15071         return (error);
15072 }
15073
15074 /*
15075  * return 0 on success, error-num on failure
15076  */
15077 static int
15078 bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
15079 {
15080         int32_t error = EINVAL;
15081         struct tcp_bbr *bbr;
15082
15083         bbr = (struct tcp_bbr *)tp->t_fb_ptr;
15084         if (bbr == NULL) {
15085                 /* Huh? */
15086                 goto out;
15087         }
15088         if (sopt->sopt_dir == SOPT_SET) {
15089                 return (bbr_set_sockopt(so, sopt, inp, tp, bbr));
15090         } else if (sopt->sopt_dir == SOPT_GET) {
15091                 return (bbr_get_sockopt(so, sopt, inp, tp, bbr));
15092         }
15093 out:
15094         INP_WUNLOCK(inp);
15095         return (error);
15096 }
15097
15098
15099 struct tcp_function_block __tcp_bbr = {
15100         .tfb_tcp_block_name = __XSTRING(STACKNAME),
15101         .tfb_tcp_output = bbr_output,
15102         .tfb_do_queued_segments = ctf_do_queued_segments,
15103         .tfb_do_segment_nounlock = bbr_do_segment_nounlock,
15104         .tfb_tcp_do_segment = bbr_do_segment,
15105         .tfb_tcp_ctloutput = bbr_ctloutput,
15106         .tfb_tcp_fb_init = bbr_init,
15107         .tfb_tcp_fb_fini = bbr_fini,
15108         .tfb_tcp_timer_stop_all = bbr_stopall,
15109         .tfb_tcp_timer_activate = bbr_timer_activate,
15110         .tfb_tcp_timer_active = bbr_timer_active,
15111         .tfb_tcp_timer_stop = bbr_timer_stop,
15112         .tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
15113         .tfb_tcp_handoff_ok = bbr_handoff_ok,
15114         .tfb_tcp_mtu_chg = bbr_mtu_chg
15115 };
15116
15117 static const char *bbr_stack_names[] = {
15118         __XSTRING(STACKNAME),
15119 #ifdef STACKALIAS
15120         __XSTRING(STACKALIAS),
15121 #endif
15122 };
15123
15124 static bool bbr_mod_inited = false;
15125
15126 static int
15127 tcp_addbbr(module_t mod, int32_t type, void *data)
15128 {
15129         int32_t err = 0;
15130         int num_stacks;
15131
15132         switch (type) {
15133         case MOD_LOAD:
15134                 printf("Attempting to load " __XSTRING(MODNAME) "\n");
15135                 bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
15136                     sizeof(struct bbr_sendmap),
15137                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
15138                 bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
15139                     sizeof(struct tcp_bbr),
15140                     NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
15141                 sysctl_ctx_init(&bbr_sysctl_ctx);
15142                 bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
15143                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
15144                     OID_AUTO,
15145 #ifdef STACKALIAS
15146                     __XSTRING(STACKALIAS),
15147 #else
15148                     __XSTRING(STACKNAME),
15149 #endif
15150                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
15151                     "");
15152                 if (bbr_sysctl_root == NULL) {
15153                         printf("Failed to add sysctl node\n");
15154                         err = EFAULT;
15155                         goto free_uma;
15156                 }
15157                 bbr_init_sysctls();
15158                 num_stacks = nitems(bbr_stack_names);
15159                 err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK,
15160                     bbr_stack_names, &num_stacks);
15161                 if (err) {
15162                         printf("Failed to register %s stack name for "
15163                             "%s module\n", bbr_stack_names[num_stacks],
15164                             __XSTRING(MODNAME));
15165                         sysctl_ctx_free(&bbr_sysctl_ctx);
15166         free_uma:
15167                         uma_zdestroy(bbr_zone);
15168                         uma_zdestroy(bbr_pcb_zone);
15169                         bbr_counter_destroy();
15170                         printf("Failed to register " __XSTRING(MODNAME)
15171                             " module err:%d\n", err);
15172                         return (err);
15173                 }
15174                 tcp_lro_reg_mbufq();
15175                 bbr_mod_inited = true;
15176                 printf(__XSTRING(MODNAME) " is now available\n");
15177                 break;
15178         case MOD_QUIESCE:
15179                 err = deregister_tcp_functions(&__tcp_bbr, true, false);
15180                 break;
15181         case MOD_UNLOAD:
15182                 err = deregister_tcp_functions(&__tcp_bbr, false, true);
15183                 if (err == EBUSY)
15184                         break;
15185                 if (bbr_mod_inited) {
15186                         uma_zdestroy(bbr_zone);
15187                         uma_zdestroy(bbr_pcb_zone);
15188                         sysctl_ctx_free(&bbr_sysctl_ctx);
15189                         bbr_counter_destroy();
15190                         printf(__XSTRING(MODNAME)
15191                             " is now no longer available\n");
15192                         bbr_mod_inited = false;
15193                 }
15194                 tcp_lro_dereg_mbufq();
15195                 err = 0;
15196                 break;
15197         default:
15198                 return (EOPNOTSUPP);
15199         }
15200         return (err);
15201 }
15202
15203 static moduledata_t tcp_bbr = {
15204         .name = __XSTRING(MODNAME),
15205             .evhand = tcp_addbbr,
15206             .priv = 0
15207 };
15208
15209 MODULE_VERSION(MODNAME, 1);
15210 DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
15211 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);