2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * Copyright (c) 2015 Netflix Inc.
10 * All rights reserved.
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
14 * James Healy and David Hayes, made possible in part by a grant from the Cisco
15 * University Research Program Fund at Community Foundation Silicon Valley.
17 * Portions of this software were developed at the Centre for Advanced
18 * Internet Architectures, Swinburne University of Technology, Melbourne,
19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
21 * Portions of this software were developed by Robert N. M. Watson under
22 * contract to Juniper Networks, Inc.
24 * Portions of this software were developed by Randall R. Stewart while
25 * working for Netflix Inc.
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. Neither the name of the University nor the names of its contributors
36 * may be used to endorse or promote products derived from this software
37 * without specific prior written permission.
39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
54 #include <sys/cdefs.h>
55 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
59 #include "opt_tcpdebug.h"
61 #include <sys/param.h>
63 #include <sys/module.h>
64 #include <sys/mutex.h>
65 #include <sys/kernel.h>
67 #include <sys/hhook.h>
69 #include <sys/malloc.h>
71 #include <sys/proc.h> /* for proc0 declaration */
72 #include <sys/protosw.h>
74 #include <sys/signalvar.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/systm.h>
81 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
85 #include <net/route.h>
88 #define TCPSTATES /* for logging */
90 #include <netinet/in.h>
91 #include <netinet/in_kdtrace.h>
92 #include <netinet/in_pcb.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/ip.h>
95 #include <netinet/ip_icmp.h> /* required for icmp_var.h */
96 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
97 #include <netinet/ip_var.h>
98 #include <netinet/ip_options.h>
99 #include <netinet/ip6.h>
100 #include <netinet/icmp6.h>
101 #include <netinet6/in6_pcb.h>
102 #include <netinet6/ip6_var.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_fsm.h>
105 #include <netinet/tcp_seq.h>
106 #include <netinet/tcp_timer.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet6/tcp6_var.h>
109 #include <netinet/tcpip.h>
110 #include <netinet/tcp_syncache.h>
111 #include <netinet/cc/cc.h>
113 #include <netinet/tcp_debug.h>
114 #endif /* TCPDEBUG */
116 #include <netinet/tcp_offload.h>
119 #include <machine/in_cksum.h>
121 #include <security/mac/mac_framework.h>
123 VNET_DECLARE(int, tcp_autorcvbuf_inc);
124 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
125 VNET_DECLARE(int, tcp_autorcvbuf_max);
126 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
127 VNET_DECLARE(int, tcp_do_rfc3042);
128 #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
129 VNET_DECLARE(int, tcp_do_autorcvbuf);
130 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
131 VNET_DECLARE(int, tcp_insecure_rst);
132 #define V_tcp_insecure_rst VNET(tcp_insecure_rst)
133 VNET_DECLARE(int, tcp_insecure_syn);
134 #define V_tcp_insecure_syn VNET(tcp_insecure_syn)
135 VNET_DECLARE(int, drop_synfin);
136 #define V_drop_synfin VNET(drop_synfin)
138 static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
139 struct socket *, struct tcpcb *, int, int, uint8_t,
142 static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
143 struct socket *, struct tcpcb *, int, int, uint8_t,
147 * Indicate whether this ack should be delayed. We can delay the ack if
148 * following conditions are met:
149 * - There is no delayed ack timer in progress.
150 * - Our last ack wasn't a 0-sized window. We never want to delay
151 * the ack that opens up a 0-sized window.
152 * - LRO wasn't used for this segment. We make sure by checking that the
153 * segment size is not larger than the MSS.
155 #define DELAY_ACK(tp, tlen) \
156 ((!tcp_timer_active(tp, TT_DELACK) && \
157 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
158 (tlen <= tp->t_maxseg) && \
159 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
162 * So how is this faster than the normal fast ack?
163 * It basically allows us to also stay in the fastpath
164 * when a window-update ack also arrives. In testing
165 * we saw only 25-30% of connections doing fastpath
166 * due to the fact that along with moving forward
167 * in sequence the window was also updated.
170 tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
171 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
172 int ti_locked, uint32_t tiwin)
178 nsegs = max(1, m->m_pkthdr.lro_nsegs);
181 * The size of tcp_saveipgen must be the size of the max ip header,
184 u_char tcp_saveipgen[IP6_HDR_LEN];
185 struct tcphdr tcp_savetcp;
189 * The following if statement will be true if
190 * we are doing the win_up_in_fp <and>
191 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
192 * - No more new data, but we have an ack for new data
193 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
194 * - No more new data, the same ack point but the window grew
195 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
197 if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
198 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
199 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
200 /* keep track of pure window updates */
201 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
203 TCPSTAT_INC(tcps_rcvwinupd);
206 tp->snd_wl1 = th->th_seq;
207 tp->snd_wl2 = th->th_ack;
208 if (tp->snd_wnd > tp->max_sndwnd)
209 tp->max_sndwnd = tp->snd_wnd;
212 * If last ACK falls within this segment's sequence numbers,
213 * record the timestamp.
214 * NOTE that the test is modified according to the latest
215 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
217 if ((to->to_flags & TOF_TS) != 0 &&
218 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
219 tp->ts_recent_age = tcp_ts_getticks();
220 tp->ts_recent = to->to_tsval;
223 * This is a pure ack for outstanding data.
225 if (ti_locked == TI_RLOCKED) {
226 INP_INFO_RUNLOCK(&V_tcbinfo);
228 ti_locked = TI_UNLOCKED;
230 TCPSTAT_INC(tcps_predack);
233 * "bad retransmit" recovery.
235 if (tp->t_rxtshift == 1 &&
236 tp->t_flags & TF_PREVVALID &&
237 (int)(ticks - tp->t_badrxtwin) < 0) {
238 cc_cong_signal(tp, th, CC_RTO_ERR);
242 * Recalculate the transmit timer / rtt.
244 * Some boxes send broken timestamp replies
245 * during the SYN+ACK phase, ignore
246 * timestamps of 0 or we could calculate a
247 * huge RTT and blow up the retransmit timer.
249 if ((to->to_flags & TOF_TS) != 0 &&
253 t = tcp_ts_getticks() - to->to_tsecr;
254 if (!tp->t_rttlow || tp->t_rttlow > t)
257 TCP_TS_TO_TICKS(t) + 1);
258 } else if (tp->t_rtttime &&
259 SEQ_GT(th->th_ack, tp->t_rtseq)) {
261 tp->t_rttlow > ticks - tp->t_rtttime)
262 tp->t_rttlow = ticks - tp->t_rtttime;
264 ticks - tp->t_rtttime);
266 if (winup_only == 0) {
267 acked = BYTES_THIS_ACK(tp, th);
270 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
271 hhook_run_tcp_est_in(tp, th, to);
274 TCPSTAT_ADD(tcps_rcvackbyte, acked);
275 sbdrop(&so->so_snd, acked);
276 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
277 SEQ_LEQ(th->th_ack, tp->snd_recover))
278 tp->snd_recover = th->th_ack - 1;
281 * Let the congestion control algorithm update
282 * congestion control related information. This
283 * typically means increasing the congestion
286 cc_ack_received(tp, th, nsegs, CC_ACK);
288 tp->snd_una = th->th_ack;
290 * Pull snd_wl2 up to prevent seq wrap relative
293 tp->snd_wl2 = th->th_ack;
297 * If all outstanding data are acked, stop
298 * retransmit timer, otherwise restart timer
299 * using current (possibly backed-off) value.
300 * If process is waiting for space,
301 * wakeup/selwakeup/signal. If data
302 * are ready to send, let tcp_output
303 * decide between more output or persist.
306 if (so->so_options & SO_DEBUG)
307 tcp_trace(TA_INPUT, ostate, tp,
308 (void *)tcp_saveipgen,
311 TCP_PROBE3(debug__input, tp, th, m);
313 if (tp->snd_una == tp->snd_max)
314 tcp_timer_activate(tp, TT_REXMT, 0);
315 else if (!tcp_timer_active(tp, TT_PERSIST))
316 tcp_timer_activate(tp, TT_REXMT,
320 * Window update only, just free the mbufs and
321 * send out whatever we can.
326 if (sbavail(&so->so_snd))
327 (void) tcp_output(tp);
328 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
329 __func__, ti_locked));
330 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
331 INP_WLOCK_ASSERT(tp->t_inpcb);
333 if (tp->t_flags & TF_DELACK) {
334 tp->t_flags &= ~TF_DELACK;
335 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
337 INP_WUNLOCK(tp->t_inpcb);
341 * Here nothing is really faster, its just that we
342 * have broken out the fast-data path also just like
346 tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
347 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
348 int ti_locked, uint32_t tiwin)
350 int newsize = 0; /* automatic sockbuf scaling */
353 * The size of tcp_saveipgen must be the size of the max ip header,
356 u_char tcp_saveipgen[IP6_HDR_LEN];
357 struct tcphdr tcp_savetcp;
361 * If last ACK falls within this segment's sequence numbers,
362 * record the timestamp.
363 * NOTE that the test is modified according to the latest
364 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
366 if ((to->to_flags & TOF_TS) != 0 &&
367 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
368 tp->ts_recent_age = tcp_ts_getticks();
369 tp->ts_recent = to->to_tsval;
373 * This is a pure, in-sequence data packet with
374 * nothing on the reassembly queue and we have enough
375 * buffer space to take it.
377 if (ti_locked == TI_RLOCKED) {
378 INP_INFO_RUNLOCK(&V_tcbinfo);
380 ti_locked = TI_UNLOCKED;
382 /* Clean receiver SACK report if present */
383 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
384 tcp_clean_sackreport(tp);
385 TCPSTAT_INC(tcps_preddat);
388 * Pull snd_wl1 up to prevent seq wrap relative to
391 tp->snd_wl1 = th->th_seq;
393 * Pull rcv_up up to prevent seq wrap relative to
396 tp->rcv_up = tp->rcv_nxt;
397 TCPSTAT_ADD(tcps_rcvbyte, tlen);
399 if (so->so_options & SO_DEBUG)
400 tcp_trace(TA_INPUT, ostate, tp,
401 (void *)tcp_saveipgen, &tcp_savetcp, 0);
403 TCP_PROBE3(debug__input, tp, th, m);
405 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
407 /* Add data to socket buffer. */
408 SOCKBUF_LOCK(&so->so_rcv);
409 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
413 * Set new socket buffer size.
414 * Give up when limit is reached.
417 if (!sbreserve_locked(&so->so_rcv,
419 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
420 m_adj(m, drop_hdrlen); /* delayed header drop */
421 sbappendstream_locked(&so->so_rcv, m, 0);
423 /* NB: sorwakeup_locked() does an implicit unlock. */
424 sorwakeup_locked(so);
425 if (DELAY_ACK(tp, tlen)) {
426 tp->t_flags |= TF_DELACK;
428 tp->t_flags |= TF_ACKNOW;
431 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
432 __func__, ti_locked));
433 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
434 INP_WLOCK_ASSERT(tp->t_inpcb);
436 if (tp->t_flags & TF_DELACK) {
437 tp->t_flags &= ~TF_DELACK;
438 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
440 INP_WUNLOCK(tp->t_inpcb);
444 * The slow-path is the clone of the long long part
445 * of tcp_do_segment past all the fast-path stuff. We
446 * use it here by two different callers, the fast/slow and
450 tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
451 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
452 int ti_locked, uint32_t tiwin, int thflags)
454 int acked, ourfinisacked, needoutput = 0;
455 int rstreason, todrop, win;
458 struct in_conninfo *inc;
459 struct mbuf *mfree = NULL;
461 nsegs = max(1, m->m_pkthdr.lro_nsegs);
464 * The size of tcp_saveipgen must be the size of the max ip header,
467 u_char tcp_saveipgen[IP6_HDR_LEN];
468 struct tcphdr tcp_savetcp;
472 * Calculate amount of space in receive window,
473 * and then do TCP input processing.
474 * Receive window is amount of space in rcv queue,
475 * but not less than advertised window.
477 inc = &tp->t_inpcb->inp_inc;
478 win = sbspace(&so->so_rcv);
481 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
483 switch (tp->t_state) {
486 * If the state is SYN_RECEIVED:
487 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
489 case TCPS_SYN_RECEIVED:
490 if ((thflags & TH_ACK) &&
491 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
492 SEQ_GT(th->th_ack, tp->snd_max))) {
493 rstreason = BANDLIM_RST_OPENPORT;
499 * If the state is SYN_SENT:
500 * if seg contains a RST with valid ACK (SEQ.ACK has already
501 * been verified), then drop the connection.
502 * if seg contains a RST without an ACK, drop the seg.
503 * if seg does not contain SYN, then drop the seg.
504 * Otherwise this is an acceptable SYN segment
505 * initialize tp->rcv_nxt and tp->irs
506 * if seg contains ack then advance tp->snd_una
507 * if seg contains an ECE and ECN support is enabled, the stream
509 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
510 * arrange for segment to be acked (eventually)
511 * continue processing rest of data/controls, beginning with URG
514 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
515 TCP_PROBE5(connect__refused, NULL, tp, m, tp, th);
516 tp = tcp_drop(tp, ECONNREFUSED);
518 if (thflags & TH_RST)
520 if (!(thflags & TH_SYN))
523 tp->irs = th->th_seq;
525 if (thflags & TH_ACK) {
526 TCPSTAT_INC(tcps_connects);
529 mac_socketpeer_set_from_mbuf(m, so);
531 /* Do window scaling on this connection? */
532 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
533 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
534 tp->rcv_scale = tp->request_r_scale;
536 tp->rcv_adv += min(tp->rcv_wnd,
537 TCP_MAXWIN << tp->rcv_scale);
538 tp->snd_una++; /* SYN is acked */
540 * If there's data, delay ACK; if there's also a FIN
541 * ACKNOW will be turned on later.
543 if (DELAY_ACK(tp, tlen) && tlen != 0)
544 tcp_timer_activate(tp, TT_DELACK,
547 tp->t_flags |= TF_ACKNOW;
549 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
550 tp->t_flags |= TF_ECN_PERMIT;
551 TCPSTAT_INC(tcps_ecn_shs);
555 * Received <SYN,ACK> in SYN_SENT[*] state.
557 * SYN_SENT --> ESTABLISHED
558 * SYN_SENT* --> FIN_WAIT_1
560 tp->t_starttime = ticks;
561 if (tp->t_flags & TF_NEEDFIN) {
562 tcp_state_change(tp, TCPS_FIN_WAIT_1);
563 tp->t_flags &= ~TF_NEEDFIN;
566 tcp_state_change(tp, TCPS_ESTABLISHED);
567 TCP_PROBE5(connect__established, NULL, tp,
570 tcp_timer_activate(tp, TT_KEEP,
575 * Received initial SYN in SYN-SENT[*] state =>
577 * If it succeeds, connection is * half-synchronized.
578 * Otherwise, do 3-way handshake:
579 * SYN-SENT -> SYN-RECEIVED
580 * SYN-SENT* -> SYN-RECEIVED*
582 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
583 tcp_timer_activate(tp, TT_REXMT, 0);
584 tcp_state_change(tp, TCPS_SYN_RECEIVED);
587 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
588 "ti_locked %d", __func__, ti_locked));
589 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
590 INP_WLOCK_ASSERT(tp->t_inpcb);
593 * Advance th->th_seq to correspond to first data byte.
594 * If data, trim to stay within window,
595 * dropping FIN if necessary.
598 if (tlen > tp->rcv_wnd) {
599 todrop = tlen - tp->rcv_wnd;
603 TCPSTAT_INC(tcps_rcvpackafterwin);
604 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
606 tp->snd_wl1 = th->th_seq - 1;
607 tp->rcv_up = th->th_seq;
609 * Client side of transaction: already sent SYN and data.
610 * If the remote host used T/TCP to validate the SYN,
611 * our data will be ACK'd; if so, enter normal data segment
612 * processing in the middle of step 5, ack processing.
613 * Otherwise, goto step 6.
615 if (thflags & TH_ACK)
621 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
622 * do normal processing.
624 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
628 break; /* continue normal processing */
632 * States other than LISTEN or SYN_SENT.
633 * First check the RST flag and sequence number since reset segments
634 * are exempt from the timestamp and connection count tests. This
635 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
636 * below which allowed reset segments in half the sequence space
637 * to fall though and be processed (which gives forged reset
638 * segments with a random sequence number a 50 percent chance of
639 * killing a connection).
640 * Then check timestamp, if present.
641 * Then check the connection count, if present.
642 * Then check that at least some bytes of segment are within
643 * receive window. If segment begins before rcv_nxt,
644 * drop leading data (and SYN); if nothing left, just ack.
646 if (thflags & TH_RST) {
648 * RFC5961 Section 3.2
650 * - RST drops connection only if SEG.SEQ == RCV.NXT.
651 * - If RST is in window, we send challenge ACK.
653 * Note: to take into account delayed ACKs, we should
654 * test against last_ack_sent instead of rcv_nxt.
655 * Note 2: we handle special case of closed window, not
656 * covered by the RFC.
658 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
659 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
660 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
661 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
662 KASSERT(ti_locked == TI_RLOCKED,
663 ("%s: TH_RST ti_locked %d, th %p tp %p",
664 __func__, ti_locked, th, tp));
665 KASSERT(tp->t_state != TCPS_SYN_SENT,
666 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
669 if (V_tcp_insecure_rst ||
670 tp->last_ack_sent == th->th_seq) {
671 TCPSTAT_INC(tcps_drops);
672 /* Drop the connection. */
673 switch (tp->t_state) {
674 case TCPS_SYN_RECEIVED:
675 so->so_error = ECONNREFUSED;
677 case TCPS_ESTABLISHED:
678 case TCPS_FIN_WAIT_1:
679 case TCPS_FIN_WAIT_2:
680 case TCPS_CLOSE_WAIT:
683 so->so_error = ECONNRESET;
690 TCPSTAT_INC(tcps_badrst);
691 /* Send challenge ACK. */
692 tcp_respond(tp, mtod(m, void *), th, m,
693 tp->rcv_nxt, tp->snd_nxt, TH_ACK);
694 tp->last_ack_sent = tp->rcv_nxt;
702 * RFC5961 Section 4.2
703 * Send challenge ACK for any SYN in synchronized state.
705 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
706 KASSERT(ti_locked == TI_RLOCKED,
707 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
708 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
710 TCPSTAT_INC(tcps_badsyn);
711 if (V_tcp_insecure_syn &&
712 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
713 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
714 tp = tcp_drop(tp, ECONNRESET);
715 rstreason = BANDLIM_UNLIMITED;
717 /* Send challenge ACK. */
718 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
719 tp->snd_nxt, TH_ACK);
720 tp->last_ack_sent = tp->rcv_nxt;
727 * RFC 1323 PAWS: If we have a timestamp reply on this segment
728 * and it's less than ts_recent, drop it.
730 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
731 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
733 /* Check to see if ts_recent is over 24 days old. */
734 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
736 * Invalidate ts_recent. If this segment updates
737 * ts_recent, the age will be reset later and ts_recent
738 * will get a valid value. If it does not, setting
739 * ts_recent to zero will at least satisfy the
740 * requirement that zero be placed in the timestamp
741 * echo reply when ts_recent isn't valid. The
742 * age isn't reset until we get a valid ts_recent
743 * because we don't want out-of-order segments to be
744 * dropped when ts_recent is old.
748 TCPSTAT_INC(tcps_rcvduppack);
749 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
750 TCPSTAT_INC(tcps_pawsdrop);
758 * In the SYN-RECEIVED state, validate that the packet belongs to
759 * this connection before trimming the data to fit the receive
760 * window. Check the sequence number versus IRS since we know
761 * the sequence numbers haven't wrapped. This is a partial fix
762 * for the "LAND" DoS attack.
764 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
765 rstreason = BANDLIM_RST_OPENPORT;
769 todrop = tp->rcv_nxt - th->th_seq;
771 if (thflags & TH_SYN) {
781 * Following if statement from Stevens, vol. 2, p. 960.
784 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
786 * Any valid FIN must be to the left of the window.
787 * At this point the FIN must be a duplicate or out
788 * of sequence; drop it.
793 * Send an ACK to resynchronize and drop any data.
794 * But keep on processing for RST or ACK.
796 tp->t_flags |= TF_ACKNOW;
798 TCPSTAT_INC(tcps_rcvduppack);
799 TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
801 TCPSTAT_INC(tcps_rcvpartduppack);
802 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
804 drop_hdrlen += todrop; /* drop from the top afterwards */
805 th->th_seq += todrop;
807 if (th->th_urp > todrop)
808 th->th_urp -= todrop;
816 * If new data are received on a connection after the
817 * user processes are gone, then RST the other end.
819 if ((so->so_state & SS_NOFDREF) &&
820 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
821 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
822 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
823 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
825 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
826 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
827 "after socket was closed, "
828 "sending RST and removing tcpcb\n",
829 s, __func__, tcpstates[tp->t_state], tlen);
833 TCPSTAT_INC(tcps_rcvafterclose);
834 rstreason = BANDLIM_UNLIMITED;
839 * If segment ends after window, drop trailing data
840 * (and PUSH and FIN); if nothing left, just ACK.
842 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
844 TCPSTAT_INC(tcps_rcvpackafterwin);
845 if (todrop >= tlen) {
846 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
848 * If window is closed can only take segments at
849 * window edge, and have to drop data and PUSH from
850 * incoming segments. Continue processing, but
851 * remember to ack. Otherwise, drop segment
854 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
855 tp->t_flags |= TF_ACKNOW;
856 TCPSTAT_INC(tcps_rcvwinprobe);
860 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
863 thflags &= ~(TH_PUSH|TH_FIN);
867 * If last ACK falls within this segment's sequence numbers,
868 * record its timestamp.
870 * 1) That the test incorporates suggestions from the latest
871 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
872 * 2) That updating only on newer timestamps interferes with
873 * our earlier PAWS tests, so this check should be solely
874 * predicated on the sequence space of this segment.
875 * 3) That we modify the segment boundary check to be
876 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
877 * instead of RFC1323's
878 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
879 * This modified check allows us to overcome RFC1323's
880 * limitations as described in Stevens TCP/IP Illustrated
881 * Vol. 2 p.869. In such cases, we can still calculate the
882 * RTT correctly when RCV.NXT == Last.ACK.Sent.
884 if ((to->to_flags & TOF_TS) != 0 &&
885 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
886 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
887 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
888 tp->ts_recent_age = tcp_ts_getticks();
889 tp->ts_recent = to->to_tsval;
893 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
894 * flag is on (half-synchronized state), then queue data for
895 * later processing; else drop segment and return.
897 if ((thflags & TH_ACK) == 0) {
898 if (tp->t_state == TCPS_SYN_RECEIVED ||
899 (tp->t_flags & TF_NEEDSYN))
901 else if (tp->t_flags & TF_ACKNOW)
910 switch (tp->t_state) {
913 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
914 * ESTABLISHED state and continue processing.
915 * The ACK was checked above.
917 case TCPS_SYN_RECEIVED:
919 TCPSTAT_INC(tcps_connects);
921 /* Do window scaling? */
922 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
923 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
924 tp->rcv_scale = tp->request_r_scale;
929 * SYN-RECEIVED -> ESTABLISHED
930 * SYN-RECEIVED* -> FIN-WAIT-1
932 tp->t_starttime = ticks;
933 if (tp->t_flags & TF_NEEDFIN) {
934 tcp_state_change(tp, TCPS_FIN_WAIT_1);
935 tp->t_flags &= ~TF_NEEDFIN;
937 tcp_state_change(tp, TCPS_ESTABLISHED);
938 TCP_PROBE5(accept__established, NULL, tp,
941 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
944 * If segment contains data or ACK, will call tcp_reass()
945 * later; if not, do so now to pass queued data to user.
947 if (tlen == 0 && (thflags & TH_FIN) == 0)
948 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
950 tp->snd_wl1 = th->th_seq - 1;
954 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
955 * ACKs. If the ack is in the range
956 * tp->snd_una < th->th_ack <= tp->snd_max
957 * then advance tp->snd_una to th->th_ack and drop
958 * data from the retransmission queue. If this ACK reflects
959 * more up to date window information we update our window information.
961 case TCPS_ESTABLISHED:
962 case TCPS_FIN_WAIT_1:
963 case TCPS_FIN_WAIT_2:
964 case TCPS_CLOSE_WAIT:
967 if (SEQ_GT(th->th_ack, tp->snd_max)) {
968 TCPSTAT_INC(tcps_rcvacktoomuch);
971 if ((tp->t_flags & TF_SACK_PERMIT) &&
972 ((to->to_flags & TOF_SACK) ||
973 !TAILQ_EMPTY(&tp->snd_holes)))
974 tcp_sack_doack(tp, to, th->th_ack);
977 * Reset the value so that previous (valid) value
978 * from the last ack with SACK doesn't get used.
980 tp->sackhint.sacked_bytes = 0;
983 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
984 hhook_run_tcp_est_in(tp, th, to);
987 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
988 if (tlen == 0 && tiwin == tp->snd_wnd) {
990 * If this is the first time we've seen a
991 * FIN from the remote, this is not a
992 * duplicate and it needs to be processed
993 * normally. This happens during a
994 * simultaneous close.
996 if ((thflags & TH_FIN) &&
997 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
1001 TCPSTAT_INC(tcps_rcvdupack);
1003 * If we have outstanding data (other than
1004 * a window probe), this is a completely
1005 * duplicate ack (ie, window info didn't
1006 * change and FIN isn't set),
1007 * the ack is the biggest we've
1008 * seen and we've seen exactly our rexmt
1009 * threshold of them, assume a packet
1010 * has been dropped and retransmit it.
1011 * Kludge snd_nxt & the congestion
1012 * window so we send only this one
1015 * We know we're losing at the current
1016 * window size so do congestion avoidance
1017 * (set ssthresh to half the current window
1018 * and pull our congestion window back to
1019 * the new ssthresh).
1021 * Dup acks mean that packets have left the
1022 * network (they're now cached at the receiver)
1023 * so bump cwnd by the amount in the receiver
1024 * to keep a constant cwnd packets in the
1027 * When using TCP ECN, notify the peer that
1028 * we reduced the cwnd.
1030 if (!tcp_timer_active(tp, TT_REXMT) ||
1031 th->th_ack != tp->snd_una)
1033 else if (++tp->t_dupacks > tcprexmtthresh ||
1034 IN_FASTRECOVERY(tp->t_flags)) {
1035 cc_ack_received(tp, th, nsegs,
1037 if ((tp->t_flags & TF_SACK_PERMIT) &&
1038 IN_FASTRECOVERY(tp->t_flags)) {
1042 * Compute the amount of data in flight first.
1043 * We can inject new data into the pipe iff
1044 * we have less than 1/2 the original window's
1045 * worth of data in flight.
1047 if (V_tcp_do_rfc6675_pipe)
1048 awnd = tcp_compute_pipe(tp);
1050 awnd = (tp->snd_nxt - tp->snd_fack) +
1051 tp->sackhint.sack_bytes_rexmit;
1053 if (awnd < tp->snd_ssthresh) {
1054 tp->snd_cwnd += tp->t_maxseg;
1055 if (tp->snd_cwnd > tp->snd_ssthresh)
1056 tp->snd_cwnd = tp->snd_ssthresh;
1059 tp->snd_cwnd += tp->t_maxseg;
1060 (void) tp->t_fb->tfb_tcp_output(tp);
1062 } else if (tp->t_dupacks == tcprexmtthresh) {
1063 tcp_seq onxt = tp->snd_nxt;
1066 * If we're doing sack, check to
1067 * see if we're already in sack
1068 * recovery. If we're not doing sack,
1069 * check to see if we're in newreno
1072 if (tp->t_flags & TF_SACK_PERMIT) {
1073 if (IN_FASTRECOVERY(tp->t_flags)) {
1078 if (SEQ_LEQ(th->th_ack,
1084 /* Congestion signal before ack. */
1085 cc_cong_signal(tp, th, CC_NDUPACK);
1086 cc_ack_received(tp, th, nsegs,
1088 tcp_timer_activate(tp, TT_REXMT, 0);
1090 if (tp->t_flags & TF_SACK_PERMIT) {
1092 tcps_sack_recovery_episode);
1093 tp->sack_newdata = tp->snd_nxt;
1094 tp->snd_cwnd = tp->t_maxseg;
1095 (void) tp->t_fb->tfb_tcp_output(tp);
1098 tp->snd_nxt = th->th_ack;
1099 tp->snd_cwnd = tp->t_maxseg;
1100 (void) tp->t_fb->tfb_tcp_output(tp);
1101 KASSERT(tp->snd_limited <= 2,
1102 ("%s: tp->snd_limited too big",
1104 tp->snd_cwnd = tp->snd_ssthresh +
1106 (tp->t_dupacks - tp->snd_limited);
1107 if (SEQ_GT(onxt, tp->snd_nxt))
1110 } else if (V_tcp_do_rfc3042) {
1112 * Process first and second duplicate
1113 * ACKs. Each indicates a segment
1114 * leaving the network, creating room
1115 * for more. Make sure we can send a
1116 * packet on reception of each duplicate
1117 * ACK by increasing snd_cwnd by one
1118 * segment. Restore the original
1119 * snd_cwnd after packet transmission.
1121 cc_ack_received(tp, th, nsegs,
1123 uint32_t oldcwnd = tp->snd_cwnd;
1124 tcp_seq oldsndmax = tp->snd_max;
1128 KASSERT(tp->t_dupacks == 1 ||
1130 ("%s: dupacks not 1 or 2",
1132 if (tp->t_dupacks == 1)
1133 tp->snd_limited = 0;
1135 (tp->snd_nxt - tp->snd_una) +
1136 (tp->t_dupacks - tp->snd_limited) *
1139 * Only call tcp_output when there
1140 * is new data available to be sent.
1141 * Otherwise we would send pure ACKs.
1143 SOCKBUF_LOCK(&so->so_snd);
1144 avail = sbavail(&so->so_snd) -
1145 (tp->snd_nxt - tp->snd_una);
1146 SOCKBUF_UNLOCK(&so->so_snd);
1148 (void) tp->t_fb->tfb_tcp_output(tp);
1149 sent = tp->snd_max - oldsndmax;
1150 if (sent > tp->t_maxseg) {
1151 KASSERT((tp->t_dupacks == 2 &&
1152 tp->snd_limited == 0) ||
1153 (sent == tp->t_maxseg + 1 &&
1154 tp->t_flags & TF_SENTFIN),
1155 ("%s: sent too much",
1157 tp->snd_limited = 2;
1158 } else if (sent > 0)
1160 tp->snd_cwnd = oldcwnd;
1168 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
1169 ("%s: th_ack <= snd_una", __func__));
1172 * If the congestion window was inflated to account
1173 * for the other side's cached packets, retract it.
1175 if (IN_FASTRECOVERY(tp->t_flags)) {
1176 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
1177 if (tp->t_flags & TF_SACK_PERMIT)
1178 tcp_sack_partialack(tp, th);
1180 tcp_newreno_partial_ack(tp, th);
1182 cc_post_recovery(tp, th);
1186 * If we reach this point, ACK is not a duplicate,
1187 * i.e., it ACKs something we sent.
1189 if (tp->t_flags & TF_NEEDSYN) {
1191 * T/TCP: Connection was half-synchronized, and our
1192 * SYN has been ACK'd (so connection is now fully
1193 * synchronized). Go to non-starred state,
1194 * increment snd_una for ACK of SYN, and check if
1195 * we can do window scaling.
1197 tp->t_flags &= ~TF_NEEDSYN;
1199 /* Do window scaling? */
1200 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1201 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1202 tp->rcv_scale = tp->request_r_scale;
1203 /* Send window already scaled. */
1208 INP_WLOCK_ASSERT(tp->t_inpcb);
1210 acked = BYTES_THIS_ACK(tp, th);
1211 TCPSTAT_INC(tcps_rcvackpack);
1212 TCPSTAT_ADD(tcps_rcvackbyte, acked);
1215 * If we just performed our first retransmit, and the ACK
1216 * arrives within our recovery window, then it was a mistake
1217 * to do the retransmit in the first place. Recover our
1218 * original cwnd and ssthresh, and proceed to transmit where
1221 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
1222 (int)(ticks - tp->t_badrxtwin) < 0)
1223 cc_cong_signal(tp, th, CC_RTO_ERR);
1226 * If we have a timestamp reply, update smoothed
1227 * round trip time. If no timestamp is present but
1228 * transmit timer is running and timed sequence
1229 * number was acked, update smoothed round trip time.
1230 * Since we now have an rtt measurement, cancel the
1231 * timer backoff (cf., Phil Karn's retransmit alg.).
1232 * Recompute the initial retransmit timer.
1234 * Some boxes send broken timestamp replies
1235 * during the SYN+ACK phase, ignore
1236 * timestamps of 0 or we could calculate a
1237 * huge RTT and blow up the retransmit timer.
1239 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
1242 t = tcp_ts_getticks() - to->to_tsecr;
1243 if (!tp->t_rttlow || tp->t_rttlow > t)
1245 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
1246 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
1247 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
1248 tp->t_rttlow = ticks - tp->t_rtttime;
1249 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1253 * If all outstanding data is acked, stop retransmit
1254 * timer and remember to restart (more output or persist).
1255 * If there is more data to be acked, restart retransmit
1256 * timer, using current (possibly backed-off) value.
1258 if (th->th_ack == tp->snd_max) {
1259 tcp_timer_activate(tp, TT_REXMT, 0);
1261 } else if (!tcp_timer_active(tp, TT_PERSIST))
1262 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1265 * If no data (only SYN) was ACK'd,
1266 * skip rest of ACK processing.
1272 * Let the congestion control algorithm update congestion
1273 * control related information. This typically means increasing
1274 * the congestion window.
1276 cc_ack_received(tp, th, nsegs, CC_ACK);
1278 SOCKBUF_LOCK(&so->so_snd);
1279 if (acked > sbavail(&so->so_snd)) {
1280 tp->snd_wnd -= sbavail(&so->so_snd);
1281 mfree = sbcut_locked(&so->so_snd,
1282 (int)sbavail(&so->so_snd));
1285 mfree = sbcut_locked(&so->so_snd, acked);
1286 tp->snd_wnd -= acked;
1289 /* NB: sowwakeup_locked() does an implicit unlock. */
1290 sowwakeup_locked(so);
1292 /* Detect una wraparound. */
1293 if (!IN_RECOVERY(tp->t_flags) &&
1294 SEQ_GT(tp->snd_una, tp->snd_recover) &&
1295 SEQ_LEQ(th->th_ack, tp->snd_recover))
1296 tp->snd_recover = th->th_ack - 1;
1297 /* XXXLAS: Can this be moved up into cc_post_recovery? */
1298 if (IN_RECOVERY(tp->t_flags) &&
1299 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
1300 EXIT_RECOVERY(tp->t_flags);
1302 tp->snd_una = th->th_ack;
1303 if (tp->t_flags & TF_SACK_PERMIT) {
1304 if (SEQ_GT(tp->snd_una, tp->snd_recover))
1305 tp->snd_recover = tp->snd_una;
1307 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1308 tp->snd_nxt = tp->snd_una;
1310 switch (tp->t_state) {
1313 * In FIN_WAIT_1 STATE in addition to the processing
1314 * for the ESTABLISHED state if our FIN is now acknowledged
1315 * then enter FIN_WAIT_2.
1317 case TCPS_FIN_WAIT_1:
1318 if (ourfinisacked) {
1320 * If we can't receive any more
1321 * data, then closing user can proceed.
1322 * Starting the timer is contrary to the
1323 * specification, but if we don't get a FIN
1324 * we'll hang forever.
1327 * we should release the tp also, and use a
1330 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1331 soisdisconnected(so);
1332 tcp_timer_activate(tp, TT_2MSL,
1333 (tcp_fast_finwait2_recycle ?
1334 tcp_finwait2_timeout :
1337 tcp_state_change(tp, TCPS_FIN_WAIT_2);
1342 * In CLOSING STATE in addition to the processing for
1343 * the ESTABLISHED state if the ACK acknowledges our FIN
1344 * then enter the TIME-WAIT state, otherwise ignore
1348 if (ourfinisacked) {
1349 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1351 INP_INFO_RUNLOCK(&V_tcbinfo);
1358 * In LAST_ACK, we may still be waiting for data to drain
1359 * and/or to be acked, as well as for the ack of our FIN.
1360 * If our FIN is now acknowledged, delete the TCB,
1361 * enter the closed state and return.
1364 if (ourfinisacked) {
1365 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1374 INP_WLOCK_ASSERT(tp->t_inpcb);
1377 * Update window information.
1378 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1380 if ((thflags & TH_ACK) &&
1381 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1382 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1383 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1384 /* keep track of pure window updates */
1386 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1387 TCPSTAT_INC(tcps_rcvwinupd);
1388 tp->snd_wnd = tiwin;
1389 tp->snd_wl1 = th->th_seq;
1390 tp->snd_wl2 = th->th_ack;
1391 if (tp->snd_wnd > tp->max_sndwnd)
1392 tp->max_sndwnd = tp->snd_wnd;
1397 * Process segments with URG.
1399 if ((thflags & TH_URG) && th->th_urp &&
1400 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1402 * This is a kludge, but if we receive and accept
1403 * random urgent pointers, we'll crash in
1404 * soreceive. It's hard to imagine someone
1405 * actually wanting to send this much urgent data.
1407 SOCKBUF_LOCK(&so->so_rcv);
1408 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
1409 th->th_urp = 0; /* XXX */
1410 thflags &= ~TH_URG; /* XXX */
1411 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
1412 goto dodata; /* XXX */
1415 * If this segment advances the known urgent pointer,
1416 * then mark the data stream. This should not happen
1417 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1418 * a FIN has been received from the remote side.
1419 * In these states we ignore the URG.
1421 * According to RFC961 (Assigned Protocols),
1422 * the urgent pointer points to the last octet
1423 * of urgent data. We continue, however,
1424 * to consider it to indicate the first octet
1425 * of data past the urgent section as the original
1426 * spec states (in one of two places).
1428 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1429 tp->rcv_up = th->th_seq + th->th_urp;
1430 so->so_oobmark = sbavail(&so->so_rcv) +
1431 (tp->rcv_up - tp->rcv_nxt) - 1;
1432 if (so->so_oobmark == 0)
1433 so->so_rcv.sb_state |= SBS_RCVATMARK;
1435 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1437 SOCKBUF_UNLOCK(&so->so_rcv);
1439 * Remove out of band data so doesn't get presented to user.
1440 * This can happen independent of advancing the URG pointer,
1441 * but if two URG's are pending at once, some out-of-band
1442 * data may creep in... ick.
1444 if (th->th_urp <= (uint32_t)tlen &&
1445 !(so->so_options & SO_OOBINLINE)) {
1446 /* hdr drop is delayed */
1447 tcp_pulloutofband(so, th, m, drop_hdrlen);
1451 * If no out of band data is expected,
1452 * pull receive urgent pointer along
1453 * with the receive window.
1455 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1456 tp->rcv_up = tp->rcv_nxt;
1459 INP_WLOCK_ASSERT(tp->t_inpcb);
1462 * Process the segment text, merging it into the TCP sequencing queue,
1463 * and arranging for acknowledgment of receipt if necessary.
1464 * This process logically involves adjusting tp->rcv_wnd as data
1465 * is presented to the user (this happens in tcp_usrreq.c,
1466 * case PRU_RCVD). If a FIN has already been received on this
1467 * connection then we just ignore the text.
1469 if ((tlen || (thflags & TH_FIN)) &&
1470 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1471 tcp_seq save_start = th->th_seq;
1472 m_adj(m, drop_hdrlen); /* delayed header drop */
1474 * Insert segment which includes th into TCP reassembly queue
1475 * with control block tp. Set thflags to whether reassembly now
1476 * includes a segment with FIN. This handles the common case
1477 * inline (segment is the next to be received on an established
1478 * connection, and the queue is empty), avoiding linkage into
1479 * and removal from the queue and repetition of various
1481 * Set DELACK for segments received in order, but ack
1482 * immediately when segments are out of order (so
1483 * fast retransmit can work).
1485 if (th->th_seq == tp->rcv_nxt &&
1486 LIST_EMPTY(&tp->t_segq) &&
1487 TCPS_HAVEESTABLISHED(tp->t_state)) {
1488 if (DELAY_ACK(tp, tlen))
1489 tp->t_flags |= TF_DELACK;
1491 tp->t_flags |= TF_ACKNOW;
1492 tp->rcv_nxt += tlen;
1493 thflags = th->th_flags & TH_FIN;
1494 TCPSTAT_INC(tcps_rcvpack);
1495 TCPSTAT_ADD(tcps_rcvbyte, tlen);
1496 SOCKBUF_LOCK(&so->so_rcv);
1497 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1500 sbappendstream_locked(&so->so_rcv, m, 0);
1501 /* NB: sorwakeup_locked() does an implicit unlock. */
1502 sorwakeup_locked(so);
1505 * XXX: Due to the header drop above "th" is
1506 * theoretically invalid by now. Fortunately
1507 * m_adj() doesn't actually frees any mbufs
1508 * when trimming from the head.
1510 thflags = tcp_reass(tp, th, &tlen, m);
1511 tp->t_flags |= TF_ACKNOW;
1513 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
1514 tcp_update_sack_list(tp, save_start, save_start + tlen);
1517 * Note the amount of data that peer has sent into
1518 * our window, in order to estimate the sender's
1522 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1523 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1525 len = so->so_rcv.sb_hiwat;
1533 * If FIN is received ACK the FIN and let the user know
1534 * that the connection is closing.
1536 if (thflags & TH_FIN) {
1537 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1540 * If connection is half-synchronized
1541 * (ie NEEDSYN flag on) then delay ACK,
1542 * so it may be piggybacked when SYN is sent.
1543 * Otherwise, since we received a FIN then no
1544 * more input can be expected, send ACK now.
1546 if (tp->t_flags & TF_NEEDSYN)
1547 tp->t_flags |= TF_DELACK;
1549 tp->t_flags |= TF_ACKNOW;
1552 switch (tp->t_state) {
1555 * In SYN_RECEIVED and ESTABLISHED STATES
1556 * enter the CLOSE_WAIT state.
1558 case TCPS_SYN_RECEIVED:
1559 tp->t_starttime = ticks;
1561 case TCPS_ESTABLISHED:
1562 tcp_state_change(tp, TCPS_CLOSE_WAIT);
1566 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1567 * enter the CLOSING state.
1569 case TCPS_FIN_WAIT_1:
1570 tcp_state_change(tp, TCPS_CLOSING);
1574 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1575 * starting the time-wait timer, turning off the other
1578 case TCPS_FIN_WAIT_2:
1579 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1580 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
1581 "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
1585 INP_INFO_RUNLOCK(&V_tcbinfo);
1589 if (ti_locked == TI_RLOCKED) {
1590 INP_INFO_RUNLOCK(&V_tcbinfo);
1592 ti_locked = TI_UNLOCKED;
1595 if (so->so_options & SO_DEBUG)
1596 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1599 TCP_PROBE3(debug__input, tp, th, m);
1602 * Return any desired output.
1604 if (needoutput || (tp->t_flags & TF_ACKNOW))
1605 (void) tp->t_fb->tfb_tcp_output(tp);
1607 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
1608 __func__, ti_locked));
1609 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1610 INP_WLOCK_ASSERT(tp->t_inpcb);
1612 if (tp->t_flags & TF_DELACK) {
1613 tp->t_flags &= ~TF_DELACK;
1614 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
1616 INP_WUNLOCK(tp->t_inpcb);
1621 * Generate an ACK dropping incoming segment if it occupies
1622 * sequence space, where the ACK reflects our state.
1624 * We can now skip the test for the RST flag since all
1625 * paths to this code happen after packets containing
1626 * RST have been dropped.
1628 * In the SYN-RECEIVED state, don't send an ACK unless the
1629 * segment we received passes the SYN-RECEIVED ACK test.
1630 * If it fails send a RST. This breaks the loop in the
1631 * "LAND" DoS attack, and also prevents an ACK storm
1632 * between two listening ports that have been sent forged
1633 * SYN segments, each with the source address of the other.
1635 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1636 (SEQ_GT(tp->snd_una, th->th_ack) ||
1637 SEQ_GT(th->th_ack, tp->snd_max)) ) {
1638 rstreason = BANDLIM_RST_OPENPORT;
1642 if (so->so_options & SO_DEBUG)
1643 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1646 TCP_PROBE3(debug__drop, tp, th, m);
1647 if (ti_locked == TI_RLOCKED) {
1648 INP_INFO_RUNLOCK(&V_tcbinfo);
1650 ti_locked = TI_UNLOCKED;
1652 tp->t_flags |= TF_ACKNOW;
1653 (void) tp->t_fb->tfb_tcp_output(tp);
1654 INP_WUNLOCK(tp->t_inpcb);
1659 if (ti_locked == TI_RLOCKED) {
1660 INP_INFO_RUNLOCK(&V_tcbinfo);
1662 ti_locked = TI_UNLOCKED;
1665 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1666 INP_WUNLOCK(tp->t_inpcb);
1668 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1672 if (ti_locked == TI_RLOCKED) {
1673 INP_INFO_RUNLOCK(&V_tcbinfo);
1674 ti_locked = TI_UNLOCKED;
1678 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1682 * Drop space held by incoming segment and return.
1685 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1686 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1689 TCP_PROBE3(debug__drop, tp, th, m);
1691 INP_WUNLOCK(tp->t_inpcb);
1697 * Do fast slow is a combination of the original
1698 * tcp_dosegment and a split fastpath, one function
1699 * for the fast-ack which also includes allowing fastpath
1700 * for window advanced in sequence acks. And also a
1701 * sub-function that handles the insequence data.
1704 tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
1705 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1713 struct in_conninfo *inc;
1716 thflags = th->th_flags;
1717 inc = &tp->t_inpcb->inp_inc;
1718 nsegs = max(1, m->m_pkthdr.lro_nsegs);
1720 * If this is either a state-changing packet or current state isn't
1721 * established, we require a write lock on tcbinfo. Otherwise, we
1722 * allow the tcbinfo to be in either alocked or unlocked, as the
1723 * caller may have unnecessarily acquired a write lock due to a race.
1725 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1726 tp->t_state != TCPS_ESTABLISHED) {
1727 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
1728 "SYN/FIN/RST/!EST", __func__, ti_locked));
1729 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1732 if (ti_locked == TI_RLOCKED) {
1733 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1735 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
1736 "ti_locked: %d", __func__, ti_locked));
1737 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1741 INP_WLOCK_ASSERT(tp->t_inpcb);
1742 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1744 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1747 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
1748 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1749 log(LOG_DEBUG, "%s; %s: "
1750 "SYN|FIN segment ignored (based on "
1751 "sysctl setting)\n", s, __func__);
1754 if (ti_locked == TI_RLOCKED) {
1755 INP_INFO_RUNLOCK(&V_tcbinfo);
1757 INP_WUNLOCK(tp->t_inpcb);
1763 * If a segment with the ACK-bit set arrives in the SYN-SENT state
1764 * check SEQ.ACK first.
1766 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
1767 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
1768 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED);
1769 if (ti_locked == TI_RLOCKED) {
1770 INP_INFO_RUNLOCK(&V_tcbinfo);
1772 INP_WUNLOCK(tp->t_inpcb);
1776 tp->sackhint.last_sack_ack = 0;
1779 * Segment received on connection.
1780 * Reset idle time and keep-alive timer.
1781 * XXX: This should be done after segment
1782 * validation to ignore broken/spoofed segs.
1784 tp->t_rcvtime = ticks;
1787 * Unscale the window into a 32-bit value.
1788 * For the SYN_SENT state the scale is zero.
1790 tiwin = th->th_win << tp->snd_scale;
1793 * TCP ECN processing.
1795 if (tp->t_flags & TF_ECN_PERMIT) {
1796 if (thflags & TH_CWR)
1797 tp->t_flags &= ~TF_ECN_SND_ECE;
1798 switch (iptos & IPTOS_ECN_MASK) {
1800 tp->t_flags |= TF_ECN_SND_ECE;
1801 TCPSTAT_INC(tcps_ecn_ce);
1803 case IPTOS_ECN_ECT0:
1804 TCPSTAT_INC(tcps_ecn_ect0);
1806 case IPTOS_ECN_ECT1:
1807 TCPSTAT_INC(tcps_ecn_ect1);
1810 /* Congestion experienced. */
1811 if (thflags & TH_ECE) {
1812 cc_cong_signal(tp, th, CC_ECN);
1817 * Parse options on any incoming segment.
1819 tcp_dooptions(&to, (u_char *)(th + 1),
1820 (th->th_off << 2) - sizeof(struct tcphdr),
1821 (thflags & TH_SYN) ? TO_SYN : 0);
1824 * If echoed timestamp is later than the current time,
1825 * fall back to non RFC1323 RTT calculation. Normalize
1826 * timestamp if syncookies were used when this connection
1829 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1830 to.to_tsecr -= tp->ts_offset;
1831 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1836 * Process options only when we get SYN/ACK back. The SYN case
1837 * for incoming connections is handled in tcp_syncache.
1838 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1839 * or <SYN,ACK>) segment itself is never scaled.
1840 * XXX this is traditional behavior, may need to be cleaned up.
1842 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1843 if ((to.to_flags & TOF_SCALE) &&
1844 (tp->t_flags & TF_REQ_SCALE)) {
1845 tp->t_flags |= TF_RCVD_SCALE;
1846 tp->snd_scale = to.to_wscale;
1849 * Initial send window. It will be updated with
1850 * the next incoming segment to the scaled value.
1852 tp->snd_wnd = th->th_win;
1853 if (to.to_flags & TOF_TS) {
1854 tp->t_flags |= TF_RCVD_TSTMP;
1855 tp->ts_recent = to.to_tsval;
1856 tp->ts_recent_age = tcp_ts_getticks();
1858 if (to.to_flags & TOF_MSS)
1859 tcp_mss(tp, to.to_mss);
1860 if ((tp->t_flags & TF_SACK_PERMIT) &&
1861 (to.to_flags & TOF_SACKPERM) == 0)
1862 tp->t_flags &= ~TF_SACK_PERMIT;
1866 * If timestamps were negotiated during SYN/ACK they should
1867 * appear on every segment during this session and vice versa.
1869 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1870 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1871 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1872 "no action\n", s, __func__);
1876 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1877 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1878 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1879 "no action\n", s, __func__);
1885 if (__predict_true((tlen == 0))) {
1887 * The ack moved forward and we have a window (non-zero)
1889 * The ack did not move forward, but the window increased.
1891 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
1892 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
1897 * Data incoming, use the old entry criteria
1898 * for fast-path with data.
1900 if ((tiwin && tiwin == tp->snd_wnd)) {
1905 * Header prediction: check for the two common cases
1906 * of a uni-directional data xfer. If the packet has
1907 * no control flags, is in-sequence, the window didn't
1908 * change and we're not retransmitting, it's a
1909 * candidate. If the length is zero and the ack moved
1910 * forward, we're the sender side of the xfer. Just
1911 * free the data acked & wake any higher level process
1912 * that was blocked waiting for space. If the length
1913 * is non-zero and the ack didn't move, we're the
1914 * receiver side. If we're getting packets in-order
1915 * (the reassembly queue is empty), add the data to
1916 * the socket buffer and note that we need a delayed ack.
1917 * Make sure that the hidden state-flags are also off.
1918 * Since we check for TCPS_ESTABLISHED first, it can only
1921 if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
1922 th->th_seq == tp->rcv_nxt &&
1923 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1924 tp->snd_nxt == tp->snd_max &&
1926 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1927 LIST_EMPTY(&tp->t_segq) &&
1928 ((to.to_flags & TOF_TS) == 0 ||
1929 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
1930 if (__predict_true((tlen == 0) &&
1931 (SEQ_LEQ(th->th_ack, tp->snd_max) &&
1932 !IN_RECOVERY(tp->t_flags) &&
1933 (to.to_flags & TOF_SACK) == 0 &&
1934 TAILQ_EMPTY(&tp->snd_holes)))) {
1936 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
1939 } else if ((tlen) &&
1940 (th->th_ack == tp->snd_una &&
1941 tlen <= sbspace(&so->so_rcv))) {
1942 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
1948 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
1949 ti_locked, tiwin, thflags);
1954 * This subfunction is used to try to highly optimize the
1955 * fast path. We again allow window updates that are
1956 * in sequence to remain in the fast-path. We also add
1957 * in the __predict's to attempt to help the compiler.
1958 * Note that if we return a 0, then we can *not* process
1959 * it and the caller should push the packet into the
1963 tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
1964 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
1965 int ti_locked, uint32_t tiwin)
1971 nsegs = max(1, m->m_pkthdr.lro_nsegs);
1974 * The size of tcp_saveipgen must be the size of the max ip header,
1977 u_char tcp_saveipgen[IP6_HDR_LEN];
1978 struct tcphdr tcp_savetcp;
1983 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
1984 /* Old ack, behind (or duplicate to) the last one rcv'd */
1987 if (__predict_false(th->th_ack == tp->snd_una) &&
1988 __predict_false(tiwin <= tp->snd_wnd)) {
1989 /* duplicate ack <or> a shrinking dup ack with shrinking window */
1992 if (__predict_false(tiwin == 0)) {
1996 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
1997 /* Above what we have sent? */
2000 if (__predict_false(tp->snd_nxt != tp->snd_max)) {
2001 /* We are retransmitting */
2004 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
2005 /* We need a SYN or a FIN, unlikely.. */
2008 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
2009 /* Timestamp is behind .. old ack with seq wrap? */
2012 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
2013 /* Still recovering */
2016 if (__predict_false(to->to_flags & TOF_SACK)) {
2017 /* Sack included in the ack.. */
2020 if (!TAILQ_EMPTY(&tp->snd_holes)) {
2021 /* We have sack holes on our scoreboard */
2024 /* Ok if we reach here, we can process a fast-ack */
2026 /* Did the window get updated? */
2027 if (tiwin != tp->snd_wnd) {
2028 /* keep track of pure window updates */
2029 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
2031 TCPSTAT_INC(tcps_rcvwinupd);
2033 tp->snd_wnd = tiwin;
2034 tp->snd_wl1 = th->th_seq;
2035 if (tp->snd_wnd > tp->max_sndwnd)
2036 tp->max_sndwnd = tp->snd_wnd;
2039 * Pull snd_wl2 up to prevent seq wrap relative
2042 tp->snd_wl2 = th->th_ack;
2044 * If last ACK falls within this segment's sequence numbers,
2045 * record the timestamp.
2046 * NOTE that the test is modified according to the latest
2047 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2049 if ((to->to_flags & TOF_TS) != 0 &&
2050 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2051 tp->ts_recent_age = tcp_ts_getticks();
2052 tp->ts_recent = to->to_tsval;
2055 * This is a pure ack for outstanding data.
2057 if (ti_locked == TI_RLOCKED) {
2058 INP_INFO_RUNLOCK(&V_tcbinfo);
2060 ti_locked = TI_UNLOCKED;
2062 TCPSTAT_INC(tcps_predack);
2065 * "bad retransmit" recovery.
2067 if (tp->t_rxtshift == 1 &&
2068 tp->t_flags & TF_PREVVALID &&
2069 (int)(ticks - tp->t_badrxtwin) < 0) {
2070 cc_cong_signal(tp, th, CC_RTO_ERR);
2074 * Recalculate the transmit timer / rtt.
2076 * Some boxes send broken timestamp replies
2077 * during the SYN+ACK phase, ignore
2078 * timestamps of 0 or we could calculate a
2079 * huge RTT and blow up the retransmit timer.
2081 if ((to->to_flags & TOF_TS) != 0 &&
2085 t = tcp_ts_getticks() - to->to_tsecr;
2086 if (!tp->t_rttlow || tp->t_rttlow > t)
2089 TCP_TS_TO_TICKS(t) + 1);
2090 } else if (tp->t_rtttime &&
2091 SEQ_GT(th->th_ack, tp->t_rtseq)) {
2092 if (!tp->t_rttlow ||
2093 tp->t_rttlow > ticks - tp->t_rtttime)
2094 tp->t_rttlow = ticks - tp->t_rtttime;
2096 ticks - tp->t_rtttime);
2098 if (winup_only == 0) {
2099 acked = BYTES_THIS_ACK(tp, th);
2102 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2103 hhook_run_tcp_est_in(tp, th, to);
2106 TCPSTAT_ADD(tcps_rcvackbyte, acked);
2107 sbdrop(&so->so_snd, acked);
2108 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2109 SEQ_LEQ(th->th_ack, tp->snd_recover))
2110 tp->snd_recover = th->th_ack - 1;
2113 * Let the congestion control algorithm update
2114 * congestion control related information. This
2115 * typically means increasing the congestion
2118 cc_ack_received(tp, th, nsegs, CC_ACK);
2120 tp->snd_una = th->th_ack;
2124 * If all outstanding data are acked, stop
2125 * retransmit timer, otherwise restart timer
2126 * using current (possibly backed-off) value.
2127 * If process is waiting for space,
2128 * wakeup/selwakeup/signal. If data
2129 * are ready to send, let tcp_output
2130 * decide between more output or persist.
2133 if (so->so_options & SO_DEBUG)
2134 tcp_trace(TA_INPUT, ostate, tp,
2135 (void *)tcp_saveipgen,
2138 TCP_PROBE3(debug__input, tp, th, m);
2140 if (tp->snd_una == tp->snd_max)
2141 tcp_timer_activate(tp, TT_REXMT, 0);
2142 else if (!tcp_timer_active(tp, TT_PERSIST))
2143 tcp_timer_activate(tp, TT_REXMT,
2145 /* Wake up the socket if we have room to write more */
2149 * Window update only, just free the mbufs and
2150 * send out whatever we can.
2154 if (sbavail(&so->so_snd))
2155 (void) tcp_output(tp);
2156 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2157 __func__, ti_locked));
2158 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2159 INP_WLOCK_ASSERT(tp->t_inpcb);
2161 if (tp->t_flags & TF_DELACK) {
2162 tp->t_flags &= ~TF_DELACK;
2163 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2165 INP_WUNLOCK(tp->t_inpcb);
2170 * This tcp-do-segment concentrates on making the fastest
2171 * ack processing path. It does not have a fast-path for
2172 * data (it possibly could which would then eliminate the
2173 * need for fast-slow above). For a content distributor having
2174 * large outgoing elephants and very very little coming in
2175 * having no fastpath for data does not really help (since you
2176 * don't get much data in). The most important thing is
2177 * processing ack's quickly and getting the rest of the data
2178 * output to the peer as quickly as possible. This routine
2179 * seems to be about an overall 3% faster then the old
2180 * tcp_do_segment and keeps us in the fast-path for packets
2181 * much more (by allowing window updates to also stay in the fastpath).
2184 tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
2185 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
2191 struct in_conninfo *inc;
2194 thflags = th->th_flags;
2195 inc = &tp->t_inpcb->inp_inc;
2197 * If this is either a state-changing packet or current state isn't
2198 * established, we require a write lock on tcbinfo. Otherwise, we
2199 * allow the tcbinfo to be in either alocked or unlocked, as the
2200 * caller may have unnecessarily acquired a write lock due to a race.
2202 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
2203 tp->t_state != TCPS_ESTABLISHED) {
2204 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
2205 "SYN/FIN/RST/!EST", __func__, ti_locked));
2206 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2209 if (ti_locked == TI_RLOCKED) {
2210 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2212 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
2213 "ti_locked: %d", __func__, ti_locked));
2214 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2218 INP_WLOCK_ASSERT(tp->t_inpcb);
2219 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
2221 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
2224 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
2225 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2226 log(LOG_DEBUG, "%s; %s: "
2227 "SYN|FIN segment ignored (based on "
2228 "sysctl setting)\n", s, __func__);
2231 if (ti_locked == TI_RLOCKED) {
2232 INP_INFO_RUNLOCK(&V_tcbinfo);
2234 INP_WUNLOCK(tp->t_inpcb);
2240 * If a segment with the ACK-bit set arrives in the SYN-SENT state
2241 * check SEQ.ACK first.
2243 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
2244 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
2245 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED);
2246 if (ti_locked == TI_RLOCKED) {
2247 INP_INFO_RUNLOCK(&V_tcbinfo);
2249 INP_WUNLOCK(tp->t_inpcb);
2253 tp->sackhint.last_sack_ack = 0;
2256 * Segment received on connection.
2257 * Reset idle time and keep-alive timer.
2258 * XXX: This should be done after segment
2259 * validation to ignore broken/spoofed segs.
2261 tp->t_rcvtime = ticks;
2264 * Unscale the window into a 32-bit value.
2265 * For the SYN_SENT state the scale is zero.
2267 tiwin = th->th_win << tp->snd_scale;
2270 * TCP ECN processing.
2272 if (tp->t_flags & TF_ECN_PERMIT) {
2273 if (thflags & TH_CWR)
2274 tp->t_flags &= ~TF_ECN_SND_ECE;
2275 switch (iptos & IPTOS_ECN_MASK) {
2277 tp->t_flags |= TF_ECN_SND_ECE;
2278 TCPSTAT_INC(tcps_ecn_ce);
2280 case IPTOS_ECN_ECT0:
2281 TCPSTAT_INC(tcps_ecn_ect0);
2283 case IPTOS_ECN_ECT1:
2284 TCPSTAT_INC(tcps_ecn_ect1);
2287 /* Congestion experienced. */
2288 if (thflags & TH_ECE) {
2289 cc_cong_signal(tp, th, CC_ECN);
2294 * Parse options on any incoming segment.
2296 tcp_dooptions(&to, (u_char *)(th + 1),
2297 (th->th_off << 2) - sizeof(struct tcphdr),
2298 (thflags & TH_SYN) ? TO_SYN : 0);
2301 * If echoed timestamp is later than the current time,
2302 * fall back to non RFC1323 RTT calculation. Normalize
2303 * timestamp if syncookies were used when this connection
2306 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
2307 to.to_tsecr -= tp->ts_offset;
2308 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
2313 * Process options only when we get SYN/ACK back. The SYN case
2314 * for incoming connections is handled in tcp_syncache.
2315 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
2316 * or <SYN,ACK>) segment itself is never scaled.
2317 * XXX this is traditional behavior, may need to be cleaned up.
2319 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2320 if ((to.to_flags & TOF_SCALE) &&
2321 (tp->t_flags & TF_REQ_SCALE)) {
2322 tp->t_flags |= TF_RCVD_SCALE;
2323 tp->snd_scale = to.to_wscale;
2326 * Initial send window. It will be updated with
2327 * the next incoming segment to the scaled value.
2329 tp->snd_wnd = th->th_win;
2330 if (to.to_flags & TOF_TS) {
2331 tp->t_flags |= TF_RCVD_TSTMP;
2332 tp->ts_recent = to.to_tsval;
2333 tp->ts_recent_age = tcp_ts_getticks();
2335 if (to.to_flags & TOF_MSS)
2336 tcp_mss(tp, to.to_mss);
2337 if ((tp->t_flags & TF_SACK_PERMIT) &&
2338 (to.to_flags & TOF_SACKPERM) == 0)
2339 tp->t_flags &= ~TF_SACK_PERMIT;
2343 * If timestamps were negotiated during SYN/ACK they should
2344 * appear on every segment during this session and vice versa.
2346 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
2347 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2348 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
2349 "no action\n", s, __func__);
2353 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
2354 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2355 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
2356 "no action\n", s, __func__);
2362 * Header prediction: check for the two common cases
2363 * of a uni-directional data xfer. If the packet has
2364 * no control flags, is in-sequence, the window didn't
2365 * change and we're not retransmitting, it's a
2366 * candidate. If the length is zero and the ack moved
2367 * forward, we're the sender side of the xfer. Just
2368 * free the data acked & wake any higher level process
2369 * that was blocked waiting for space. If the length
2370 * is non-zero and the ack didn't move, we're the
2371 * receiver side. If we're getting packets in-order
2372 * (the reassembly queue is empty), add the data to
2373 * the socket buffer and note that we need a delayed ack.
2374 * Make sure that the hidden state-flags are also off.
2375 * Since we check for TCPS_ESTABLISHED first, it can only
2378 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
2379 __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
2380 __predict_true(tlen == 0) &&
2381 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
2382 __predict_true(LIST_EMPTY(&tp->t_segq)) &&
2383 __predict_true(th->th_seq == tp->rcv_nxt)) {
2384 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
2385 ti_locked, tiwin)) {
2389 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
2390 ti_locked, tiwin, thflags);
2393 struct tcp_function_block __tcp_fastslow = {
2394 .tfb_tcp_block_name = "fastslow",
2395 .tfb_tcp_output = tcp_output,
2396 .tfb_tcp_do_segment = tcp_do_segment_fastslow,
2397 .tfb_tcp_ctloutput = tcp_default_ctloutput,
2400 struct tcp_function_block __tcp_fastack = {
2401 .tfb_tcp_block_name = "fastack",
2402 .tfb_tcp_output = tcp_output,
2403 .tfb_tcp_do_segment = tcp_do_segment_fastack,
2404 .tfb_tcp_ctloutput = tcp_default_ctloutput
2408 tcp_addfastpaths(module_t mod, int type, void *data)
2414 err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
2416 printf("Failed to register fastack module -- err:%d\n", err);
2419 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
2421 printf("Failed to register fastslow module -- err:%d\n", err);
2422 deregister_tcp_functions(&__tcp_fastack);
2427 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
2432 err = deregister_tcp_functions(&__tcp_fastack);
2435 err = deregister_tcp_functions(&__tcp_fastslow);
2441 return (EOPNOTSUPP);
2446 static moduledata_t new_tcp_fastpaths = {
2447 .name = "tcp_fastpaths",
2448 .evhand = tcp_addfastpaths,
2452 MODULE_VERSION(kern_tcpfastpaths, 1);
2453 DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);