2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * Copyright (c) 2015 Netflix Inc.
10 * All rights reserved.
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
14 * James Healy and David Hayes, made possible in part by a grant from the Cisco
15 * University Research Program Fund at Community Foundation Silicon Valley.
17 * Portions of this software were developed at the Centre for Advanced
18 * Internet Architectures, Swinburne University of Technology, Melbourne,
19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
21 * Portions of this software were developed by Robert N. M. Watson under
22 * contract to Juniper Networks, Inc.
24 * Portions of this software were developed by Randall R. Stewart while
25 * working for Netflix Inc.
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. Neither the name of the University nor the names of its contributors
36 * may be used to endorse or promote products derived from this software
37 * without specific prior written permission.
39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
54 #include <sys/cdefs.h>
55 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
59 #include "opt_tcpdebug.h"
61 #include <sys/param.h>
63 #include <sys/module.h>
64 #include <sys/mutex.h>
65 #include <sys/kernel.h>
67 #include <sys/hhook.h>
69 #include <sys/malloc.h>
71 #include <sys/proc.h> /* for proc0 declaration */
72 #include <sys/protosw.h>
74 #include <sys/signalvar.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/systm.h>
81 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
85 #include <net/route.h>
88 #define TCPSTATES /* for logging */
90 #include <netinet/in.h>
91 #include <netinet/in_kdtrace.h>
92 #include <netinet/in_pcb.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/ip.h>
95 #include <netinet/ip_icmp.h> /* required for icmp_var.h */
96 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
97 #include <netinet/ip_var.h>
98 #include <netinet/ip_options.h>
99 #include <netinet/ip6.h>
100 #include <netinet/icmp6.h>
101 #include <netinet6/in6_pcb.h>
102 #include <netinet6/ip6_var.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_fsm.h>
105 #include <netinet/tcp_seq.h>
106 #include <netinet/tcp_timer.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet6/tcp6_var.h>
109 #include <netinet/tcpip.h>
110 #include <netinet/tcp_syncache.h>
111 #include <netinet/cc/cc.h>
113 #include <netinet/tcp_debug.h>
114 #endif /* TCPDEBUG */
116 #include <netinet/tcp_offload.h>
119 #include <machine/in_cksum.h>
121 #include <security/mac/mac_framework.h>
123 static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
124 struct socket *, struct tcpcb *, int, int, uint8_t,
127 static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
128 struct socket *, struct tcpcb *, int, int, uint8_t,
132 * Indicate whether this ack should be delayed. We can delay the ack if
133 * following conditions are met:
134 * - There is no delayed ack timer in progress.
135 * - Our last ack wasn't a 0-sized window. We never want to delay
136 * the ack that opens up a 0-sized window.
137 * - LRO wasn't used for this segment. We make sure by checking that the
138 * segment size is not larger than the MSS.
140 #define DELAY_ACK(tp, tlen) \
141 ((!tcp_timer_active(tp, TT_DELACK) && \
142 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
143 (tlen <= tp->t_maxseg) && \
144 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
147 * So how is this faster than the normal fast ack?
148 * It basically allows us to also stay in the fastpath
149 * when a window-update ack also arrives. In testing
150 * we saw only 25-30% of connections doing fastpath
151 * due to the fact that along with moving forward
152 * in sequence the window was also updated.
155 tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
156 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
157 int ti_locked, uint32_t tiwin)
163 nsegs = max(1, m->m_pkthdr.lro_nsegs);
166 * The size of tcp_saveipgen must be the size of the max ip header,
169 u_char tcp_saveipgen[IP6_HDR_LEN];
170 struct tcphdr tcp_savetcp;
174 * The following if statement will be true if
175 * we are doing the win_up_in_fp <and>
176 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
177 * - No more new data, but we have an ack for new data
178 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
179 * - No more new data, the same ack point but the window grew
180 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
182 if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
183 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
184 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
185 /* keep track of pure window updates */
186 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
188 TCPSTAT_INC(tcps_rcvwinupd);
191 tp->snd_wl1 = th->th_seq;
192 tp->snd_wl2 = th->th_ack;
193 if (tp->snd_wnd > tp->max_sndwnd)
194 tp->max_sndwnd = tp->snd_wnd;
197 * If last ACK falls within this segment's sequence numbers,
198 * record the timestamp.
199 * NOTE that the test is modified according to the latest
200 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
202 if ((to->to_flags & TOF_TS) != 0 &&
203 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
204 tp->ts_recent_age = tcp_ts_getticks();
205 tp->ts_recent = to->to_tsval;
208 * This is a pure ack for outstanding data.
210 if (ti_locked == TI_RLOCKED) {
211 INP_INFO_RUNLOCK(&V_tcbinfo);
213 ti_locked = TI_UNLOCKED;
215 TCPSTAT_INC(tcps_predack);
218 * "bad retransmit" recovery.
220 if (tp->t_rxtshift == 1 &&
221 tp->t_flags & TF_PREVVALID &&
222 (int)(ticks - tp->t_badrxtwin) < 0) {
223 cc_cong_signal(tp, th, CC_RTO_ERR);
227 * Recalculate the transmit timer / rtt.
229 * Some boxes send broken timestamp replies
230 * during the SYN+ACK phase, ignore
231 * timestamps of 0 or we could calculate a
232 * huge RTT and blow up the retransmit timer.
234 if ((to->to_flags & TOF_TS) != 0 &&
238 t = tcp_ts_getticks() - to->to_tsecr;
239 if (!tp->t_rttlow || tp->t_rttlow > t)
242 TCP_TS_TO_TICKS(t) + 1);
243 } else if (tp->t_rtttime &&
244 SEQ_GT(th->th_ack, tp->t_rtseq)) {
246 tp->t_rttlow > ticks - tp->t_rtttime)
247 tp->t_rttlow = ticks - tp->t_rtttime;
249 ticks - tp->t_rtttime);
251 if (winup_only == 0) {
252 acked = BYTES_THIS_ACK(tp, th);
255 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
256 hhook_run_tcp_est_in(tp, th, to);
259 TCPSTAT_ADD(tcps_rcvackbyte, acked);
260 sbdrop(&so->so_snd, acked);
261 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
262 SEQ_LEQ(th->th_ack, tp->snd_recover))
263 tp->snd_recover = th->th_ack - 1;
266 * Let the congestion control algorithm update
267 * congestion control related information. This
268 * typically means increasing the congestion
271 cc_ack_received(tp, th, nsegs, CC_ACK);
273 tp->snd_una = th->th_ack;
275 * Pull snd_wl2 up to prevent seq wrap relative
278 tp->snd_wl2 = th->th_ack;
282 * If all outstanding data are acked, stop
283 * retransmit timer, otherwise restart timer
284 * using current (possibly backed-off) value.
285 * If process is waiting for space,
286 * wakeup/selwakeup/signal. If data
287 * are ready to send, let tcp_output
288 * decide between more output or persist.
291 if (so->so_options & SO_DEBUG)
292 tcp_trace(TA_INPUT, ostate, tp,
293 (void *)tcp_saveipgen,
296 TCP_PROBE3(debug__input, tp, th, m);
298 if (tp->snd_una == tp->snd_max)
299 tcp_timer_activate(tp, TT_REXMT, 0);
300 else if (!tcp_timer_active(tp, TT_PERSIST))
301 tcp_timer_activate(tp, TT_REXMT,
305 * Window update only, just free the mbufs and
306 * send out whatever we can.
311 if (sbavail(&so->so_snd))
312 (void) tcp_output(tp);
313 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
314 __func__, ti_locked));
315 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
316 INP_WLOCK_ASSERT(tp->t_inpcb);
318 if (tp->t_flags & TF_DELACK) {
319 tp->t_flags &= ~TF_DELACK;
320 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
322 INP_WUNLOCK(tp->t_inpcb);
326 * Here nothing is really faster, its just that we
327 * have broken out the fast-data path also just like
331 tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
332 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
333 int ti_locked, uint32_t tiwin)
335 int newsize = 0; /* automatic sockbuf scaling */
338 * The size of tcp_saveipgen must be the size of the max ip header,
341 u_char tcp_saveipgen[IP6_HDR_LEN];
342 struct tcphdr tcp_savetcp;
346 * If last ACK falls within this segment's sequence numbers,
347 * record the timestamp.
348 * NOTE that the test is modified according to the latest
349 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
351 if ((to->to_flags & TOF_TS) != 0 &&
352 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
353 tp->ts_recent_age = tcp_ts_getticks();
354 tp->ts_recent = to->to_tsval;
358 * This is a pure, in-sequence data packet with
359 * nothing on the reassembly queue and we have enough
360 * buffer space to take it.
362 if (ti_locked == TI_RLOCKED) {
363 INP_INFO_RUNLOCK(&V_tcbinfo);
365 ti_locked = TI_UNLOCKED;
367 /* Clean receiver SACK report if present */
368 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
369 tcp_clean_sackreport(tp);
370 TCPSTAT_INC(tcps_preddat);
373 * Pull snd_wl1 up to prevent seq wrap relative to
376 tp->snd_wl1 = th->th_seq;
378 * Pull rcv_up up to prevent seq wrap relative to
381 tp->rcv_up = tp->rcv_nxt;
382 TCPSTAT_ADD(tcps_rcvbyte, tlen);
384 if (so->so_options & SO_DEBUG)
385 tcp_trace(TA_INPUT, ostate, tp,
386 (void *)tcp_saveipgen, &tcp_savetcp, 0);
388 TCP_PROBE3(debug__input, tp, th, m);
390 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
392 /* Add data to socket buffer. */
393 SOCKBUF_LOCK(&so->so_rcv);
394 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
398 * Set new socket buffer size.
399 * Give up when limit is reached.
402 if (!sbreserve_locked(&so->so_rcv,
404 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
405 m_adj(m, drop_hdrlen); /* delayed header drop */
406 sbappendstream_locked(&so->so_rcv, m, 0);
408 /* NB: sorwakeup_locked() does an implicit unlock. */
409 sorwakeup_locked(so);
410 if (DELAY_ACK(tp, tlen)) {
411 tp->t_flags |= TF_DELACK;
413 tp->t_flags |= TF_ACKNOW;
416 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
417 __func__, ti_locked));
418 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
419 INP_WLOCK_ASSERT(tp->t_inpcb);
421 if (tp->t_flags & TF_DELACK) {
422 tp->t_flags &= ~TF_DELACK;
423 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
425 INP_WUNLOCK(tp->t_inpcb);
429 * The slow-path is the clone of the long long part
430 * of tcp_do_segment past all the fast-path stuff. We
431 * use it here by two different callers, the fast/slow and
435 tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
436 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
437 int ti_locked, uint32_t tiwin, int thflags)
439 int acked, ourfinisacked, needoutput = 0;
440 int rstreason, todrop, win;
443 struct in_conninfo *inc;
444 struct mbuf *mfree = NULL;
446 nsegs = max(1, m->m_pkthdr.lro_nsegs);
449 * The size of tcp_saveipgen must be the size of the max ip header,
452 u_char tcp_saveipgen[IP6_HDR_LEN];
453 struct tcphdr tcp_savetcp;
457 * Calculate amount of space in receive window,
458 * and then do TCP input processing.
459 * Receive window is amount of space in rcv queue,
460 * but not less than advertised window.
462 inc = &tp->t_inpcb->inp_inc;
463 win = sbspace(&so->so_rcv);
466 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
468 switch (tp->t_state) {
471 * If the state is SYN_RECEIVED:
472 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
474 case TCPS_SYN_RECEIVED:
475 if ((thflags & TH_ACK) &&
476 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
477 SEQ_GT(th->th_ack, tp->snd_max))) {
478 rstreason = BANDLIM_RST_OPENPORT;
484 * If the state is SYN_SENT:
485 * if seg contains a RST with valid ACK (SEQ.ACK has already
486 * been verified), then drop the connection.
487 * if seg contains a RST without an ACK, drop the seg.
488 * if seg does not contain SYN, then drop the seg.
489 * Otherwise this is an acceptable SYN segment
490 * initialize tp->rcv_nxt and tp->irs
491 * if seg contains ack then advance tp->snd_una
492 * if seg contains an ECE and ECN support is enabled, the stream
494 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
495 * arrange for segment to be acked (eventually)
496 * continue processing rest of data/controls, beginning with URG
499 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
500 TCP_PROBE5(connect__refused, NULL, tp, m, tp, th);
501 tp = tcp_drop(tp, ECONNREFUSED);
503 if (thflags & TH_RST)
505 if (!(thflags & TH_SYN))
508 tp->irs = th->th_seq;
510 if (thflags & TH_ACK) {
511 TCPSTAT_INC(tcps_connects);
514 mac_socketpeer_set_from_mbuf(m, so);
516 /* Do window scaling on this connection? */
517 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
518 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
519 tp->rcv_scale = tp->request_r_scale;
521 tp->rcv_adv += min(tp->rcv_wnd,
522 TCP_MAXWIN << tp->rcv_scale);
523 tp->snd_una++; /* SYN is acked */
525 * If there's data, delay ACK; if there's also a FIN
526 * ACKNOW will be turned on later.
528 if (DELAY_ACK(tp, tlen) && tlen != 0)
529 tcp_timer_activate(tp, TT_DELACK,
532 tp->t_flags |= TF_ACKNOW;
534 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
535 tp->t_flags |= TF_ECN_PERMIT;
536 TCPSTAT_INC(tcps_ecn_shs);
540 * Received <SYN,ACK> in SYN_SENT[*] state.
542 * SYN_SENT --> ESTABLISHED
543 * SYN_SENT* --> FIN_WAIT_1
545 tp->t_starttime = ticks;
546 if (tp->t_flags & TF_NEEDFIN) {
547 tcp_state_change(tp, TCPS_FIN_WAIT_1);
548 tp->t_flags &= ~TF_NEEDFIN;
551 tcp_state_change(tp, TCPS_ESTABLISHED);
552 TCP_PROBE5(connect__established, NULL, tp,
555 tcp_timer_activate(tp, TT_KEEP,
560 * Received initial SYN in SYN-SENT[*] state =>
562 * If it succeeds, connection is * half-synchronized.
563 * Otherwise, do 3-way handshake:
564 * SYN-SENT -> SYN-RECEIVED
565 * SYN-SENT* -> SYN-RECEIVED*
567 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
568 tcp_timer_activate(tp, TT_REXMT, 0);
569 tcp_state_change(tp, TCPS_SYN_RECEIVED);
572 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
573 "ti_locked %d", __func__, ti_locked));
574 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
575 INP_WLOCK_ASSERT(tp->t_inpcb);
578 * Advance th->th_seq to correspond to first data byte.
579 * If data, trim to stay within window,
580 * dropping FIN if necessary.
583 if (tlen > tp->rcv_wnd) {
584 todrop = tlen - tp->rcv_wnd;
588 TCPSTAT_INC(tcps_rcvpackafterwin);
589 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
591 tp->snd_wl1 = th->th_seq - 1;
592 tp->rcv_up = th->th_seq;
594 * Client side of transaction: already sent SYN and data.
595 * If the remote host used T/TCP to validate the SYN,
596 * our data will be ACK'd; if so, enter normal data segment
597 * processing in the middle of step 5, ack processing.
598 * Otherwise, goto step 6.
600 if (thflags & TH_ACK)
606 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
607 * do normal processing.
609 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
613 break; /* continue normal processing */
617 * States other than LISTEN or SYN_SENT.
618 * First check the RST flag and sequence number since reset segments
619 * are exempt from the timestamp and connection count tests. This
620 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
621 * below which allowed reset segments in half the sequence space
622 * to fall though and be processed (which gives forged reset
623 * segments with a random sequence number a 50 percent chance of
624 * killing a connection).
625 * Then check timestamp, if present.
626 * Then check the connection count, if present.
627 * Then check that at least some bytes of segment are within
628 * receive window. If segment begins before rcv_nxt,
629 * drop leading data (and SYN); if nothing left, just ack.
631 if (thflags & TH_RST) {
633 * RFC5961 Section 3.2
635 * - RST drops connection only if SEG.SEQ == RCV.NXT.
636 * - If RST is in window, we send challenge ACK.
638 * Note: to take into account delayed ACKs, we should
639 * test against last_ack_sent instead of rcv_nxt.
640 * Note 2: we handle special case of closed window, not
641 * covered by the RFC.
643 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
644 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
645 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
646 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
647 KASSERT(ti_locked == TI_RLOCKED,
648 ("%s: TH_RST ti_locked %d, th %p tp %p",
649 __func__, ti_locked, th, tp));
650 KASSERT(tp->t_state != TCPS_SYN_SENT,
651 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
654 if (V_tcp_insecure_rst ||
655 tp->last_ack_sent == th->th_seq) {
656 TCPSTAT_INC(tcps_drops);
657 /* Drop the connection. */
658 switch (tp->t_state) {
659 case TCPS_SYN_RECEIVED:
660 so->so_error = ECONNREFUSED;
662 case TCPS_ESTABLISHED:
663 case TCPS_FIN_WAIT_1:
664 case TCPS_FIN_WAIT_2:
665 case TCPS_CLOSE_WAIT:
668 so->so_error = ECONNRESET;
675 TCPSTAT_INC(tcps_badrst);
676 /* Send challenge ACK. */
677 tcp_respond(tp, mtod(m, void *), th, m,
678 tp->rcv_nxt, tp->snd_nxt, TH_ACK);
679 tp->last_ack_sent = tp->rcv_nxt;
687 * RFC5961 Section 4.2
688 * Send challenge ACK for any SYN in synchronized state.
690 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
691 KASSERT(ti_locked == TI_RLOCKED,
692 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
693 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
695 TCPSTAT_INC(tcps_badsyn);
696 if (V_tcp_insecure_syn &&
697 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
698 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
699 tp = tcp_drop(tp, ECONNRESET);
700 rstreason = BANDLIM_UNLIMITED;
702 /* Send challenge ACK. */
703 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
704 tp->snd_nxt, TH_ACK);
705 tp->last_ack_sent = tp->rcv_nxt;
712 * RFC 1323 PAWS: If we have a timestamp reply on this segment
713 * and it's less than ts_recent, drop it.
715 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
716 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
718 /* Check to see if ts_recent is over 24 days old. */
719 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
721 * Invalidate ts_recent. If this segment updates
722 * ts_recent, the age will be reset later and ts_recent
723 * will get a valid value. If it does not, setting
724 * ts_recent to zero will at least satisfy the
725 * requirement that zero be placed in the timestamp
726 * echo reply when ts_recent isn't valid. The
727 * age isn't reset until we get a valid ts_recent
728 * because we don't want out-of-order segments to be
729 * dropped when ts_recent is old.
733 TCPSTAT_INC(tcps_rcvduppack);
734 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
735 TCPSTAT_INC(tcps_pawsdrop);
743 * In the SYN-RECEIVED state, validate that the packet belongs to
744 * this connection before trimming the data to fit the receive
745 * window. Check the sequence number versus IRS since we know
746 * the sequence numbers haven't wrapped. This is a partial fix
747 * for the "LAND" DoS attack.
749 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
750 rstreason = BANDLIM_RST_OPENPORT;
754 todrop = tp->rcv_nxt - th->th_seq;
756 if (thflags & TH_SYN) {
766 * Following if statement from Stevens, vol. 2, p. 960.
769 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
771 * Any valid FIN must be to the left of the window.
772 * At this point the FIN must be a duplicate or out
773 * of sequence; drop it.
778 * Send an ACK to resynchronize and drop any data.
779 * But keep on processing for RST or ACK.
781 tp->t_flags |= TF_ACKNOW;
783 TCPSTAT_INC(tcps_rcvduppack);
784 TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
786 TCPSTAT_INC(tcps_rcvpartduppack);
787 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
789 drop_hdrlen += todrop; /* drop from the top afterwards */
790 th->th_seq += todrop;
792 if (th->th_urp > todrop)
793 th->th_urp -= todrop;
801 * If new data are received on a connection after the
802 * user processes are gone, then RST the other end.
804 if ((so->so_state & SS_NOFDREF) &&
805 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
806 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
807 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
808 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
810 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
811 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
812 "after socket was closed, "
813 "sending RST and removing tcpcb\n",
814 s, __func__, tcpstates[tp->t_state], tlen);
818 TCPSTAT_INC(tcps_rcvafterclose);
819 rstreason = BANDLIM_UNLIMITED;
824 * If segment ends after window, drop trailing data
825 * (and PUSH and FIN); if nothing left, just ACK.
827 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
829 TCPSTAT_INC(tcps_rcvpackafterwin);
830 if (todrop >= tlen) {
831 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
833 * If window is closed can only take segments at
834 * window edge, and have to drop data and PUSH from
835 * incoming segments. Continue processing, but
836 * remember to ack. Otherwise, drop segment
839 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
840 tp->t_flags |= TF_ACKNOW;
841 TCPSTAT_INC(tcps_rcvwinprobe);
845 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
848 thflags &= ~(TH_PUSH|TH_FIN);
852 * If last ACK falls within this segment's sequence numbers,
853 * record its timestamp.
855 * 1) That the test incorporates suggestions from the latest
856 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
857 * 2) That updating only on newer timestamps interferes with
858 * our earlier PAWS tests, so this check should be solely
859 * predicated on the sequence space of this segment.
860 * 3) That we modify the segment boundary check to be
861 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
862 * instead of RFC1323's
863 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
864 * This modified check allows us to overcome RFC1323's
865 * limitations as described in Stevens TCP/IP Illustrated
866 * Vol. 2 p.869. In such cases, we can still calculate the
867 * RTT correctly when RCV.NXT == Last.ACK.Sent.
869 if ((to->to_flags & TOF_TS) != 0 &&
870 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
871 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
872 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
873 tp->ts_recent_age = tcp_ts_getticks();
874 tp->ts_recent = to->to_tsval;
878 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
879 * flag is on (half-synchronized state), then queue data for
880 * later processing; else drop segment and return.
882 if ((thflags & TH_ACK) == 0) {
883 if (tp->t_state == TCPS_SYN_RECEIVED ||
884 (tp->t_flags & TF_NEEDSYN))
886 else if (tp->t_flags & TF_ACKNOW)
895 switch (tp->t_state) {
898 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
899 * ESTABLISHED state and continue processing.
900 * The ACK was checked above.
902 case TCPS_SYN_RECEIVED:
904 TCPSTAT_INC(tcps_connects);
906 /* Do window scaling? */
907 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
908 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
909 tp->rcv_scale = tp->request_r_scale;
914 * SYN-RECEIVED -> ESTABLISHED
915 * SYN-RECEIVED* -> FIN-WAIT-1
917 tp->t_starttime = ticks;
918 if (tp->t_flags & TF_NEEDFIN) {
919 tcp_state_change(tp, TCPS_FIN_WAIT_1);
920 tp->t_flags &= ~TF_NEEDFIN;
922 tcp_state_change(tp, TCPS_ESTABLISHED);
923 TCP_PROBE5(accept__established, NULL, tp,
926 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
929 * If segment contains data or ACK, will call tcp_reass()
930 * later; if not, do so now to pass queued data to user.
932 if (tlen == 0 && (thflags & TH_FIN) == 0)
933 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
935 tp->snd_wl1 = th->th_seq - 1;
939 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
940 * ACKs. If the ack is in the range
941 * tp->snd_una < th->th_ack <= tp->snd_max
942 * then advance tp->snd_una to th->th_ack and drop
943 * data from the retransmission queue. If this ACK reflects
944 * more up to date window information we update our window information.
946 case TCPS_ESTABLISHED:
947 case TCPS_FIN_WAIT_1:
948 case TCPS_FIN_WAIT_2:
949 case TCPS_CLOSE_WAIT:
952 if (SEQ_GT(th->th_ack, tp->snd_max)) {
953 TCPSTAT_INC(tcps_rcvacktoomuch);
956 if ((tp->t_flags & TF_SACK_PERMIT) &&
957 ((to->to_flags & TOF_SACK) ||
958 !TAILQ_EMPTY(&tp->snd_holes)))
959 tcp_sack_doack(tp, to, th->th_ack);
962 * Reset the value so that previous (valid) value
963 * from the last ack with SACK doesn't get used.
965 tp->sackhint.sacked_bytes = 0;
968 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
969 hhook_run_tcp_est_in(tp, th, to);
972 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
973 if (tlen == 0 && tiwin == tp->snd_wnd) {
975 * If this is the first time we've seen a
976 * FIN from the remote, this is not a
977 * duplicate and it needs to be processed
978 * normally. This happens during a
979 * simultaneous close.
981 if ((thflags & TH_FIN) &&
982 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
986 TCPSTAT_INC(tcps_rcvdupack);
988 * If we have outstanding data (other than
989 * a window probe), this is a completely
990 * duplicate ack (ie, window info didn't
991 * change and FIN isn't set),
992 * the ack is the biggest we've
993 * seen and we've seen exactly our rexmt
994 * threshold of them, assume a packet
995 * has been dropped and retransmit it.
996 * Kludge snd_nxt & the congestion
997 * window so we send only this one
1000 * We know we're losing at the current
1001 * window size so do congestion avoidance
1002 * (set ssthresh to half the current window
1003 * and pull our congestion window back to
1004 * the new ssthresh).
1006 * Dup acks mean that packets have left the
1007 * network (they're now cached at the receiver)
1008 * so bump cwnd by the amount in the receiver
1009 * to keep a constant cwnd packets in the
1012 * When using TCP ECN, notify the peer that
1013 * we reduced the cwnd.
1015 if (!tcp_timer_active(tp, TT_REXMT) ||
1016 th->th_ack != tp->snd_una)
1018 else if (++tp->t_dupacks > tcprexmtthresh ||
1019 IN_FASTRECOVERY(tp->t_flags)) {
1020 cc_ack_received(tp, th, nsegs,
1022 if ((tp->t_flags & TF_SACK_PERMIT) &&
1023 IN_FASTRECOVERY(tp->t_flags)) {
1027 * Compute the amount of data in flight first.
1028 * We can inject new data into the pipe iff
1029 * we have less than 1/2 the original window's
1030 * worth of data in flight.
1032 if (V_tcp_do_rfc6675_pipe)
1033 awnd = tcp_compute_pipe(tp);
1035 awnd = (tp->snd_nxt - tp->snd_fack) +
1036 tp->sackhint.sack_bytes_rexmit;
1038 if (awnd < tp->snd_ssthresh) {
1039 tp->snd_cwnd += tp->t_maxseg;
1040 if (tp->snd_cwnd > tp->snd_ssthresh)
1041 tp->snd_cwnd = tp->snd_ssthresh;
1044 tp->snd_cwnd += tp->t_maxseg;
1045 (void) tp->t_fb->tfb_tcp_output(tp);
1047 } else if (tp->t_dupacks == tcprexmtthresh) {
1048 tcp_seq onxt = tp->snd_nxt;
1051 * If we're doing sack, check to
1052 * see if we're already in sack
1053 * recovery. If we're not doing sack,
1054 * check to see if we're in newreno
1057 if (tp->t_flags & TF_SACK_PERMIT) {
1058 if (IN_FASTRECOVERY(tp->t_flags)) {
1063 if (SEQ_LEQ(th->th_ack,
1069 /* Congestion signal before ack. */
1070 cc_cong_signal(tp, th, CC_NDUPACK);
1071 cc_ack_received(tp, th, nsegs,
1073 tcp_timer_activate(tp, TT_REXMT, 0);
1075 if (tp->t_flags & TF_SACK_PERMIT) {
1077 tcps_sack_recovery_episode);
1078 tp->sack_newdata = tp->snd_nxt;
1079 tp->snd_cwnd = tp->t_maxseg;
1080 (void) tp->t_fb->tfb_tcp_output(tp);
1083 tp->snd_nxt = th->th_ack;
1084 tp->snd_cwnd = tp->t_maxseg;
1085 (void) tp->t_fb->tfb_tcp_output(tp);
1086 KASSERT(tp->snd_limited <= 2,
1087 ("%s: tp->snd_limited too big",
1089 tp->snd_cwnd = tp->snd_ssthresh +
1091 (tp->t_dupacks - tp->snd_limited);
1092 if (SEQ_GT(onxt, tp->snd_nxt))
1095 } else if (V_tcp_do_rfc3042) {
1097 * Process first and second duplicate
1098 * ACKs. Each indicates a segment
1099 * leaving the network, creating room
1100 * for more. Make sure we can send a
1101 * packet on reception of each duplicate
1102 * ACK by increasing snd_cwnd by one
1103 * segment. Restore the original
1104 * snd_cwnd after packet transmission.
1106 cc_ack_received(tp, th, nsegs,
1108 uint32_t oldcwnd = tp->snd_cwnd;
1109 tcp_seq oldsndmax = tp->snd_max;
1113 KASSERT(tp->t_dupacks == 1 ||
1115 ("%s: dupacks not 1 or 2",
1117 if (tp->t_dupacks == 1)
1118 tp->snd_limited = 0;
1120 (tp->snd_nxt - tp->snd_una) +
1121 (tp->t_dupacks - tp->snd_limited) *
1124 * Only call tcp_output when there
1125 * is new data available to be sent.
1126 * Otherwise we would send pure ACKs.
1128 SOCKBUF_LOCK(&so->so_snd);
1129 avail = sbavail(&so->so_snd) -
1130 (tp->snd_nxt - tp->snd_una);
1131 SOCKBUF_UNLOCK(&so->so_snd);
1133 (void) tp->t_fb->tfb_tcp_output(tp);
1134 sent = tp->snd_max - oldsndmax;
1135 if (sent > tp->t_maxseg) {
1136 KASSERT((tp->t_dupacks == 2 &&
1137 tp->snd_limited == 0) ||
1138 (sent == tp->t_maxseg + 1 &&
1139 tp->t_flags & TF_SENTFIN),
1140 ("%s: sent too much",
1142 tp->snd_limited = 2;
1143 } else if (sent > 0)
1145 tp->snd_cwnd = oldcwnd;
1153 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
1154 ("%s: th_ack <= snd_una", __func__));
1157 * If the congestion window was inflated to account
1158 * for the other side's cached packets, retract it.
1160 if (IN_FASTRECOVERY(tp->t_flags)) {
1161 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
1162 if (tp->t_flags & TF_SACK_PERMIT)
1163 tcp_sack_partialack(tp, th);
1165 tcp_newreno_partial_ack(tp, th);
1167 cc_post_recovery(tp, th);
1171 * If we reach this point, ACK is not a duplicate,
1172 * i.e., it ACKs something we sent.
1174 if (tp->t_flags & TF_NEEDSYN) {
1176 * T/TCP: Connection was half-synchronized, and our
1177 * SYN has been ACK'd (so connection is now fully
1178 * synchronized). Go to non-starred state,
1179 * increment snd_una for ACK of SYN, and check if
1180 * we can do window scaling.
1182 tp->t_flags &= ~TF_NEEDSYN;
1184 /* Do window scaling? */
1185 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1186 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1187 tp->rcv_scale = tp->request_r_scale;
1188 /* Send window already scaled. */
1193 INP_WLOCK_ASSERT(tp->t_inpcb);
1195 acked = BYTES_THIS_ACK(tp, th);
1196 TCPSTAT_INC(tcps_rcvackpack);
1197 TCPSTAT_ADD(tcps_rcvackbyte, acked);
1200 * If we just performed our first retransmit, and the ACK
1201 * arrives within our recovery window, then it was a mistake
1202 * to do the retransmit in the first place. Recover our
1203 * original cwnd and ssthresh, and proceed to transmit where
1206 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
1207 (int)(ticks - tp->t_badrxtwin) < 0)
1208 cc_cong_signal(tp, th, CC_RTO_ERR);
1211 * If we have a timestamp reply, update smoothed
1212 * round trip time. If no timestamp is present but
1213 * transmit timer is running and timed sequence
1214 * number was acked, update smoothed round trip time.
1215 * Since we now have an rtt measurement, cancel the
1216 * timer backoff (cf., Phil Karn's retransmit alg.).
1217 * Recompute the initial retransmit timer.
1219 * Some boxes send broken timestamp replies
1220 * during the SYN+ACK phase, ignore
1221 * timestamps of 0 or we could calculate a
1222 * huge RTT and blow up the retransmit timer.
1224 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
1227 t = tcp_ts_getticks() - to->to_tsecr;
1228 if (!tp->t_rttlow || tp->t_rttlow > t)
1230 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
1231 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
1232 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
1233 tp->t_rttlow = ticks - tp->t_rtttime;
1234 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1238 * If all outstanding data is acked, stop retransmit
1239 * timer and remember to restart (more output or persist).
1240 * If there is more data to be acked, restart retransmit
1241 * timer, using current (possibly backed-off) value.
1243 if (th->th_ack == tp->snd_max) {
1244 tcp_timer_activate(tp, TT_REXMT, 0);
1246 } else if (!tcp_timer_active(tp, TT_PERSIST))
1247 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1250 * If no data (only SYN) was ACK'd,
1251 * skip rest of ACK processing.
1257 * Let the congestion control algorithm update congestion
1258 * control related information. This typically means increasing
1259 * the congestion window.
1261 cc_ack_received(tp, th, nsegs, CC_ACK);
1263 SOCKBUF_LOCK(&so->so_snd);
1264 if (acked > sbavail(&so->so_snd)) {
1265 tp->snd_wnd -= sbavail(&so->so_snd);
1266 mfree = sbcut_locked(&so->so_snd,
1267 (int)sbavail(&so->so_snd));
1270 mfree = sbcut_locked(&so->so_snd, acked);
1271 tp->snd_wnd -= acked;
1274 /* NB: sowwakeup_locked() does an implicit unlock. */
1275 sowwakeup_locked(so);
1277 /* Detect una wraparound. */
1278 if (!IN_RECOVERY(tp->t_flags) &&
1279 SEQ_GT(tp->snd_una, tp->snd_recover) &&
1280 SEQ_LEQ(th->th_ack, tp->snd_recover))
1281 tp->snd_recover = th->th_ack - 1;
1282 /* XXXLAS: Can this be moved up into cc_post_recovery? */
1283 if (IN_RECOVERY(tp->t_flags) &&
1284 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
1285 EXIT_RECOVERY(tp->t_flags);
1287 tp->snd_una = th->th_ack;
1288 if (tp->t_flags & TF_SACK_PERMIT) {
1289 if (SEQ_GT(tp->snd_una, tp->snd_recover))
1290 tp->snd_recover = tp->snd_una;
1292 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1293 tp->snd_nxt = tp->snd_una;
1295 switch (tp->t_state) {
1298 * In FIN_WAIT_1 STATE in addition to the processing
1299 * for the ESTABLISHED state if our FIN is now acknowledged
1300 * then enter FIN_WAIT_2.
1302 case TCPS_FIN_WAIT_1:
1303 if (ourfinisacked) {
1305 * If we can't receive any more
1306 * data, then closing user can proceed.
1307 * Starting the timer is contrary to the
1308 * specification, but if we don't get a FIN
1309 * we'll hang forever.
1312 * we should release the tp also, and use a
1315 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1316 soisdisconnected(so);
1317 tcp_timer_activate(tp, TT_2MSL,
1318 (tcp_fast_finwait2_recycle ?
1319 tcp_finwait2_timeout :
1322 tcp_state_change(tp, TCPS_FIN_WAIT_2);
1327 * In CLOSING STATE in addition to the processing for
1328 * the ESTABLISHED state if the ACK acknowledges our FIN
1329 * then enter the TIME-WAIT state, otherwise ignore
1333 if (ourfinisacked) {
1334 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1336 INP_INFO_RUNLOCK(&V_tcbinfo);
1343 * In LAST_ACK, we may still be waiting for data to drain
1344 * and/or to be acked, as well as for the ack of our FIN.
1345 * If our FIN is now acknowledged, delete the TCB,
1346 * enter the closed state and return.
1349 if (ourfinisacked) {
1350 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1359 INP_WLOCK_ASSERT(tp->t_inpcb);
1362 * Update window information.
1363 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1365 if ((thflags & TH_ACK) &&
1366 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1367 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1368 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1369 /* keep track of pure window updates */
1371 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1372 TCPSTAT_INC(tcps_rcvwinupd);
1373 tp->snd_wnd = tiwin;
1374 tp->snd_wl1 = th->th_seq;
1375 tp->snd_wl2 = th->th_ack;
1376 if (tp->snd_wnd > tp->max_sndwnd)
1377 tp->max_sndwnd = tp->snd_wnd;
1382 * Process segments with URG.
1384 if ((thflags & TH_URG) && th->th_urp &&
1385 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1387 * This is a kludge, but if we receive and accept
1388 * random urgent pointers, we'll crash in
1389 * soreceive. It's hard to imagine someone
1390 * actually wanting to send this much urgent data.
1392 SOCKBUF_LOCK(&so->so_rcv);
1393 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
1394 th->th_urp = 0; /* XXX */
1395 thflags &= ~TH_URG; /* XXX */
1396 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
1397 goto dodata; /* XXX */
1400 * If this segment advances the known urgent pointer,
1401 * then mark the data stream. This should not happen
1402 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1403 * a FIN has been received from the remote side.
1404 * In these states we ignore the URG.
1406 * According to RFC961 (Assigned Protocols),
1407 * the urgent pointer points to the last octet
1408 * of urgent data. We continue, however,
1409 * to consider it to indicate the first octet
1410 * of data past the urgent section as the original
1411 * spec states (in one of two places).
1413 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1414 tp->rcv_up = th->th_seq + th->th_urp;
1415 so->so_oobmark = sbavail(&so->so_rcv) +
1416 (tp->rcv_up - tp->rcv_nxt) - 1;
1417 if (so->so_oobmark == 0)
1418 so->so_rcv.sb_state |= SBS_RCVATMARK;
1420 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1422 SOCKBUF_UNLOCK(&so->so_rcv);
1424 * Remove out of band data so doesn't get presented to user.
1425 * This can happen independent of advancing the URG pointer,
1426 * but if two URG's are pending at once, some out-of-band
1427 * data may creep in... ick.
1429 if (th->th_urp <= (uint32_t)tlen &&
1430 !(so->so_options & SO_OOBINLINE)) {
1431 /* hdr drop is delayed */
1432 tcp_pulloutofband(so, th, m, drop_hdrlen);
1436 * If no out of band data is expected,
1437 * pull receive urgent pointer along
1438 * with the receive window.
1440 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1441 tp->rcv_up = tp->rcv_nxt;
1444 INP_WLOCK_ASSERT(tp->t_inpcb);
1447 * Process the segment text, merging it into the TCP sequencing queue,
1448 * and arranging for acknowledgment of receipt if necessary.
1449 * This process logically involves adjusting tp->rcv_wnd as data
1450 * is presented to the user (this happens in tcp_usrreq.c,
1451 * case PRU_RCVD). If a FIN has already been received on this
1452 * connection then we just ignore the text.
1454 if ((tlen || (thflags & TH_FIN)) &&
1455 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1456 tcp_seq save_start = th->th_seq;
1457 m_adj(m, drop_hdrlen); /* delayed header drop */
1459 * Insert segment which includes th into TCP reassembly queue
1460 * with control block tp. Set thflags to whether reassembly now
1461 * includes a segment with FIN. This handles the common case
1462 * inline (segment is the next to be received on an established
1463 * connection, and the queue is empty), avoiding linkage into
1464 * and removal from the queue and repetition of various
1466 * Set DELACK for segments received in order, but ack
1467 * immediately when segments are out of order (so
1468 * fast retransmit can work).
1470 if (th->th_seq == tp->rcv_nxt &&
1471 LIST_EMPTY(&tp->t_segq) &&
1472 TCPS_HAVEESTABLISHED(tp->t_state)) {
1473 if (DELAY_ACK(tp, tlen))
1474 tp->t_flags |= TF_DELACK;
1476 tp->t_flags |= TF_ACKNOW;
1477 tp->rcv_nxt += tlen;
1478 thflags = th->th_flags & TH_FIN;
1479 TCPSTAT_INC(tcps_rcvpack);
1480 TCPSTAT_ADD(tcps_rcvbyte, tlen);
1481 SOCKBUF_LOCK(&so->so_rcv);
1482 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1485 sbappendstream_locked(&so->so_rcv, m, 0);
1486 /* NB: sorwakeup_locked() does an implicit unlock. */
1487 sorwakeup_locked(so);
1490 * XXX: Due to the header drop above "th" is
1491 * theoretically invalid by now. Fortunately
1492 * m_adj() doesn't actually frees any mbufs
1493 * when trimming from the head.
1495 thflags = tcp_reass(tp, th, &tlen, m);
1496 tp->t_flags |= TF_ACKNOW;
1498 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
1499 tcp_update_sack_list(tp, save_start, save_start + tlen);
1502 * Note the amount of data that peer has sent into
1503 * our window, in order to estimate the sender's
1507 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1508 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1510 len = so->so_rcv.sb_hiwat;
1518 * If FIN is received ACK the FIN and let the user know
1519 * that the connection is closing.
1521 if (thflags & TH_FIN) {
1522 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1525 * If connection is half-synchronized
1526 * (ie NEEDSYN flag on) then delay ACK,
1527 * so it may be piggybacked when SYN is sent.
1528 * Otherwise, since we received a FIN then no
1529 * more input can be expected, send ACK now.
1531 if (tp->t_flags & TF_NEEDSYN)
1532 tp->t_flags |= TF_DELACK;
1534 tp->t_flags |= TF_ACKNOW;
1537 switch (tp->t_state) {
1540 * In SYN_RECEIVED and ESTABLISHED STATES
1541 * enter the CLOSE_WAIT state.
1543 case TCPS_SYN_RECEIVED:
1544 tp->t_starttime = ticks;
1546 case TCPS_ESTABLISHED:
1547 tcp_state_change(tp, TCPS_CLOSE_WAIT);
1551 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1552 * enter the CLOSING state.
1554 case TCPS_FIN_WAIT_1:
1555 tcp_state_change(tp, TCPS_CLOSING);
1559 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1560 * starting the time-wait timer, turning off the other
1563 case TCPS_FIN_WAIT_2:
1564 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1565 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
1566 "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
1570 INP_INFO_RUNLOCK(&V_tcbinfo);
1574 if (ti_locked == TI_RLOCKED) {
1575 INP_INFO_RUNLOCK(&V_tcbinfo);
1577 ti_locked = TI_UNLOCKED;
1580 if (so->so_options & SO_DEBUG)
1581 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1584 TCP_PROBE3(debug__input, tp, th, m);
1587 * Return any desired output.
1589 if (needoutput || (tp->t_flags & TF_ACKNOW))
1590 (void) tp->t_fb->tfb_tcp_output(tp);
1592 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
1593 __func__, ti_locked));
1594 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1595 INP_WLOCK_ASSERT(tp->t_inpcb);
1597 if (tp->t_flags & TF_DELACK) {
1598 tp->t_flags &= ~TF_DELACK;
1599 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
1601 INP_WUNLOCK(tp->t_inpcb);
1606 * Generate an ACK dropping incoming segment if it occupies
1607 * sequence space, where the ACK reflects our state.
1609 * We can now skip the test for the RST flag since all
1610 * paths to this code happen after packets containing
1611 * RST have been dropped.
1613 * In the SYN-RECEIVED state, don't send an ACK unless the
1614 * segment we received passes the SYN-RECEIVED ACK test.
1615 * If it fails send a RST. This breaks the loop in the
1616 * "LAND" DoS attack, and also prevents an ACK storm
1617 * between two listening ports that have been sent forged
1618 * SYN segments, each with the source address of the other.
1620 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1621 (SEQ_GT(tp->snd_una, th->th_ack) ||
1622 SEQ_GT(th->th_ack, tp->snd_max)) ) {
1623 rstreason = BANDLIM_RST_OPENPORT;
1627 if (so->so_options & SO_DEBUG)
1628 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1631 TCP_PROBE3(debug__drop, tp, th, m);
1632 if (ti_locked == TI_RLOCKED) {
1633 INP_INFO_RUNLOCK(&V_tcbinfo);
1635 ti_locked = TI_UNLOCKED;
1637 tp->t_flags |= TF_ACKNOW;
1638 (void) tp->t_fb->tfb_tcp_output(tp);
1639 INP_WUNLOCK(tp->t_inpcb);
1644 if (ti_locked == TI_RLOCKED) {
1645 INP_INFO_RUNLOCK(&V_tcbinfo);
1647 ti_locked = TI_UNLOCKED;
1650 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1651 INP_WUNLOCK(tp->t_inpcb);
1653 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1657 if (ti_locked == TI_RLOCKED) {
1658 INP_INFO_RUNLOCK(&V_tcbinfo);
1659 ti_locked = TI_UNLOCKED;
1663 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1667 * Drop space held by incoming segment and return.
1670 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1671 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1674 TCP_PROBE3(debug__drop, tp, th, m);
1676 INP_WUNLOCK(tp->t_inpcb);
1682 * Do fast slow is a combination of the original
1683 * tcp_dosegment and a split fastpath, one function
1684 * for the fast-ack which also includes allowing fastpath
1685 * for window advanced in sequence acks. And also a
1686 * sub-function that handles the insequence data.
1689 tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
1690 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1698 struct in_conninfo *inc;
1701 thflags = th->th_flags;
1702 inc = &tp->t_inpcb->inp_inc;
1703 nsegs = max(1, m->m_pkthdr.lro_nsegs);
1705 * If this is either a state-changing packet or current state isn't
1706 * established, we require a write lock on tcbinfo. Otherwise, we
1707 * allow the tcbinfo to be in either alocked or unlocked, as the
1708 * caller may have unnecessarily acquired a write lock due to a race.
1710 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1711 tp->t_state != TCPS_ESTABLISHED) {
1712 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
1713 "SYN/FIN/RST/!EST", __func__, ti_locked));
1714 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1717 if (ti_locked == TI_RLOCKED) {
1718 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1720 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
1721 "ti_locked: %d", __func__, ti_locked));
1722 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1726 INP_WLOCK_ASSERT(tp->t_inpcb);
1727 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1729 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1732 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
1733 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1734 log(LOG_DEBUG, "%s; %s: "
1735 "SYN|FIN segment ignored (based on "
1736 "sysctl setting)\n", s, __func__);
1739 if (ti_locked == TI_RLOCKED) {
1740 INP_INFO_RUNLOCK(&V_tcbinfo);
1742 INP_WUNLOCK(tp->t_inpcb);
1748 * If a segment with the ACK-bit set arrives in the SYN-SENT state
1749 * check SEQ.ACK first.
1751 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
1752 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
1753 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED);
1754 if (ti_locked == TI_RLOCKED) {
1755 INP_INFO_RUNLOCK(&V_tcbinfo);
1757 INP_WUNLOCK(tp->t_inpcb);
1761 tp->sackhint.last_sack_ack = 0;
1764 * Segment received on connection.
1765 * Reset idle time and keep-alive timer.
1766 * XXX: This should be done after segment
1767 * validation to ignore broken/spoofed segs.
1769 tp->t_rcvtime = ticks;
1772 * Unscale the window into a 32-bit value.
1773 * For the SYN_SENT state the scale is zero.
1775 tiwin = th->th_win << tp->snd_scale;
1778 * TCP ECN processing.
1780 if (tp->t_flags & TF_ECN_PERMIT) {
1781 if (thflags & TH_CWR)
1782 tp->t_flags &= ~TF_ECN_SND_ECE;
1783 switch (iptos & IPTOS_ECN_MASK) {
1785 tp->t_flags |= TF_ECN_SND_ECE;
1786 TCPSTAT_INC(tcps_ecn_ce);
1788 case IPTOS_ECN_ECT0:
1789 TCPSTAT_INC(tcps_ecn_ect0);
1791 case IPTOS_ECN_ECT1:
1792 TCPSTAT_INC(tcps_ecn_ect1);
1795 /* Congestion experienced. */
1796 if (thflags & TH_ECE) {
1797 cc_cong_signal(tp, th, CC_ECN);
1802 * Parse options on any incoming segment.
1804 tcp_dooptions(&to, (u_char *)(th + 1),
1805 (th->th_off << 2) - sizeof(struct tcphdr),
1806 (thflags & TH_SYN) ? TO_SYN : 0);
1809 * If echoed timestamp is later than the current time,
1810 * fall back to non RFC1323 RTT calculation. Normalize
1811 * timestamp if syncookies were used when this connection
1814 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1815 to.to_tsecr -= tp->ts_offset;
1816 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1821 * Process options only when we get SYN/ACK back. The SYN case
1822 * for incoming connections is handled in tcp_syncache.
1823 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1824 * or <SYN,ACK>) segment itself is never scaled.
1825 * XXX this is traditional behavior, may need to be cleaned up.
1827 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1828 if ((to.to_flags & TOF_SCALE) &&
1829 (tp->t_flags & TF_REQ_SCALE)) {
1830 tp->t_flags |= TF_RCVD_SCALE;
1831 tp->snd_scale = to.to_wscale;
1834 * Initial send window. It will be updated with
1835 * the next incoming segment to the scaled value.
1837 tp->snd_wnd = th->th_win;
1838 if (to.to_flags & TOF_TS) {
1839 tp->t_flags |= TF_RCVD_TSTMP;
1840 tp->ts_recent = to.to_tsval;
1841 tp->ts_recent_age = tcp_ts_getticks();
1843 if (to.to_flags & TOF_MSS)
1844 tcp_mss(tp, to.to_mss);
1845 if ((tp->t_flags & TF_SACK_PERMIT) &&
1846 (to.to_flags & TOF_SACKPERM) == 0)
1847 tp->t_flags &= ~TF_SACK_PERMIT;
1851 * If timestamps were negotiated during SYN/ACK they should
1852 * appear on every segment during this session and vice versa.
1854 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1855 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1856 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1857 "no action\n", s, __func__);
1861 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1862 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1863 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1864 "no action\n", s, __func__);
1870 if (__predict_true((tlen == 0))) {
1872 * The ack moved forward and we have a window (non-zero)
1874 * The ack did not move forward, but the window increased.
1876 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
1877 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
1882 * Data incoming, use the old entry criteria
1883 * for fast-path with data.
1885 if ((tiwin && tiwin == tp->snd_wnd)) {
1890 * Header prediction: check for the two common cases
1891 * of a uni-directional data xfer. If the packet has
1892 * no control flags, is in-sequence, the window didn't
1893 * change and we're not retransmitting, it's a
1894 * candidate. If the length is zero and the ack moved
1895 * forward, we're the sender side of the xfer. Just
1896 * free the data acked & wake any higher level process
1897 * that was blocked waiting for space. If the length
1898 * is non-zero and the ack didn't move, we're the
1899 * receiver side. If we're getting packets in-order
1900 * (the reassembly queue is empty), add the data to
1901 * the socket buffer and note that we need a delayed ack.
1902 * Make sure that the hidden state-flags are also off.
1903 * Since we check for TCPS_ESTABLISHED first, it can only
1906 if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
1907 th->th_seq == tp->rcv_nxt &&
1908 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1909 tp->snd_nxt == tp->snd_max &&
1911 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1912 LIST_EMPTY(&tp->t_segq) &&
1913 ((to.to_flags & TOF_TS) == 0 ||
1914 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
1915 if (__predict_true((tlen == 0) &&
1916 (SEQ_LEQ(th->th_ack, tp->snd_max) &&
1917 !IN_RECOVERY(tp->t_flags) &&
1918 (to.to_flags & TOF_SACK) == 0 &&
1919 TAILQ_EMPTY(&tp->snd_holes)))) {
1921 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
1924 } else if ((tlen) &&
1925 (th->th_ack == tp->snd_una &&
1926 tlen <= sbspace(&so->so_rcv))) {
1927 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
1933 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
1934 ti_locked, tiwin, thflags);
1939 * This subfunction is used to try to highly optimize the
1940 * fast path. We again allow window updates that are
1941 * in sequence to remain in the fast-path. We also add
1942 * in the __predict's to attempt to help the compiler.
1943 * Note that if we return a 0, then we can *not* process
1944 * it and the caller should push the packet into the
1948 tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
1949 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
1950 int ti_locked, uint32_t tiwin)
1956 nsegs = max(1, m->m_pkthdr.lro_nsegs);
1959 * The size of tcp_saveipgen must be the size of the max ip header,
1962 u_char tcp_saveipgen[IP6_HDR_LEN];
1963 struct tcphdr tcp_savetcp;
1968 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
1969 /* Old ack, behind (or duplicate to) the last one rcv'd */
1972 if (__predict_false(th->th_ack == tp->snd_una) &&
1973 __predict_false(tiwin <= tp->snd_wnd)) {
1974 /* duplicate ack <or> a shrinking dup ack with shrinking window */
1977 if (__predict_false(tiwin == 0)) {
1981 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
1982 /* Above what we have sent? */
1985 if (__predict_false(tp->snd_nxt != tp->snd_max)) {
1986 /* We are retransmitting */
1989 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
1990 /* We need a SYN or a FIN, unlikely.. */
1993 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
1994 /* Timestamp is behind .. old ack with seq wrap? */
1997 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
1998 /* Still recovering */
2001 if (__predict_false(to->to_flags & TOF_SACK)) {
2002 /* Sack included in the ack.. */
2005 if (!TAILQ_EMPTY(&tp->snd_holes)) {
2006 /* We have sack holes on our scoreboard */
2009 /* Ok if we reach here, we can process a fast-ack */
2011 /* Did the window get updated? */
2012 if (tiwin != tp->snd_wnd) {
2013 /* keep track of pure window updates */
2014 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
2016 TCPSTAT_INC(tcps_rcvwinupd);
2018 tp->snd_wnd = tiwin;
2019 tp->snd_wl1 = th->th_seq;
2020 if (tp->snd_wnd > tp->max_sndwnd)
2021 tp->max_sndwnd = tp->snd_wnd;
2024 * Pull snd_wl2 up to prevent seq wrap relative
2027 tp->snd_wl2 = th->th_ack;
2029 * If last ACK falls within this segment's sequence numbers,
2030 * record the timestamp.
2031 * NOTE that the test is modified according to the latest
2032 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2034 if ((to->to_flags & TOF_TS) != 0 &&
2035 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2036 tp->ts_recent_age = tcp_ts_getticks();
2037 tp->ts_recent = to->to_tsval;
2040 * This is a pure ack for outstanding data.
2042 if (ti_locked == TI_RLOCKED) {
2043 INP_INFO_RUNLOCK(&V_tcbinfo);
2045 ti_locked = TI_UNLOCKED;
2047 TCPSTAT_INC(tcps_predack);
2050 * "bad retransmit" recovery.
2052 if (tp->t_rxtshift == 1 &&
2053 tp->t_flags & TF_PREVVALID &&
2054 (int)(ticks - tp->t_badrxtwin) < 0) {
2055 cc_cong_signal(tp, th, CC_RTO_ERR);
2059 * Recalculate the transmit timer / rtt.
2061 * Some boxes send broken timestamp replies
2062 * during the SYN+ACK phase, ignore
2063 * timestamps of 0 or we could calculate a
2064 * huge RTT and blow up the retransmit timer.
2066 if ((to->to_flags & TOF_TS) != 0 &&
2070 t = tcp_ts_getticks() - to->to_tsecr;
2071 if (!tp->t_rttlow || tp->t_rttlow > t)
2074 TCP_TS_TO_TICKS(t) + 1);
2075 } else if (tp->t_rtttime &&
2076 SEQ_GT(th->th_ack, tp->t_rtseq)) {
2077 if (!tp->t_rttlow ||
2078 tp->t_rttlow > ticks - tp->t_rtttime)
2079 tp->t_rttlow = ticks - tp->t_rtttime;
2081 ticks - tp->t_rtttime);
2083 if (winup_only == 0) {
2084 acked = BYTES_THIS_ACK(tp, th);
2087 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2088 hhook_run_tcp_est_in(tp, th, to);
2091 TCPSTAT_ADD(tcps_rcvackbyte, acked);
2092 sbdrop(&so->so_snd, acked);
2093 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2094 SEQ_LEQ(th->th_ack, tp->snd_recover))
2095 tp->snd_recover = th->th_ack - 1;
2098 * Let the congestion control algorithm update
2099 * congestion control related information. This
2100 * typically means increasing the congestion
2103 cc_ack_received(tp, th, nsegs, CC_ACK);
2105 tp->snd_una = th->th_ack;
2109 * If all outstanding data are acked, stop
2110 * retransmit timer, otherwise restart timer
2111 * using current (possibly backed-off) value.
2112 * If process is waiting for space,
2113 * wakeup/selwakeup/signal. If data
2114 * are ready to send, let tcp_output
2115 * decide between more output or persist.
2118 if (so->so_options & SO_DEBUG)
2119 tcp_trace(TA_INPUT, ostate, tp,
2120 (void *)tcp_saveipgen,
2123 TCP_PROBE3(debug__input, tp, th, m);
2125 if (tp->snd_una == tp->snd_max)
2126 tcp_timer_activate(tp, TT_REXMT, 0);
2127 else if (!tcp_timer_active(tp, TT_PERSIST))
2128 tcp_timer_activate(tp, TT_REXMT,
2130 /* Wake up the socket if we have room to write more */
2134 * Window update only, just free the mbufs and
2135 * send out whatever we can.
2139 if (sbavail(&so->so_snd))
2140 (void) tcp_output(tp);
2141 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2142 __func__, ti_locked));
2143 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2144 INP_WLOCK_ASSERT(tp->t_inpcb);
2146 if (tp->t_flags & TF_DELACK) {
2147 tp->t_flags &= ~TF_DELACK;
2148 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2150 INP_WUNLOCK(tp->t_inpcb);
2155 * This tcp-do-segment concentrates on making the fastest
2156 * ack processing path. It does not have a fast-path for
2157 * data (it possibly could which would then eliminate the
2158 * need for fast-slow above). For a content distributor having
2159 * large outgoing elephants and very very little coming in
2160 * having no fastpath for data does not really help (since you
2161 * don't get much data in). The most important thing is
2162 * processing ack's quickly and getting the rest of the data
2163 * output to the peer as quickly as possible. This routine
2164 * seems to be about an overall 3% faster then the old
2165 * tcp_do_segment and keeps us in the fast-path for packets
2166 * much more (by allowing window updates to also stay in the fastpath).
2169 tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
2170 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
2176 struct in_conninfo *inc;
2179 thflags = th->th_flags;
2180 inc = &tp->t_inpcb->inp_inc;
2182 * If this is either a state-changing packet or current state isn't
2183 * established, we require a write lock on tcbinfo. Otherwise, we
2184 * allow the tcbinfo to be in either alocked or unlocked, as the
2185 * caller may have unnecessarily acquired a write lock due to a race.
2187 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
2188 tp->t_state != TCPS_ESTABLISHED) {
2189 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
2190 "SYN/FIN/RST/!EST", __func__, ti_locked));
2191 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2194 if (ti_locked == TI_RLOCKED) {
2195 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2197 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
2198 "ti_locked: %d", __func__, ti_locked));
2199 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2203 INP_WLOCK_ASSERT(tp->t_inpcb);
2204 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
2206 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
2209 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
2210 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2211 log(LOG_DEBUG, "%s; %s: "
2212 "SYN|FIN segment ignored (based on "
2213 "sysctl setting)\n", s, __func__);
2216 if (ti_locked == TI_RLOCKED) {
2217 INP_INFO_RUNLOCK(&V_tcbinfo);
2219 INP_WUNLOCK(tp->t_inpcb);
2225 * If a segment with the ACK-bit set arrives in the SYN-SENT state
2226 * check SEQ.ACK first.
2228 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
2229 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
2230 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED);
2231 if (ti_locked == TI_RLOCKED) {
2232 INP_INFO_RUNLOCK(&V_tcbinfo);
2234 INP_WUNLOCK(tp->t_inpcb);
2238 tp->sackhint.last_sack_ack = 0;
2241 * Segment received on connection.
2242 * Reset idle time and keep-alive timer.
2243 * XXX: This should be done after segment
2244 * validation to ignore broken/spoofed segs.
2246 tp->t_rcvtime = ticks;
2249 * Unscale the window into a 32-bit value.
2250 * For the SYN_SENT state the scale is zero.
2252 tiwin = th->th_win << tp->snd_scale;
2255 * TCP ECN processing.
2257 if (tp->t_flags & TF_ECN_PERMIT) {
2258 if (thflags & TH_CWR)
2259 tp->t_flags &= ~TF_ECN_SND_ECE;
2260 switch (iptos & IPTOS_ECN_MASK) {
2262 tp->t_flags |= TF_ECN_SND_ECE;
2263 TCPSTAT_INC(tcps_ecn_ce);
2265 case IPTOS_ECN_ECT0:
2266 TCPSTAT_INC(tcps_ecn_ect0);
2268 case IPTOS_ECN_ECT1:
2269 TCPSTAT_INC(tcps_ecn_ect1);
2272 /* Congestion experienced. */
2273 if (thflags & TH_ECE) {
2274 cc_cong_signal(tp, th, CC_ECN);
2279 * Parse options on any incoming segment.
2281 tcp_dooptions(&to, (u_char *)(th + 1),
2282 (th->th_off << 2) - sizeof(struct tcphdr),
2283 (thflags & TH_SYN) ? TO_SYN : 0);
2286 * If echoed timestamp is later than the current time,
2287 * fall back to non RFC1323 RTT calculation. Normalize
2288 * timestamp if syncookies were used when this connection
2291 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
2292 to.to_tsecr -= tp->ts_offset;
2293 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
2298 * Process options only when we get SYN/ACK back. The SYN case
2299 * for incoming connections is handled in tcp_syncache.
2300 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
2301 * or <SYN,ACK>) segment itself is never scaled.
2302 * XXX this is traditional behavior, may need to be cleaned up.
2304 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2305 if ((to.to_flags & TOF_SCALE) &&
2306 (tp->t_flags & TF_REQ_SCALE)) {
2307 tp->t_flags |= TF_RCVD_SCALE;
2308 tp->snd_scale = to.to_wscale;
2311 * Initial send window. It will be updated with
2312 * the next incoming segment to the scaled value.
2314 tp->snd_wnd = th->th_win;
2315 if (to.to_flags & TOF_TS) {
2316 tp->t_flags |= TF_RCVD_TSTMP;
2317 tp->ts_recent = to.to_tsval;
2318 tp->ts_recent_age = tcp_ts_getticks();
2320 if (to.to_flags & TOF_MSS)
2321 tcp_mss(tp, to.to_mss);
2322 if ((tp->t_flags & TF_SACK_PERMIT) &&
2323 (to.to_flags & TOF_SACKPERM) == 0)
2324 tp->t_flags &= ~TF_SACK_PERMIT;
2328 * If timestamps were negotiated during SYN/ACK they should
2329 * appear on every segment during this session and vice versa.
2331 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
2332 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2333 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
2334 "no action\n", s, __func__);
2338 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
2339 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2340 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
2341 "no action\n", s, __func__);
2347 * Header prediction: check for the two common cases
2348 * of a uni-directional data xfer. If the packet has
2349 * no control flags, is in-sequence, the window didn't
2350 * change and we're not retransmitting, it's a
2351 * candidate. If the length is zero and the ack moved
2352 * forward, we're the sender side of the xfer. Just
2353 * free the data acked & wake any higher level process
2354 * that was blocked waiting for space. If the length
2355 * is non-zero and the ack didn't move, we're the
2356 * receiver side. If we're getting packets in-order
2357 * (the reassembly queue is empty), add the data to
2358 * the socket buffer and note that we need a delayed ack.
2359 * Make sure that the hidden state-flags are also off.
2360 * Since we check for TCPS_ESTABLISHED first, it can only
2363 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
2364 __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
2365 __predict_true(tlen == 0) &&
2366 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
2367 __predict_true(LIST_EMPTY(&tp->t_segq)) &&
2368 __predict_true(th->th_seq == tp->rcv_nxt)) {
2369 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
2370 ti_locked, tiwin)) {
2374 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
2375 ti_locked, tiwin, thflags);
2378 struct tcp_function_block __tcp_fastslow = {
2379 .tfb_tcp_block_name = "fastslow",
2380 .tfb_tcp_output = tcp_output,
2381 .tfb_tcp_do_segment = tcp_do_segment_fastslow,
2382 .tfb_tcp_ctloutput = tcp_default_ctloutput,
2385 struct tcp_function_block __tcp_fastack = {
2386 .tfb_tcp_block_name = "fastack",
2387 .tfb_tcp_output = tcp_output,
2388 .tfb_tcp_do_segment = tcp_do_segment_fastack,
2389 .tfb_tcp_ctloutput = tcp_default_ctloutput
2393 tcp_addfastpaths(module_t mod, int type, void *data)
2399 err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
2401 printf("Failed to register fastack module -- err:%d\n", err);
2404 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
2406 printf("Failed to register fastslow module -- err:%d\n", err);
2407 deregister_tcp_functions(&__tcp_fastack);
2412 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
2417 err = deregister_tcp_functions(&__tcp_fastack);
2420 err = deregister_tcp_functions(&__tcp_fastslow);
2426 return (EOPNOTSUPP);
2431 static moduledata_t new_tcp_fastpaths = {
2432 .name = "tcp_fastpaths",
2433 .evhand = tcp_addfastpaths,
2437 MODULE_VERSION(kern_tcpfastpaths, 1);
2438 DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);