2 * Copyright (c) 2001 McAfee, Inc.
5 * This software was developed for the FreeBSD Project by Jonathan Lemon
6 * and McAfee Research, the Security Research Division of McAfee, Inc. under
7 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
8 * DARPA CHATS research program.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
38 #include "opt_tcpdebug.h"
39 #include "opt_tcp_sack.h"
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/malloc.h>
49 #include <sys/proc.h> /* for proc0 declaration */
50 #include <sys/random.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
55 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/ip.h>
60 #include <netinet/in_var.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet6/nd6.h>
67 #include <netinet6/ip6_var.h>
68 #include <netinet6/in6_pcb.h>
70 #include <netinet/tcp.h>
72 #include <netinet/tcpip.h>
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
79 #include <netinet/tcp_debug.h>
82 #include <netinet6/tcp6_var.h>
86 #include <netinet6/ipsec.h>
88 #include <netinet6/ipsec6.h>
93 #include <netipsec/ipsec.h>
95 #include <netipsec/ipsec6.h>
97 #include <netipsec/key.h>
100 #include <machine/in_cksum.h>
103 static int tcp_syncookies = 1;
104 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
106 "Use TCP SYN cookies if the syncache overflows");
108 static void syncache_drop(struct syncache *, struct syncache_head *);
109 static void syncache_free(struct syncache *);
110 static void syncache_insert(struct syncache *, struct syncache_head *);
111 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
113 static int syncache_respond(struct syncache *, struct mbuf *, struct socket *);
115 static int syncache_respond(struct syncache *, struct mbuf *);
117 static struct socket *syncache_socket(struct syncache *, struct socket *,
119 static void syncache_timer(void *);
120 static u_int32_t syncookie_generate(struct syncache *, u_int32_t *);
121 static struct syncache *syncookie_lookup(struct in_conninfo *,
122 struct tcphdr *, struct socket *);
125 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
126 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
127 * the odds are that the user has given up attempting to connect by then.
129 #define SYNCACHE_MAXREXMTS 3
131 /* Arbitrary values */
132 #define TCP_SYNCACHE_HASHSIZE 512
133 #define TCP_SYNCACHE_BUCKETLIMIT 30
135 struct tcp_syncache {
136 struct syncache_head *hashbase;
145 TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
146 struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
148 static struct tcp_syncache tcp_syncache;
150 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
152 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
153 &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
155 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
156 &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
158 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
159 &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
161 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
162 &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
164 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
165 &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
167 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
169 #define SYNCACHE_HASH(inc, mask) \
170 ((tcp_syncache.hash_secret ^ \
171 (inc)->inc_faddr.s_addr ^ \
172 ((inc)->inc_faddr.s_addr >> 16) ^ \
173 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
175 #define SYNCACHE_HASH6(inc, mask) \
176 ((tcp_syncache.hash_secret ^ \
177 (inc)->inc6_faddr.s6_addr32[0] ^ \
178 (inc)->inc6_faddr.s6_addr32[3] ^ \
179 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
181 #define ENDPTS_EQ(a, b) ( \
182 (a)->ie_fport == (b)->ie_fport && \
183 (a)->ie_lport == (b)->ie_lport && \
184 (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
185 (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
188 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
190 #define SYNCACHE_TIMEOUT(sc, slot) do { \
191 sc->sc_rxtslot = (slot); \
192 sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[(slot)]; \
193 TAILQ_INSERT_TAIL(&tcp_syncache.timerq[(slot)], sc, sc_timerq); \
194 if (!callout_active(&tcp_syncache.tt_timerq[(slot)])) \
195 callout_reset(&tcp_syncache.tt_timerq[(slot)], \
196 TCPTV_RTOBASE * tcp_backoff[(slot)], \
197 syncache_timer, (void *)((intptr_t)(slot))); \
201 syncache_free(struct syncache *sc)
204 (void) m_free(sc->sc_ipopts);
206 uma_zfree(tcp_syncache.zone, sc);
214 tcp_syncache.cache_count = 0;
215 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
216 tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
217 tcp_syncache.cache_limit =
218 tcp_syncache.hashsize * tcp_syncache.bucket_limit;
219 tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
220 tcp_syncache.hash_secret = arc4random();
222 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
223 &tcp_syncache.hashsize);
224 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
225 &tcp_syncache.cache_limit);
226 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
227 &tcp_syncache.bucket_limit);
228 if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
229 printf("WARNING: syncache hash size is not a power of 2.\n");
230 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
232 tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
234 /* Allocate the hash table. */
235 MALLOC(tcp_syncache.hashbase, struct syncache_head *,
236 tcp_syncache.hashsize * sizeof(struct syncache_head),
237 M_SYNCACHE, M_WAITOK);
239 /* Initialize the hash buckets. */
240 for (i = 0; i < tcp_syncache.hashsize; i++) {
241 TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
242 tcp_syncache.hashbase[i].sch_length = 0;
245 /* Initialize the timer queues. */
246 for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
247 TAILQ_INIT(&tcp_syncache.timerq[i]);
248 callout_init(&tcp_syncache.tt_timerq[i], NET_CALLOUT_MPSAFE);
252 * Allocate the syncache entries. Allow the zone to allocate one
253 * more entry than cache limit, so a new entry can bump out an
256 tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
257 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
258 uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
259 tcp_syncache.cache_limit -= 1;
263 syncache_insert(sc, sch)
265 struct syncache_head *sch;
267 struct syncache *sc2;
270 INP_INFO_WLOCK_ASSERT(&tcbinfo);
273 * Make sure that we don't overflow the per-bucket
274 * limit or the total cache size limit.
276 if (sch->sch_length >= tcp_syncache.bucket_limit) {
278 * The bucket is full, toss the oldest element.
280 sc2 = TAILQ_FIRST(&sch->sch_bucket);
281 sc2->sc_tp->ts_recent = ticks;
282 syncache_drop(sc2, sch);
283 tcpstat.tcps_sc_bucketoverflow++;
284 } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
286 * The cache is full. Toss the oldest entry in the
287 * entire cache. This is the front entry in the
288 * first non-empty timer queue with the largest
291 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
292 sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
296 sc2->sc_tp->ts_recent = ticks;
297 syncache_drop(sc2, NULL);
298 tcpstat.tcps_sc_cacheoverflow++;
301 /* Initialize the entry's timer. */
302 SYNCACHE_TIMEOUT(sc, 0);
304 /* Put it into the bucket. */
305 TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
307 tcp_syncache.cache_count++;
308 tcpstat.tcps_sc_added++;
312 syncache_drop(sc, sch)
314 struct syncache_head *sch;
316 INP_INFO_WLOCK_ASSERT(&tcbinfo);
320 if (sc->sc_inc.inc_isipv6) {
321 sch = &tcp_syncache.hashbase[
322 SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
326 sch = &tcp_syncache.hashbase[
327 SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
331 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
333 tcp_syncache.cache_count--;
335 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
336 if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
337 callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
343 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
344 * If we have retransmitted an entry the maximum number of times, expire it.
347 syncache_timer(xslot)
350 intptr_t slot = (intptr_t)xslot;
351 struct syncache *sc, *nsc;
354 INP_INFO_WLOCK(&tcbinfo);
355 if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
356 !callout_active(&tcp_syncache.tt_timerq[slot])) {
357 /* XXX can this happen? */
358 INP_INFO_WUNLOCK(&tcbinfo);
361 callout_deactivate(&tcp_syncache.tt_timerq[slot]);
363 nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
364 while (nsc != NULL) {
365 if (ticks < nsc->sc_rxttime)
368 inp = sc->sc_tp->t_inpcb;
369 if (slot == SYNCACHE_MAXREXMTS ||
370 slot >= tcp_syncache.rexmt_limit ||
371 inp == NULL || inp->inp_gencnt != sc->sc_inp_gencnt) {
372 nsc = TAILQ_NEXT(sc, sc_timerq);
373 syncache_drop(sc, NULL);
374 tcpstat.tcps_sc_stale++;
378 * syncache_respond() may call back into the syncache to
379 * to modify another entry, so do not obtain the next
380 * entry on the timer chain until it has completed.
383 (void) syncache_respond(sc, NULL, NULL);
385 (void) syncache_respond(sc, NULL);
387 nsc = TAILQ_NEXT(sc, sc_timerq);
388 tcpstat.tcps_sc_retransmitted++;
389 TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
390 SYNCACHE_TIMEOUT(sc, slot + 1);
393 callout_reset(&tcp_syncache.tt_timerq[slot],
394 nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
395 INP_INFO_WUNLOCK(&tcbinfo);
399 * Find an entry in the syncache.
402 syncache_lookup(inc, schp)
403 struct in_conninfo *inc;
404 struct syncache_head **schp;
407 struct syncache_head *sch;
409 INP_INFO_WLOCK_ASSERT(&tcbinfo);
412 if (inc->inc_isipv6) {
413 sch = &tcp_syncache.hashbase[
414 SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
416 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
417 if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
423 sch = &tcp_syncache.hashbase[
424 SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
426 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
428 if (sc->sc_inc.inc_isipv6)
431 if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
439 * This function is called when we get a RST for a
440 * non-existent connection, so that we can see if the
441 * connection is in the syn cache. If it is, zap it.
444 syncache_chkrst(inc, th)
445 struct in_conninfo *inc;
449 struct syncache_head *sch;
451 INP_INFO_WLOCK_ASSERT(&tcbinfo);
453 sc = syncache_lookup(inc, &sch);
457 * If the RST bit is set, check the sequence number to see
458 * if this is a valid reset segment.
460 * In all states except SYN-SENT, all reset (RST) segments
461 * are validated by checking their SEQ-fields. A reset is
462 * valid if its sequence number is in the window.
464 * The sequence number in the reset segment is normally an
465 * echo of our outgoing acknowlegement numbers, but some hosts
466 * send a reset with the sequence number at the rightmost edge
467 * of our receive window, and we have to handle this case.
469 if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
470 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
471 syncache_drop(sc, sch);
472 tcpstat.tcps_sc_reset++;
478 struct in_conninfo *inc;
481 struct syncache_head *sch;
483 INP_INFO_WLOCK_ASSERT(&tcbinfo);
485 sc = syncache_lookup(inc, &sch);
487 syncache_drop(sc, sch);
488 tcpstat.tcps_sc_badack++;
493 syncache_unreach(inc, th)
494 struct in_conninfo *inc;
498 struct syncache_head *sch;
500 INP_INFO_WLOCK_ASSERT(&tcbinfo);
502 sc = syncache_lookup(inc, &sch);
506 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
507 if (ntohl(th->th_seq) != sc->sc_iss)
511 * If we've rertransmitted 3 times and this is our second error,
512 * we remove the entry. Otherwise, we allow it to continue on.
513 * This prevents us from incorrectly nuking an entry during a
514 * spurious network outage.
518 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
519 sc->sc_flags |= SCF_UNREACH;
522 syncache_drop(sc, sch);
523 tcpstat.tcps_sc_unreach++;
527 * Build a new TCP socket structure from a syncache entry.
529 static struct socket *
530 syncache_socket(sc, lso, m)
535 struct inpcb *inp = NULL;
540 INP_INFO_WLOCK_ASSERT(&tcbinfo);
543 * Ok, create the full blown connection, and set things up
544 * as they would have been set up if we had created the
545 * connection when the SYN arrived. If we can't create
546 * the connection, abort it.
548 so = sonewconn(lso, SS_ISCONNECTED);
551 * Drop the connection; we will send a RST if the peer
552 * retransmits the ACK,
554 tcpstat.tcps_listendrop++;
559 mac_set_socket_peer_from_mbuf(m, so);
567 * Insert new socket into hash list.
569 inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
571 if (sc->sc_inc.inc_isipv6) {
572 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
574 inp->inp_vflag &= ~INP_IPV6;
575 inp->inp_vflag |= INP_IPV4;
577 inp->inp_laddr = sc->sc_inc.inc_laddr;
581 inp->inp_lport = sc->sc_inc.inc_lport;
582 if (in_pcbinshash(inp) != 0) {
584 * Undo the assignments above if we failed to
585 * put the PCB on the hash lists.
588 if (sc->sc_inc.inc_isipv6)
589 inp->in6p_laddr = in6addr_any;
592 inp->inp_laddr.s_addr = INADDR_ANY;
597 /* copy old policy into new socket's */
598 if (ipsec_copy_pcbpolicy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
599 printf("syncache_expand: could not copy policy\n");
602 /* copy old policy into new socket's */
603 if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
604 printf("syncache_expand: could not copy policy\n");
607 if (sc->sc_inc.inc_isipv6) {
608 struct inpcb *oinp = sotoinpcb(lso);
609 struct in6_addr laddr6;
610 struct sockaddr_in6 sin6;
612 * Inherit socket options from the listening socket.
613 * Note that in6p_inputopts are not (and should not be)
614 * copied, since it stores previously received options and is
615 * used to detect if each new option is different than the
616 * previous one and hence should be passed to a user.
617 * If we copied in6p_inputopts, a user would not be able to
618 * receive options just after calling the accept system call.
620 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
621 if (oinp->in6p_outputopts)
622 inp->in6p_outputopts =
623 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
625 sin6.sin6_family = AF_INET6;
626 sin6.sin6_len = sizeof(sin6);
627 sin6.sin6_addr = sc->sc_inc.inc6_faddr;
628 sin6.sin6_port = sc->sc_inc.inc_fport;
629 sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
630 laddr6 = inp->in6p_laddr;
631 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
632 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
633 if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
635 inp->in6p_laddr = laddr6;
638 /* Override flowlabel from in6_pcbconnect. */
639 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
640 inp->in6p_flowinfo |= sc->sc_flowlabel;
644 struct in_addr laddr;
645 struct sockaddr_in sin;
647 inp->inp_options = ip_srcroute(m);
648 if (inp->inp_options == NULL) {
649 inp->inp_options = sc->sc_ipopts;
650 sc->sc_ipopts = NULL;
653 sin.sin_family = AF_INET;
654 sin.sin_len = sizeof(sin);
655 sin.sin_addr = sc->sc_inc.inc_faddr;
656 sin.sin_port = sc->sc_inc.inc_fport;
657 bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
658 laddr = inp->inp_laddr;
659 if (inp->inp_laddr.s_addr == INADDR_ANY)
660 inp->inp_laddr = sc->sc_inc.inc_laddr;
661 if (in_pcbconnect(inp, (struct sockaddr *)&sin,
663 inp->inp_laddr = laddr;
669 tp->t_state = TCPS_SYN_RECEIVED;
670 tp->iss = sc->sc_iss;
671 tp->irs = sc->sc_irs;
674 tp->snd_wl1 = sc->sc_irs;
675 tp->rcv_up = sc->sc_irs + 1;
676 tp->rcv_wnd = sc->sc_wnd;
677 tp->rcv_adv += tp->rcv_wnd;
679 tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
680 if (sc->sc_flags & SCF_NOOPT)
681 tp->t_flags |= TF_NOOPT;
682 if (sc->sc_flags & SCF_WINSCALE) {
683 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
684 tp->requested_s_scale = sc->sc_requested_s_scale;
685 tp->request_r_scale = sc->sc_request_r_scale;
687 if (sc->sc_flags & SCF_TIMESTAMP) {
688 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
689 tp->ts_recent = sc->sc_tsrecent;
690 tp->ts_recent_age = ticks;
693 if (sc->sc_flags & SCF_SIGNATURE)
694 tp->t_flags |= TF_SIGNATURE;
696 if (sc->sc_flags & SCF_SACK) {
698 tp->t_flags |= TF_SACK_PERMIT;
701 * Set up MSS and get cached values from tcp_hostcache.
702 * This might overwrite some of the defaults we just set.
704 tcp_mss(tp, sc->sc_peer_mss);
707 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
709 if (sc->sc_rxtslot != 0)
710 tp->snd_cwnd = tp->t_maxseg;
711 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
715 tcpstat.tcps_accepts++;
727 * This function gets called when we receive an ACK for a
728 * socket in the LISTEN state. We look up the connection
729 * in the syncache, and if its there, we pull it out of
730 * the cache and turn it into a full-blown connection in
731 * the SYN-RECEIVED state.
734 syncache_expand(inc, th, sop, m)
735 struct in_conninfo *inc;
741 struct syncache_head *sch;
744 INP_INFO_WLOCK_ASSERT(&tcbinfo);
746 sc = syncache_lookup(inc, &sch);
749 * There is no syncache entry, so see if this ACK is
750 * a returning syncookie. To do this, first:
751 * A. See if this socket has had a syncache entry dropped in
752 * the past. We don't want to accept a bogus syncookie
753 * if we've never received a SYN.
754 * B. check that the syncookie is valid. If it is, then
755 * cobble up a fake syncache entry, and return.
759 sc = syncookie_lookup(inc, th, *sop);
763 tcpstat.tcps_sc_recvcookie++;
767 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
769 if (th->th_ack != sc->sc_iss + 1) {
775 so = syncache_socket(sc, *sop, m);
779 /* XXXjlemon check this - is this correct? */
780 (void) tcp_respond(NULL, m, m, th,
781 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
783 m_freem(m); /* XXX only needed for above */
784 tcpstat.tcps_sc_aborted++;
786 tcpstat.tcps_sc_completed++;
791 syncache_drop(sc, sch);
797 * Given a LISTEN socket and an inbound SYN request, add
798 * this to the syn cache, and send back a segment:
799 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
802 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
803 * Doing so would require that we hold onto the data and deliver it
804 * to the application. However, if we are the target of a SYN-flood
805 * DoS attack, an attacker could send data which would eventually
806 * consume all available buffer space if it were ACKed. By not ACKing
807 * the data, we avoid this DoS scenario.
810 syncache_add(inc, to, th, sop, m)
811 struct in_conninfo *inc;
819 struct syncache *sc = NULL;
820 struct syncache_head *sch;
821 struct mbuf *ipopts = NULL;
825 INP_INFO_WLOCK_ASSERT(&tcbinfo);
831 * Remember the IP options, if any.
834 if (!inc->inc_isipv6)
836 ipopts = ip_srcroute(m);
839 * See if we already have an entry for this connection.
840 * If we do, resend the SYN,ACK, and reset the retransmit timer.
843 * should the syncache be re-initialized with the contents
844 * of the new SYN here (which may have different options?)
846 sc = syncache_lookup(inc, &sch);
848 tcpstat.tcps_sc_dupsyn++;
851 * If we were remembering a previous source route,
852 * forget it and use the new one we've been given.
855 (void) m_free(sc->sc_ipopts);
856 sc->sc_ipopts = ipopts;
859 * Update timestamp if present.
861 if (sc->sc_flags & SCF_TIMESTAMP)
862 sc->sc_tsrecent = to->to_tsval;
864 * PCB may have changed, pick up new values.
867 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
869 if (syncache_respond(sc, m, so) == 0) {
871 if (syncache_respond(sc, m) == 0) {
873 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
874 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
876 SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
877 tcpstat.tcps_sndacks++;
878 tcpstat.tcps_sndtotal++;
884 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
887 * The zone allocator couldn't provide more entries.
888 * Treat this as if the cache was full; drop the oldest
889 * entry and insert the new one.
891 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
892 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
893 sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
897 sc->sc_tp->ts_recent = ticks;
898 syncache_drop(sc, NULL);
899 tcpstat.tcps_sc_zonefail++;
900 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
903 (void) m_free(ipopts);
909 * Fill in the syncache values.
912 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
913 sc->sc_ipopts = ipopts;
914 sc->sc_inc.inc_fport = inc->inc_fport;
915 sc->sc_inc.inc_lport = inc->inc_lport;
917 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
918 if (inc->inc_isipv6) {
919 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
920 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
924 sc->sc_inc.inc_faddr = inc->inc_faddr;
925 sc->sc_inc.inc_laddr = inc->inc_laddr;
927 sc->sc_irs = th->th_seq;
929 sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
930 sc->sc_flowlabel = 0;
931 if (tcp_syncookies) {
932 sc->sc_iss = syncookie_generate(sc, &flowtmp);
934 if (inc->inc_isipv6 &&
935 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
936 sc->sc_flowlabel = flowtmp & IPV6_FLOWLABEL_MASK;
940 sc->sc_iss = arc4random();
942 if (inc->inc_isipv6 &&
943 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
945 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
950 /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
951 win = sbspace(&so->so_rcv);
953 win = imin(win, TCP_MAXWIN);
956 if (tcp_do_rfc1323) {
958 * A timestamp received in a SYN makes
959 * it ok to send timestamp requests and replies.
961 if (to->to_flags & TOF_TS) {
962 sc->sc_tsrecent = to->to_tsval;
963 sc->sc_flags |= SCF_TIMESTAMP;
965 if (to->to_flags & TOF_SCALE) {
968 /* Compute proper scaling value from buffer space */
969 while (wscale < TCP_MAX_WINSHIFT &&
970 (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
972 sc->sc_request_r_scale = wscale;
973 sc->sc_requested_s_scale = to->to_requested_s_scale;
974 sc->sc_flags |= SCF_WINSCALE;
977 if (tp->t_flags & TF_NOOPT)
978 sc->sc_flags = SCF_NOOPT;
981 * If listening socket requested TCP digests, and received SYN
982 * contains the option, flag this in the syncache so that
983 * syncache_respond() will do the right thing with the SYN+ACK.
984 * XXX Currently we always record the option by default and will
985 * attempt to use it in syncache_respond().
987 if (to->to_flags & TOF_SIGNATURE)
988 sc->sc_flags |= SCF_SIGNATURE;
991 if (to->to_flags & TOF_SACK)
992 sc->sc_flags |= SCF_SACK;
995 * Do a standard 3-way handshake.
998 if (syncache_respond(sc, m, so) == 0) {
1000 if (syncache_respond(sc, m) == 0) {
1002 syncache_insert(sc, sch);
1003 tcpstat.tcps_sndacks++;
1004 tcpstat.tcps_sndtotal++;
1007 tcpstat.tcps_sc_dropped++;
1015 syncache_respond(sc, m, so)
1016 struct syncache *sc;
1021 syncache_respond(sc, m)
1022 struct syncache *sc;
1028 u_int16_t tlen, hlen, mssopt;
1029 struct ip *ip = NULL;
1033 struct ip6_hdr *ip6 = NULL;
1038 (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
1042 KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
1044 /* Determine MSS we advertize to other end of connection */
1045 mssopt = tcp_mssopt(&sc->sc_inc);
1047 /* Compute the size of the TCP options. */
1048 if (sc->sc_flags & SCF_NOOPT) {
1051 optlen = TCPOLEN_MAXSEG +
1052 ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1053 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
1054 #ifdef TCP_SIGNATURE
1055 if (sc->sc_flags & SCF_SIGNATURE)
1056 optlen += TCPOLEN_SIGNATURE;
1058 if (sc->sc_flags & SCF_SACK)
1059 optlen += TCPOLEN_SACK_PERMITTED;
1060 optlen = roundup2(optlen, 4);
1062 tlen = hlen + sizeof(struct tcphdr) + optlen;
1066 * assume that the entire packet will fit in a header mbuf
1068 KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
1071 * XXX shouldn't this reuse the mbuf if possible ?
1072 * Create the IP+TCP header from scratch.
1077 m = m_gethdr(M_DONTWAIT, MT_HEADER);
1080 m->m_data += max_linkhdr;
1082 m->m_pkthdr.len = tlen;
1083 m->m_pkthdr.rcvif = NULL;
1084 inp = sc->sc_tp->t_inpcb;
1087 mac_create_mbuf_from_inpcb(inp, m);
1091 if (sc->sc_inc.inc_isipv6) {
1092 ip6 = mtod(m, struct ip6_hdr *);
1093 ip6->ip6_vfc = IPV6_VERSION;
1094 ip6->ip6_nxt = IPPROTO_TCP;
1095 ip6->ip6_src = sc->sc_inc.inc6_laddr;
1096 ip6->ip6_dst = sc->sc_inc.inc6_faddr;
1097 ip6->ip6_plen = htons(tlen - hlen);
1098 /* ip6_hlim is set after checksum */
1099 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
1100 ip6->ip6_flow |= sc->sc_flowlabel;
1102 th = (struct tcphdr *)(ip6 + 1);
1106 ip = mtod(m, struct ip *);
1107 ip->ip_v = IPVERSION;
1108 ip->ip_hl = sizeof(struct ip) >> 2;
1113 ip->ip_p = IPPROTO_TCP;
1114 ip->ip_src = sc->sc_inc.inc_laddr;
1115 ip->ip_dst = sc->sc_inc.inc_faddr;
1116 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
1117 ip->ip_tos = inp->inp_ip_tos; /* XXX */
1120 * See if we should do MTU discovery. Route lookups are
1121 * expensive, so we will only unset the DF bit if:
1123 * 1) path_mtu_discovery is disabled
1124 * 2) the SCF_UNREACH flag has been set
1126 if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
1127 ip->ip_off |= IP_DF;
1129 th = (struct tcphdr *)(ip + 1);
1131 th->th_sport = sc->sc_inc.inc_lport;
1132 th->th_dport = sc->sc_inc.inc_fport;
1134 th->th_seq = htonl(sc->sc_iss);
1135 th->th_ack = htonl(sc->sc_irs + 1);
1136 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1138 th->th_flags = TH_SYN|TH_ACK;
1139 th->th_win = htons(sc->sc_wnd);
1142 /* Tack on the TCP options. */
1144 optp = (u_int8_t *)(th + 1);
1145 *optp++ = TCPOPT_MAXSEG;
1146 *optp++ = TCPOLEN_MAXSEG;
1147 *optp++ = (mssopt >> 8) & 0xff;
1148 *optp++ = mssopt & 0xff;
1150 if (sc->sc_flags & SCF_WINSCALE) {
1151 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1152 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1153 sc->sc_request_r_scale);
1157 if (sc->sc_flags & SCF_TIMESTAMP) {
1158 u_int32_t *lp = (u_int32_t *)(optp);
1160 /* Form timestamp option per appendix A of RFC 1323. */
1161 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1162 *lp++ = htonl(ticks);
1163 *lp = htonl(sc->sc_tsrecent);
1164 optp += TCPOLEN_TSTAMP_APPA;
1167 #ifdef TCP_SIGNATURE
1169 * Handle TCP-MD5 passive opener response.
1171 if (sc->sc_flags & SCF_SIGNATURE) {
1172 u_int8_t *bp = optp;
1175 *bp++ = TCPOPT_SIGNATURE;
1176 *bp++ = TCPOLEN_SIGNATURE;
1177 for (i = 0; i < TCP_SIGLEN; i++)
1179 tcp_signature_compute(m, sizeof(struct ip), 0, optlen,
1180 optp + 2, IPSEC_DIR_OUTBOUND);
1181 optp += TCPOLEN_SIGNATURE;
1183 #endif /* TCP_SIGNATURE */
1185 if (sc->sc_flags & SCF_SACK) {
1186 *optp++ = TCPOPT_SACK_PERMITTED;
1187 *optp++ = TCPOLEN_SACK_PERMITTED;
1191 /* Pad TCP options to a 4 byte boundary */
1192 int padlen = optlen - (optp - (u_int8_t *)(th + 1));
1193 while (padlen-- > 0)
1194 *optp++ = TCPOPT_EOL;
1199 if (sc->sc_inc.inc_isipv6) {
1201 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1202 ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
1203 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
1207 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1208 htons(tlen - hlen + IPPROTO_TCP));
1209 m->m_pkthdr.csum_flags = CSUM_TCP;
1210 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1215 if (so != NULL && so->so_options & SO_DEBUG) {
1216 struct tcpcb *tp = sototcpcb(so);
1217 tcp_trace(TA_OUTPUT, tp->t_state, tp,
1218 mtod(m, void *), th, 0);
1221 error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
1230 * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
1232 * | MD5(laddr,faddr,secret,lport,fport) |. . . . . . .|
1234 * (A): peer mss index
1238 * The values below are chosen to minimize the size of the tcp_secret
1239 * table, as well as providing roughly a 16 second lifetime for the cookie.
1242 #define SYNCOOKIE_WNDBITS 5 /* exposed bits for window indexing */
1243 #define SYNCOOKIE_TIMESHIFT 1 /* scale ticks to window time units */
1245 #define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1)
1246 #define SYNCOOKIE_NSECRETS (1 << SYNCOOKIE_WNDBITS)
1247 #define SYNCOOKIE_TIMEOUT \
1248 (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
1249 #define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
1252 u_int32_t ts_secbits[4];
1254 } tcp_secret[SYNCOOKIE_NSECRETS];
1256 static int tcp_msstab[] = { 0, 536, 1460, 8960 };
1258 static MD5_CTX syn_ctx;
1260 #define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
1263 u_int32_t laddr, faddr;
1264 u_int32_t secbits[4];
1265 u_int16_t lport, fport;
1269 CTASSERT(sizeof(struct md5_add) == 28);
1273 * Consider the problem of a recreated (and retransmitted) cookie. If the
1274 * original SYN was accepted, the connection is established. The second
1275 * SYN is inflight, and if it arrives with an ISN that falls within the
1276 * receive window, the connection is killed.
1278 * However, since cookies have other problems, this may not be worth
1283 syncookie_generate(struct syncache *sc, u_int32_t *flowid)
1285 u_int32_t md5_buffer[4];
1290 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1292 idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
1293 if (tcp_secret[idx].ts_expire < ticks) {
1294 for (i = 0; i < 4; i++)
1295 tcp_secret[idx].ts_secbits[i] = arc4random();
1296 tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
1298 for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
1299 if (tcp_msstab[data] <= sc->sc_peer_mss)
1301 data = (data << SYNCOOKIE_WNDBITS) | idx;
1302 data ^= sc->sc_irs; /* peer's iss */
1305 if (sc->sc_inc.inc_isipv6) {
1306 MD5Add(sc->sc_inc.inc6_laddr);
1307 MD5Add(sc->sc_inc.inc6_faddr);
1313 add.laddr = sc->sc_inc.inc_laddr.s_addr;
1314 add.faddr = sc->sc_inc.inc_faddr.s_addr;
1316 add.lport = sc->sc_inc.inc_lport;
1317 add.fport = sc->sc_inc.inc_fport;
1318 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1319 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1320 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1321 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1323 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1324 data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
1325 *flowid = md5_buffer[1];
1326 tcpstat.tcps_sc_sendcookie++;
1330 static struct syncache *
1331 syncookie_lookup(inc, th, so)
1332 struct in_conninfo *inc;
1336 u_int32_t md5_buffer[4];
1337 struct syncache *sc;
1342 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1344 data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
1345 idx = data & SYNCOOKIE_WNDMASK;
1346 if (tcp_secret[idx].ts_expire < ticks ||
1347 sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
1351 if (inc->inc_isipv6) {
1352 MD5Add(inc->inc6_laddr);
1353 MD5Add(inc->inc6_faddr);
1359 add.laddr = inc->inc_laddr.s_addr;
1360 add.faddr = inc->inc_faddr.s_addr;
1362 add.lport = inc->inc_lport;
1363 add.fport = inc->inc_fport;
1364 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1365 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1366 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1367 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1369 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1370 data ^= md5_buffer[0];
1371 if ((data & ~SYNCOOKIE_DATAMASK) != 0)
1373 data = data >> SYNCOOKIE_WNDBITS;
1375 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
1379 * Fill in the syncache values.
1380 * XXX duplicate code from syncache_add
1382 sc->sc_ipopts = NULL;
1383 sc->sc_inc.inc_fport = inc->inc_fport;
1384 sc->sc_inc.inc_lport = inc->inc_lport;
1385 sc->sc_tp = sototcpcb(so);
1387 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
1388 if (inc->inc_isipv6) {
1389 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
1390 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
1391 if (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)
1392 sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
1396 sc->sc_inc.inc_faddr = inc->inc_faddr;
1397 sc->sc_inc.inc_laddr = inc->inc_laddr;
1399 sc->sc_irs = th->th_seq - 1;
1400 sc->sc_iss = th->th_ack - 1;
1401 wnd = sbspace(&so->so_rcv);
1403 wnd = imin(wnd, TCP_MAXWIN);
1407 sc->sc_peer_mss = tcp_msstab[data];