]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_reass.c
be more cautious about tcp option length field. drop bogus ones earlier.
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_reass.c
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36
37 #include "opt_ipfw.h"           /* for ipfw_fwd         */
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_tcp_input.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysctl.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>           /* for proc0 declaration */
50 #include <sys/protosw.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/syslog.h>
54
55 #include <machine/cpu.h>        /* before tcp_seq.h, for tcp_random18() */
56
57 #include <net/if.h>
58 #include <net/route.h>
59
60 #include <netinet/in.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/ip.h>
63 #include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM             */
64 #include <netinet/in_var.h>
65 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM             */
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip_var.h>
68 #ifdef INET6
69 #include <netinet/ip6.h>
70 #include <netinet/icmp6.h>
71 #include <netinet6/nd6.h>
72 #include <netinet6/ip6_var.h>
73 #include <netinet6/in6_pcb.h>
74 #endif
75 #include <netinet/tcp.h>
76 #include <netinet/tcp_fsm.h>
77 #include <netinet/tcp_seq.h>
78 #include <netinet/tcp_timer.h>
79 #include <netinet/tcp_var.h>
80 #ifdef INET6
81 #include <netinet6/tcp6_var.h>
82 #endif
83 #include <netinet/tcpip.h>
84 #ifdef TCPDEBUG
85 #include <netinet/tcp_debug.h>
86
87 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
88 struct tcphdr tcp_savetcp;
89 #endif /* TCPDEBUG */
90
91 #ifdef IPSEC
92 #include <netinet6/ipsec.h>
93 #ifdef INET6
94 #include <netinet6/ipsec6.h>
95 #endif
96 #include <netkey/key.h>
97 #endif /*IPSEC*/
98
99 #include <machine/in_cksum.h>
100
101 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
102
103 static int      tcprexmtthresh = 3;
104 tcp_seq tcp_iss;
105 tcp_cc  tcp_ccgen;
106
107 struct  tcpstat tcpstat;
108 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, 
109     &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
110
111 static int log_in_vain = 0;
112 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 
113     &log_in_vain, 0, "Log all incoming TCP connections");
114
115 static int blackhole = 0;
116 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
117         &blackhole, 0, "Do not send RST when dropping refused connections");
118
119 int tcp_delack_enabled = 1;
120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 
121     &tcp_delack_enabled, 0, 
122     "Delay ACK to try and piggyback it onto a data packet");
123
124 #ifdef TCP_DROP_SYNFIN
125 static int drop_synfin = 0;
126 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
127     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
128 #endif
129
130 #ifdef TCP_RESTRICT_RST
131 static int restrict_rst = 0;
132 SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW,
133     &restrict_rst, 0, "Restrict RST emission");
134 #endif
135
136 struct inpcbhead tcb;
137 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
138 struct inpcbinfo tcbinfo;
139
140 static void      tcp_dooptions __P((struct tcpcb *,
141             u_char *, int, struct tcphdr *, struct tcpopt *));
142 static void      tcp_pulloutofband __P((struct socket *,
143             struct tcphdr *, struct mbuf *, int));
144 static int       tcp_reass __P((struct tcpcb *, struct tcphdr *, int *,
145                                 struct mbuf *));
146 static void      tcp_xmit_timer __P((struct tcpcb *, int));
147 static int       tcp_newreno __P((struct tcpcb *, struct tcphdr *));
148
149 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
150 #ifdef INET6
151 #define ND6_HINT(tp) \
152 do { \
153         if ((tp) && (tp)->t_inpcb && \
154             ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
155             (tp)->t_inpcb->in6p_route.ro_rt) \
156                 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
157 } while (0)
158 #else
159 #define ND6_HINT(tp)
160 #endif
161
162 /*
163  * Insert segment which inludes th into reassembly queue of tcp with
164  * control block tp.  Return TH_FIN if reassembly now includes
165  * a segment with FIN.  The macro form does the common case inline
166  * (segment is the next to be received on an established connection,
167  * and the queue is empty), avoiding linkage into and removal
168  * from the queue and repetition of various conversions.
169  * Set DELACK for segments received in order, but ack immediately
170  * when segments are out of order (so fast retransmit can work).
171  */
172 #define TCP_REASS(tp, th, tlenp, m, so, flags) { \
173         if ((th)->th_seq == (tp)->rcv_nxt && \
174             LIST_EMPTY(&(tp)->t_segq) && \
175             (tp)->t_state == TCPS_ESTABLISHED) { \
176                 if (tcp_delack_enabled) \
177                         callout_reset(tp->tt_delack, tcp_delacktime, \
178                             tcp_timer_delack, tp); \
179                 else \
180                         tp->t_flags |= TF_ACKNOW; \
181                 (tp)->rcv_nxt += *(tlenp); \
182                 flags = (th)->th_flags & TH_FIN; \
183                 tcpstat.tcps_rcvpack++;\
184                 tcpstat.tcps_rcvbyte += *(tlenp);\
185                 ND6_HINT(tp); \
186                 sbappend(&(so)->so_rcv, (m)); \
187                 sorwakeup(so); \
188         } else { \
189                 (flags) = tcp_reass((tp), (th), (tlenp), (m)); \
190                 tp->t_flags |= TF_ACKNOW; \
191         } \
192 }
193
194 static int
195 tcp_reass(tp, th, tlenp, m)
196         register struct tcpcb *tp;
197         register struct tcphdr *th;
198         int *tlenp;
199         struct mbuf *m;
200 {
201         struct tseg_qent *q;
202         struct tseg_qent *p = NULL;
203         struct tseg_qent *nq;
204         struct tseg_qent *te;
205         struct socket *so = tp->t_inpcb->inp_socket;
206         int flags;
207
208         /*
209          * Call with th==0 after become established to
210          * force pre-ESTABLISHED data up to user socket.
211          */
212         if (th == 0)
213                 goto present;
214
215         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
216         MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
217                M_NOWAIT);
218         if (te == NULL) {
219                 tcpstat.tcps_rcvmemdrop++;
220                 m_freem(m);
221                 return (0);
222         }
223
224         /*
225          * Find a segment which begins after this one does.
226          */
227         LIST_FOREACH(q, &tp->t_segq, tqe_q) {
228                 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
229                         break;
230                 p = q;
231         }
232
233         /*
234          * If there is a preceding segment, it may provide some of
235          * our data already.  If so, drop the data from the incoming
236          * segment.  If it provides all of our data, drop us.
237          */
238         if (p != NULL) {
239                 register int i;
240                 /* conversion to int (in i) handles seq wraparound */
241                 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
242                 if (i > 0) {
243                         if (i >= *tlenp) {
244                                 tcpstat.tcps_rcvduppack++;
245                                 tcpstat.tcps_rcvdupbyte += *tlenp;
246                                 m_freem(m);
247                                 FREE(te, M_TSEGQ);
248                                 /*
249                                  * Try to present any queued data
250                                  * at the left window edge to the user.
251                                  * This is needed after the 3-WHS
252                                  * completes.
253                                  */
254                                 goto present;   /* ??? */
255                         }
256                         m_adj(m, i);
257                         *tlenp -= i;
258                         th->th_seq += i;
259                 }
260         }
261         tcpstat.tcps_rcvoopack++;
262         tcpstat.tcps_rcvoobyte += *tlenp;
263
264         /*
265          * While we overlap succeeding segments trim them or,
266          * if they are completely covered, dequeue them.
267          */
268         while (q) {
269                 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
270                 if (i <= 0)
271                         break;
272                 if (i < q->tqe_len) {
273                         q->tqe_th->th_seq += i;
274                         q->tqe_len -= i;
275                         m_adj(q->tqe_m, i);
276                         break;
277                 }
278
279                 nq = LIST_NEXT(q, tqe_q);
280                 LIST_REMOVE(q, tqe_q);
281                 m_freem(q->tqe_m);
282                 FREE(q, M_TSEGQ);
283                 q = nq;
284         }
285
286         /* Insert the new segment queue entry into place. */
287         te->tqe_m = m;
288         te->tqe_th = th;
289         te->tqe_len = *tlenp;
290
291         if (p == NULL) {
292                 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
293         } else {
294                 LIST_INSERT_AFTER(p, te, tqe_q);
295         }
296
297 present:
298         /*
299          * Present data to user, advancing rcv_nxt through
300          * completed sequence space.
301          */
302         if (!TCPS_HAVEESTABLISHED(tp->t_state))
303                 return (0);
304         q = LIST_FIRST(&tp->t_segq);
305         if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
306                 return (0);
307         do {
308                 tp->rcv_nxt += q->tqe_len;
309                 flags = q->tqe_th->th_flags & TH_FIN;
310                 nq = LIST_NEXT(q, tqe_q);
311                 LIST_REMOVE(q, tqe_q);
312                 if (so->so_state & SS_CANTRCVMORE)
313                         m_freem(q->tqe_m);
314                 else
315                         sbappend(&so->so_rcv, q->tqe_m);
316                 FREE(q, M_TSEGQ);
317                 q = nq;
318         } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
319         ND6_HINT(tp);
320         sorwakeup(so);
321         return (flags);
322 }
323
324 /*
325  * TCP input routine, follows pages 65-76 of the
326  * protocol specification dated September, 1981 very closely.
327  */
328 #ifdef INET6
329 int
330 tcp6_input(mp, offp, proto)
331         struct mbuf **mp;
332         int *offp, proto;
333 {
334         register struct mbuf *m = *mp;
335
336         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
337
338         /*
339          * draft-itojun-ipv6-tcp-to-anycast
340          * better place to put this in?
341          */
342         if (m->m_flags & M_ANYCAST6) {
343                 struct ip6_hdr *ip6;
344
345                 ip6 = mtod(m, struct ip6_hdr *);
346                 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
347                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
348                 return IPPROTO_DONE;
349         }
350
351         tcp_input(m, *offp, proto);
352         return IPPROTO_DONE;
353 }
354 #endif
355
356 void
357 tcp_input(m, off0, proto)
358         register struct mbuf *m;
359         int off0, proto;
360 {
361         register struct tcphdr *th;
362         register struct ip *ip = NULL;
363         register struct ipovly *ipov;
364         register struct inpcb *inp;
365         u_char *optp = NULL;
366         int optlen = 0;
367         int len, tlen, off;
368         int drop_hdrlen;
369         register struct tcpcb *tp = 0;
370         register int thflags;
371         struct socket *so = 0;
372         int todrop, acked, ourfinisacked, needoutput = 0;
373         struct in_addr laddr;
374 #ifdef INET6
375         struct in6_addr laddr6;
376 #endif
377         int dropsocket = 0;
378         int iss = 0;
379         u_long tiwin;
380         struct tcpopt to;               /* options in this segment */
381         struct rmxp_tao *taop;          /* pointer to our TAO cache entry */
382         struct rmxp_tao tao_noncached;  /* in case there's no cached entry */
383 #ifdef TCPDEBUG
384         short ostate = 0;
385 #endif
386 #ifdef INET6
387         struct ip6_hdr *ip6 = NULL;
388         int isipv6;
389 #endif /* INET6 */
390
391 #ifdef INET6
392         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
393 #endif
394         bzero((char *)&to, sizeof(to));
395
396         tcpstat.tcps_rcvtotal++;
397
398 #ifdef INET6
399         if (isipv6) {
400                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
401                 ip6 = mtod(m, struct ip6_hdr *);
402                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
403                 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
404                         tcpstat.tcps_rcvbadsum++;
405                         goto drop;
406                 }
407                 th = (struct tcphdr *)((caddr_t)ip6 + off0);
408         } else
409 #endif /* INET6 */
410       {
411         /*
412          * Get IP and TCP header together in first mbuf.
413          * Note: IP leaves IP header in first mbuf.
414          */
415         if (off0 > sizeof (struct ip)) {
416                 ip_stripoptions(m, (struct mbuf *)0);
417                 off0 = sizeof(struct ip);
418         }
419         if (m->m_len < sizeof (struct tcpiphdr)) {
420                 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
421                         tcpstat.tcps_rcvshort++;
422                         return;
423                 }
424         }
425         ip = mtod(m, struct ip *);
426         ipov = (struct ipovly *)ip;
427         th = (struct tcphdr *)((caddr_t)ip + off0);
428         tlen = ip->ip_len;
429
430         if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
431                 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
432                         th->th_sum = m->m_pkthdr.csum_data;
433                 else
434                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
435                             ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
436                             ip->ip_len + IPPROTO_TCP));
437                 th->th_sum ^= 0xffff;
438         } else {
439                 /*
440                  * Checksum extended TCP header and data.
441                  */
442                 len = sizeof (struct ip) + tlen;
443                 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
444                 ipov->ih_len = (u_short)tlen;
445                 HTONS(ipov->ih_len);
446                 th->th_sum = in_cksum(m, len);
447         }
448         if (th->th_sum) {
449                 tcpstat.tcps_rcvbadsum++;
450                 goto drop;
451         }
452 #ifdef INET6
453         /* Re-initialization for later version check */
454         ip->ip_v = IPVERSION;
455 #endif
456       }
457
458         /*
459          * Check that TCP offset makes sense,
460          * pull out TCP options and adjust length.              XXX
461          */
462         off = th->th_off << 2;
463         if (off < sizeof (struct tcphdr) || off > tlen) {
464                 tcpstat.tcps_rcvbadoff++;
465                 goto drop;
466         }
467         tlen -= off;    /* tlen is used instead of ti->ti_len */
468         if (off > sizeof (struct tcphdr)) {
469 #ifdef INET6
470                 if (isipv6) {
471                         IP6_EXTHDR_CHECK(m, off0, off, );
472                         ip6 = mtod(m, struct ip6_hdr *);
473                         th = (struct tcphdr *)((caddr_t)ip6 + off0);
474                 } else
475 #endif /* INET6 */
476               {
477                 if (m->m_len < sizeof(struct ip) + off) {
478                         if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
479                                 tcpstat.tcps_rcvshort++;
480                                 return;
481                         }
482                         ip = mtod(m, struct ip *);
483                         ipov = (struct ipovly *)ip;
484                         th = (struct tcphdr *)((caddr_t)ip + off0);
485                 }
486               }
487                 optlen = off - sizeof (struct tcphdr);
488                 optp = (u_char *)(th + 1);
489         }
490         thflags = th->th_flags;
491
492 #ifdef TCP_DROP_SYNFIN
493         /*
494          * If the drop_synfin option is enabled, drop all packets with
495          * both the SYN and FIN bits set. This prevents e.g. nmap from
496          * identifying the TCP/IP stack.
497          *
498          * This is incompatible with RFC1644 extensions (T/TCP).
499          */
500         if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
501                 goto drop;
502 #endif
503
504         /*
505          * Convert TCP protocol specific fields to host format.
506          */
507         NTOHL(th->th_seq);
508         NTOHL(th->th_ack);
509         NTOHS(th->th_win);
510         NTOHS(th->th_urp);
511
512         /*
513          * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
514          * until after ip6_savecontrol() is called and before other functions
515          * which don't want those proto headers.
516          * Because ip6_savecontrol() is going to parse the mbuf to
517          * search for data to be passed up to user-land, it wants mbuf
518          * parameters to be unchanged.
519          */
520         drop_hdrlen = off0 + off;
521
522         /*
523          * Locate pcb for segment.
524          */
525 findpcb:
526 #ifdef IPFIREWALL_FORWARD
527         if (ip_fw_fwd_addr != NULL
528 #ifdef INET6
529             && isipv6 == NULL /* IPv6 support is not yet */
530 #endif /* INET6 */
531             ) {
532                 /*
533                  * Diverted. Pretend to be the destination.
534                  * already got one like this? 
535                  */
536                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
537                         ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
538                 if (!inp) {
539                         /* 
540                          * No, then it's new. Try find the ambushing socket
541                          */
542                         if (!ip_fw_fwd_addr->sin_port) {
543                                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
544                                     th->th_sport, ip_fw_fwd_addr->sin_addr,
545                                     th->th_dport, 1, m->m_pkthdr.rcvif);
546                         } else {
547                                 inp = in_pcblookup_hash(&tcbinfo,
548                                     ip->ip_src, th->th_sport,
549                                     ip_fw_fwd_addr->sin_addr,
550                                     ntohs(ip_fw_fwd_addr->sin_port), 1,
551                                     m->m_pkthdr.rcvif);
552                         }
553                 }
554                 ip_fw_fwd_addr = NULL;
555         } else
556 #endif  /* IPFIREWALL_FORWARD */
557       {
558 #ifdef INET6
559         if (isipv6)
560                 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
561                                          &ip6->ip6_dst, th->th_dport, 1,
562                                          m->m_pkthdr.rcvif);
563         else
564 #endif /* INET6 */
565         inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
566             ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
567       }
568
569 #ifdef IPSEC
570 #ifdef INET6
571         if (isipv6) {
572                 if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
573                         ipsec6stat.in_polvio++;
574                         goto drop;
575                 }
576         } else
577 #endif /* INET6 */
578         if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
579                 ipsecstat.in_polvio++;
580                 goto drop;
581         }
582 #endif /*IPSEC*/
583
584         /*
585          * If the state is CLOSED (i.e., TCB does not exist) then
586          * all data in the incoming segment is discarded.
587          * If the TCB exists but is in CLOSED state, it is embryonic,
588          * but should either do a listen or a connect soon.
589          */
590         if (inp == NULL) {
591                 if (log_in_vain) {
592 #ifdef INET6
593                         char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
594 #else /* INET6 */
595                         char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
596 #endif /* INET6 */
597
598 #ifdef INET6
599                         if (isipv6) {
600                                 strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
601                                 strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
602                         } else
603 #endif
604                       {
605                         strcpy(dbuf, inet_ntoa(ip->ip_dst));
606                         strcpy(sbuf, inet_ntoa(ip->ip_src));
607                       }
608                         switch (log_in_vain) {
609                         case 1:
610                                 if(thflags & TH_SYN)
611                                         log(LOG_INFO,
612                                         "Connection attempt to TCP %s:%d from %s:%d\n",
613                                         dbuf, ntohs(th->th_dport),
614                                         sbuf,
615                                         ntohs(th->th_sport));
616                                 break;
617                         case 2:
618                                 log(LOG_INFO,
619                                 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
620                                 dbuf, ntohs(th->th_dport), sbuf,
621                                 ntohs(th->th_sport), thflags);
622                                 break;
623                         default:
624                                 break;
625                         }
626                 }
627                 if (blackhole) { 
628                         switch (blackhole) {
629                         case 1:
630                                 if (thflags & TH_SYN)
631                                         goto drop;
632                                 break;
633                         case 2:
634                                 goto drop;
635                         default:
636                                 goto drop;
637                         }
638                 }
639                 goto maybedropwithreset;
640         }
641         tp = intotcpcb(inp);
642         if (tp == 0)
643                 goto maybedropwithreset;
644         if (tp->t_state == TCPS_CLOSED)
645                 goto drop;
646
647         /* Unscale the window into a 32-bit value. */
648         if ((thflags & TH_SYN) == 0)
649                 tiwin = th->th_win << tp->snd_scale;
650         else
651                 tiwin = th->th_win;
652
653 #ifdef INET6
654         /* save packet options if user wanted */
655         if (isipv6 && inp->in6p_flags & INP_CONTROLOPTS) {
656                 if (inp->in6p_options) {
657                         m_freem(inp->in6p_options);
658                         inp->in6p_options = 0;
659                 }
660                 ip6_savecontrol(inp, &inp->in6p_options, ip6, m);
661         }
662         /* else, should also do ip_srcroute() here? */
663 #endif /* INET6 */
664
665         so = inp->inp_socket;
666         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
667 #ifdef TCPDEBUG
668                 if (so->so_options & SO_DEBUG) {
669                         ostate = tp->t_state;
670 #ifdef INET6
671                         if (isipv6)
672                                 bcopy((char *)ip6, (char *)tcp_saveipgen,
673                                       sizeof(*ip6));
674                         else
675 #endif /* INET6 */
676                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
677                         tcp_savetcp = *th;
678                 }
679 #endif
680                 if (so->so_options & SO_ACCEPTCONN) {
681                         register struct tcpcb *tp0 = tp;
682                         struct socket *so2;
683 #ifdef IPSEC
684                         struct socket *oso;
685 #endif
686 #ifdef INET6
687                         struct inpcb *oinp = sotoinpcb(so);
688 #endif /* INET6 */
689
690 #ifndef IPSEC
691                         /*
692                          * Current IPsec implementation makes incorrect IPsec
693                          * cache if this check is done here.
694                          * So delay this until duplicated socket is created.
695                          */
696                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
697                                 /*
698                                  * Note: dropwithreset makes sure we don't
699                                  * send a RST in response to a RST.
700                                  */
701                                 if (thflags & TH_ACK) {
702                                         tcpstat.tcps_badsyn++;
703                                         goto maybedropwithreset;
704                                 }
705                                 goto drop;
706                         }
707 #endif
708                         so2 = sonewconn(so, 0);
709                         if (so2 == 0) {
710                                 tcpstat.tcps_listendrop++;
711                                 so2 = sodropablereq(so);
712                                 if (so2) {
713                                         tcp_drop(sototcpcb(so2), ETIMEDOUT);
714                                         so2 = sonewconn(so, 0);
715                                 }
716                                 if (!so2)
717                                         goto drop;
718                         }
719 #ifdef IPSEC
720                         oso = so;
721 #endif
722                         so = so2;
723                         /*
724                          * This is ugly, but ....
725                          *
726                          * Mark socket as temporary until we're
727                          * committed to keeping it.  The code at
728                          * ``drop'' and ``dropwithreset'' check the
729                          * flag dropsocket to see if the temporary
730                          * socket created here should be discarded.
731                          * We mark the socket as discardable until
732                          * we're committed to it below in TCPS_LISTEN.
733                          */
734                         dropsocket++;
735                         inp = (struct inpcb *)so->so_pcb;
736 #ifdef INET6
737                         if (isipv6)
738                                 inp->in6p_laddr = ip6->ip6_dst;
739                         else {
740                                 if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) {
741                                         inp->inp_vflag &= ~INP_IPV6;
742                                         inp->inp_vflag |= INP_IPV4;
743                                 }
744 #endif /* INET6 */
745                         inp->inp_laddr = ip->ip_dst;
746 #ifdef INET6
747                         }
748 #endif /* INET6 */
749                         inp->inp_lport = th->th_dport;
750                         if (in_pcbinshash(inp) != 0) {
751                                 /*
752                                  * Undo the assignments above if we failed to
753                                  * put the PCB on the hash lists.
754                                  */
755 #ifdef INET6
756                                 if (isipv6)
757                                         inp->in6p_laddr = in6addr_any;
758                                 else
759 #endif /* INET6 */
760                                 inp->inp_laddr.s_addr = INADDR_ANY;
761                                 inp->inp_lport = 0;
762                                 goto drop;
763                         }
764 #ifdef IPSEC
765                         /*
766                          * To avoid creating incorrectly cached IPsec
767                          * association, this is need to be done here.
768                          *
769                          * Subject: (KAME-snap 748)
770                          * From: Wayne Knowles <w.knowles@niwa.cri.nz>
771                          * ftp://ftp.kame.net/pub/mail-list/snap-users/748
772                          */
773                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
774                                 /*
775                                  * Note: dropwithreset makes sure we don't
776                                  * send a RST in response to a RST.
777                                  */
778                                 if (thflags & TH_ACK) {
779                                         tcpstat.tcps_badsyn++;
780                                         goto maybedropwithreset;
781                                 }
782                                 goto drop;
783                         }
784 #endif
785 #ifdef INET6
786                         if (isipv6) {
787                                 /*
788                                  * inherit socket options from the listening
789                                  * socket.
790                                  */
791                                 inp->inp_flags |=
792                                         oinp->inp_flags & INP_CONTROLOPTS;
793                                 if (inp->inp_flags & INP_CONTROLOPTS) {
794                                         if (inp->in6p_options) {
795                                                 m_freem(inp->in6p_options);
796                                                 inp->in6p_options = 0;
797                                         }
798                                         ip6_savecontrol(inp,
799                                                         &inp->in6p_options,
800                                                         ip6, m);
801                                 }
802                         } else
803 #endif /* INET6 */
804                         inp->inp_options = ip_srcroute();
805 #ifdef IPSEC
806                         /* copy old policy into new socket's */
807                         if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp,
808                                               inp->inp_sp))
809                                 printf("tcp_input: could not copy policy\n");
810 #endif
811                         tp = intotcpcb(inp);
812                         tp->t_state = TCPS_LISTEN;
813                         tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT);
814
815                         /* Compute proper scaling value from buffer space */
816                         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
817                            TCP_MAXWIN << tp->request_r_scale <
818                            so->so_rcv.sb_hiwat)
819                                 tp->request_r_scale++;
820                 }
821         }
822
823         /*
824          * Segment received on connection.
825          * Reset idle time and keep-alive timer.
826          */
827         tp->t_rcvtime = ticks;
828         if (TCPS_HAVEESTABLISHED(tp->t_state))
829                 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
830
831         /*
832          * Process options if not in LISTEN state,
833          * else do it below (after getting remote address).
834          */
835         if (tp->t_state != TCPS_LISTEN)
836                 tcp_dooptions(tp, optp, optlen, th, &to);
837
838         /*
839          * Header prediction: check for the two common cases
840          * of a uni-directional data xfer.  If the packet has
841          * no control flags, is in-sequence, the window didn't
842          * change and we're not retransmitting, it's a
843          * candidate.  If the length is zero and the ack moved
844          * forward, we're the sender side of the xfer.  Just
845          * free the data acked & wake any higher level process
846          * that was blocked waiting for space.  If the length
847          * is non-zero and the ack didn't move, we're the
848          * receiver side.  If we're getting packets in-order
849          * (the reassembly queue is empty), add the data to
850          * the socket buffer and note that we need a delayed ack.
851          * Make sure that the hidden state-flags are also off.
852          * Since we check for TCPS_ESTABLISHED above, it can only
853          * be TH_NEEDSYN.
854          */
855         if (tp->t_state == TCPS_ESTABLISHED &&
856             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
857             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
858             ((to.to_flag & TOF_TS) == 0 ||
859              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
860             /*
861              * Using the CC option is compulsory if once started:
862              *   the segment is OK if no T/TCP was negotiated or
863              *   if the segment has a CC option equal to CCrecv
864              */
865             ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
866              ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
867             th->th_seq == tp->rcv_nxt &&
868             tiwin && tiwin == tp->snd_wnd &&
869             tp->snd_nxt == tp->snd_max) {
870
871                 /*
872                  * If last ACK falls within this segment's sequence numbers,
873                  * record the timestamp.
874                  * NOTE that the test is modified according to the latest
875                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
876                  */
877                 if ((to.to_flag & TOF_TS) != 0 &&
878                    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
879                         tp->ts_recent_age = ticks;
880                         tp->ts_recent = to.to_tsval;
881                 }
882
883                 if (tlen == 0) {
884                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
885                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
886                             tp->snd_cwnd >= tp->snd_wnd &&
887                             tp->t_dupacks < tcprexmtthresh) {
888                                 /*
889                                  * this is a pure ack for outstanding data.
890                                  */
891                                 ++tcpstat.tcps_predack;
892                                 /*
893                                  * "bad retransmit" recovery
894                                  */
895                                 if (tp->t_rxtshift == 1 &&
896                                     ticks < tp->t_badrxtwin) {
897                                         tp->snd_cwnd = tp->snd_cwnd_prev;
898                                         tp->snd_ssthresh =
899                                             tp->snd_ssthresh_prev;
900                                         tp->snd_nxt = tp->snd_max;
901                                         tp->t_badrxtwin = 0;
902                                 }
903                                 if ((to.to_flag & TOF_TS) != 0)
904                                         tcp_xmit_timer(tp,
905                                             ticks - to.to_tsecr + 1);
906                                 else if (tp->t_rtttime &&
907                                             SEQ_GT(th->th_ack, tp->t_rtseq))
908                                         tcp_xmit_timer(tp, ticks - tp->t_rtttime);
909                                 acked = th->th_ack - tp->snd_una;
910                                 tcpstat.tcps_rcvackpack++;
911                                 tcpstat.tcps_rcvackbyte += acked;
912                                 sbdrop(&so->so_snd, acked);
913                                 tp->snd_una = th->th_ack;
914                                 m_freem(m);
915                                 ND6_HINT(tp); /* some progress has been done */
916
917                                 /*
918                                  * If all outstanding data are acked, stop
919                                  * retransmit timer, otherwise restart timer
920                                  * using current (possibly backed-off) value.
921                                  * If process is waiting for space,
922                                  * wakeup/selwakeup/signal.  If data
923                                  * are ready to send, let tcp_output
924                                  * decide between more output or persist.
925                                  */
926                                 if (tp->snd_una == tp->snd_max)
927                                         callout_stop(tp->tt_rexmt);
928                                 else if (!callout_active(tp->tt_persist))
929                                         callout_reset(tp->tt_rexmt, 
930                                                       tp->t_rxtcur,
931                                                       tcp_timer_rexmt, tp);
932
933                                 sowwakeup(so);
934                                 if (so->so_snd.sb_cc)
935                                         (void) tcp_output(tp);
936                                 return;
937                         }
938                 } else if (th->th_ack == tp->snd_una &&
939                     LIST_EMPTY(&tp->t_segq) &&
940                     tlen <= sbspace(&so->so_rcv)) {
941                         /*
942                          * this is a pure, in-sequence data packet
943                          * with nothing on the reassembly queue and
944                          * we have enough buffer space to take it.
945                          */
946                         ++tcpstat.tcps_preddat;
947                         tp->rcv_nxt += tlen;
948                         tcpstat.tcps_rcvpack++;
949                         tcpstat.tcps_rcvbyte += tlen;
950                         ND6_HINT(tp);   /* some progress has been done */
951                         /*
952                          * Add data to socket buffer.
953                          */
954                         m_adj(m, drop_hdrlen);  /* delayed header drop */
955                         sbappend(&so->so_rcv, m);
956                         sorwakeup(so);
957                         if (tcp_delack_enabled) {
958                                 callout_reset(tp->tt_delack, tcp_delacktime,
959                                     tcp_timer_delack, tp);
960                         } else {
961                                 tp->t_flags |= TF_ACKNOW;
962                                 tcp_output(tp);
963                         }
964                         return;
965                 }
966         }
967
968         /*
969          * Calculate amount of space in receive window,
970          * and then do TCP input processing.
971          * Receive window is amount of space in rcv queue,
972          * but not less than advertised window.
973          */
974         { int win;
975
976         win = sbspace(&so->so_rcv);
977         if (win < 0)
978                 win = 0;
979         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
980         }
981
982         switch (tp->t_state) {
983
984         /*
985          * If the state is LISTEN then ignore segment if it contains an RST.
986          * If the segment contains an ACK then it is bad and send a RST.
987          * If it does not contain a SYN then it is not interesting; drop it.
988          * If it is from this socket, drop it, it must be forged.
989          * Don't bother responding if the destination was a broadcast.
990          * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
991          * tp->iss, and send a segment:
992          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
993          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
994          * Fill in remote peer address fields if not previously specified.
995          * Enter SYN_RECEIVED state, and process any other fields of this
996          * segment in this state.
997          */
998         case TCPS_LISTEN: {
999                 register struct sockaddr_in *sin;
1000 #ifdef INET6
1001                 register struct sockaddr_in6 *sin6;
1002 #endif
1003
1004                 if (thflags & TH_RST)
1005                         goto drop;
1006                 if (thflags & TH_ACK)
1007                         goto maybedropwithreset;
1008                 if ((thflags & TH_SYN) == 0)
1009                         goto drop;
1010                 if (th->th_dport == th->th_sport) {
1011 #ifdef INET6
1012                         if (isipv6) {
1013                                 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1014                                                        &ip6->ip6_src))
1015                                         goto drop;
1016                         } else
1017 #endif /* INET6 */
1018                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1019                                 goto drop;
1020                 }
1021                 /*
1022                  * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1023                  * in_broadcast() should never return true on a received
1024                  * packet with M_BCAST not set.
1025                  *
1026                  * Packets with a multicast source address should also
1027                  * be discarded.
1028                  */
1029                 if (m->m_flags & (M_BCAST|M_MCAST))
1030                         goto drop;
1031 #ifdef INET6
1032                 if (isipv6) {
1033                         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1034                             IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1035                                 goto drop;
1036                 } else
1037 #endif
1038                 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1039                     IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1040                     ip->ip_src.s_addr == htonl(INADDR_BROADCAST))
1041                         goto drop;
1042 #ifdef INET6
1043                 if (isipv6) {
1044                         MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1045                                M_SONAME, M_NOWAIT);
1046                         if (sin6 == NULL)
1047                                 goto drop;
1048                         bzero(sin6, sizeof(*sin6));
1049                         sin6->sin6_family = AF_INET6;
1050                         sin6->sin6_len = sizeof(*sin6);
1051                         sin6->sin6_addr = ip6->ip6_src;
1052                         sin6->sin6_port = th->th_sport;
1053                         laddr6 = inp->in6p_laddr;
1054                         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1055                                 inp->in6p_laddr = ip6->ip6_dst;
1056                         if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1057                                            &proc0)) {
1058                                 inp->in6p_laddr = laddr6;
1059                                 FREE(sin6, M_SONAME);
1060                                 goto drop;
1061                         }
1062                         FREE(sin6, M_SONAME);
1063                 } else
1064 #endif
1065               {
1066                 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1067                        M_NOWAIT);
1068                 if (sin == NULL)
1069                         goto drop;
1070                 sin->sin_family = AF_INET;
1071                 sin->sin_len = sizeof(*sin);
1072                 sin->sin_addr = ip->ip_src;
1073                 sin->sin_port = th->th_sport;
1074                 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1075                 laddr = inp->inp_laddr;
1076                 if (inp->inp_laddr.s_addr == INADDR_ANY)
1077                         inp->inp_laddr = ip->ip_dst;
1078                 if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) {
1079                         inp->inp_laddr = laddr;
1080                         FREE(sin, M_SONAME);
1081                         goto drop;
1082                 }
1083                 FREE(sin, M_SONAME);
1084               }
1085                 tp->t_template = tcp_template(tp);
1086                 if (tp->t_template == 0) {
1087                         tp = tcp_drop(tp, ENOBUFS);
1088                         dropsocket = 0;         /* socket is already gone */
1089                         goto drop;
1090                 }
1091                 if ((taop = tcp_gettaocache(inp)) == NULL) {
1092                         taop = &tao_noncached;
1093                         bzero(taop, sizeof(*taop));
1094                 }
1095                 tcp_dooptions(tp, optp, optlen, th, &to);
1096                 if (iss)
1097                         tp->iss = iss;
1098                 else
1099                         tp->iss = tcp_iss;
1100                 tcp_iss += TCP_ISSINCR/4;
1101                 tp->irs = th->th_seq;
1102                 tcp_sendseqinit(tp);
1103                 tcp_rcvseqinit(tp);
1104                 tp->snd_recover = tp->snd_una;
1105                 /*
1106                  * Initialization of the tcpcb for transaction;
1107                  *   set SND.WND = SEG.WND,
1108                  *   initialize CCsend and CCrecv.
1109                  */
1110                 tp->snd_wnd = tiwin;    /* initial send-window */
1111                 tp->cc_send = CC_INC(tcp_ccgen);
1112                 tp->cc_recv = to.to_cc;
1113                 /*
1114                  * Perform TAO test on incoming CC (SEG.CC) option, if any.
1115                  * - compare SEG.CC against cached CC from the same host,
1116                  *      if any.
1117                  * - if SEG.CC > chached value, SYN must be new and is accepted
1118                  *      immediately: save new CC in the cache, mark the socket
1119                  *      connected, enter ESTABLISHED state, turn on flag to
1120                  *      send a SYN in the next segment.
1121                  *      A virtual advertised window is set in rcv_adv to
1122                  *      initialize SWS prevention.  Then enter normal segment
1123                  *      processing: drop SYN, process data and FIN.
1124                  * - otherwise do a normal 3-way handshake.
1125                  */
1126                 if ((to.to_flag & TOF_CC) != 0) {
1127                     if (((tp->t_flags & TF_NOPUSH) != 0) &&
1128                         taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
1129
1130                         taop->tao_cc = to.to_cc;
1131                         tp->t_starttime = ticks;
1132                         tp->t_state = TCPS_ESTABLISHED;
1133
1134                         /*
1135                          * If there is a FIN, or if there is data and the
1136                          * connection is local, then delay SYN,ACK(SYN) in
1137                          * the hope of piggy-backing it on a response
1138                          * segment.  Otherwise must send ACK now in case
1139                          * the other side is slow starting.
1140                          */
1141                         if (tcp_delack_enabled && ((thflags & TH_FIN) ||
1142                             (tlen != 0 &&
1143 #ifdef INET6
1144                               ((isipv6 && in6_localaddr(&inp->in6p_faddr))
1145                               ||
1146                               (!isipv6 &&
1147 #endif
1148                             in_localaddr(inp->inp_faddr)
1149 #ifdef INET6
1150                                ))
1151 #endif
1152                              ))) {
1153                                 callout_reset(tp->tt_delack, tcp_delacktime,  
1154                                     tcp_timer_delack, tp);  
1155                                 tp->t_flags |= TF_NEEDSYN;
1156                         } else 
1157                                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1158
1159                         /*
1160                          * Limit the `virtual advertised window' to TCP_MAXWIN
1161                          * here.  Even if we requested window scaling, it will
1162                          * become effective only later when our SYN is acked.
1163                          */
1164                         tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
1165                         tcpstat.tcps_connects++;
1166                         soisconnected(so);
1167                         callout_reset(tp->tt_keep, tcp_keepinit,
1168                                       tcp_timer_keep, tp);
1169                         dropsocket = 0;         /* committed to socket */
1170                         tcpstat.tcps_accepts++;
1171                         goto trimthenstep6;
1172                     }
1173                 /* else do standard 3-way handshake */
1174                 } else {
1175                     /*
1176                      * No CC option, but maybe CC.NEW:
1177                      *   invalidate cached value.
1178                      */
1179                      taop->tao_cc = 0;
1180                 }
1181                 /*
1182                  * TAO test failed or there was no CC option,
1183                  *    do a standard 3-way handshake.
1184                  */
1185                 tp->t_flags |= TF_ACKNOW;
1186                 tp->t_state = TCPS_SYN_RECEIVED;
1187                 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
1188                 dropsocket = 0;         /* committed to socket */
1189                 tcpstat.tcps_accepts++;
1190                 goto trimthenstep6;
1191                 }
1192
1193         /*
1194          * If the state is SYN_RECEIVED:
1195          *      if seg contains an ACK, but not for our SYN/ACK, send a RST.
1196          */
1197         case TCPS_SYN_RECEIVED:
1198                 if ((thflags & TH_ACK) &&
1199                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1200                      SEQ_GT(th->th_ack, tp->snd_max)))
1201                                 goto maybedropwithreset;
1202                 break;
1203
1204         /*
1205          * If the state is SYN_SENT:
1206          *      if seg contains an ACK, but not for our SYN, drop the input.
1207          *      if seg contains a RST, then drop the connection.
1208          *      if seg does not contain SYN, then drop it.
1209          * Otherwise this is an acceptable SYN segment
1210          *      initialize tp->rcv_nxt and tp->irs
1211          *      if seg contains ack then advance tp->snd_una
1212          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1213          *      arrange for segment to be acked (eventually)
1214          *      continue processing rest of data/controls, beginning with URG
1215          */
1216         case TCPS_SYN_SENT:
1217                 if ((taop = tcp_gettaocache(inp)) == NULL) {
1218                         taop = &tao_noncached;
1219                         bzero(taop, sizeof(*taop));
1220                 }
1221
1222                 if ((thflags & TH_ACK) &&
1223                     (SEQ_LEQ(th->th_ack, tp->iss) ||
1224                      SEQ_GT(th->th_ack, tp->snd_max))) {
1225                         /*
1226                          * If we have a cached CCsent for the remote host,
1227                          * hence we haven't just crashed and restarted,
1228                          * do not send a RST.  This may be a retransmission
1229                          * from the other side after our earlier ACK was lost.
1230                          * Our new SYN, when it arrives, will serve as the
1231                          * needed ACK.
1232                          */
1233                         if (taop->tao_ccsent != 0)
1234                                 goto drop;
1235                         else
1236                                 goto dropwithreset;
1237                 }
1238                 if (thflags & TH_RST) {
1239                         if (thflags & TH_ACK)
1240                                 tp = tcp_drop(tp, ECONNREFUSED);
1241                         goto drop;
1242                 }
1243                 if ((thflags & TH_SYN) == 0)
1244                         goto drop;
1245                 tp->snd_wnd = th->th_win;       /* initial send window */
1246                 tp->cc_recv = to.to_cc;         /* foreign CC */
1247
1248                 tp->irs = th->th_seq;
1249                 tcp_rcvseqinit(tp);
1250                 if (thflags & TH_ACK) {
1251                         /*
1252                          * Our SYN was acked.  If segment contains CC.ECHO
1253                          * option, check it to make sure this segment really
1254                          * matches our SYN.  If not, just drop it as old
1255                          * duplicate, but send an RST if we're still playing
1256                          * by the old rules.  If no CC.ECHO option, make sure
1257                          * we don't get fooled into using T/TCP.
1258                          */
1259                         if (to.to_flag & TOF_CCECHO) {
1260                                 if (tp->cc_send != to.to_ccecho) {
1261                                         if (taop->tao_ccsent != 0)
1262                                                 goto drop;
1263                                         else
1264                                                 goto dropwithreset;
1265                                 }
1266                         } else
1267                                 tp->t_flags &= ~TF_RCVD_CC;
1268                         tcpstat.tcps_connects++;
1269                         soisconnected(so);
1270                         /* Do window scaling on this connection? */
1271                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1272                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1273                                 tp->snd_scale = tp->requested_s_scale;
1274                                 tp->rcv_scale = tp->request_r_scale;
1275                         }
1276                         /* Segment is acceptable, update cache if undefined. */
1277                         if (taop->tao_ccsent == 0)
1278                                 taop->tao_ccsent = to.to_ccecho;
1279
1280                         tp->rcv_adv += tp->rcv_wnd;
1281                         tp->snd_una++;          /* SYN is acked */
1282                         /*
1283                          * If there's data, delay ACK; if there's also a FIN
1284                          * ACKNOW will be turned on later.
1285                          */
1286                         if (tcp_delack_enabled && tlen != 0)
1287                                 callout_reset(tp->tt_delack, tcp_delacktime,  
1288                                     tcp_timer_delack, tp);  
1289                         else
1290                                 tp->t_flags |= TF_ACKNOW;
1291                         /*
1292                          * Received <SYN,ACK> in SYN_SENT[*] state.
1293                          * Transitions:
1294                          *      SYN_SENT  --> ESTABLISHED
1295                          *      SYN_SENT* --> FIN_WAIT_1
1296                          */
1297                         tp->t_starttime = ticks;
1298                         if (tp->t_flags & TF_NEEDFIN) {
1299                                 tp->t_state = TCPS_FIN_WAIT_1;
1300                                 tp->t_flags &= ~TF_NEEDFIN;
1301                                 thflags &= ~TH_SYN;
1302                         } else {
1303                                 tp->t_state = TCPS_ESTABLISHED;
1304                                 callout_reset(tp->tt_keep, tcp_keepidle,
1305                                               tcp_timer_keep, tp);
1306                         }
1307                 } else {
1308                 /*
1309                  *  Received initial SYN in SYN-SENT[*] state => simul-
1310                  *  taneous open.  If segment contains CC option and there is
1311                  *  a cached CC, apply TAO test; if it succeeds, connection is
1312                  *  half-synchronized.  Otherwise, do 3-way handshake:
1313                  *        SYN-SENT -> SYN-RECEIVED
1314                  *        SYN-SENT* -> SYN-RECEIVED*
1315                  *  If there was no CC option, clear cached CC value.
1316                  */
1317                         tp->t_flags |= TF_ACKNOW;
1318                         callout_stop(tp->tt_rexmt);
1319                         if (to.to_flag & TOF_CC) {
1320                                 if (taop->tao_cc != 0 &&
1321                                     CC_GT(to.to_cc, taop->tao_cc)) {
1322                                         /*
1323                                          * update cache and make transition:
1324                                          *        SYN-SENT -> ESTABLISHED*
1325                                          *        SYN-SENT* -> FIN-WAIT-1*
1326                                          */
1327                                         taop->tao_cc = to.to_cc;
1328                                         tp->t_starttime = ticks;
1329                                         if (tp->t_flags & TF_NEEDFIN) {
1330                                                 tp->t_state = TCPS_FIN_WAIT_1;
1331                                                 tp->t_flags &= ~TF_NEEDFIN;
1332                                         } else {
1333                                                 tp->t_state = TCPS_ESTABLISHED;
1334                                                 callout_reset(tp->tt_keep,
1335                                                               tcp_keepidle,
1336                                                               tcp_timer_keep,
1337                                                               tp);
1338                                         }
1339                                         tp->t_flags |= TF_NEEDSYN;
1340                                 } else
1341                                         tp->t_state = TCPS_SYN_RECEIVED;
1342                         } else {
1343                                 /* CC.NEW or no option => invalidate cache */
1344                                 taop->tao_cc = 0;
1345                                 tp->t_state = TCPS_SYN_RECEIVED;
1346                         }
1347                 }
1348
1349 trimthenstep6:
1350                 /*
1351                  * Advance th->th_seq to correspond to first data byte.
1352                  * If data, trim to stay within window,
1353                  * dropping FIN if necessary.
1354                  */
1355                 th->th_seq++;
1356                 if (tlen > tp->rcv_wnd) {
1357                         todrop = tlen - tp->rcv_wnd;
1358                         m_adj(m, -todrop);
1359                         tlen = tp->rcv_wnd;
1360                         thflags &= ~TH_FIN;
1361                         tcpstat.tcps_rcvpackafterwin++;
1362                         tcpstat.tcps_rcvbyteafterwin += todrop;
1363                 }
1364                 tp->snd_wl1 = th->th_seq - 1;
1365                 tp->rcv_up = th->th_seq;
1366                 /*
1367                  *  Client side of transaction: already sent SYN and data.
1368                  *  If the remote host used T/TCP to validate the SYN,
1369                  *  our data will be ACK'd; if so, enter normal data segment
1370                  *  processing in the middle of step 5, ack processing.
1371                  *  Otherwise, goto step 6.
1372                  */
1373                 if (thflags & TH_ACK)
1374                         goto process_ACK;
1375                 goto step6;
1376         /*
1377          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1378          *      if segment contains a SYN and CC [not CC.NEW] option:
1379          *              if state == TIME_WAIT and connection duration > MSL,
1380          *                  drop packet and send RST;
1381          *
1382          *              if SEG.CC > CCrecv then is new SYN, and can implicitly
1383          *                  ack the FIN (and data) in retransmission queue.
1384          *                  Complete close and delete TCPCB.  Then reprocess
1385          *                  segment, hoping to find new TCPCB in LISTEN state;
1386          *
1387          *              else must be old SYN; drop it.
1388          *      else do normal processing.
1389          */
1390         case TCPS_LAST_ACK:
1391         case TCPS_CLOSING:
1392         case TCPS_TIME_WAIT:
1393                 if ((thflags & TH_SYN) &&
1394                     (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1395                         if (tp->t_state == TCPS_TIME_WAIT &&
1396                                         (ticks - tp->t_starttime) > tcp_msl)
1397                                 goto dropwithreset;
1398                         if (CC_GT(to.to_cc, tp->cc_recv)) {
1399                                 tp = tcp_close(tp);
1400                                 goto findpcb;
1401                         }
1402                         else
1403                                 goto drop;
1404                 }
1405                 break;  /* continue normal processing */
1406         }
1407
1408         /*
1409          * States other than LISTEN or SYN_SENT.
1410          * First check the RST flag and sequence number since reset segments
1411          * are exempt from the timestamp and connection count tests.  This
1412          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1413          * below which allowed reset segments in half the sequence space
1414          * to fall though and be processed (which gives forged reset
1415          * segments with a random sequence number a 50 percent chance of
1416          * killing a connection).
1417          * Then check timestamp, if present.
1418          * Then check the connection count, if present.
1419          * Then check that at least some bytes of segment are within
1420          * receive window.  If segment begins before rcv_nxt,
1421          * drop leading data (and SYN); if nothing left, just ack.
1422          *
1423          *
1424          * If the RST bit is set, check the sequence number to see
1425          * if this is a valid reset segment.
1426          * RFC 793 page 37:
1427          *   In all states except SYN-SENT, all reset (RST) segments
1428          *   are validated by checking their SEQ-fields.  A reset is
1429          *   valid if its sequence number is in the window.
1430          * Note: this does not take into account delayed ACKs, so
1431          *   we should test against last_ack_sent instead of rcv_nxt.
1432          *   The sequence number in the reset segment is normally an
1433          *   echo of our outgoing acknowlegement numbers, but some hosts
1434          *   send a reset with the sequence number at the rightmost edge
1435          *   of our receive window, and we have to handle this case.
1436          * If we have multiple segments in flight, the intial reset
1437          * segment sequence numbers will be to the left of last_ack_sent,
1438          * but they will eventually catch up.
1439          * In any case, it never made sense to trim reset segments to
1440          * fit the receive window since RFC 1122 says:
1441          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
1442          *
1443          *    A TCP SHOULD allow a received RST segment to include data.
1444          *
1445          *    DISCUSSION
1446          *         It has been suggested that a RST segment could contain
1447          *         ASCII text that encoded and explained the cause of the
1448          *         RST.  No standard has yet been established for such
1449          *         data.
1450          *
1451          * If the reset segment passes the sequence number test examine
1452          * the state:
1453          *    SYN_RECEIVED STATE:
1454          *      If passive open, return to LISTEN state.
1455          *      If active open, inform user that connection was refused.
1456          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1457          *      Inform user that connection was reset, and close tcb.
1458          *    CLOSING, LAST_ACK STATES:
1459          *      Close the tcb.
1460          *    TIME_WAIT STATE:
1461          *      Drop the segment - see Stevens, vol. 2, p. 964 and
1462          *      RFC 1337.
1463          */
1464         if (thflags & TH_RST) {
1465                 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1466                     SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1467                         switch (tp->t_state) {
1468
1469                         case TCPS_SYN_RECEIVED:
1470                                 so->so_error = ECONNREFUSED;
1471                                 goto close;
1472
1473                         case TCPS_ESTABLISHED:
1474                         case TCPS_FIN_WAIT_1:
1475                         case TCPS_FIN_WAIT_2:
1476                         case TCPS_CLOSE_WAIT:
1477                                 so->so_error = ECONNRESET;
1478                         close:
1479                                 tp->t_state = TCPS_CLOSED;
1480                                 tcpstat.tcps_drops++;
1481                                 tp = tcp_close(tp);
1482                                 break;
1483
1484                         case TCPS_CLOSING:
1485                         case TCPS_LAST_ACK:
1486                                 tp = tcp_close(tp);
1487                                 break;
1488
1489                         case TCPS_TIME_WAIT:
1490                                 break;
1491                         }
1492                 }
1493                 goto drop;
1494         }
1495
1496         /*
1497          * RFC 1323 PAWS: If we have a timestamp reply on this segment
1498          * and it's less than ts_recent, drop it.
1499          */
1500         if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1501             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1502
1503                 /* Check to see if ts_recent is over 24 days old.  */
1504                 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1505                         /*
1506                          * Invalidate ts_recent.  If this segment updates
1507                          * ts_recent, the age will be reset later and ts_recent
1508                          * will get a valid value.  If it does not, setting
1509                          * ts_recent to zero will at least satisfy the
1510                          * requirement that zero be placed in the timestamp
1511                          * echo reply when ts_recent isn't valid.  The
1512                          * age isn't reset until we get a valid ts_recent
1513                          * because we don't want out-of-order segments to be
1514                          * dropped when ts_recent is old.
1515                          */
1516                         tp->ts_recent = 0;
1517                 } else {
1518                         tcpstat.tcps_rcvduppack++;
1519                         tcpstat.tcps_rcvdupbyte += tlen;
1520                         tcpstat.tcps_pawsdrop++;
1521                         goto dropafterack;
1522                 }
1523         }
1524
1525         /*
1526          * T/TCP mechanism
1527          *   If T/TCP was negotiated and the segment doesn't have CC,
1528          *   or if its CC is wrong then drop the segment.
1529          *   RST segments do not have to comply with this.
1530          */
1531         if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1532             ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1533                 goto dropafterack;
1534
1535         /*
1536          * In the SYN-RECEIVED state, validate that the packet belongs to
1537          * this connection before trimming the data to fit the receive
1538          * window.  Check the sequence number versus IRS since we know
1539          * the sequence numbers haven't wrapped.  This is a partial fix
1540          * for the "LAND" DoS attack.
1541          */
1542         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs))
1543                 goto maybedropwithreset;
1544
1545         todrop = tp->rcv_nxt - th->th_seq;
1546         if (todrop > 0) {
1547                 if (thflags & TH_SYN) {
1548                         thflags &= ~TH_SYN;
1549                         th->th_seq++;
1550                         if (th->th_urp > 1)
1551                                 th->th_urp--;
1552                         else
1553                                 thflags &= ~TH_URG;
1554                         todrop--;
1555                 }
1556                 /*
1557                  * Following if statement from Stevens, vol. 2, p. 960.
1558                  */
1559                 if (todrop > tlen
1560                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1561                         /*
1562                          * Any valid FIN must be to the left of the window.
1563                          * At this point the FIN must be a duplicate or out
1564                          * of sequence; drop it.
1565                          */
1566                         thflags &= ~TH_FIN;
1567
1568                         /*
1569                          * Send an ACK to resynchronize and drop any data.
1570                          * But keep on processing for RST or ACK.
1571                          */
1572                         tp->t_flags |= TF_ACKNOW;
1573                         todrop = tlen;
1574                         tcpstat.tcps_rcvduppack++;
1575                         tcpstat.tcps_rcvdupbyte += todrop;
1576                 } else {
1577                         tcpstat.tcps_rcvpartduppack++;
1578                         tcpstat.tcps_rcvpartdupbyte += todrop;
1579                 }
1580                 drop_hdrlen += todrop;  /* drop from the top afterwards */
1581                 th->th_seq += todrop;
1582                 tlen -= todrop;
1583                 if (th->th_urp > todrop)
1584                         th->th_urp -= todrop;
1585                 else {
1586                         thflags &= ~TH_URG;
1587                         th->th_urp = 0;
1588                 }
1589         }
1590
1591         /*
1592          * If new data are received on a connection after the
1593          * user processes are gone, then RST the other end.
1594          */
1595         if ((so->so_state & SS_NOFDREF) &&
1596             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1597                 tp = tcp_close(tp);
1598                 tcpstat.tcps_rcvafterclose++;
1599                 goto dropwithreset;
1600         }
1601
1602         /*
1603          * If segment ends after window, drop trailing data
1604          * (and PUSH and FIN); if nothing left, just ACK.
1605          */
1606         todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1607         if (todrop > 0) {
1608                 tcpstat.tcps_rcvpackafterwin++;
1609                 if (todrop >= tlen) {
1610                         tcpstat.tcps_rcvbyteafterwin += tlen;
1611                         /*
1612                          * If a new connection request is received
1613                          * while in TIME_WAIT, drop the old connection
1614                          * and start over if the sequence numbers
1615                          * are above the previous ones.
1616                          */
1617                         if (thflags & TH_SYN &&
1618                             tp->t_state == TCPS_TIME_WAIT &&
1619                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1620                                 iss = tp->snd_nxt + TCP_ISSINCR;
1621                                 tp = tcp_close(tp);
1622                                 goto findpcb;
1623                         }
1624                         /*
1625                          * If window is closed can only take segments at
1626                          * window edge, and have to drop data and PUSH from
1627                          * incoming segments.  Continue processing, but
1628                          * remember to ack.  Otherwise, drop segment
1629                          * and ack.
1630                          */
1631                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1632                                 tp->t_flags |= TF_ACKNOW;
1633                                 tcpstat.tcps_rcvwinprobe++;
1634                         } else
1635                                 goto dropafterack;
1636                 } else
1637                         tcpstat.tcps_rcvbyteafterwin += todrop;
1638                 m_adj(m, -todrop);
1639                 tlen -= todrop;
1640                 thflags &= ~(TH_PUSH|TH_FIN);
1641         }
1642
1643         /*
1644          * If last ACK falls within this segment's sequence numbers,
1645          * record its timestamp.
1646          * NOTE that the test is modified according to the latest
1647          * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1648          */
1649         if ((to.to_flag & TOF_TS) != 0 &&
1650             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1651                 tp->ts_recent_age = ticks;
1652                 tp->ts_recent = to.to_tsval;
1653         }
1654
1655         /*
1656          * If a SYN is in the window, then this is an
1657          * error and we send an RST and drop the connection.
1658          */
1659         if (thflags & TH_SYN) {
1660                 tp = tcp_drop(tp, ECONNRESET);
1661                 goto dropwithreset;
1662         }
1663
1664         /*
1665          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
1666          * flag is on (half-synchronized state), then queue data for
1667          * later processing; else drop segment and return.
1668          */
1669         if ((thflags & TH_ACK) == 0) {
1670                 if (tp->t_state == TCPS_SYN_RECEIVED ||
1671                     (tp->t_flags & TF_NEEDSYN))
1672                         goto step6;
1673                 else
1674                         goto drop;
1675         }
1676
1677         /*
1678          * Ack processing.
1679          */
1680         switch (tp->t_state) {
1681
1682         /*
1683          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1684          * ESTABLISHED state and continue processing.
1685          * The ACK was checked above.
1686          */
1687         case TCPS_SYN_RECEIVED:
1688
1689                 tcpstat.tcps_connects++;
1690                 soisconnected(so);
1691                 /* Do window scaling? */
1692                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1693                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1694                         tp->snd_scale = tp->requested_s_scale;
1695                         tp->rcv_scale = tp->request_r_scale;
1696                 }
1697                 /*
1698                  * Upon successful completion of 3-way handshake,
1699                  * update cache.CC if it was undefined, pass any queued
1700                  * data to the user, and advance state appropriately.
1701                  */
1702                 if ((taop = tcp_gettaocache(inp)) != NULL &&
1703                     taop->tao_cc == 0)
1704                         taop->tao_cc = tp->cc_recv;
1705
1706                 /*
1707                  * Make transitions:
1708                  *      SYN-RECEIVED  -> ESTABLISHED
1709                  *      SYN-RECEIVED* -> FIN-WAIT-1
1710                  */
1711                 tp->t_starttime = ticks;
1712                 if (tp->t_flags & TF_NEEDFIN) {
1713                         tp->t_state = TCPS_FIN_WAIT_1;
1714                         tp->t_flags &= ~TF_NEEDFIN;
1715                 } else {
1716                         tp->t_state = TCPS_ESTABLISHED;
1717                         callout_reset(tp->tt_keep, tcp_keepidle, 
1718                                       tcp_timer_keep, tp);
1719                 }
1720                 /*
1721                  * If segment contains data or ACK, will call tcp_reass()
1722                  * later; if not, do so now to pass queued data to user.
1723                  */
1724                 if (tlen == 0 && (thflags & TH_FIN) == 0)
1725                         (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1726                             (struct mbuf *)0);
1727                 tp->snd_wl1 = th->th_seq - 1;
1728                 /* fall into ... */
1729
1730         /*
1731          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1732          * ACKs.  If the ack is in the range
1733          *      tp->snd_una < th->th_ack <= tp->snd_max
1734          * then advance tp->snd_una to th->th_ack and drop
1735          * data from the retransmission queue.  If this ACK reflects
1736          * more up to date window information we update our window information.
1737          */
1738         case TCPS_ESTABLISHED:
1739         case TCPS_FIN_WAIT_1:
1740         case TCPS_FIN_WAIT_2:
1741         case TCPS_CLOSE_WAIT:
1742         case TCPS_CLOSING:
1743         case TCPS_LAST_ACK:
1744         case TCPS_TIME_WAIT:
1745
1746                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1747                         if (tlen == 0 && tiwin == tp->snd_wnd) {
1748                                 tcpstat.tcps_rcvdupack++;
1749                                 /*
1750                                  * If we have outstanding data (other than
1751                                  * a window probe), this is a completely
1752                                  * duplicate ack (ie, window info didn't
1753                                  * change), the ack is the biggest we've
1754                                  * seen and we've seen exactly our rexmt
1755                                  * threshhold of them, assume a packet
1756                                  * has been dropped and retransmit it.
1757                                  * Kludge snd_nxt & the congestion
1758                                  * window so we send only this one
1759                                  * packet.
1760                                  *
1761                                  * We know we're losing at the current
1762                                  * window size so do congestion avoidance
1763                                  * (set ssthresh to half the current window
1764                                  * and pull our congestion window back to
1765                                  * the new ssthresh).
1766                                  *
1767                                  * Dup acks mean that packets have left the
1768                                  * network (they're now cached at the receiver)
1769                                  * so bump cwnd by the amount in the receiver
1770                                  * to keep a constant cwnd packets in the
1771                                  * network.
1772                                  */
1773                                 if (!callout_active(tp->tt_rexmt) ||
1774                                     th->th_ack != tp->snd_una)
1775                                         tp->t_dupacks = 0;
1776                                 else if (++tp->t_dupacks == tcprexmtthresh) {
1777                                         tcp_seq onxt = tp->snd_nxt;
1778                                         u_int win =
1779                                             min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1780                                                 tp->t_maxseg;
1781                                         if (tcp_do_newreno && SEQ_LT(th->th_ack,
1782                                             tp->snd_recover)) {
1783                                                 /* False retransmit, should not
1784                                                  * cut window
1785                                                  */
1786                                                 tp->snd_cwnd += tp->t_maxseg;
1787                                                 tp->t_dupacks = 0;
1788                                                 (void) tcp_output(tp);
1789                                                 goto drop;
1790                                         }
1791                                         if (win < 2)
1792                                                 win = 2;
1793                                         tp->snd_ssthresh = win * tp->t_maxseg;
1794                                         tp->snd_recover = tp->snd_max;
1795                                         callout_stop(tp->tt_rexmt);
1796                                         tp->t_rtttime = 0;
1797                                         tp->snd_nxt = th->th_ack;
1798                                         tp->snd_cwnd = tp->t_maxseg;
1799                                         (void) tcp_output(tp);
1800                                         tp->snd_cwnd = tp->snd_ssthresh +
1801                                                tp->t_maxseg * tp->t_dupacks;
1802                                         if (SEQ_GT(onxt, tp->snd_nxt))
1803                                                 tp->snd_nxt = onxt;
1804                                         goto drop;
1805                                 } else if (tp->t_dupacks > tcprexmtthresh) {
1806                                         tp->snd_cwnd += tp->t_maxseg;
1807                                         (void) tcp_output(tp);
1808                                         goto drop;
1809                                 }
1810                         } else
1811                                 tp->t_dupacks = 0;
1812                         break;
1813                 }
1814                 /*
1815                  * If the congestion window was inflated to account
1816                  * for the other side's cached packets, retract it.
1817                  */
1818                 if (tcp_do_newreno == 0) {
1819                         if (tp->t_dupacks >= tcprexmtthresh &&
1820                                 tp->snd_cwnd > tp->snd_ssthresh)
1821                                 tp->snd_cwnd = tp->snd_ssthresh;
1822                         tp->t_dupacks = 0;
1823                 } else if (tp->t_dupacks >= tcprexmtthresh &&
1824                     !tcp_newreno(tp, th)) {
1825                         /*
1826                          * Window inflation should have left us with approx.
1827                          * snd_ssthresh outstanding data.  But in case we
1828                          * would be inclined to send a burst, better to do
1829                          * it via the slow start mechanism.
1830                          */
1831                         if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
1832                                 tp->snd_cwnd =
1833                                     tp->snd_max - th->th_ack + tp->t_maxseg;
1834                         else
1835                                 tp->snd_cwnd = tp->snd_ssthresh;
1836                         tp->t_dupacks = 0;
1837                 }
1838                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1839                         tcpstat.tcps_rcvacktoomuch++;
1840                         goto dropafterack;
1841                 }
1842                 /*
1843                  *  If we reach this point, ACK is not a duplicate,
1844                  *     i.e., it ACKs something we sent.
1845                  */
1846                 if (tp->t_flags & TF_NEEDSYN) {
1847                         /*
1848                          * T/TCP: Connection was half-synchronized, and our
1849                          * SYN has been ACK'd (so connection is now fully
1850                          * synchronized).  Go to non-starred state,
1851                          * increment snd_una for ACK of SYN, and check if
1852                          * we can do window scaling.
1853                          */
1854                         tp->t_flags &= ~TF_NEEDSYN;
1855                         tp->snd_una++;
1856                         /* Do window scaling? */
1857                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1858                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1859                                 tp->snd_scale = tp->requested_s_scale;
1860                                 tp->rcv_scale = tp->request_r_scale;
1861                         }
1862                 }
1863
1864 process_ACK:
1865                 acked = th->th_ack - tp->snd_una;
1866                 tcpstat.tcps_rcvackpack++;
1867                 tcpstat.tcps_rcvackbyte += acked;
1868
1869                 /*
1870                  * If we just performed our first retransmit, and the ACK
1871                  * arrives within our recovery window, then it was a mistake
1872                  * to do the retransmit in the first place.  Recover our
1873                  * original cwnd and ssthresh, and proceed to transmit where
1874                  * we left off.
1875                  */
1876                 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
1877                         tp->snd_cwnd = tp->snd_cwnd_prev;
1878                         tp->snd_ssthresh = tp->snd_ssthresh_prev;
1879                         tp->snd_nxt = tp->snd_max;
1880                         tp->t_badrxtwin = 0;    /* XXX probably not required */ 
1881                 }
1882
1883                 /*
1884                  * If we have a timestamp reply, update smoothed
1885                  * round trip time.  If no timestamp is present but
1886                  * transmit timer is running and timed sequence
1887                  * number was acked, update smoothed round trip time.
1888                  * Since we now have an rtt measurement, cancel the
1889                  * timer backoff (cf., Phil Karn's retransmit alg.).
1890                  * Recompute the initial retransmit timer.
1891                  */
1892                 if (to.to_flag & TOF_TS)
1893                         tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
1894                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
1895                         tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1896
1897                 /*
1898                  * If all outstanding data is acked, stop retransmit
1899                  * timer and remember to restart (more output or persist).
1900                  * If there is more data to be acked, restart retransmit
1901                  * timer, using current (possibly backed-off) value.
1902                  */
1903                 if (th->th_ack == tp->snd_max) {
1904                         callout_stop(tp->tt_rexmt);
1905                         needoutput = 1;
1906                 } else if (!callout_active(tp->tt_persist))
1907                         callout_reset(tp->tt_rexmt, tp->t_rxtcur,
1908                                       tcp_timer_rexmt, tp);
1909
1910                 /*
1911                  * If no data (only SYN) was ACK'd,
1912                  *    skip rest of ACK processing.
1913                  */
1914                 if (acked == 0)
1915                         goto step6;
1916
1917                 /*
1918                  * When new data is acked, open the congestion window.
1919                  * If the window gives us less than ssthresh packets
1920                  * in flight, open exponentially (maxseg per packet).
1921                  * Otherwise open linearly: maxseg per window
1922                  * (maxseg^2 / cwnd per packet).
1923                  */
1924                 {
1925                 register u_int cw = tp->snd_cwnd;
1926                 register u_int incr = tp->t_maxseg;
1927
1928                 if (cw > tp->snd_ssthresh)
1929                         incr = incr * incr / cw;
1930                 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
1931                         tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
1932                 }
1933                 if (acked > so->so_snd.sb_cc) {
1934                         tp->snd_wnd -= so->so_snd.sb_cc;
1935                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1936                         ourfinisacked = 1;
1937                 } else {
1938                         sbdrop(&so->so_snd, acked);
1939                         tp->snd_wnd -= acked;
1940                         ourfinisacked = 0;
1941                 }
1942                 sowwakeup(so);
1943                 tp->snd_una = th->th_ack;
1944                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1945                         tp->snd_nxt = tp->snd_una;
1946
1947                 switch (tp->t_state) {
1948
1949                 /*
1950                  * In FIN_WAIT_1 STATE in addition to the processing
1951                  * for the ESTABLISHED state if our FIN is now acknowledged
1952                  * then enter FIN_WAIT_2.
1953                  */
1954                 case TCPS_FIN_WAIT_1:
1955                         if (ourfinisacked) {
1956                                 /*
1957                                  * If we can't receive any more
1958                                  * data, then closing user can proceed.
1959                                  * Starting the timer is contrary to the
1960                                  * specification, but if we don't get a FIN
1961                                  * we'll hang forever.
1962                                  */
1963                                 if (so->so_state & SS_CANTRCVMORE) {
1964                                         soisdisconnected(so);
1965                                         callout_reset(tp->tt_2msl, tcp_maxidle,
1966                                                       tcp_timer_2msl, tp);
1967                                 }
1968                                 tp->t_state = TCPS_FIN_WAIT_2;
1969                         }
1970                         break;
1971
1972                 /*
1973                  * In CLOSING STATE in addition to the processing for
1974                  * the ESTABLISHED state if the ACK acknowledges our FIN
1975                  * then enter the TIME-WAIT state, otherwise ignore
1976                  * the segment.
1977                  */
1978                 case TCPS_CLOSING:
1979                         if (ourfinisacked) {
1980                                 tp->t_state = TCPS_TIME_WAIT;
1981                                 tcp_canceltimers(tp);
1982                                 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1983                                 if (tp->cc_recv != 0 &&
1984                                     (ticks - tp->t_starttime) < tcp_msl)
1985                                         callout_reset(tp->tt_2msl,
1986                                                       tp->t_rxtcur *
1987                                                       TCPTV_TWTRUNC,
1988                                                       tcp_timer_2msl, tp);
1989                                 else
1990                                         callout_reset(tp->tt_2msl, 2 * tcp_msl,
1991                                                       tcp_timer_2msl, tp);
1992                                 soisdisconnected(so);
1993                         }
1994                         break;
1995
1996                 /*
1997                  * In LAST_ACK, we may still be waiting for data to drain
1998                  * and/or to be acked, as well as for the ack of our FIN.
1999                  * If our FIN is now acknowledged, delete the TCB,
2000                  * enter the closed state and return.
2001                  */
2002                 case TCPS_LAST_ACK:
2003                         if (ourfinisacked) {
2004                                 tp = tcp_close(tp);
2005                                 goto drop;
2006                         }
2007                         break;
2008
2009                 /*
2010                  * In TIME_WAIT state the only thing that should arrive
2011                  * is a retransmission of the remote FIN.  Acknowledge
2012                  * it and restart the finack timer.
2013                  */
2014                 case TCPS_TIME_WAIT:
2015                         callout_reset(tp->tt_2msl, 2 * tcp_msl,
2016                                       tcp_timer_2msl, tp);
2017                         goto dropafterack;
2018                 }
2019         }
2020
2021 step6:
2022         /*
2023          * Update window information.
2024          * Don't look at window if no ACK: TAC's send garbage on first SYN.
2025          */
2026         if ((thflags & TH_ACK) &&
2027             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2028             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2029              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2030                 /* keep track of pure window updates */
2031                 if (tlen == 0 &&
2032                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2033                         tcpstat.tcps_rcvwinupd++;
2034                 tp->snd_wnd = tiwin;
2035                 tp->snd_wl1 = th->th_seq;
2036                 tp->snd_wl2 = th->th_ack;
2037                 if (tp->snd_wnd > tp->max_sndwnd)
2038                         tp->max_sndwnd = tp->snd_wnd;
2039                 needoutput = 1;
2040         }
2041
2042         /*
2043          * Process segments with URG.
2044          */
2045         if ((thflags & TH_URG) && th->th_urp &&
2046             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2047                 /*
2048                  * This is a kludge, but if we receive and accept
2049                  * random urgent pointers, we'll crash in
2050                  * soreceive.  It's hard to imagine someone
2051                  * actually wanting to send this much urgent data.
2052                  */
2053                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2054                         th->th_urp = 0;                 /* XXX */
2055                         thflags &= ~TH_URG;             /* XXX */
2056                         goto dodata;                    /* XXX */
2057                 }
2058                 /*
2059                  * If this segment advances the known urgent pointer,
2060                  * then mark the data stream.  This should not happen
2061                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2062                  * a FIN has been received from the remote side.
2063                  * In these states we ignore the URG.
2064                  *
2065                  * According to RFC961 (Assigned Protocols),
2066                  * the urgent pointer points to the last octet
2067                  * of urgent data.  We continue, however,
2068                  * to consider it to indicate the first octet
2069                  * of data past the urgent section as the original
2070                  * spec states (in one of two places).
2071                  */
2072                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2073                         tp->rcv_up = th->th_seq + th->th_urp;
2074                         so->so_oobmark = so->so_rcv.sb_cc +
2075                             (tp->rcv_up - tp->rcv_nxt) - 1;
2076                         if (so->so_oobmark == 0)
2077                                 so->so_state |= SS_RCVATMARK;
2078                         sohasoutofband(so);
2079                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2080                 }
2081                 /*
2082                  * Remove out of band data so doesn't get presented to user.
2083                  * This can happen independent of advancing the URG pointer,
2084                  * but if two URG's are pending at once, some out-of-band
2085                  * data may creep in... ick.
2086                  */
2087                 if (th->th_urp <= (u_long)tlen
2088 #ifdef SO_OOBINLINE
2089                      && (so->so_options & SO_OOBINLINE) == 0
2090 #endif
2091                      )
2092                         tcp_pulloutofband(so, th, m,
2093                                 drop_hdrlen);   /* hdr drop is delayed */
2094         } else
2095                 /*
2096                  * If no out of band data is expected,
2097                  * pull receive urgent pointer along
2098                  * with the receive window.
2099                  */
2100                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2101                         tp->rcv_up = tp->rcv_nxt;
2102 dodata:                                                 /* XXX */
2103
2104         /*
2105          * Process the segment text, merging it into the TCP sequencing queue,
2106          * and arranging for acknowledgment of receipt if necessary.
2107          * This process logically involves adjusting tp->rcv_wnd as data
2108          * is presented to the user (this happens in tcp_usrreq.c,
2109          * case PRU_RCVD).  If a FIN has already been received on this
2110          * connection then we just ignore the text.
2111          */
2112         if ((tlen || (thflags&TH_FIN)) &&
2113             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2114                 m_adj(m, drop_hdrlen);  /* delayed header drop */
2115                 TCP_REASS(tp, th, &tlen, m, so, thflags);
2116                 /*
2117                  * Note the amount of data that peer has sent into
2118                  * our window, in order to estimate the sender's
2119                  * buffer size.
2120                  */
2121                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2122         } else {
2123                 m_freem(m);
2124                 thflags &= ~TH_FIN;
2125         }
2126
2127         /*
2128          * If FIN is received ACK the FIN and let the user know
2129          * that the connection is closing.
2130          */
2131         if (thflags & TH_FIN) {
2132                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2133                         socantrcvmore(so);
2134                         /*
2135                          *  If connection is half-synchronized
2136                          *  (ie NEEDSYN flag on) then delay ACK,
2137                          *  so it may be piggybacked when SYN is sent.
2138                          *  Otherwise, since we received a FIN then no
2139                          *  more input can be expected, send ACK now.
2140                          */
2141                         if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN))
2142                                 callout_reset(tp->tt_delack, tcp_delacktime,  
2143                                     tcp_timer_delack, tp);  
2144                         else
2145                                 tp->t_flags |= TF_ACKNOW;
2146                         tp->rcv_nxt++;
2147                 }
2148                 switch (tp->t_state) {
2149
2150                 /*
2151                  * In SYN_RECEIVED and ESTABLISHED STATES
2152                  * enter the CLOSE_WAIT state.
2153                  */
2154                 case TCPS_SYN_RECEIVED:
2155                         tp->t_starttime = ticks;
2156                         /*FALLTHROUGH*/
2157                 case TCPS_ESTABLISHED:
2158                         tp->t_state = TCPS_CLOSE_WAIT;
2159                         break;
2160
2161                 /*
2162                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
2163                  * enter the CLOSING state.
2164                  */
2165                 case TCPS_FIN_WAIT_1:
2166                         tp->t_state = TCPS_CLOSING;
2167                         break;
2168
2169                 /*
2170                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
2171                  * starting the time-wait timer, turning off the other
2172                  * standard timers.
2173                  */
2174                 case TCPS_FIN_WAIT_2:
2175                         tp->t_state = TCPS_TIME_WAIT;
2176                         tcp_canceltimers(tp);
2177                         /* Shorten TIME_WAIT [RFC-1644, p.28] */
2178                         if (tp->cc_recv != 0 &&
2179                             (ticks - tp->t_starttime) < tcp_msl) {
2180                                 callout_reset(tp->tt_2msl,
2181                                               tp->t_rxtcur * TCPTV_TWTRUNC,
2182                                               tcp_timer_2msl, tp);
2183                                 /* For transaction client, force ACK now. */
2184                                 tp->t_flags |= TF_ACKNOW;
2185                         }
2186                         else
2187                                 callout_reset(tp->tt_2msl, 2 * tcp_msl,
2188                                               tcp_timer_2msl, tp);
2189                         soisdisconnected(so);
2190                         break;
2191
2192                 /*
2193                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
2194                  */
2195                 case TCPS_TIME_WAIT:
2196                         callout_reset(tp->tt_2msl, 2 * tcp_msl,
2197                                       tcp_timer_2msl, tp);
2198                         break;
2199                 }
2200         }
2201 #ifdef TCPDEBUG
2202         if (so->so_options & SO_DEBUG)
2203                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2204                           &tcp_savetcp, 0);
2205 #endif
2206
2207         /*
2208          * Return any desired output.
2209          */
2210         if (needoutput || (tp->t_flags & TF_ACKNOW))
2211                 (void) tcp_output(tp);
2212         return;
2213
2214 dropafterack:
2215         /*
2216          * Generate an ACK dropping incoming segment if it occupies
2217          * sequence space, where the ACK reflects our state.
2218          *
2219          * We can now skip the test for the RST flag since all
2220          * paths to this code happen after packets containing
2221          * RST have been dropped.
2222          *
2223          * In the SYN-RECEIVED state, don't send an ACK unless the
2224          * segment we received passes the SYN-RECEIVED ACK test.
2225          * If it fails send a RST.  This breaks the loop in the
2226          * "LAND" DoS attack, and also prevents an ACK storm
2227          * between two listening ports that have been sent forged
2228          * SYN segments, each with the source address of the other.
2229          */
2230         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2231             (SEQ_GT(tp->snd_una, th->th_ack) ||
2232              SEQ_GT(th->th_ack, tp->snd_max)) )
2233                 goto maybedropwithreset;
2234 #ifdef TCPDEBUG
2235         if (so->so_options & SO_DEBUG)
2236                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2237                           &tcp_savetcp, 0);
2238 #endif
2239         m_freem(m);
2240         tp->t_flags |= TF_ACKNOW;
2241         (void) tcp_output(tp);
2242         return;
2243
2244
2245         /*
2246          * Conditionally drop with reset or just drop depending on whether
2247          * we think we are under attack or not.
2248          */
2249 maybedropwithreset:
2250         if (badport_bandlim(1) < 0)
2251                 goto drop;
2252         /* fall through */
2253 dropwithreset:
2254 #ifdef TCP_RESTRICT_RST
2255         if (restrict_rst)
2256                 goto drop;
2257 #endif
2258         /*
2259          * Generate a RST, dropping incoming segment.
2260          * Make ACK acceptable to originator of segment.
2261          * Don't bother to respond if destination was broadcast/multicast.
2262          */
2263         if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2264                 goto drop;
2265 #ifdef INET6
2266         if (isipv6) {
2267                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2268                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2269                         goto drop;
2270         } else
2271 #endif /* INET6 */
2272         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2273             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2274             ip->ip_src.s_addr == htonl(INADDR_BROADCAST))
2275                 goto drop;
2276         /* IPv6 anycast check is done at tcp6_input() */
2277 #ifdef TCPDEBUG
2278         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2279                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2280                           &tcp_savetcp, 0);
2281 #endif
2282         if (thflags & TH_ACK)
2283                 /* mtod() below is safe as long as hdr dropping is delayed */
2284                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2285                             TH_RST);
2286         else {
2287                 if (thflags & TH_SYN)
2288                         tlen++;
2289                 /* mtod() below is safe as long as hdr dropping is delayed */
2290                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2291                             (tcp_seq)0, TH_RST|TH_ACK);
2292         }
2293         /* destroy temporarily created socket */
2294         if (dropsocket)
2295                 (void) soabort(so);
2296         return;
2297
2298 drop:
2299         /*
2300          * Drop space held by incoming segment and return.
2301          */
2302 #ifdef TCPDEBUG
2303         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2304                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2305                           &tcp_savetcp, 0);
2306 #endif
2307         m_freem(m);
2308         /* destroy temporarily created socket */
2309         if (dropsocket)
2310                 (void) soabort(so);
2311         return;
2312 }
2313
2314 static void
2315 tcp_dooptions(tp, cp, cnt, th, to)
2316         struct tcpcb *tp;
2317         u_char *cp;
2318         int cnt;
2319         struct tcphdr *th;
2320         struct tcpopt *to;
2321 {
2322         u_short mss = 0;
2323         int opt, optlen;
2324
2325         for (; cnt > 0; cnt -= optlen, cp += optlen) {
2326                 opt = cp[0];
2327                 if (opt == TCPOPT_EOL)
2328                         break;
2329                 if (opt == TCPOPT_NOP)
2330                         optlen = 1;
2331                 else {
2332                         if (cnt < 2)
2333                                 break;
2334                         optlen = cp[1];
2335                         if (optlen < 2 || optlen > cnt)
2336                                 break;
2337                 }
2338                 switch (opt) {
2339
2340                 default:
2341                         continue;
2342
2343                 case TCPOPT_MAXSEG:
2344                         if (optlen != TCPOLEN_MAXSEG)
2345                                 continue;
2346                         if (!(th->th_flags & TH_SYN))
2347                                 continue;
2348                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2349                         NTOHS(mss);
2350                         break;
2351
2352                 case TCPOPT_WINDOW:
2353                         if (optlen != TCPOLEN_WINDOW)
2354                                 continue;
2355                         if (!(th->th_flags & TH_SYN))
2356                                 continue;
2357                         tp->t_flags |= TF_RCVD_SCALE;
2358                         tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2359                         break;
2360
2361                 case TCPOPT_TIMESTAMP:
2362                         if (optlen != TCPOLEN_TIMESTAMP)
2363                                 continue;
2364                         to->to_flag |= TOF_TS;
2365                         bcopy((char *)cp + 2,
2366                             (char *)&to->to_tsval, sizeof(to->to_tsval));
2367                         NTOHL(to->to_tsval);
2368                         bcopy((char *)cp + 6,
2369                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2370                         NTOHL(to->to_tsecr);
2371
2372                         /*
2373                          * A timestamp received in a SYN makes
2374                          * it ok to send timestamp requests and replies.
2375                          */
2376                         if (th->th_flags & TH_SYN) {
2377                                 tp->t_flags |= TF_RCVD_TSTMP;
2378                                 tp->ts_recent = to->to_tsval;
2379                                 tp->ts_recent_age = ticks;
2380                         }
2381                         break;
2382                 case TCPOPT_CC:
2383                         if (optlen != TCPOLEN_CC)
2384                                 continue;
2385                         to->to_flag |= TOF_CC;
2386                         bcopy((char *)cp + 2,
2387                             (char *)&to->to_cc, sizeof(to->to_cc));
2388                         NTOHL(to->to_cc);
2389                         /*
2390                          * A CC or CC.new option received in a SYN makes
2391                          * it ok to send CC in subsequent segments.
2392                          */
2393                         if (th->th_flags & TH_SYN)
2394                                 tp->t_flags |= TF_RCVD_CC;
2395                         break;
2396                 case TCPOPT_CCNEW:
2397                         if (optlen != TCPOLEN_CC)
2398                                 continue;
2399                         if (!(th->th_flags & TH_SYN))
2400                                 continue;
2401                         to->to_flag |= TOF_CCNEW;
2402                         bcopy((char *)cp + 2,
2403                             (char *)&to->to_cc, sizeof(to->to_cc));
2404                         NTOHL(to->to_cc);
2405                         /*
2406                          * A CC or CC.new option received in a SYN makes
2407                          * it ok to send CC in subsequent segments.
2408                          */
2409                         tp->t_flags |= TF_RCVD_CC;
2410                         break;
2411                 case TCPOPT_CCECHO:
2412                         if (optlen != TCPOLEN_CC)
2413                                 continue;
2414                         if (!(th->th_flags & TH_SYN))
2415                                 continue;
2416                         to->to_flag |= TOF_CCECHO;
2417                         bcopy((char *)cp + 2,
2418                             (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2419                         NTOHL(to->to_ccecho);
2420                         break;
2421                 }
2422         }
2423         if (th->th_flags & TH_SYN)
2424                 tcp_mss(tp, mss);       /* sets t_maxseg */
2425 }
2426
2427 /*
2428  * Pull out of band byte out of a segment so
2429  * it doesn't appear in the user's data queue.
2430  * It is still reflected in the segment length for
2431  * sequencing purposes.
2432  */
2433 static void
2434 tcp_pulloutofband(so, th, m, off)
2435         struct socket *so;
2436         struct tcphdr *th;
2437         register struct mbuf *m;
2438         int off;                /* delayed to be droped hdrlen */
2439 {
2440         int cnt = off + th->th_urp - 1;
2441
2442         while (cnt >= 0) {
2443                 if (m->m_len > cnt) {
2444                         char *cp = mtod(m, caddr_t) + cnt;
2445                         struct tcpcb *tp = sototcpcb(so);
2446
2447                         tp->t_iobc = *cp;
2448                         tp->t_oobflags |= TCPOOB_HAVEDATA;
2449                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2450                         m->m_len--;
2451                         if (m->m_flags & M_PKTHDR)
2452                                 m->m_pkthdr.len--;
2453                         return;
2454                 }
2455                 cnt -= m->m_len;
2456                 m = m->m_next;
2457                 if (m == 0)
2458                         break;
2459         }
2460         panic("tcp_pulloutofband");
2461 }
2462
2463 /*
2464  * Collect new round-trip time estimate
2465  * and update averages and current timeout.
2466  */
2467 static void
2468 tcp_xmit_timer(tp, rtt)
2469         register struct tcpcb *tp;
2470         int rtt;
2471 {
2472         register int delta;
2473
2474         tcpstat.tcps_rttupdated++;
2475         tp->t_rttupdated++;
2476         if (tp->t_srtt != 0) {
2477                 /*
2478                  * srtt is stored as fixed point with 5 bits after the
2479                  * binary point (i.e., scaled by 8).  The following magic
2480                  * is equivalent to the smoothing algorithm in rfc793 with
2481                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2482                  * point).  Adjust rtt to origin 0.
2483                  */
2484                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2485                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2486
2487                 if ((tp->t_srtt += delta) <= 0)
2488                         tp->t_srtt = 1;
2489
2490                 /*
2491                  * We accumulate a smoothed rtt variance (actually, a
2492                  * smoothed mean difference), then set the retransmit
2493                  * timer to smoothed rtt + 4 times the smoothed variance.
2494                  * rttvar is stored as fixed point with 4 bits after the
2495                  * binary point (scaled by 16).  The following is
2496                  * equivalent to rfc793 smoothing with an alpha of .75
2497                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
2498                  * rfc793's wired-in beta.
2499                  */
2500                 if (delta < 0)
2501                         delta = -delta;
2502                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2503                 if ((tp->t_rttvar += delta) <= 0)
2504                         tp->t_rttvar = 1;
2505         } else {
2506                 /*
2507                  * No rtt measurement yet - use the unsmoothed rtt.
2508                  * Set the variance to half the rtt (so our first
2509                  * retransmit happens at 3*rtt).
2510                  */
2511                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2512                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2513         }
2514         tp->t_rtttime = 0;
2515         tp->t_rxtshift = 0;
2516
2517         /*
2518          * the retransmit should happen at rtt + 4 * rttvar.
2519          * Because of the way we do the smoothing, srtt and rttvar
2520          * will each average +1/2 tick of bias.  When we compute
2521          * the retransmit timer, we want 1/2 tick of rounding and
2522          * 1 extra tick because of +-1/2 tick uncertainty in the
2523          * firing of the timer.  The bias will give us exactly the
2524          * 1.5 tick we need.  But, because the bias is
2525          * statistical, we have to test that we don't drop below
2526          * the minimum feasible timer (which is 2 ticks).
2527          */
2528         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2529                       max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2530
2531         /*
2532          * We received an ack for a packet that wasn't retransmitted;
2533          * it is probably safe to discard any error indications we've
2534          * received recently.  This isn't quite right, but close enough
2535          * for now (a route might have failed after we sent a segment,
2536          * and the return path might not be symmetrical).
2537          */
2538         tp->t_softerror = 0;
2539 }
2540
2541 /*
2542  * Determine a reasonable value for maxseg size.
2543  * If the route is known, check route for mtu.
2544  * If none, use an mss that can be handled on the outgoing
2545  * interface without forcing IP to fragment; if bigger than
2546  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2547  * to utilize large mbufs.  If no route is found, route has no mtu,
2548  * or the destination isn't local, use a default, hopefully conservative
2549  * size (usually 512 or the default IP max size, but no more than the mtu
2550  * of the interface), as we can't discover anything about intervening
2551  * gateways or networks.  We also initialize the congestion/slow start
2552  * window to be a single segment if the destination isn't local.
2553  * While looking at the routing entry, we also initialize other path-dependent
2554  * parameters from pre-set or cached values in the routing entry.
2555  *
2556  * Also take into account the space needed for options that we
2557  * send regularly.  Make maxseg shorter by that amount to assure
2558  * that we can send maxseg amount of data even when the options
2559  * are present.  Store the upper limit of the length of options plus
2560  * data in maxopd.
2561  *
2562  * NOTE that this routine is only called when we process an incoming
2563  * segment, for outgoing segments only tcp_mssopt is called.
2564  *
2565  * In case of T/TCP, we call this routine during implicit connection
2566  * setup as well (offer = -1), to initialize maxseg from the cached
2567  * MSS of our peer.
2568  */
2569 void
2570 tcp_mss(tp, offer)
2571         struct tcpcb *tp;
2572         int offer;
2573 {
2574         register struct rtentry *rt;
2575         struct ifnet *ifp;
2576         register int rtt, mss;
2577         u_long bufsize;
2578         struct inpcb *inp;
2579         struct socket *so;
2580         struct rmxp_tao *taop;
2581         int origoffer = offer;
2582 #ifdef INET6
2583         int isipv6;
2584         int min_protoh;
2585 #endif
2586
2587         inp = tp->t_inpcb;
2588 #ifdef INET6
2589         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2590         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2591                             : sizeof (struct tcpiphdr);
2592 #else
2593 #define min_protoh  (sizeof (struct tcpiphdr))
2594 #endif
2595 #ifdef INET6
2596         if (isipv6)
2597                 rt = tcp_rtlookup6(inp);
2598         else
2599 #endif
2600         rt = tcp_rtlookup(inp);
2601         if (rt == NULL) {
2602                 tp->t_maxopd = tp->t_maxseg =
2603 #ifdef INET6
2604                 isipv6 ? tcp_v6mssdflt :
2605 #endif /* INET6 */
2606                 tcp_mssdflt;
2607                 return;
2608         }
2609         ifp = rt->rt_ifp;
2610         so = inp->inp_socket;
2611
2612         taop = rmx_taop(rt->rt_rmx);
2613         /*
2614          * Offer == -1 means that we didn't receive SYN yet,
2615          * use cached value in that case;
2616          */
2617         if (offer == -1)
2618                 offer = taop->tao_mssopt;
2619         /*
2620          * Offer == 0 means that there was no MSS on the SYN segment,
2621          * in this case we use tcp_mssdflt.
2622          */
2623         if (offer == 0)
2624                 offer =
2625 #ifdef INET6
2626                         isipv6 ? tcp_v6mssdflt :
2627 #endif /* INET6 */
2628                         tcp_mssdflt;
2629         else
2630                 /*
2631                  * Sanity check: make sure that maxopd will be large
2632                  * enough to allow some data on segments even is the
2633                  * all the option space is used (40bytes).  Otherwise
2634                  * funny things may happen in tcp_output.
2635                  */
2636                 offer = max(offer, 64);
2637         taop->tao_mssopt = offer;
2638
2639         /*
2640          * While we're here, check if there's an initial rtt
2641          * or rttvar.  Convert from the route-table units
2642          * to scaled multiples of the slow timeout timer.
2643          */
2644         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2645                 /*
2646                  * XXX the lock bit for RTT indicates that the value
2647                  * is also a minimum value; this is subject to time.
2648                  */
2649                 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2650                         tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
2651                 tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
2652                 tcpstat.tcps_usedrtt++;
2653                 if (rt->rt_rmx.rmx_rttvar) {
2654                         tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2655                             (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
2656                         tcpstat.tcps_usedrttvar++;
2657                 } else {
2658                         /* default variation is +- 1 rtt */
2659                         tp->t_rttvar =
2660                             tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2661                 }
2662                 TCPT_RANGESET(tp->t_rxtcur,
2663                               ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2664                               tp->t_rttmin, TCPTV_REXMTMAX);
2665         }
2666         /*
2667          * if there's an mtu associated with the route, use it
2668          * else, use the link mtu.
2669          */
2670         if (rt->rt_rmx.rmx_mtu)
2671                 mss = rt->rt_rmx.rmx_mtu - min_protoh;
2672         else
2673         {
2674                 mss =
2675 #ifdef INET6
2676                         (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
2677 #endif
2678                          ifp->if_mtu
2679 #ifdef INET6
2680                          )
2681 #endif
2682                         - min_protoh;
2683 #ifdef INET6
2684                 if (isipv6) {
2685                         if (!in6_localaddr(&inp->in6p_faddr))
2686                                 mss = min(mss, tcp_v6mssdflt);
2687                 } else
2688 #endif
2689                 if (!in_localaddr(inp->inp_faddr))
2690                         mss = min(mss, tcp_mssdflt);
2691         }
2692         mss = min(mss, offer);
2693         /*
2694          * maxopd stores the maximum length of data AND options
2695          * in a segment; maxseg is the amount of data in a normal
2696          * segment.  We need to store this value (maxopd) apart
2697          * from maxseg, because now every segment carries options
2698          * and thus we normally have somewhat less data in segments.
2699          */
2700         tp->t_maxopd = mss;
2701
2702         /*
2703          * In case of T/TCP, origoffer==-1 indicates, that no segments
2704          * were received yet.  In this case we just guess, otherwise
2705          * we do the same as before T/TCP.
2706          */
2707         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2708             (origoffer == -1 ||
2709              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2710                 mss -= TCPOLEN_TSTAMP_APPA;
2711         if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2712             (origoffer == -1 ||
2713              (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2714                 mss -= TCPOLEN_CC_APPA;
2715
2716 #if     (MCLBYTES & (MCLBYTES - 1)) == 0
2717                 if (mss > MCLBYTES)
2718                         mss &= ~(MCLBYTES-1);
2719 #else
2720                 if (mss > MCLBYTES)
2721                         mss = mss / MCLBYTES * MCLBYTES;
2722 #endif
2723         /*
2724          * If there's a pipesize, change the socket buffer
2725          * to that size.  Make the socket buffers an integral
2726          * number of mss units; if the mss is larger than
2727          * the socket buffer, decrease the mss.
2728          */
2729 #ifdef RTV_SPIPE
2730         if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2731 #endif
2732                 bufsize = so->so_snd.sb_hiwat;
2733         if (bufsize < mss)
2734                 mss = bufsize;
2735         else {
2736                 bufsize = roundup(bufsize, mss);
2737                 if (bufsize > sb_max)
2738                         bufsize = sb_max;
2739                 (void)sbreserve(&so->so_snd, bufsize, so, NULL);
2740         }
2741         tp->t_maxseg = mss;
2742
2743 #ifdef RTV_RPIPE
2744         if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2745 #endif
2746                 bufsize = so->so_rcv.sb_hiwat;
2747         if (bufsize > mss) {
2748                 bufsize = roundup(bufsize, mss);
2749                 if (bufsize > sb_max)
2750                         bufsize = sb_max;
2751                 (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2752         }
2753
2754         /*
2755          * Set the slow-start flight size depending on whether this
2756          * is a local network or not.
2757          */
2758         if (
2759 #ifdef INET6
2760             (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
2761             (!isipv6 &&
2762 #endif
2763              in_localaddr(inp->inp_faddr)
2764 #ifdef INET6
2765              )
2766 #endif
2767             )
2768                 tp->snd_cwnd = mss * ss_fltsz_local;
2769         else 
2770                 tp->snd_cwnd = mss * ss_fltsz;
2771
2772         if (rt->rt_rmx.rmx_ssthresh) {
2773                 /*
2774                  * There's some sort of gateway or interface
2775                  * buffer limit on the path.  Use this to set
2776                  * the slow start threshhold, but set the
2777                  * threshold to no less than 2*mss.
2778                  */
2779                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
2780                 tcpstat.tcps_usedssthresh++;
2781         }
2782 }
2783
2784 /*
2785  * Determine the MSS option to send on an outgoing SYN.
2786  */
2787 int
2788 tcp_mssopt(tp)
2789         struct tcpcb *tp;
2790 {
2791         struct rtentry *rt;
2792 #ifdef INET6
2793         int isipv6;
2794         int min_protoh;
2795 #endif
2796
2797 #ifdef INET6
2798         isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2799         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2800                             : sizeof (struct tcpiphdr);
2801 #else
2802 #define min_protoh  (sizeof (struct tcpiphdr))
2803 #endif
2804 #ifdef INET6
2805         if (isipv6)
2806                 rt = tcp_rtlookup6(tp->t_inpcb);
2807         else
2808 #endif /* INET6 */
2809         rt = tcp_rtlookup(tp->t_inpcb);
2810         if (rt == NULL)
2811                 return
2812 #ifdef INET6
2813                         isipv6 ? tcp_v6mssdflt :
2814 #endif /* INET6 */
2815                         tcp_mssdflt;
2816
2817         return rt->rt_ifp->if_mtu - min_protoh;
2818 }
2819
2820
2821 /*
2822  * Checks for partial ack.  If partial ack arrives, force the retransmission
2823  * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
2824  * 1.  By setting snd_nxt to ti_ack, this forces retransmission timer to
2825  * be started again.  If the ack advances at least to tp->snd_recover, return 0.
2826  */
2827 static int
2828 tcp_newreno(tp, th)
2829         struct tcpcb *tp;
2830         struct tcphdr *th;
2831 {
2832         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2833                 tcp_seq onxt = tp->snd_nxt;
2834                 u_long  ocwnd = tp->snd_cwnd;
2835
2836                 callout_stop(tp->tt_rexmt);
2837                 tp->t_rtttime = 0;
2838                 tp->snd_nxt = th->th_ack;
2839                 /*
2840                  * Set snd_cwnd to one segment beyond acknowledged offset
2841                  * (tp->snd_una has not yet been updated when this function 
2842                  *  is called)
2843                  */
2844                 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
2845                 (void) tcp_output(tp);
2846                 tp->snd_cwnd = ocwnd;
2847                 if (SEQ_GT(onxt, tp->snd_nxt))
2848                         tp->snd_nxt = onxt;
2849                 /*
2850                  * Partial window deflation.  Relies on fact that tp->snd_una
2851                  * not updated yet.
2852                  */
2853                 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
2854                 return (1);
2855         }
2856         return (0);
2857 }