]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_usrreq.c
This commit was generated by cvs2svn to compensate for changes in r155094,
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_usrreq.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      From: @(#)tcp_usrreq.c  8.2 (Berkeley) 1/3/94
30  * $FreeBSD$
31  */
32
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_tcpdebug.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kernel.h>
41 #include <sys/sysctl.h>
42 #include <sys/mbuf.h>
43 #ifdef INET6
44 #include <sys/domain.h>
45 #endif /* INET6 */
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/protosw.h>
49 #include <sys/proc.h>
50 #include <sys/jail.h>
51
52 #include <net/if.h>
53 #include <net/route.h>
54
55 #include <netinet/in.h>
56 #include <netinet/in_systm.h>
57 #ifdef INET6
58 #include <netinet/ip6.h>
59 #endif
60 #include <netinet/in_pcb.h>
61 #ifdef INET6
62 #include <netinet6/in6_pcb.h>
63 #endif
64 #include <netinet/in_var.h>
65 #include <netinet/ip_var.h>
66 #ifdef INET6
67 #include <netinet6/ip6_var.h>
68 #include <netinet6/scope6_var.h>
69 #endif
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_fsm.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_timer.h>
74 #include <netinet/tcp_var.h>
75 #include <netinet/tcpip.h>
76 #ifdef TCPDEBUG
77 #include <netinet/tcp_debug.h>
78 #endif
79
80 /*
81  * TCP protocol interface to socket abstraction.
82  */
83 extern  char *tcpstates[];      /* XXX ??? */
84
85 static int      tcp_attach(struct socket *);
86 static int      tcp_connect(struct tcpcb *, struct sockaddr *,
87                     struct thread *td);
88 #ifdef INET6
89 static int      tcp6_connect(struct tcpcb *, struct sockaddr *,
90                     struct thread *td);
91 #endif /* INET6 */
92 static struct tcpcb *
93                 tcp_disconnect(struct tcpcb *);
94 static struct tcpcb *
95                 tcp_usrclosed(struct tcpcb *);
96 static void     tcp_fill_info(struct tcpcb *, struct tcp_info *);
97
98 #ifdef TCPDEBUG
99 #define TCPDEBUG0       int ostate = 0
100 #define TCPDEBUG1()     ostate = tp ? tp->t_state : 0
101 #define TCPDEBUG2(req)  if (tp && (so->so_options & SO_DEBUG)) \
102                                 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
103 #else
104 #define TCPDEBUG0
105 #define TCPDEBUG1()
106 #define TCPDEBUG2(req)
107 #endif
108
109 /*
110  * TCP attaches to socket via pru_attach(), reserving space,
111  * and an internet control block.
112  */
113 static int
114 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
115 {
116         int error;
117         struct inpcb *inp;
118         struct tcpcb *tp = 0;
119         TCPDEBUG0;
120
121         INP_INFO_WLOCK(&tcbinfo);
122         TCPDEBUG1();
123         inp = sotoinpcb(so);
124         if (inp) {
125                 error = EISCONN;
126                 goto out;
127         }
128
129         error = tcp_attach(so);
130         if (error)
131                 goto out;
132
133         if ((so->so_options & SO_LINGER) && so->so_linger == 0)
134                 so->so_linger = TCP_LINGERTIME;
135
136         inp = sotoinpcb(so);
137         tp = intotcpcb(inp);
138 out:
139         TCPDEBUG2(PRU_ATTACH);
140         INP_INFO_WUNLOCK(&tcbinfo);
141         return error;
142 }
143
144 /*
145  * pru_detach() detaches the TCP protocol from the socket.
146  * If the protocol state is non-embryonic, then can't
147  * do this directly: have to initiate a pru_disconnect(),
148  * which may finish later; embryonic TCB's can just
149  * be discarded here.
150  */
151 static int
152 tcp_usr_detach(struct socket *so)
153 {
154         int error = 0;
155         struct inpcb *inp;
156         struct tcpcb *tp;
157         TCPDEBUG0;
158
159         INP_INFO_WLOCK(&tcbinfo);
160         inp = sotoinpcb(so);
161         if (inp == NULL) {
162                 INP_INFO_WUNLOCK(&tcbinfo);
163                 return error;
164         }
165         INP_LOCK(inp);
166         tp = intotcpcb(inp);
167         TCPDEBUG1();
168         tp = tcp_disconnect(tp);
169
170         TCPDEBUG2(PRU_DETACH);
171         if (tp)
172                 INP_UNLOCK(inp);
173         INP_INFO_WUNLOCK(&tcbinfo);
174         return error;
175 }
176
177 #define INI_NOLOCK      0
178 #define INI_READ        1
179 #define INI_WRITE       2
180
181 #define COMMON_START()                                          \
182         TCPDEBUG0;                                              \
183         do {                                                    \
184                 if (inirw == INI_READ)                          \
185                         INP_INFO_RLOCK(&tcbinfo);               \
186                 else if (inirw == INI_WRITE)                    \
187                         INP_INFO_WLOCK(&tcbinfo);               \
188                 inp = sotoinpcb(so);                            \
189                 if (inp == 0) {                                 \
190                         if (inirw == INI_READ)                  \
191                                 INP_INFO_RUNLOCK(&tcbinfo);     \
192                         else if (inirw == INI_WRITE)            \
193                                 INP_INFO_WUNLOCK(&tcbinfo);     \
194                         return EINVAL;                          \
195                 }                                               \
196                 INP_LOCK(inp);                                  \
197                 if (inirw == INI_READ)                          \
198                         INP_INFO_RUNLOCK(&tcbinfo);             \
199                 tp = intotcpcb(inp);                            \
200                 TCPDEBUG1();                                    \
201 } while(0)
202
203 #define COMMON_END(req)                                         \
204 out:    TCPDEBUG2(req);                                         \
205         do {                                                    \
206                 if (tp)                                         \
207                         INP_UNLOCK(inp);                        \
208                 if (inirw == INI_WRITE)                         \
209                         INP_INFO_WUNLOCK(&tcbinfo);             \
210                 return error;                                   \
211                 goto out;                                       \
212 } while(0)
213
214 /*
215  * Give the socket an address.
216  */
217 static int
218 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
219 {
220         int error = 0;
221         struct inpcb *inp;
222         struct tcpcb *tp;
223         struct sockaddr_in *sinp;
224         const int inirw = INI_WRITE;
225
226         sinp = (struct sockaddr_in *)nam;
227         if (nam->sa_len != sizeof (*sinp))
228                 return (EINVAL);
229         /*
230          * Must check for multicast addresses and disallow binding
231          * to them.
232          */
233         if (sinp->sin_family == AF_INET &&
234             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
235                 return (EAFNOSUPPORT);
236
237         COMMON_START();
238         error = in_pcbbind(inp, nam, td->td_ucred);
239         if (error)
240                 goto out;
241         COMMON_END(PRU_BIND);
242 }
243
244 #ifdef INET6
245 static int
246 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
247 {
248         int error = 0;
249         struct inpcb *inp;
250         struct tcpcb *tp;
251         struct sockaddr_in6 *sin6p;
252         const int inirw = INI_WRITE;
253
254         sin6p = (struct sockaddr_in6 *)nam;
255         if (nam->sa_len != sizeof (*sin6p))
256                 return (EINVAL);
257         /*
258          * Must check for multicast addresses and disallow binding
259          * to them.
260          */
261         if (sin6p->sin6_family == AF_INET6 &&
262             IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
263                 return (EAFNOSUPPORT);
264
265         COMMON_START();
266         inp->inp_vflag &= ~INP_IPV4;
267         inp->inp_vflag |= INP_IPV6;
268         if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
269                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
270                         inp->inp_vflag |= INP_IPV4;
271                 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
272                         struct sockaddr_in sin;
273
274                         in6_sin6_2_sin(&sin, sin6p);
275                         inp->inp_vflag |= INP_IPV4;
276                         inp->inp_vflag &= ~INP_IPV6;
277                         error = in_pcbbind(inp, (struct sockaddr *)&sin,
278                             td->td_ucred);
279                         goto out;
280                 }
281         }
282         error = in6_pcbbind(inp, nam, td->td_ucred);
283         if (error)
284                 goto out;
285         COMMON_END(PRU_BIND);
286 }
287 #endif /* INET6 */
288
289 /*
290  * Prepare to accept connections.
291  */
292 static int
293 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
294 {
295         int error = 0;
296         struct inpcb *inp;
297         struct tcpcb *tp;
298         const int inirw = INI_WRITE;
299
300         COMMON_START();
301         SOCK_LOCK(so);
302         error = solisten_proto_check(so);
303         if (error == 0 && inp->inp_lport == 0)
304                 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
305         if (error == 0) {
306                 tp->t_state = TCPS_LISTEN;
307                 solisten_proto(so, backlog);
308         }
309         SOCK_UNLOCK(so);
310         COMMON_END(PRU_LISTEN);
311 }
312
313 #ifdef INET6
314 static int
315 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
316 {
317         int error = 0;
318         struct inpcb *inp;
319         struct tcpcb *tp;
320         const int inirw = INI_WRITE;
321
322         COMMON_START();
323         SOCK_LOCK(so);
324         error = solisten_proto_check(so);
325         if (error == 0 && inp->inp_lport == 0) {
326                 inp->inp_vflag &= ~INP_IPV4;
327                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
328                         inp->inp_vflag |= INP_IPV4;
329                 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
330         }
331         if (error == 0) {
332                 tp->t_state = TCPS_LISTEN;
333                 solisten_proto(so, backlog);
334         }
335         SOCK_UNLOCK(so);
336         COMMON_END(PRU_LISTEN);
337 }
338 #endif /* INET6 */
339
340 /*
341  * Initiate connection to peer.
342  * Create a template for use in transmissions on this connection.
343  * Enter SYN_SENT state, and mark socket as connecting.
344  * Start keep-alive timer, and seed output sequence space.
345  * Send initial segment on connection.
346  */
347 static int
348 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
349 {
350         int error = 0;
351         struct inpcb *inp;
352         struct tcpcb *tp;
353         struct sockaddr_in *sinp;
354         const int inirw = INI_WRITE;
355
356         sinp = (struct sockaddr_in *)nam;
357         if (nam->sa_len != sizeof (*sinp))
358                 return (EINVAL);
359         /*
360          * Must disallow TCP ``connections'' to multicast addresses.
361          */
362         if (sinp->sin_family == AF_INET
363             && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
364                 return (EAFNOSUPPORT);
365         if (jailed(td->td_ucred))
366                 prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
367
368         COMMON_START();
369         if ((error = tcp_connect(tp, nam, td)) != 0)
370                 goto out;
371         error = tcp_output(tp);
372         COMMON_END(PRU_CONNECT);
373 }
374
375 #ifdef INET6
376 static int
377 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
378 {
379         int error = 0;
380         struct inpcb *inp;
381         struct tcpcb *tp;
382         struct sockaddr_in6 *sin6p;
383         const int inirw = INI_WRITE;
384
385         sin6p = (struct sockaddr_in6 *)nam;
386         if (nam->sa_len != sizeof (*sin6p))
387                 return (EINVAL);
388         /*
389          * Must disallow TCP ``connections'' to multicast addresses.
390          */
391         if (sin6p->sin6_family == AF_INET6
392             && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
393                 return (EAFNOSUPPORT);
394
395         COMMON_START();
396         if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
397                 struct sockaddr_in sin;
398
399                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
400                         error = EINVAL;
401                         goto out;
402                 }
403
404                 in6_sin6_2_sin(&sin, sin6p);
405                 inp->inp_vflag |= INP_IPV4;
406                 inp->inp_vflag &= ~INP_IPV6;
407                 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
408                         goto out;
409                 error = tcp_output(tp);
410                 goto out;
411         }
412         inp->inp_vflag &= ~INP_IPV4;
413         inp->inp_vflag |= INP_IPV6;
414         inp->inp_inc.inc_isipv6 = 1;
415         if ((error = tcp6_connect(tp, nam, td)) != 0)
416                 goto out;
417         error = tcp_output(tp);
418         COMMON_END(PRU_CONNECT);
419 }
420 #endif /* INET6 */
421
422 /*
423  * Initiate disconnect from peer.
424  * If connection never passed embryonic stage, just drop;
425  * else if don't need to let data drain, then can just drop anyways,
426  * else have to begin TCP shutdown process: mark socket disconnecting,
427  * drain unread data, state switch to reflect user close, and
428  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
429  * when peer sends FIN and acks ours.
430  *
431  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
432  */
433 static int
434 tcp_usr_disconnect(struct socket *so)
435 {
436         int error = 0;
437         struct inpcb *inp;
438         struct tcpcb *tp;
439         const int inirw = INI_WRITE;
440
441         COMMON_START();
442         tp = tcp_disconnect(tp);
443         COMMON_END(PRU_DISCONNECT);
444 }
445
446 /*
447  * Accept a connection.  Essentially all the work is
448  * done at higher levels; just return the address
449  * of the peer, storing through addr.
450  */
451 static int
452 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
453 {
454         int error = 0;
455         struct inpcb *inp = NULL;
456         struct tcpcb *tp = NULL;
457         struct in_addr addr;
458         in_port_t port = 0;
459         TCPDEBUG0;
460
461         if (so->so_state & SS_ISDISCONNECTED) {
462                 error = ECONNABORTED;
463                 goto out;
464         }
465
466         INP_INFO_RLOCK(&tcbinfo);
467         inp = sotoinpcb(so);
468         if (!inp) {
469                 INP_INFO_RUNLOCK(&tcbinfo);
470                 return (EINVAL);
471         }
472         INP_LOCK(inp);
473         INP_INFO_RUNLOCK(&tcbinfo);
474         tp = intotcpcb(inp);
475         TCPDEBUG1();
476
477         /*
478          * We inline in_setpeeraddr and COMMON_END here, so that we can
479          * copy the data of interest and defer the malloc until after we
480          * release the lock.
481          */
482         port = inp->inp_fport;
483         addr = inp->inp_faddr;
484
485 out:    TCPDEBUG2(PRU_ACCEPT);
486         if (tp)
487                 INP_UNLOCK(inp);
488         if (error == 0)
489                 *nam = in_sockaddr(port, &addr);
490         return error;
491 }
492
493 #ifdef INET6
494 static int
495 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
496 {
497         struct inpcb *inp = NULL;
498         int error = 0;
499         struct tcpcb *tp = NULL;
500         struct in_addr addr;
501         struct in6_addr addr6;
502         in_port_t port = 0;
503         int v4 = 0;
504         TCPDEBUG0;
505
506         if (so->so_state & SS_ISDISCONNECTED) {
507                 error = ECONNABORTED;
508                 goto out;
509         }
510
511         INP_INFO_RLOCK(&tcbinfo);
512         inp = sotoinpcb(so);
513         if (inp == 0) {
514                 INP_INFO_RUNLOCK(&tcbinfo);
515                 return (EINVAL);
516         }
517         INP_LOCK(inp);
518         INP_INFO_RUNLOCK(&tcbinfo);
519         tp = intotcpcb(inp);
520         TCPDEBUG1();
521         /*
522          * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
523          * copy the data of interest and defer the malloc until after we
524          * release the lock.
525          */
526         if (inp->inp_vflag & INP_IPV4) {
527                 v4 = 1;
528                 port = inp->inp_fport;
529                 addr = inp->inp_faddr;
530         } else {
531                 port = inp->inp_fport;
532                 addr6 = inp->in6p_faddr;
533         }
534
535 out:    TCPDEBUG2(PRU_ACCEPT);
536         if (tp)
537                 INP_UNLOCK(inp);
538         if (error == 0) {
539                 if (v4)
540                         *nam = in6_v4mapsin6_sockaddr(port, &addr);
541                 else
542                         *nam = in6_sockaddr(port, &addr6);
543         }
544         return error;
545 }
546 #endif /* INET6 */
547
548 /*
549  * This is the wrapper function for in_setsockaddr. We just pass down
550  * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
551  * here because in_setsockaddr will call malloc and can block.
552  */
553 static int
554 tcp_sockaddr(struct socket *so, struct sockaddr **nam)
555 {
556         return (in_setsockaddr(so, nam, &tcbinfo));
557 }
558
559 /*
560  * This is the wrapper function for in_setpeeraddr. We just pass down
561  * the pcbinfo for in_setpeeraddr to lock.
562  */
563 static int
564 tcp_peeraddr(struct socket *so, struct sockaddr **nam)
565 {
566         return (in_setpeeraddr(so, nam, &tcbinfo));
567 }
568
569 /*
570  * Mark the connection as being incapable of further output.
571  */
572 static int
573 tcp_usr_shutdown(struct socket *so)
574 {
575         int error = 0;
576         struct inpcb *inp;
577         struct tcpcb *tp;
578         const int inirw = INI_WRITE;
579
580         COMMON_START();
581         socantsendmore(so);
582         tp = tcp_usrclosed(tp);
583         if (tp)
584                 error = tcp_output(tp);
585         COMMON_END(PRU_SHUTDOWN);
586 }
587
588 /*
589  * After a receive, possibly send window update to peer.
590  */
591 static int
592 tcp_usr_rcvd(struct socket *so, int flags)
593 {
594         int error = 0;
595         struct inpcb *inp;
596         struct tcpcb *tp;
597         const int inirw = INI_READ;
598
599         COMMON_START();
600         tcp_output(tp);
601         COMMON_END(PRU_RCVD);
602 }
603
604 /*
605  * Do a send by putting data in output queue and updating urgent
606  * marker if URG set.  Possibly send more data.  Unlike the other
607  * pru_*() routines, the mbuf chains are our responsibility.  We
608  * must either enqueue them or free them.  The other pru_* routines
609  * generally are caller-frees.
610  */
611 static int
612 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
613              struct sockaddr *nam, struct mbuf *control, struct thread *td)
614 {
615         int error = 0;
616         struct inpcb *inp;
617         struct tcpcb *tp;
618         int unlocked = 0;
619 #ifdef INET6
620         int isipv6;
621 #endif
622         TCPDEBUG0;
623
624         /*
625          * Need write lock here because this function might call
626          * tcp_connect or tcp_usrclosed.
627          * We really want to have to this function upgrade from read lock
628          * to write lock.  XXX
629          */
630         INP_INFO_WLOCK(&tcbinfo);
631         inp = sotoinpcb(so);
632         if (inp == NULL) {
633                 /*
634                  * OOPS! we lost a race, the TCP session got reset after
635                  * we checked SBS_CANTSENDMORE, eg: while doing uiomove or a
636                  * network interrupt in the non-splnet() section of sosend().
637                  */
638                 if (m)
639                         m_freem(m);
640                 if (control)
641                         m_freem(control);
642                 error = ECONNRESET;     /* XXX EPIPE? */
643                 tp = NULL;
644                 TCPDEBUG1();
645                 goto out;
646         }
647         INP_LOCK(inp);
648 #ifdef INET6
649         isipv6 = nam && nam->sa_family == AF_INET6;
650 #endif /* INET6 */
651         tp = intotcpcb(inp);
652         TCPDEBUG1();
653         if (control) {
654                 /* TCP doesn't do control messages (rights, creds, etc) */
655                 if (control->m_len) {
656                         m_freem(control);
657                         if (m)
658                                 m_freem(m);
659                         error = EINVAL;
660                         goto out;
661                 }
662                 m_freem(control);       /* empty control, just free it */
663         }
664         if (!(flags & PRUS_OOB)) {
665                 sbappendstream(&so->so_snd, m);
666                 if (nam && tp->t_state < TCPS_SYN_SENT) {
667                         /*
668                          * Do implied connect if not yet connected,
669                          * initialize window to default value, and
670                          * initialize maxseg/maxopd using peer's cached
671                          * MSS.
672                          */
673 #ifdef INET6
674                         if (isipv6)
675                                 error = tcp6_connect(tp, nam, td);
676                         else
677 #endif /* INET6 */
678                         error = tcp_connect(tp, nam, td);
679                         if (error)
680                                 goto out;
681                         tp->snd_wnd = TTCP_CLIENT_SND_WND;
682                         tcp_mss(tp, -1);
683                 }
684
685                 if (flags & PRUS_EOF) {
686                         /*
687                          * Close the send side of the connection after
688                          * the data is sent.
689                          */
690                         socantsendmore(so);
691                         tp = tcp_usrclosed(tp);
692                 }
693                 INP_INFO_WUNLOCK(&tcbinfo);
694                 unlocked = 1;
695                 if (tp != NULL) {
696                         if (flags & PRUS_MORETOCOME)
697                                 tp->t_flags |= TF_MORETOCOME;
698                         error = tcp_output(tp);
699                         if (flags & PRUS_MORETOCOME)
700                                 tp->t_flags &= ~TF_MORETOCOME;
701                 }
702         } else {
703                 SOCKBUF_LOCK(&so->so_snd);
704                 if (sbspace(&so->so_snd) < -512) {
705                         SOCKBUF_UNLOCK(&so->so_snd);
706                         m_freem(m);
707                         error = ENOBUFS;
708                         goto out;
709                 }
710                 /*
711                  * According to RFC961 (Assigned Protocols),
712                  * the urgent pointer points to the last octet
713                  * of urgent data.  We continue, however,
714                  * to consider it to indicate the first octet
715                  * of data past the urgent section.
716                  * Otherwise, snd_up should be one lower.
717                  */
718                 sbappendstream_locked(&so->so_snd, m);
719                 SOCKBUF_UNLOCK(&so->so_snd);
720                 if (nam && tp->t_state < TCPS_SYN_SENT) {
721                         /*
722                          * Do implied connect if not yet connected,
723                          * initialize window to default value, and
724                          * initialize maxseg/maxopd using peer's cached
725                          * MSS.
726                          */
727 #ifdef INET6
728                         if (isipv6)
729                                 error = tcp6_connect(tp, nam, td);
730                         else
731 #endif /* INET6 */
732                         error = tcp_connect(tp, nam, td);
733                         if (error)
734                                 goto out;
735                         tp->snd_wnd = TTCP_CLIENT_SND_WND;
736                         tcp_mss(tp, -1);
737                 }
738                 INP_INFO_WUNLOCK(&tcbinfo);
739                 unlocked = 1;
740                 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
741                 tp->t_flags |= TF_FORCEDATA;
742                 error = tcp_output(tp);
743                 tp->t_flags &= ~TF_FORCEDATA;
744         }
745 out:
746         TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
747                   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
748         if (tp)
749                 INP_UNLOCK(inp);
750         if (!unlocked)
751                 INP_INFO_WUNLOCK(&tcbinfo);
752         return (error);
753 }
754
755 /*
756  * Abort the TCP.
757  */
758 static int
759 tcp_usr_abort(struct socket *so)
760 {
761         int error = 0;
762         struct inpcb *inp;
763         struct tcpcb *tp;
764         const int inirw = INI_WRITE;
765
766         COMMON_START();
767         tp = tcp_drop(tp, ECONNABORTED);
768         COMMON_END(PRU_ABORT);
769 }
770
771 /*
772  * Receive out-of-band data.
773  */
774 static int
775 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
776 {
777         int error = 0;
778         struct inpcb *inp;
779         struct tcpcb *tp;
780         const int inirw = INI_READ;
781
782         COMMON_START();
783         if ((so->so_oobmark == 0 &&
784              (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
785             so->so_options & SO_OOBINLINE ||
786             tp->t_oobflags & TCPOOB_HADDATA) {
787                 error = EINVAL;
788                 goto out;
789         }
790         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
791                 error = EWOULDBLOCK;
792                 goto out;
793         }
794         m->m_len = 1;
795         *mtod(m, caddr_t) = tp->t_iobc;
796         if ((flags & MSG_PEEK) == 0)
797                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
798         COMMON_END(PRU_RCVOOB);
799 }
800
801 struct pr_usrreqs tcp_usrreqs = {
802         .pru_abort =            tcp_usr_abort,
803         .pru_accept =           tcp_usr_accept,
804         .pru_attach =           tcp_usr_attach,
805         .pru_bind =             tcp_usr_bind,
806         .pru_connect =          tcp_usr_connect,
807         .pru_control =          in_control,
808         .pru_detach =           tcp_usr_detach,
809         .pru_disconnect =       tcp_usr_disconnect,
810         .pru_listen =           tcp_usr_listen,
811         .pru_peeraddr =         tcp_peeraddr,
812         .pru_rcvd =             tcp_usr_rcvd,
813         .pru_rcvoob =           tcp_usr_rcvoob,
814         .pru_send =             tcp_usr_send,
815         .pru_shutdown =         tcp_usr_shutdown,
816         .pru_sockaddr =         tcp_sockaddr,
817         .pru_sosetlabel =       in_pcbsosetlabel
818 };
819
820 #ifdef INET6
821 struct pr_usrreqs tcp6_usrreqs = {
822         .pru_abort =            tcp_usr_abort,
823         .pru_accept =           tcp6_usr_accept,
824         .pru_attach =           tcp_usr_attach,
825         .pru_bind =             tcp6_usr_bind,
826         .pru_connect =          tcp6_usr_connect,
827         .pru_control =          in6_control,
828         .pru_detach =           tcp_usr_detach,
829         .pru_disconnect =       tcp_usr_disconnect,
830         .pru_listen =           tcp6_usr_listen,
831         .pru_peeraddr =         in6_mapped_peeraddr,
832         .pru_rcvd =             tcp_usr_rcvd,
833         .pru_rcvoob =           tcp_usr_rcvoob,
834         .pru_send =             tcp_usr_send,
835         .pru_shutdown =         tcp_usr_shutdown,
836         .pru_sockaddr =         in6_mapped_sockaddr,
837         .pru_sosetlabel =       in_pcbsosetlabel
838 };
839 #endif /* INET6 */
840
841 /*
842  * Common subroutine to open a TCP connection to remote host specified
843  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
844  * port number if needed.  Call in_pcbconnect_setup to do the routing and
845  * to choose a local host address (interface).  If there is an existing
846  * incarnation of the same connection in TIME-WAIT state and if the remote
847  * host was sending CC options and if the connection duration was < MSL, then
848  * truncate the previous TIME-WAIT state and proceed.
849  * Initialize connection parameters and enter SYN-SENT state.
850  */
851 static int
852 tcp_connect(tp, nam, td)
853         register struct tcpcb *tp;
854         struct sockaddr *nam;
855         struct thread *td;
856 {
857         struct inpcb *inp = tp->t_inpcb, *oinp;
858         struct socket *so = inp->inp_socket;
859         struct in_addr laddr;
860         u_short lport;
861         int error;
862
863         if (inp->inp_lport == 0) {
864                 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
865                 if (error)
866                         return error;
867         }
868
869         /*
870          * Cannot simply call in_pcbconnect, because there might be an
871          * earlier incarnation of this same connection still in
872          * TIME_WAIT state, creating an ADDRINUSE error.
873          */
874         laddr = inp->inp_laddr;
875         lport = inp->inp_lport;
876         error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
877             &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
878         if (error && oinp == NULL)
879                 return error;
880         if (oinp)
881                 return EADDRINUSE;
882         inp->inp_laddr = laddr;
883         in_pcbrehash(inp);
884
885         /* Compute window scaling to request.  */
886         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
887             (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
888                 tp->request_r_scale++;
889
890         soisconnecting(so);
891         tcpstat.tcps_connattempt++;
892         tp->t_state = TCPS_SYN_SENT;
893         callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
894         tp->iss = tcp_new_isn(tp);
895         tp->t_bw_rtseq = tp->iss;
896         tcp_sendseqinit(tp);
897
898         return 0;
899 }
900
901 #ifdef INET6
902 static int
903 tcp6_connect(tp, nam, td)
904         register struct tcpcb *tp;
905         struct sockaddr *nam;
906         struct thread *td;
907 {
908         struct inpcb *inp = tp->t_inpcb, *oinp;
909         struct socket *so = inp->inp_socket;
910         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
911         struct in6_addr *addr6;
912         int error;
913
914         if (inp->inp_lport == 0) {
915                 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
916                 if (error)
917                         return error;
918         }
919
920         /*
921          * Cannot simply call in_pcbconnect, because there might be an
922          * earlier incarnation of this same connection still in
923          * TIME_WAIT state, creating an ADDRINUSE error.
924          * in6_pcbladdr() also handles scope zone IDs.
925          */
926         error = in6_pcbladdr(inp, nam, &addr6);
927         if (error)
928                 return error;
929         oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
930                                   &sin6->sin6_addr, sin6->sin6_port,
931                                   IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
932                                   ? addr6
933                                   : &inp->in6p_laddr,
934                                   inp->inp_lport,  0, NULL);
935         if (oinp)
936                 return EADDRINUSE;
937         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
938                 inp->in6p_laddr = *addr6;
939         inp->in6p_faddr = sin6->sin6_addr;
940         inp->inp_fport = sin6->sin6_port;
941         /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
942         inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
943         if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
944                 inp->in6p_flowinfo |=
945                     (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
946         in_pcbrehash(inp);
947
948         /* Compute window scaling to request.  */
949         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
950             (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
951                 tp->request_r_scale++;
952
953         soisconnecting(so);
954         tcpstat.tcps_connattempt++;
955         tp->t_state = TCPS_SYN_SENT;
956         callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
957         tp->iss = tcp_new_isn(tp);
958         tp->t_bw_rtseq = tp->iss;
959         tcp_sendseqinit(tp);
960
961         return 0;
962 }
963 #endif /* INET6 */
964
965 /*
966  * Export TCP internal state information via a struct tcp_info, based on the
967  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
968  * (TCP state machine, etc).  We export all information using FreeBSD-native
969  * constants -- for example, the numeric values for tcpi_state will differ
970  * from Linux.
971  */
972 static void
973 tcp_fill_info(tp, ti)
974         struct tcpcb *tp;
975         struct tcp_info *ti;
976 {
977
978         INP_LOCK_ASSERT(tp->t_inpcb);
979         bzero(ti, sizeof(*ti));
980
981         ti->tcpi_state = tp->t_state;
982         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
983                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
984         if (tp->sack_enable)
985                 ti->tcpi_options |= TCPI_OPT_SACK;
986         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
987                 ti->tcpi_options |= TCPI_OPT_WSCALE;
988                 ti->tcpi_snd_wscale = tp->snd_scale;
989                 ti->tcpi_rcv_wscale = tp->rcv_scale;
990         }
991         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
992         ti->tcpi_snd_cwnd = tp->snd_cwnd;
993
994         /*
995          * FreeBSD-specific extension fields for tcp_info.
996          */
997         ti->tcpi_rcv_space = tp->rcv_wnd;
998         ti->tcpi_snd_wnd = tp->snd_wnd;
999         ti->tcpi_snd_bwnd = tp->snd_bwnd;
1000 }
1001
1002 /*
1003  * The new sockopt interface makes it possible for us to block in the
1004  * copyin/out step (if we take a page fault).  Taking a page fault at
1005  * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1006  * use TSM, there probably isn't any need for this function to run at
1007  * splnet() any more.  This needs more examination.)
1008  *
1009  * XXXRW: The locking here is wrong; we may take a page fault while holding
1010  * the inpcb lock.
1011  */
1012 int
1013 tcp_ctloutput(so, sopt)
1014         struct socket *so;
1015         struct sockopt *sopt;
1016 {
1017         int     error, opt, optval;
1018         struct  inpcb *inp;
1019         struct  tcpcb *tp;
1020         struct  tcp_info ti;
1021
1022         error = 0;
1023         INP_INFO_RLOCK(&tcbinfo);
1024         inp = sotoinpcb(so);
1025         if (inp == NULL) {
1026                 INP_INFO_RUNLOCK(&tcbinfo);
1027                 return (ECONNRESET);
1028         }
1029         INP_LOCK(inp);
1030         INP_INFO_RUNLOCK(&tcbinfo);
1031         if (sopt->sopt_level != IPPROTO_TCP) {
1032                 INP_UNLOCK(inp);
1033 #ifdef INET6
1034                 if (INP_CHECK_SOCKAF(so, AF_INET6))
1035                         error = ip6_ctloutput(so, sopt);
1036                 else
1037 #endif /* INET6 */
1038                 error = ip_ctloutput(so, sopt);
1039                 return (error);
1040         }
1041         tp = intotcpcb(inp);
1042
1043         switch (sopt->sopt_dir) {
1044         case SOPT_SET:
1045                 switch (sopt->sopt_name) {
1046 #ifdef TCP_SIGNATURE
1047                 case TCP_MD5SIG:
1048                         error = sooptcopyin(sopt, &optval, sizeof optval,
1049                                             sizeof optval);
1050                         if (error)
1051                                 break;
1052
1053                         if (optval > 0)
1054                                 tp->t_flags |= TF_SIGNATURE;
1055                         else
1056                                 tp->t_flags &= ~TF_SIGNATURE;
1057                         break;
1058 #endif /* TCP_SIGNATURE */
1059                 case TCP_NODELAY:
1060                 case TCP_NOOPT:
1061                         error = sooptcopyin(sopt, &optval, sizeof optval,
1062                                             sizeof optval);
1063                         if (error)
1064                                 break;
1065
1066                         switch (sopt->sopt_name) {
1067                         case TCP_NODELAY:
1068                                 opt = TF_NODELAY;
1069                                 break;
1070                         case TCP_NOOPT:
1071                                 opt = TF_NOOPT;
1072                                 break;
1073                         default:
1074                                 opt = 0; /* dead code to fool gcc */
1075                                 break;
1076                         }
1077
1078                         if (optval)
1079                                 tp->t_flags |= opt;
1080                         else
1081                                 tp->t_flags &= ~opt;
1082                         break;
1083
1084                 case TCP_NOPUSH:
1085                         error = sooptcopyin(sopt, &optval, sizeof optval,
1086                                             sizeof optval);
1087                         if (error)
1088                                 break;
1089
1090                         if (optval)
1091                                 tp->t_flags |= TF_NOPUSH;
1092                         else {
1093                                 tp->t_flags &= ~TF_NOPUSH;
1094                                 error = tcp_output(tp);
1095                         }
1096                         break;
1097
1098                 case TCP_MAXSEG:
1099                         error = sooptcopyin(sopt, &optval, sizeof optval,
1100                                             sizeof optval);
1101                         if (error)
1102                                 break;
1103
1104                         if (optval > 0 && optval <= tp->t_maxseg &&
1105                             optval + 40 >= tcp_minmss)
1106                                 tp->t_maxseg = optval;
1107                         else
1108                                 error = EINVAL;
1109                         break;
1110
1111                 case TCP_INFO:
1112                         error = EINVAL;
1113                         break;
1114
1115                 default:
1116                         error = ENOPROTOOPT;
1117                         break;
1118                 }
1119                 break;
1120
1121         case SOPT_GET:
1122                 switch (sopt->sopt_name) {
1123 #ifdef TCP_SIGNATURE
1124                 case TCP_MD5SIG:
1125                         optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1126                         error = sooptcopyout(sopt, &optval, sizeof optval);
1127                         break;
1128 #endif
1129                 case TCP_NODELAY:
1130                         optval = tp->t_flags & TF_NODELAY;
1131                         error = sooptcopyout(sopt, &optval, sizeof optval);
1132                         break;
1133                 case TCP_MAXSEG:
1134                         optval = tp->t_maxseg;
1135                         error = sooptcopyout(sopt, &optval, sizeof optval);
1136                         break;
1137                 case TCP_NOOPT:
1138                         optval = tp->t_flags & TF_NOOPT;
1139                         error = sooptcopyout(sopt, &optval, sizeof optval);
1140                         break;
1141                 case TCP_NOPUSH:
1142                         optval = tp->t_flags & TF_NOPUSH;
1143                         error = sooptcopyout(sopt, &optval, sizeof optval);
1144                         break;
1145                 case TCP_INFO:
1146                         tcp_fill_info(tp, &ti);
1147                         error = sooptcopyout(sopt, &ti, sizeof ti);
1148                         break;
1149                 default:
1150                         error = ENOPROTOOPT;
1151                         break;
1152                 }
1153                 break;
1154         }
1155         INP_UNLOCK(inp);
1156         return (error);
1157 }
1158
1159 /*
1160  * tcp_sendspace and tcp_recvspace are the default send and receive window
1161  * sizes, respectively.  These are obsolescent (this information should
1162  * be set by the route).
1163  */
1164 u_long  tcp_sendspace = 1024*32;
1165 SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1166     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1167 u_long  tcp_recvspace = 1024*64;
1168 SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1169     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1170
1171 /*
1172  * Attach TCP protocol to socket, allocating
1173  * internet protocol control block, tcp control block,
1174  * bufer space, and entering LISTEN state if to accept connections.
1175  */
1176 static int
1177 tcp_attach(so)
1178         struct socket *so;
1179 {
1180         register struct tcpcb *tp;
1181         struct inpcb *inp;
1182         int error;
1183 #ifdef INET6
1184         int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1185 #endif
1186
1187         INP_INFO_WLOCK_ASSERT(&tcbinfo);
1188
1189         if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1190                 error = soreserve(so, tcp_sendspace, tcp_recvspace);
1191                 if (error)
1192                         return (error);
1193         }
1194         error = in_pcballoc(so, &tcbinfo, "tcpinp");
1195         if (error)
1196                 return (error);
1197         inp = sotoinpcb(so);
1198 #ifdef INET6
1199         if (isipv6) {
1200                 inp->inp_vflag |= INP_IPV6;
1201                 inp->in6p_hops = -1;    /* use kernel default */
1202         }
1203         else
1204 #endif
1205         inp->inp_vflag |= INP_IPV4;
1206         tp = tcp_newtcpcb(inp);
1207         if (tp == 0) {
1208                 int nofd = so->so_state & SS_NOFDREF;   /* XXX */
1209
1210                 so->so_state &= ~SS_NOFDREF;    /* don't free the socket yet */
1211
1212                 INP_LOCK(inp);
1213 #ifdef INET6
1214                 if (isipv6)
1215                         in6_pcbdetach(inp);
1216                 else
1217 #endif
1218                 in_pcbdetach(inp);
1219                 so->so_state |= nofd;
1220                 return (ENOBUFS);
1221         }
1222         tp->t_state = TCPS_CLOSED;
1223         return (0);
1224 }
1225
1226 /*
1227  * Initiate (or continue) disconnect.
1228  * If embryonic state, just send reset (once).
1229  * If in ``let data drain'' option and linger null, just drop.
1230  * Otherwise (hard), mark socket disconnecting and drop
1231  * current input data; switch states based on user close, and
1232  * send segment to peer (with FIN).
1233  */
1234 static struct tcpcb *
1235 tcp_disconnect(tp)
1236         register struct tcpcb *tp;
1237 {
1238         struct inpcb *inp = tp->t_inpcb;
1239         struct socket *so = inp->inp_socket;
1240
1241         INP_INFO_WLOCK_ASSERT(&tcbinfo);
1242         INP_LOCK_ASSERT(inp);
1243
1244         if (tp->t_state < TCPS_ESTABLISHED)
1245                 tp = tcp_close(tp);
1246         else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1247                 tp = tcp_drop(tp, 0);
1248         else {
1249                 soisdisconnecting(so);
1250                 sbflush(&so->so_rcv);
1251                 tp = tcp_usrclosed(tp);
1252                 if (tp)
1253                         (void) tcp_output(tp);
1254         }
1255         return (tp);
1256 }
1257
1258 /*
1259  * User issued close, and wish to trail through shutdown states:
1260  * if never received SYN, just forget it.  If got a SYN from peer,
1261  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1262  * If already got a FIN from peer, then almost done; go to LAST_ACK
1263  * state.  In all other cases, have already sent FIN to peer (e.g.
1264  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1265  * for peer to send FIN or not respond to keep-alives, etc.
1266  * We can let the user exit from the close as soon as the FIN is acked.
1267  */
1268 static struct tcpcb *
1269 tcp_usrclosed(tp)
1270         register struct tcpcb *tp;
1271 {
1272
1273         INP_INFO_WLOCK_ASSERT(&tcbinfo);
1274         INP_LOCK_ASSERT(tp->t_inpcb);
1275
1276         switch (tp->t_state) {
1277
1278         case TCPS_CLOSED:
1279         case TCPS_LISTEN:
1280                 tp->t_state = TCPS_CLOSED;
1281                 tp = tcp_close(tp);
1282                 break;
1283
1284         case TCPS_SYN_SENT:
1285         case TCPS_SYN_RECEIVED:
1286                 tp->t_flags |= TF_NEEDFIN;
1287                 break;
1288
1289         case TCPS_ESTABLISHED:
1290                 tp->t_state = TCPS_FIN_WAIT_1;
1291                 break;
1292
1293         case TCPS_CLOSE_WAIT:
1294                 tp->t_state = TCPS_LAST_ACK;
1295                 break;
1296         }
1297         if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1298                 soisdisconnected(tp->t_inpcb->inp_socket);
1299                 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
1300                 if (tp->t_state == TCPS_FIN_WAIT_2)
1301                         callout_reset(tp->tt_2msl, tcp_maxidle,
1302                                       tcp_timer_2msl, tp);
1303         }
1304         return (tp);
1305 }