2 * Copyright 2004-2005 Robert N. M. Watson
3 * Copyright (c) 1982, 1986, 1989, 1991, 1993
4 * The Regents of the University of California. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
38 #include <sys/param.h>
39 #include <sys/domain.h>
40 #include <sys/fcntl.h>
41 #include <sys/malloc.h> /* XXX must be before <sys/file.h> */
43 #include <sys/filedesc.h>
45 #include <sys/kernel.h>
49 #include <sys/mutex.h>
50 #include <sys/namei.h>
52 #include <sys/protosw.h>
53 #include <sys/resourcevar.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/signalvar.h>
59 #include <sys/sysctl.h>
60 #include <sys/systm.h>
62 #include <sys/unpcb.h>
63 #include <sys/vnode.h>
67 static uma_zone_t unp_zone;
68 static unp_gen_t unp_gencnt;
69 static u_int unp_count;
71 static struct unp_head unp_shead, unp_dhead;
74 * Unix communications domain.
78 * rethink name space problems
79 * need a proper out-of-band
82 static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
83 static ino_t unp_ino; /* prototype for fake inode numbers */
84 struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
87 * Currently, UNIX domain sockets are protected by a single subsystem lock,
88 * which covers global data structures and variables, the contents of each
89 * per-socket unpcb structure, and the so_pcb field in sockets attached to
90 * the UNIX domain. This provides for a moderate degree of paralellism, as
91 * receive operations on UNIX domain sockets do not need to acquire the
92 * subsystem lock. Finer grained locking to permit send() without acquiring
93 * a global lock would be a logical next step.
95 * The UNIX domain socket lock preceds all socket layer locks, including the
96 * socket lock and socket buffer lock, permitting UNIX domain socket code to
97 * call into socket support routines without releasing its locks.
99 * Some caution is required in areas where the UNIX domain socket code enters
100 * VFS in order to create or find rendezvous points. This results in
101 * dropping of the UNIX domain socket subsystem lock, acquisition of the
102 * Giant lock, and potential sleeping. This increases the chances of races,
103 * and exposes weaknesses in the socket->protocol API by offering poor
106 static struct mtx unp_mtx;
107 #define UNP_LOCK_INIT() \
108 mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
109 #define UNP_LOCK() mtx_lock(&unp_mtx)
110 #define UNP_UNLOCK() mtx_unlock(&unp_mtx)
111 #define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED)
112 #define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED)
114 static int unp_attach(struct socket *);
115 static void unp_detach(struct unpcb *);
116 static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
117 static int unp_connect(struct socket *,struct sockaddr *, struct thread *);
118 static int unp_connect2(struct socket *so, struct socket *so2, int);
119 static void unp_disconnect(struct unpcb *);
120 static void unp_shutdown(struct unpcb *);
121 static void unp_drop(struct unpcb *, int);
122 static void unp_gc(void);
123 static void unp_scan(struct mbuf *, void (*)(struct file *));
124 static void unp_mark(struct file *);
125 static void unp_discard(struct file *);
126 static void unp_freerights(struct file **, int);
127 static int unp_internalize(struct mbuf **, struct thread *);
128 static int unp_listen(struct socket *, struct unpcb *, struct thread *);
131 uipc_abort(struct socket *so)
141 unp_drop(unp, ECONNABORTED);
151 uipc_accept(struct socket *so, struct sockaddr **nam)
154 const struct sockaddr *sa;
157 * Pass back name of connected socket,
158 * if it was bound and we are still connected
159 * (our peer may have closed already!).
161 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
166 free(*nam, M_SONAME);
170 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
171 sa = (struct sockaddr *) unp->unp_conn->unp_addr;
174 bcopy(sa, *nam, sa->sa_len);
180 uipc_attach(struct socket *so, int proto, struct thread *td)
182 struct unpcb *unp = sotounpcb(so);
186 return (unp_attach(so));
190 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
201 error = unp_bind(unp, nam, td);
207 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
212 KASSERT(td == curthread, ("uipc_connect: td != curthread"));
220 error = unp_connect(so, nam, td);
226 uipc_connect2(struct socket *so1, struct socket *so2)
232 unp = sotounpcb(so1);
237 error = unp_connect2(so1, so2, PRU_CONNECT2);
242 /* control is EOPNOTSUPP */
245 uipc_detach(struct socket *so)
261 uipc_disconnect(struct socket *so)
277 uipc_listen(struct socket *so, struct thread *td)
284 if (unp == NULL || unp->unp_vnode == NULL) {
288 error = unp_listen(so, unp, td);
294 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
297 const struct sockaddr *sa;
299 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
304 free(*nam, M_SONAME);
308 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
309 sa = (struct sockaddr *) unp->unp_conn->unp_addr;
312 * XXX: It seems that this test always fails even when
313 * connection is established. So, this else clause is
314 * added as workaround to return PF_LOCAL sockaddr.
318 bcopy(sa, *nam, sa->sa_len);
324 uipc_rcvd(struct socket *so, int flags)
336 switch (so->so_type) {
338 panic("uipc_rcvd DGRAM?");
342 if (unp->unp_conn == NULL)
344 so2 = unp->unp_conn->unp_socket;
345 SOCKBUF_LOCK(&so2->so_snd);
346 SOCKBUF_LOCK(&so->so_rcv);
348 * Adjust backpressure on sender
349 * and wakeup any waiting to write.
351 so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
352 unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
353 newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
355 (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
356 newhiwat, RLIM_INFINITY);
357 unp->unp_cc = so->so_rcv.sb_cc;
358 SOCKBUF_UNLOCK(&so->so_rcv);
359 sowwakeup_locked(so2);
363 panic("uipc_rcvd unknown socktype");
369 /* pru_rcvoob is EOPNOTSUPP */
372 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
373 struct mbuf *control, struct thread *td)
385 if (flags & PRUS_OOB) {
390 if (control != NULL && (error = unp_internalize(&control, td)))
398 goto dispose_release;
401 switch (so->so_type) {
404 const struct sockaddr *from;
407 if (unp->unp_conn != NULL) {
411 error = unp_connect(so, nam, td);
415 if (unp->unp_conn == NULL) {
420 so2 = unp->unp_conn->unp_socket;
421 if (unp->unp_addr != NULL)
422 from = (struct sockaddr *)unp->unp_addr;
425 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
426 control = unp_addsockcred(td, control);
427 SOCKBUF_LOCK(&so2->so_rcv);
428 if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
429 sorwakeup_locked(so2);
433 SOCKBUF_UNLOCK(&so2->so_rcv);
442 /* Connect if not connected yet. */
444 * Note: A better implementation would complain
445 * if not equal to the peer's address.
447 if ((so->so_state & SS_ISCONNECTED) == 0) {
449 error = unp_connect(so, nam, td);
458 SOCKBUF_LOCK(&so->so_snd);
459 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
460 SOCKBUF_UNLOCK(&so->so_snd);
464 if (unp->unp_conn == NULL)
465 panic("uipc_send connected but no connection?");
466 so2 = unp->unp_conn->unp_socket;
467 SOCKBUF_LOCK(&so2->so_rcv);
468 if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
470 * Credentials are passed only once on
473 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
474 control = unp_addsockcred(td, control);
477 * Send to paired receive port, and then reduce
478 * send buffer hiwater marks to maintain backpressure.
481 if (control != NULL) {
482 if (sbappendcontrol_locked(&so2->so_rcv, m, control))
485 sbappend_locked(&so2->so_rcv, m);
487 so->so_snd.sb_mbmax -=
488 so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
489 unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
490 newhiwat = so->so_snd.sb_hiwat -
491 (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
492 (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
493 newhiwat, RLIM_INFINITY);
494 SOCKBUF_UNLOCK(&so->so_snd);
495 unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
496 sorwakeup_locked(so2);
501 panic("uipc_send unknown socktype");
505 * SEND_EOF is equivalent to a SEND followed by
508 if (flags & PRUS_EOF) {
515 if (control != NULL && error != 0)
516 unp_dispose(control);
527 uipc_sense(struct socket *so, struct stat *sb)
538 sb->st_blksize = so->so_snd.sb_hiwat;
539 if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
540 so2 = unp->unp_conn->unp_socket;
541 sb->st_blksize += so2->so_rcv.sb_cc;
544 if (unp->unp_ino == 0)
545 unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
546 sb->st_ino = unp->unp_ino;
552 uipc_shutdown(struct socket *so)
569 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
572 const struct sockaddr *sa;
574 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
579 free(*nam, M_SONAME);
583 if (unp->unp_addr != NULL)
584 sa = (struct sockaddr *) unp->unp_addr;
587 bcopy(sa, *nam, sa->sa_len);
592 struct pr_usrreqs uipc_usrreqs = {
593 .pru_abort = uipc_abort,
594 .pru_accept = uipc_accept,
595 .pru_attach = uipc_attach,
596 .pru_bind = uipc_bind,
597 .pru_connect = uipc_connect,
598 .pru_connect2 = uipc_connect2,
599 .pru_detach = uipc_detach,
600 .pru_disconnect = uipc_disconnect,
601 .pru_listen = uipc_listen,
602 .pru_peeraddr = uipc_peeraddr,
603 .pru_rcvd = uipc_rcvd,
604 .pru_send = uipc_send,
605 .pru_sense = uipc_sense,
606 .pru_shutdown = uipc_shutdown,
607 .pru_sockaddr = uipc_sockaddr,
608 .pru_sosend = sosend,
609 .pru_soreceive = soreceive,
610 .pru_sopoll = sopoll,
614 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
620 if (sopt->sopt_level != 0)
631 switch (sopt->sopt_dir) {
633 switch (sopt->sopt_name) {
635 if (unp->unp_flags & UNP_HAVEPC)
636 xu = unp->unp_peercred;
638 if (so->so_type == SOCK_STREAM)
644 error = sooptcopyout(sopt, &xu, sizeof(xu));
647 optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
648 error = sooptcopyout(sopt, &optval, sizeof(optval));
651 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
652 error = sooptcopyout(sopt, &optval, sizeof(optval));
660 switch (sopt->sopt_name) {
663 error = sooptcopyin(sopt, &optval, sizeof(optval),
668 #define OPTSET(bit) \
670 unp->unp_flags |= bit; \
672 unp->unp_flags &= ~bit;
674 switch (sopt->sopt_name) {
676 OPTSET(UNP_WANTCRED);
679 OPTSET(UNP_CONNWAIT);
700 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
701 * for stream sockets, although the total for sender and receiver is
702 * actually only PIPSIZ.
703 * Datagram sockets really use the sendspace as the maximum datagram size,
704 * and don't really want to reserve the sendspace. Their recvspace should
705 * be large enough for at least one max-size datagram plus address.
710 static u_long unpst_sendspace = PIPSIZ;
711 static u_long unpst_recvspace = PIPSIZ;
712 static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
713 static u_long unpdg_recvspace = 4*1024;
715 static int unp_rights; /* file descriptors in flight */
717 SYSCTL_DECL(_net_local_stream);
718 SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
719 &unpst_sendspace, 0, "");
720 SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
721 &unpst_recvspace, 0, "");
722 SYSCTL_DECL(_net_local_dgram);
723 SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
724 &unpdg_sendspace, 0, "");
725 SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
726 &unpdg_recvspace, 0, "");
727 SYSCTL_DECL(_net_local);
728 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
731 unp_attach(struct socket *so)
736 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
737 switch (so->so_type) {
740 error = soreserve(so, unpst_sendspace, unpst_recvspace);
744 error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
753 unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
756 LIST_INIT(&unp->unp_refs);
757 unp->unp_socket = so;
761 unp->unp_gencnt = ++unp_gencnt;
763 LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
764 : &unp_shead, unp, unp_link);
771 unp_detach(struct unpcb *unp)
777 LIST_REMOVE(unp, unp_link);
778 unp->unp_gencnt = ++unp_gencnt;
780 if ((vp = unp->unp_vnode) != NULL) {
782 * XXXRW: should v_socket be frobbed only while holding
785 unp->unp_vnode->v_socket = NULL;
786 unp->unp_vnode = NULL;
788 if (unp->unp_conn != NULL)
790 while (!LIST_EMPTY(&unp->unp_refs)) {
791 struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
792 unp_drop(ref, ECONNRESET);
794 soisdisconnected(unp->unp_socket);
795 unp->unp_socket->so_pcb = NULL;
798 * Normally the receive buffer is flushed later,
799 * in sofree, but if our receive buffer holds references
800 * to descriptors that are now garbage, we will dispose
801 * of those descriptor references after the garbage collector
802 * gets them (resulting in a "panic: closef: count < 0").
804 sorflush(unp->unp_socket);
805 unp_gc(); /* Will unlock UNP. */
809 if (unp->unp_addr != NULL)
810 FREE(unp->unp_addr, M_SONAME);
811 uma_zfree(unp_zone, unp);
820 unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
822 struct sockaddr_un *soun = (struct sockaddr_un *)nam;
833 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
834 * unlocked read here is fine, but the value of unp_vnode needs
835 * to be tested again after we do all the lookups to see if the
836 * pcb is still unbound?
838 if (unp->unp_vnode != NULL)
841 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
847 buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
848 strlcpy(buf, soun->sun_path, namelen + 1);
852 mtx_assert(&Giant, MA_OWNED);
853 NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
855 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
860 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
861 NDFREE(&nd, NDF_ONLY_PNBUF);
871 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
877 vattr.va_type = VSOCK;
878 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
880 error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
884 VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
885 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
887 NDFREE(&nd, NDF_ONLY_PNBUF);
890 vn_finished_write(mp);
894 ASSERT_VOP_LOCKED(vp, "unp_bind");
895 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
897 vp->v_socket = unp->unp_socket;
899 unp->unp_addr = soun;
901 VOP_UNLOCK(vp, 0, td);
902 vn_finished_write(mp);
911 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
913 struct sockaddr_un *soun = (struct sockaddr_un *)nam;
915 struct socket *so2, *so3;
916 struct unpcb *unp, *unp2, *unp3;
919 char buf[SOCK_MAXADDRLEN];
925 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
928 strlcpy(buf, soun->sun_path, len + 1);
930 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
932 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
938 ASSERT_VOP_LOCKED(vp, "unp_connect");
939 NDFREE(&nd, NDF_ONLY_PNBUF);
943 if (vp->v_type != VSOCK) {
947 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
959 error = ECONNREFUSED;
962 if (so->so_type != so2->so_type) {
966 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
967 if (so2->so_options & SO_ACCEPTCONN) {
969 * NB: drop locks here so unp_attach is entered
970 * w/o locks; this avoids a recursive lock
971 * of the head and holding sleep locks across
972 * a (potentially) blocking malloc.
975 so3 = sonewconn(so2, 0);
980 error = ECONNREFUSED;
984 unp2 = sotounpcb(so2);
985 unp3 = sotounpcb(so3);
986 if (unp2->unp_addr != NULL) {
987 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
988 unp3->unp_addr = (struct sockaddr_un *) sa;
992 * unp_peercred management:
994 * The connecter's (client's) credentials are copied
995 * from its process structure at the time of connect()
998 cru2x(td->td_ucred, &unp3->unp_peercred);
999 unp3->unp_flags |= UNP_HAVEPC;
1001 * The receiver's (server's) credentials are copied
1002 * from the unp_peercred member of socket on which the
1003 * former called listen(); unp_listen() cached that
1004 * process's credentials at that time so we can use
1007 KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
1008 ("unp_connect: listener without cached peercred"));
1009 memcpy(&unp->unp_peercred, &unp2->unp_peercred,
1010 sizeof(unp->unp_peercred));
1011 unp->unp_flags |= UNP_HAVEPC;
1014 mac_set_socket_peer_from_socket(so, so3);
1015 mac_set_socket_peer_from_socket(so3, so);
1021 error = unp_connect2(so, so2, PRU_CONNECT);
1026 mtx_assert(&Giant, MA_OWNED);
1036 unp_connect2(struct socket *so, struct socket *so2, int req)
1038 struct unpcb *unp = sotounpcb(so);
1043 if (so2->so_type != so->so_type)
1044 return (EPROTOTYPE);
1045 unp2 = sotounpcb(so2);
1046 unp->unp_conn = unp2;
1047 switch (so->so_type) {
1050 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1055 unp2->unp_conn = unp;
1056 if (req == PRU_CONNECT &&
1057 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1065 panic("unp_connect2");
1071 unp_disconnect(struct unpcb *unp)
1073 struct unpcb *unp2 = unp->unp_conn;
1080 unp->unp_conn = NULL;
1081 switch (unp->unp_socket->so_type) {
1084 LIST_REMOVE(unp, unp_reflink);
1085 so = unp->unp_socket;
1087 so->so_state &= ~SS_ISCONNECTED;
1092 soisdisconnected(unp->unp_socket);
1093 unp2->unp_conn = NULL;
1094 soisdisconnected(unp2->unp_socket);
1101 unp_abort(struct unpcb *unp)
1105 UNP_UNLOCK_ASSERT();
1110 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
1111 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
1112 * are safe to reference. It first scans the list of struct unpcb's to
1113 * generate a pointer list, then it rescans its list one entry at a time to
1114 * externalize and copyout. It checks the generation number to see if a
1115 * struct unpcb has been reused, and will skip it if so.
1118 unp_pcblist(SYSCTL_HANDLER_ARGS)
1121 struct unpcb *unp, **unp_list;
1123 struct xunpgen *xug;
1124 struct unp_head *head;
1127 head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1130 * The process of preparing the PCB list is too time-consuming and
1131 * resource-intensive to repeat twice on every request.
1133 if (req->oldptr == NULL) {
1135 req->oldidx = 2 * (sizeof *xug)
1136 + (n + n/8) * sizeof(struct xunpcb);
1140 if (req->newptr != NULL)
1144 * OK, now we're committed to doing something.
1146 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1148 gencnt = unp_gencnt;
1152 xug->xug_len = sizeof *xug;
1154 xug->xug_gen = gencnt;
1155 xug->xug_sogen = so_gencnt;
1156 error = SYSCTL_OUT(req, xug, sizeof *xug);
1162 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1165 for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1166 unp = LIST_NEXT(unp, unp_link)) {
1167 if (unp->unp_gencnt <= gencnt) {
1168 if (cr_cansee(req->td->td_ucred,
1169 unp->unp_socket->so_cred))
1171 unp_list[i++] = unp;
1175 n = i; /* in case we lost some during malloc */
1178 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
1179 for (i = 0; i < n; i++) {
1181 if (unp->unp_gencnt <= gencnt) {
1182 xu->xu_len = sizeof *xu;
1185 * XXX - need more locking here to protect against
1186 * connect/disconnect races for SMP.
1188 if (unp->unp_addr != NULL)
1189 bcopy(unp->unp_addr, &xu->xu_addr,
1190 unp->unp_addr->sun_len);
1191 if (unp->unp_conn != NULL &&
1192 unp->unp_conn->unp_addr != NULL)
1193 bcopy(unp->unp_conn->unp_addr,
1195 unp->unp_conn->unp_addr->sun_len);
1196 bcopy(unp, &xu->xu_unp, sizeof *unp);
1197 sotoxsocket(unp->unp_socket, &xu->xu_socket);
1198 error = SYSCTL_OUT(req, xu, sizeof *xu);
1204 * Give the user an updated idea of our state.
1205 * If the generation differs from what we told
1206 * her before, she knows that something happened
1207 * while we were processing this request, and it
1208 * might be necessary to retry.
1210 xug->xug_gen = unp_gencnt;
1211 xug->xug_sogen = so_gencnt;
1212 xug->xug_count = unp_count;
1213 error = SYSCTL_OUT(req, xug, sizeof *xug);
1215 free(unp_list, M_TEMP);
1220 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1221 (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1222 "List of active local datagram sockets");
1223 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1224 (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1225 "List of active local stream sockets");
1228 unp_shutdown(struct unpcb *unp)
1234 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1235 (so = unp->unp_conn->unp_socket))
1240 unp_drop(struct unpcb *unp, int errno)
1242 struct socket *so = unp->unp_socket;
1246 so->so_error = errno;
1247 unp_disconnect(unp);
1259 unp_freerights(struct file **rp, int fdcount)
1264 for (i = 0; i < fdcount; i++) {
1267 * zero the pointer before calling
1268 * unp_discard since it may end up
1277 unp_externalize(struct mbuf *control, struct mbuf **controlp)
1279 struct thread *td = curthread; /* XXX */
1280 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1286 socklen_t clen = control->m_len, datalen;
1291 UNP_UNLOCK_ASSERT();
1294 if (controlp != NULL) /* controlp == NULL => free control messages */
1297 while (cm != NULL) {
1298 if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1303 data = CMSG_DATA(cm);
1304 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1306 if (cm->cmsg_level == SOL_SOCKET
1307 && cm->cmsg_type == SCM_RIGHTS) {
1308 newfds = datalen / sizeof(struct file *);
1311 /* If we're not outputting the descriptors free them. */
1312 if (error || controlp == NULL) {
1313 unp_freerights(rp, newfds);
1316 FILEDESC_LOCK(td->td_proc->p_fd);
1317 /* if the new FD's will not fit free them. */
1318 if (!fdavail(td, newfds)) {
1319 FILEDESC_UNLOCK(td->td_proc->p_fd);
1321 unp_freerights(rp, newfds);
1325 * now change each pointer to an fd in the global
1326 * table to an integer that is the index to the
1327 * local fd table entry that we set up to point
1328 * to the global one we are transferring.
1330 newlen = newfds * sizeof(int);
1331 *controlp = sbcreatecontrol(NULL, newlen,
1332 SCM_RIGHTS, SOL_SOCKET);
1333 if (*controlp == NULL) {
1334 FILEDESC_UNLOCK(td->td_proc->p_fd);
1336 unp_freerights(rp, newfds);
1341 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1342 for (i = 0; i < newfds; i++) {
1343 if (fdalloc(td, 0, &f))
1344 panic("unp_externalize fdalloc failed");
1346 td->td_proc->p_fd->fd_ofiles[f] = fp;
1353 FILEDESC_UNLOCK(td->td_proc->p_fd);
1354 } else { /* We can just copy anything else across */
1355 if (error || controlp == NULL)
1357 *controlp = sbcreatecontrol(NULL, datalen,
1358 cm->cmsg_type, cm->cmsg_level);
1359 if (*controlp == NULL) {
1364 CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1368 controlp = &(*controlp)->m_next;
1371 if (CMSG_SPACE(datalen) < clen) {
1372 clen -= CMSG_SPACE(datalen);
1373 cm = (struct cmsghdr *)
1374 ((caddr_t)cm + CMSG_SPACE(datalen));
1389 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1390 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1391 if (unp_zone == NULL)
1393 uma_zone_set_max(unp_zone, nmbclusters);
1394 LIST_INIT(&unp_dhead);
1395 LIST_INIT(&unp_shead);
1401 unp_internalize(struct mbuf **controlp, struct thread *td)
1403 struct mbuf *control = *controlp;
1404 struct proc *p = td->td_proc;
1405 struct filedesc *fdescp = p->p_fd;
1406 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1407 struct cmsgcred *cmcred;
1413 socklen_t clen = control->m_len, datalen;
1417 UNP_UNLOCK_ASSERT();
1422 while (cm != NULL) {
1423 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1424 || cm->cmsg_len > clen) {
1429 data = CMSG_DATA(cm);
1430 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1432 switch (cm->cmsg_type) {
1434 * Fill in credential information.
1437 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1438 SCM_CREDS, SOL_SOCKET);
1439 if (*controlp == NULL) {
1444 cmcred = (struct cmsgcred *)
1445 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1446 cmcred->cmcred_pid = p->p_pid;
1447 cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1448 cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1449 cmcred->cmcred_euid = td->td_ucred->cr_uid;
1450 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1452 for (i = 0; i < cmcred->cmcred_ngroups; i++)
1453 cmcred->cmcred_groups[i] =
1454 td->td_ucred->cr_groups[i];
1458 oldfds = datalen / sizeof (int);
1460 * check that all the FDs passed in refer to legal files
1461 * If not, reject the entire operation.
1464 FILEDESC_LOCK(fdescp);
1465 for (i = 0; i < oldfds; i++) {
1467 if ((unsigned)fd >= fdescp->fd_nfiles ||
1468 fdescp->fd_ofiles[fd] == NULL) {
1469 FILEDESC_UNLOCK(fdescp);
1473 fp = fdescp->fd_ofiles[fd];
1474 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1475 FILEDESC_UNLOCK(fdescp);
1482 * Now replace the integer FDs with pointers to
1483 * the associated global file table entry..
1485 newlen = oldfds * sizeof(struct file *);
1486 *controlp = sbcreatecontrol(NULL, newlen,
1487 SCM_RIGHTS, SOL_SOCKET);
1488 if (*controlp == NULL) {
1489 FILEDESC_UNLOCK(fdescp);
1495 rp = (struct file **)
1496 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1497 for (i = 0; i < oldfds; i++) {
1498 fp = fdescp->fd_ofiles[*fdp++];
1506 FILEDESC_UNLOCK(fdescp);
1510 *controlp = sbcreatecontrol(NULL, sizeof(*tv),
1511 SCM_TIMESTAMP, SOL_SOCKET);
1512 if (*controlp == NULL) {
1516 tv = (struct timeval *)
1517 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1526 controlp = &(*controlp)->m_next;
1528 if (CMSG_SPACE(datalen) < clen) {
1529 clen -= CMSG_SPACE(datalen);
1530 cm = (struct cmsghdr *)
1531 ((caddr_t)cm + CMSG_SPACE(datalen));
1545 unp_addsockcred(struct thread *td, struct mbuf *control)
1548 struct sockcred *sc;
1552 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
1554 m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
1559 sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
1560 sc->sc_uid = td->td_ucred->cr_ruid;
1561 sc->sc_euid = td->td_ucred->cr_uid;
1562 sc->sc_gid = td->td_ucred->cr_rgid;
1563 sc->sc_egid = td->td_ucred->cr_gid;
1564 sc->sc_ngroups = ngroups;
1565 for (i = 0; i < sc->sc_ngroups; i++)
1566 sc->sc_groups[i] = td->td_ucred->cr_groups[i];
1569 * If a control message already exists, append us to the end.
1571 if (control != NULL) {
1572 for (n = control; n->m_next != NULL; n = n->m_next)
1582 * unp_defer is thread-local during garbage collection, and does not require
1583 * explicit synchronization. unp_gcing prevents other threads from entering
1584 * garbage collection, and perhaps should be an sx lock instead.
1586 static int unp_defer, unp_gcing;
1591 struct file *fp, *nextfp;
1593 struct file **extra_ref, **fpp;
1596 int nfiles_slack = 20;
1608 * before going through all this, set all FDs to
1609 * be NOT defered and NOT externally accessible
1611 sx_slock(&filelist_lock);
1612 LIST_FOREACH(fp, &filehead, f_list)
1613 fp->f_gcflag &= ~(FMARK|FDEFER);
1615 LIST_FOREACH(fp, &filehead, f_list) {
1618 * If the file is not open, skip it
1620 if (fp->f_count == 0) {
1625 * If we already marked it as 'defer' in a
1626 * previous pass, then try process it this time
1629 if (fp->f_gcflag & FDEFER) {
1630 fp->f_gcflag &= ~FDEFER;
1634 * if it's not defered, then check if it's
1635 * already marked.. if so skip it
1637 if (fp->f_gcflag & FMARK) {
1642 * If all references are from messages
1643 * in transit, then skip it. it's not
1644 * externally accessible.
1646 if (fp->f_count == fp->f_msgcount) {
1651 * If it got this far then it must be
1652 * externally accessible.
1654 fp->f_gcflag |= FMARK;
1657 * either it was defered, or it is externally
1658 * accessible and not already marked so.
1659 * Now check if it is possibly one of OUR sockets.
1661 if (fp->f_type != DTYPE_SOCKET ||
1662 (so = fp->f_data) == NULL) {
1667 if (so->so_proto->pr_domain != &localdomain ||
1668 (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1671 if (so->so_rcv.sb_flags & SB_LOCK) {
1673 * This is problematical; it's not clear
1674 * we need to wait for the sockbuf to be
1675 * unlocked (on a uniprocessor, at least),
1676 * and it's also not clear what to do
1677 * if sbwait returns an error due to receipt
1678 * of a signal. If sbwait does return
1679 * an error, we'll go into an infinite
1680 * loop. Delete all of this for now.
1682 (void) sbwait(&so->so_rcv);
1687 * So, Ok, it's one of our sockets and it IS externally
1688 * accessible (or was defered). Now we look
1689 * to see if we hold any file descriptors in its
1690 * message buffers. Follow those links and mark them
1691 * as accessible too.
1693 SOCKBUF_LOCK(&so->so_rcv);
1694 unp_scan(so->so_rcv.sb_mb, unp_mark);
1695 SOCKBUF_UNLOCK(&so->so_rcv);
1697 } while (unp_defer);
1698 sx_sunlock(&filelist_lock);
1700 * We grab an extra reference to each of the file table entries
1701 * that are not otherwise accessible and then free the rights
1702 * that are stored in messages on them.
1704 * The bug in the orginal code is a little tricky, so I'll describe
1705 * what's wrong with it here.
1707 * It is incorrect to simply unp_discard each entry for f_msgcount
1708 * times -- consider the case of sockets A and B that contain
1709 * references to each other. On a last close of some other socket,
1710 * we trigger a gc since the number of outstanding rights (unp_rights)
1711 * is non-zero. If during the sweep phase the gc code un_discards,
1712 * we end up doing a (full) closef on the descriptor. A closef on A
1713 * results in the following chain. Closef calls soo_close, which
1714 * calls soclose. Soclose calls first (through the switch
1715 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
1716 * returns because the previous instance had set unp_gcing, and
1717 * we return all the way back to soclose, which marks the socket
1718 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
1719 * to free up the rights that are queued in messages on the socket A,
1720 * i.e., the reference on B. The sorflush calls via the dom_dispose
1721 * switch unp_dispose, which unp_scans with unp_discard. This second
1722 * instance of unp_discard just calls closef on B.
1724 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1725 * which results in another closef on A. Unfortunately, A is already
1726 * being closed, and the descriptor has already been marked with
1727 * SS_NOFDREF, and soclose panics at this point.
1729 * Here, we first take an extra reference to each inaccessible
1730 * descriptor. Then, we call sorflush ourself, since we know
1731 * it is a Unix domain socket anyhow. After we destroy all the
1732 * rights carried in messages, we do a last closef to get rid
1733 * of our extra reference. This is the last close, and the
1734 * unp_detach etc will shut down the socket.
1736 * 91/09/19, bsy@cs.cmu.edu
1739 nfiles_snap = openfiles + nfiles_slack; /* some slack */
1740 extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
1742 sx_slock(&filelist_lock);
1743 if (nfiles_snap < openfiles) {
1744 sx_sunlock(&filelist_lock);
1745 free(extra_ref, M_TEMP);
1749 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1750 fp != NULL; fp = nextfp) {
1751 nextfp = LIST_NEXT(fp, f_list);
1754 * If it's not open, skip it
1756 if (fp->f_count == 0) {
1761 * If all refs are from msgs, and it's not marked accessible
1762 * then it must be referenced from some unreachable cycle
1763 * of (shut-down) FDs, so include it in our
1764 * list of FDs to remove
1766 if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1773 sx_sunlock(&filelist_lock);
1775 * for each FD on our hit list, do the following two things
1777 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1778 struct file *tfp = *fpp;
1780 if (tfp->f_type == DTYPE_SOCKET &&
1781 tfp->f_data != NULL) {
1783 sorflush(tfp->f_data);
1788 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1789 closef(*fpp, (struct thread *) NULL);
1790 free(extra_ref, M_TEMP);
1793 UNP_UNLOCK_ASSERT();
1797 unp_dispose(struct mbuf *m)
1801 unp_scan(m, unp_discard);
1805 unp_listen(struct socket *so, struct unpcb *unp, struct thread *td)
1812 error = solisten_proto_check(so);
1814 cru2x(td->td_ucred, &unp->unp_peercred);
1815 unp->unp_flags |= UNP_HAVEPCCACHED;
1823 unp_scan(struct mbuf *m0, void (*op)(struct file *))
1830 socklen_t clen, datalen;
1833 while (m0 != NULL) {
1834 for (m = m0; m; m = m->m_next) {
1835 if (m->m_type != MT_CONTROL)
1838 cm = mtod(m, struct cmsghdr *);
1841 while (cm != NULL) {
1842 if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1845 data = CMSG_DATA(cm);
1846 datalen = (caddr_t)cm + cm->cmsg_len
1849 if (cm->cmsg_level == SOL_SOCKET &&
1850 cm->cmsg_type == SCM_RIGHTS) {
1851 qfds = datalen / sizeof (struct file *);
1853 for (i = 0; i < qfds; i++)
1857 if (CMSG_SPACE(datalen) < clen) {
1858 clen -= CMSG_SPACE(datalen);
1859 cm = (struct cmsghdr *)
1860 ((caddr_t)cm + CMSG_SPACE(datalen));
1872 unp_mark(struct file *fp)
1874 if (fp->f_gcflag & FMARK)
1877 fp->f_gcflag |= (FMARK|FDEFER);
1881 unp_discard(struct file *fp)
1887 (void) closef(fp, (struct thread *)NULL);