sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3  *      The Regents of the University of California.
   4  * Copyright (c) 2004 The FreeBSD Foundation
   5  * Copyright (c) 2004-2008 Robert N. M. Watson
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  */
  96
  97 #include <sys/cdefs.h>
  98 __FBSDID("$FreeBSD$");
  99
 100 #include "opt_inet.h"
 101 #include "opt_inet6.h"
 102 #include "opt_zero.h"
 103 #include "opt_compat.h"
 104
 105 #include <sys/param.h>
 106 #include <sys/systm.h>
 107 #include <sys/fcntl.h>
 108 #include <sys/limits.h>
 109 #include <sys/lock.h>
 110 #include <sys/mac.h>
 111 #include <sys/malloc.h>
 112 #include <sys/mbuf.h>
 113 #include <sys/mutex.h>
 114 #include <sys/domain.h>
 115 #include <sys/file.h>                   /* for struct knote */
 116 #include <sys/kernel.h>
 117 #include <sys/event.h>
 118 #include <sys/eventhandler.h>
 119 #include <sys/poll.h>
 120 #include <sys/proc.h>
 121 #include <sys/protosw.h>
 122 #include <sys/socket.h>
 123 #include <sys/socketvar.h>
 124 #include <sys/resourcevar.h>
 125 #include <net/route.h>
 126 #include <sys/signalvar.h>
 127 #include <sys/stat.h>
 128 #include <sys/sx.h>
 129 #include <sys/sysctl.h>
 130 #include <sys/uio.h>
 131 #include <sys/jail.h>
 132
 133 #include <net/vnet.h>
 134
 135 #include <security/mac/mac_framework.h>
 136
 137 #include <vm/uma.h>
 138
 139 #ifdef COMPAT_FREEBSD32
 140 #include <sys/mount.h>
 141 #include <sys/sysent.h>
 142 #include <compat/freebsd32/freebsd32.h>
 143 #endif
 144
 145 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 146                     int flags);
 147
 148 static void     filt_sordetach(struct knote *kn);
 149 static int      filt_soread(struct knote *kn, long hint);
 150 static void     filt_sowdetach(struct knote *kn);
 151 static int      filt_sowrite(struct knote *kn, long hint);
 152 static int      filt_solisten(struct knote *kn, long hint);
 153
 154 static struct filterops solisten_filtops =
 155         { 1, NULL, filt_sordetach, filt_solisten };
 156 static struct filterops soread_filtops =
 157         { 1, NULL, filt_sordetach, filt_soread };
 158 static struct filterops sowrite_filtops =
 159         { 1, NULL, filt_sowdetach, filt_sowrite };
 160
 161 uma_zone_t socket_zone;
 162 so_gen_t        so_gencnt;      /* generation count for sockets */
 163
 164 int     maxsockets;
 165
 166 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 167 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 168
 169 static int somaxconn = SOMAXCONN;
 170 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
 171 /* XXX: we dont have SYSCTL_USHORT */
 172 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
 173     0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
 174     "queue size");
 175 static int numopensockets;
 176 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 177     &numopensockets, 0, "Number of open sockets");
 178 #ifdef ZERO_COPY_SOCKETS
 179 /* These aren't static because they're used in other files. */
 180 int so_zero_copy_send = 1;
 181 int so_zero_copy_receive = 1;
 182 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
 183     "Zero copy controls");
 184 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
 185     &so_zero_copy_receive, 0, "Enable zero copy receive");
 186 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
 187     &so_zero_copy_send, 0, "Enable zero copy send");
 188 #endif /* ZERO_COPY_SOCKETS */
 189
 190 /*
 191  * accept_mtx locks down per-socket fields relating to accept queues.  See
 192  * socketvar.h for an annotation of the protected fields of struct socket.
 193  */
 194 struct mtx accept_mtx;
 195 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 196
 197 /*
 198  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 199  * so_gencnt field.
 200  */
 201 static struct mtx so_global_mtx;
 202 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 203
 204 /*
 205  * General IPC sysctl name space, used by sockets and a variety of other IPC
 206  * types.
 207  */
 208 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 209
 210 /*
 211  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 212  * of the change so that they can update their dependent limits as required.
 213  */
 214 static int
 215 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 216 {
 217         int error, newmaxsockets;
 218
 219         newmaxsockets = maxsockets;
 220         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 221         if (error == 0 && req->newptr) {
 222                 if (newmaxsockets > maxsockets) {
 223                         maxsockets = newmaxsockets;
 224                         if (maxsockets > ((maxfiles / 4) * 3)) {
 225                                 maxfiles = (maxsockets * 5) / 4;
 226                                 maxfilesperproc = (maxfiles * 9) / 10;
 227                         }
 228                         EVENTHANDLER_INVOKE(maxsockets_change);
 229                 } else
 230                         error = EINVAL;
 231         }
 232         return (error);
 233 }
 234
 235 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 236     &maxsockets, 0, sysctl_maxsockets, "IU",
 237     "Maximum number of sockets avaliable");
 238
 239 /*
 240  * Initialise maxsockets.  This SYSINIT must be run after
 241  * tunable_mbinit().
 242  */
 243 static void
 244 init_maxsockets(void *ignored)
 245 {
 246
 247         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 248         maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
 249 }
 250 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 251
 252 /*
 253  * Socket operation routines.  These routines are called by the routines in
 254  * sys_socket.c or from a system process, and implement the semantics of
 255  * socket operations by switching out to the protocol specific routines.
 256  */
 257
 258 /*
 259  * Get a socket structure from our zone, and initialize it.  Note that it
 260  * would probably be better to allocate socket and PCB at the same time, but
 261  * I'm not convinced that all the protocols can be easily modified to do
 262  * this.
 263  *
 264  * soalloc() returns a socket with a ref count of 0.
 265  */
 266 static struct socket *
 267 soalloc(struct vnet *vnet)
 268 {
 269         struct socket *so;
 270
 271         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 272         if (so == NULL)
 273                 return (NULL);
 274 #ifdef MAC
 275         if (mac_socket_init(so, M_NOWAIT) != 0) {
 276                 uma_zfree(socket_zone, so);
 277                 return (NULL);
 278         }
 279 #endif
 280         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 281         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 282         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 283         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 284         TAILQ_INIT(&so->so_aiojobq);
 285         mtx_lock(&so_global_mtx);
 286         so->so_gencnt = ++so_gencnt;
 287         ++numopensockets;
 288 #ifdef VIMAGE
 289         vnet->vnet_sockcnt++;
 290         so->so_vnet = vnet;
 291 #endif
 292         mtx_unlock(&so_global_mtx);
 293         return (so);
 294 }
 295
 296 /*
 297  * Free the storage associated with a socket at the socket layer, tear down
 298  * locks, labels, etc.  All protocol state is assumed already to have been
 299  * torn down (and possibly never set up) by the caller.
 300  */
 301 static void
 302 sodealloc(struct socket *so)
 303 {
 304
 305         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 306         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 307
 308         mtx_lock(&so_global_mtx);
 309         so->so_gencnt = ++so_gencnt;
 310         --numopensockets;       /* Could be below, but faster here. */
 311 #ifdef VIMAGE
 312         so->so_vnet->vnet_sockcnt--;
 313 #endif
 314         mtx_unlock(&so_global_mtx);
 315         if (so->so_rcv.sb_hiwat)
 316                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 317                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 318         if (so->so_snd.sb_hiwat)
 319                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 320                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 321 #ifdef INET
 322         /* remove acccept filter if one is present. */
 323         if (so->so_accf != NULL)
 324                 do_setopt_accept_filter(so, NULL);
 325 #endif
 326 #ifdef MAC
 327         mac_socket_destroy(so);
 328 #endif
 329         crfree(so->so_cred);
 330         sx_destroy(&so->so_snd.sb_sx);
 331         sx_destroy(&so->so_rcv.sb_sx);
 332         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 333         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 334         uma_zfree(socket_zone, so);
 335 }
 336
 337 /*
 338  * socreate returns a socket with a ref count of 1.  The socket should be
 339  * closed with soclose().
 340  */
 341 int
 342 socreate(int dom, struct socket **aso, int type, int proto,
 343     struct ucred *cred, struct thread *td)
 344 {
 345         struct protosw *prp;
 346         struct socket *so;
 347         int error;
 348
 349         if (proto)
 350                 prp = pffindproto(dom, proto, type);
 351         else
 352                 prp = pffindtype(dom, type);
 353
 354         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
 355             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 356                 return (EPROTONOSUPPORT);
 357
 358         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 359                 return (EPROTONOSUPPORT);
 360
 361         if (prp->pr_type != type)
 362                 return (EPROTOTYPE);
 363         so = soalloc(CRED_TO_VNET(cred));
 364         if (so == NULL)
 365                 return (ENOBUFS);
 366
 367         TAILQ_INIT(&so->so_incomp);
 368         TAILQ_INIT(&so->so_comp);
 369         so->so_type = type;
 370         so->so_cred = crhold(cred);
 371         if ((prp->pr_domain->dom_family == PF_INET) ||
 372             (prp->pr_domain->dom_family == PF_ROUTE))
 373                 so->so_fibnum = td->td_proc->p_fibnum;
 374         else
 375                 so->so_fibnum = 0;
 376         so->so_proto = prp;
 377 #ifdef MAC
 378         mac_socket_create(cred, so);
 379 #endif
 380         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 381         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 382         so->so_count = 1;
 383         /*
 384          * Auto-sizing of socket buffers is managed by the protocols and
 385          * the appropriate flags must be set in the pru_attach function.
 386          */
 387         CURVNET_SET(so->so_vnet);
 388         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 389         CURVNET_RESTORE();
 390         if (error) {
 391                 KASSERT(so->so_count == 1, ("socreate: so_count %d",
 392                     so->so_count));
 393                 so->so_count = 0;
 394                 sodealloc(so);
 395                 return (error);
 396         }
 397         *aso = so;
 398         return (0);
 399 }
 400
 401 #ifdef REGRESSION
 402 static int regression_sonewconn_earlytest = 1;
 403 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 404     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 405 #endif
 406
 407 /*
 408  * When an attempt at a new connection is noted on a socket which accepts
 409  * connections, sonewconn is called.  If the connection is possible (subject
 410  * to space constraints, etc.) then we allocate a new structure, propoerly
 411  * linked into the data structure of the original socket, and return this.
 412  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 413  *
 414  * Note: the ref count on the socket is 0 on return.
 415  */
 416 struct socket *
 417 sonewconn(struct socket *head, int connstatus)
 418 {
 419         struct socket *so;
 420         int over;
 421
 422         ACCEPT_LOCK();
 423         over = (head->so_qlen > 3 * head->so_qlimit / 2);
 424         ACCEPT_UNLOCK();
 425 #ifdef REGRESSION
 426         if (regression_sonewconn_earlytest && over)
 427 #else
 428         if (over)
 429 #endif
 430                 return (NULL);
 431         VNET_ASSERT(head->so_vnet);
 432         so = soalloc(head->so_vnet);
 433         if (so == NULL)
 434                 return (NULL);
 435         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 436                 connstatus = 0;
 437         so->so_head = head;
 438         so->so_type = head->so_type;
 439         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 440         so->so_linger = head->so_linger;
 441         so->so_state = head->so_state | SS_NOFDREF;
 442         so->so_fibnum = head->so_fibnum;
 443         so->so_proto = head->so_proto;
 444         so->so_cred = crhold(head->so_cred);
 445 #ifdef MAC
 446         mac_socket_newconn(head, so);
 447 #endif
 448         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 449         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 450         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
 451             (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 452                 sodealloc(so);
 453                 return (NULL);
 454         }
 455         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 456         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 457         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 458         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 459         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 460         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 461         so->so_state |= connstatus;
 462         ACCEPT_LOCK();
 463         if (connstatus) {
 464                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 465                 so->so_qstate |= SQ_COMP;
 466                 head->so_qlen++;
 467         } else {
 468                 /*
 469                  * Keep removing sockets from the head until there's room for
 470                  * us to insert on the tail.  In pre-locking revisions, this
 471                  * was a simple if(), but as we could be racing with other
 472                  * threads and soabort() requires dropping locks, we must
 473                  * loop waiting for the condition to be true.
 474                  */
 475                 while (head->so_incqlen > head->so_qlimit) {
 476                         struct socket *sp;
 477                         sp = TAILQ_FIRST(&head->so_incomp);
 478                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 479                         head->so_incqlen--;
 480                         sp->so_qstate &= ~SQ_INCOMP;
 481                         sp->so_head = NULL;
 482                         ACCEPT_UNLOCK();
 483                         soabort(sp);
 484                         ACCEPT_LOCK();
 485                 }
 486                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 487                 so->so_qstate |= SQ_INCOMP;
 488                 head->so_incqlen++;
 489         }
 490         ACCEPT_UNLOCK();
 491         if (connstatus) {
 492                 sorwakeup(head);
 493                 wakeup_one(&head->so_timeo);
 494         }
 495         return (so);
 496 }
 497
 498 int
 499 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 500 {
 501         int error;
 502
 503         CURVNET_SET(so->so_vnet);
 504         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 505         CURVNET_RESTORE();
 506         return error;
 507 }
 508
 509 /*
 510  * solisten() transitions a socket from a non-listening state to a listening
 511  * state, but can also be used to update the listen queue depth on an
 512  * existing listen socket.  The protocol will call back into the sockets
 513  * layer using solisten_proto_check() and solisten_proto() to check and set
 514  * socket-layer listen state.  Call backs are used so that the protocol can
 515  * acquire both protocol and socket layer locks in whatever order is required
 516  * by the protocol.
 517  *
 518  * Protocol implementors are advised to hold the socket lock across the
 519  * socket-layer test and set to avoid races at the socket layer.
 520  */
 521 int
 522 solisten(struct socket *so, int backlog, struct thread *td)
 523 {
 524
 525         return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
 526 }
 527
 528 int
 529 solisten_proto_check(struct socket *so)
 530 {
 531
 532         SOCK_LOCK_ASSERT(so);
 533
 534         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 535             SS_ISDISCONNECTING))
 536                 return (EINVAL);
 537         return (0);
 538 }
 539
 540 void
 541 solisten_proto(struct socket *so, int backlog)
 542 {
 543
 544         SOCK_LOCK_ASSERT(so);
 545
 546         if (backlog < 0 || backlog > somaxconn)
 547                 backlog = somaxconn;
 548         so->so_qlimit = backlog;
 549         so->so_options |= SO_ACCEPTCONN;
 550 }
 551
 552 /*
 553  * Attempt to free a socket.  This should really be sotryfree().
 554  *
 555  * sofree() will succeed if:
 556  *
 557  * - There are no outstanding file descriptor references or related consumers
 558  *   (so_count == 0).
 559  *
 560  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 561  *
 562  * - The protocol does not have an outstanding strong reference on the socket
 563  *   (SS_PROTOREF).
 564  *
 565  * - The socket is not in a completed connection queue, so a process has been
 566  *   notified that it is present.  If it is removed, the user process may
 567  *   block in accept() despite select() saying the socket was ready.
 568  *
 569  * Otherwise, it will quietly abort so that a future call to sofree(), when
 570  * conditions are right, can succeed.
 571  */
 572 void
 573 sofree(struct socket *so)
 574 {
 575         struct protosw *pr = so->so_proto;
 576         struct socket *head;
 577
 578         ACCEPT_LOCK_ASSERT();
 579         SOCK_LOCK_ASSERT(so);
 580
 581         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 582             (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 583                 SOCK_UNLOCK(so);
 584                 ACCEPT_UNLOCK();
 585                 return;
 586         }
 587
 588         head = so->so_head;
 589         if (head != NULL) {
 590                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 591                     (so->so_qstate & SQ_INCOMP) != 0,
 592                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 593                     "SQ_INCOMP"));
 594                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 595                     (so->so_qstate & SQ_INCOMP) == 0,
 596                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 597                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 598                 head->so_incqlen--;
 599                 so->so_qstate &= ~SQ_INCOMP;
 600                 so->so_head = NULL;
 601         }
 602         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 603             (so->so_qstate & SQ_INCOMP) == 0,
 604             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 605             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 606         if (so->so_options & SO_ACCEPTCONN) {
 607                 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
 608                 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
 609         }
 610         SOCK_UNLOCK(so);
 611         ACCEPT_UNLOCK();
 612
 613         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 614                 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 615         if (pr->pr_usrreqs->pru_detach != NULL)
 616                 (*pr->pr_usrreqs->pru_detach)(so);
 617
 618         /*
 619          * From this point on, we assume that no other references to this
 620          * socket exist anywhere else in the stack.  Therefore, no locks need
 621          * to be acquired or held.
 622          *
 623          * We used to do a lot of socket buffer and socket locking here, as
 624          * well as invoke sorflush() and perform wakeups.  The direct call to
 625          * dom_dispose() and sbrelease_internal() are an inlining of what was
 626          * necessary from sorflush().
 627          *
 628          * Notice that the socket buffer and kqueue state are torn down
 629          * before calling pru_detach.  This means that protocols shold not
 630          * assume they can perform socket wakeups, etc, in their detach code.
 631          */
 632         sbdestroy(&so->so_snd, so);
 633         sbdestroy(&so->so_rcv, so);
 634         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 635         knlist_destroy(&so->so_snd.sb_sel.si_note);
 636         sodealloc(so);
 637 }
 638
 639 /*
 640  * Close a socket on last file table reference removal.  Initiate disconnect
 641  * if connected.  Free socket when disconnect complete.
 642  *
 643  * This function will sorele() the socket.  Note that soclose() may be called
 644  * prior to the ref count reaching zero.  The actual socket structure will
 645  * not be freed until the ref count reaches zero.
 646  */
 647 int
 648 soclose(struct socket *so)
 649 {
 650         int error = 0;
 651
 652         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 653
 654         CURVNET_SET(so->so_vnet);
 655         funsetown(&so->so_sigio);
 656         if (so->so_state & SS_ISCONNECTED) {
 657                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 658                         error = sodisconnect(so);
 659                         if (error)
 660                                 goto drop;
 661                 }
 662                 if (so->so_options & SO_LINGER) {
 663                         if ((so->so_state & SS_ISDISCONNECTING) &&
 664                             (so->so_state & SS_NBIO))
 665                                 goto drop;
 666                         while (so->so_state & SS_ISCONNECTED) {
 667                                 error = tsleep(&so->so_timeo,
 668                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
 669                                 if (error)
 670                                         break;
 671                         }
 672                 }
 673         }
 674
 675 drop:
 676         if (so->so_proto->pr_usrreqs->pru_close != NULL)
 677                 (*so->so_proto->pr_usrreqs->pru_close)(so);
 678         if (so->so_options & SO_ACCEPTCONN) {
 679                 struct socket *sp;
 680                 ACCEPT_LOCK();
 681                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 682                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 683                         so->so_incqlen--;
 684                         sp->so_qstate &= ~SQ_INCOMP;
 685                         sp->so_head = NULL;
 686                         ACCEPT_UNLOCK();
 687                         soabort(sp);
 688                         ACCEPT_LOCK();
 689                 }
 690                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 691                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 692                         so->so_qlen--;
 693                         sp->so_qstate &= ~SQ_COMP;
 694                         sp->so_head = NULL;
 695                         ACCEPT_UNLOCK();
 696                         soabort(sp);
 697                         ACCEPT_LOCK();
 698                 }
 699                 ACCEPT_UNLOCK();
 700         }
 701         ACCEPT_LOCK();
 702         SOCK_LOCK(so);
 703         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 704         so->so_state |= SS_NOFDREF;
 705         sorele(so);
 706         CURVNET_RESTORE();
 707         return (error);
 708 }
 709
 710 /*
 711  * soabort() is used to abruptly tear down a connection, such as when a
 712  * resource limit is reached (listen queue depth exceeded), or if a listen
 713  * socket is closed while there are sockets waiting to be accepted.
 714  *
 715  * This interface is tricky, because it is called on an unreferenced socket,
 716  * and must be called only by a thread that has actually removed the socket
 717  * from the listen queue it was on, or races with other threads are risked.
 718  *
 719  * This interface will call into the protocol code, so must not be called
 720  * with any socket locks held.  Protocols do call it while holding their own
 721  * recursible protocol mutexes, but this is something that should be subject
 722  * to review in the future.
 723  */
 724 void
 725 soabort(struct socket *so)
 726 {
 727
 728         /*
 729          * In as much as is possible, assert that no references to this
 730          * socket are held.  This is not quite the same as asserting that the
 731          * current thread is responsible for arranging for no references, but
 732          * is as close as we can get for now.
 733          */
 734         KASSERT(so->so_count == 0, ("soabort: so_count"));
 735         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 736         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 737         KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 738         KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 739
 740         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 741                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
 742         ACCEPT_LOCK();
 743         SOCK_LOCK(so);
 744         sofree(so);
 745 }
 746
 747 int
 748 soaccept(struct socket *so, struct sockaddr **nam)
 749 {
 750         int error;
 751
 752         SOCK_LOCK(so);
 753         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 754         so->so_state &= ~SS_NOFDREF;
 755         SOCK_UNLOCK(so);
 756         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 757         return (error);
 758 }
 759
 760 int
 761 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 762 {
 763         int error;
 764
 765         if (so->so_options & SO_ACCEPTCONN)
 766                 return (EOPNOTSUPP);
 767
 768         CURVNET_SET(so->so_vnet);
 769         /*
 770          * If protocol is connection-based, can only connect once.
 771          * Otherwise, if connected, try to disconnect first.  This allows
 772          * user to disconnect by connecting to, e.g., a null address.
 773          */
 774         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 775             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 776             (error = sodisconnect(so)))) {
 777                 error = EISCONN;
 778         } else {
 779                 /*
 780                  * Prevent accumulated error from previous connection from
 781                  * biting us.
 782                  */
 783                 so->so_error = 0;
 784                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
 785         }
 786         CURVNET_RESTORE();
 787
 788         return (error);
 789 }
 790
 791 int
 792 soconnect2(struct socket *so1, struct socket *so2)
 793 {
 794
 795         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
 796 }
 797
 798 int
 799 sodisconnect(struct socket *so)
 800 {
 801         int error;
 802
 803         if ((so->so_state & SS_ISCONNECTED) == 0)
 804                 return (ENOTCONN);
 805         if (so->so_state & SS_ISDISCONNECTING)
 806                 return (EALREADY);
 807         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 808         return (error);
 809 }
 810
 811 #ifdef ZERO_COPY_SOCKETS
 812 struct so_zerocopy_stats{
 813         int size_ok;
 814         int align_ok;
 815         int found_ifp;
 816 };
 817 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
 818 #include <netinet/in.h>
 819 #include <net/route.h>
 820 #include <netinet/in_pcb.h>
 821 #include <vm/vm.h>
 822 #include <vm/vm_page.h>
 823 #include <vm/vm_object.h>
 824
 825 /*
 826  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
 827  * sosend_dgram() and sosend_generic() use m_uiotombuf().
 828  *
 829  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
 830  * all of the data referenced by the uio.  If desired, it uses zero-copy.
 831  * *space will be updated to reflect data copied in.
 832  *
 833  * NB: If atomic I/O is requested, the caller must already have checked that
 834  * space can hold resid bytes.
 835  *
 836  * NB: In the event of an error, the caller may need to free the partial
 837  * chain pointed to by *mpp.  The contents of both *uio and *space may be
 838  * modified even in the case of an error.
 839  */
 840 static int
 841 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
 842     int flags)
 843 {
 844         struct mbuf *m, **mp, *top;
 845         long len, resid;
 846         int error;
 847 #ifdef ZERO_COPY_SOCKETS
 848         int cow_send;
 849 #endif
 850
 851         *retmp = top = NULL;
 852         mp = &top;
 853         len = 0;
 854         resid = uio->uio_resid;
 855         error = 0;
 856         do {
 857 #ifdef ZERO_COPY_SOCKETS
 858                 cow_send = 0;
 859 #endif /* ZERO_COPY_SOCKETS */
 860                 if (resid >= MINCLSIZE) {
 861 #ifdef ZERO_COPY_SOCKETS
 862                         if (top == NULL) {
 863                                 m = m_gethdr(M_WAITOK, MT_DATA);
 864                                 m->m_pkthdr.len = 0;
 865                                 m->m_pkthdr.rcvif = NULL;
 866                         } else
 867                                 m = m_get(M_WAITOK, MT_DATA);
 868                         if (so_zero_copy_send &&
 869                             resid>=PAGE_SIZE &&
 870                             *space>=PAGE_SIZE &&
 871                             uio->uio_iov->iov_len>=PAGE_SIZE) {
 872                                 so_zerocp_stats.size_ok++;
 873                                 so_zerocp_stats.align_ok++;
 874                                 cow_send = socow_setup(m, uio);
 875                                 len = cow_send;
 876                         }
 877                         if (!cow_send) {
 878                                 m_clget(m, M_WAITOK);
 879                                 len = min(min(MCLBYTES, resid), *space);
 880                         }
 881 #else /* ZERO_COPY_SOCKETS */
 882                         if (top == NULL) {
 883                                 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
 884                                 m->m_pkthdr.len = 0;
 885                                 m->m_pkthdr.rcvif = NULL;
 886                         } else
 887                                 m = m_getcl(M_WAIT, MT_DATA, 0);
 888                         len = min(min(MCLBYTES, resid), *space);
 889 #endif /* ZERO_COPY_SOCKETS */
 890                 } else {
 891                         if (top == NULL) {
 892                                 m = m_gethdr(M_WAIT, MT_DATA);
 893                                 m->m_pkthdr.len = 0;
 894                                 m->m_pkthdr.rcvif = NULL;
 895
 896                                 len = min(min(MHLEN, resid), *space);
 897                                 /*
 898                                  * For datagram protocols, leave room
 899                                  * for protocol headers in first mbuf.
 900                                  */
 901                                 if (atomic && m && len < MHLEN)
 902                                         MH_ALIGN(m, len);
 903                         } else {
 904                                 m = m_get(M_WAIT, MT_DATA);
 905                                 len = min(min(MLEN, resid), *space);
 906                         }
 907                 }
 908                 if (m == NULL) {
 909                         error = ENOBUFS;
 910                         goto out;
 911                 }
 912
 913                 *space -= len;
 914 #ifdef ZERO_COPY_SOCKETS
 915                 if (cow_send)
 916                         error = 0;
 917                 else
 918 #endif /* ZERO_COPY_SOCKETS */
 919                 error = uiomove(mtod(m, void *), (int)len, uio);
 920                 resid = uio->uio_resid;
 921                 m->m_len = len;
 922                 *mp = m;
 923                 top->m_pkthdr.len += len;
 924                 if (error)
 925                         goto out;
 926                 mp = &m->m_next;
 927                 if (resid <= 0) {
 928                         if (flags & MSG_EOR)
 929                                 top->m_flags |= M_EOR;
 930                         break;
 931                 }
 932         } while (*space > 0 && atomic);
 933 out:
 934         *retmp = top;
 935         return (error);
 936 }
 937 #endif /*ZERO_COPY_SOCKETS*/
 938
 939 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 940
 941 int
 942 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 943     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 944 {
 945         long space, resid;
 946         int clen = 0, error, dontroute;
 947 #ifdef ZERO_COPY_SOCKETS
 948         int atomic = sosendallatonce(so) || top;
 949 #endif
 950
 951         KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
 952         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 953             ("sodgram_send: !PR_ATOMIC"));
 954
 955         if (uio != NULL)
 956                 resid = uio->uio_resid;
 957         else
 958                 resid = top->m_pkthdr.len;
 959         /*
 960          * In theory resid should be unsigned.  However, space must be
 961          * signed, as it might be less than 0 if we over-committed, and we
 962          * must use a signed comparison of space and resid.  On the other
 963          * hand, a negative resid causes us to loop sending 0-length
 964          * segments to the protocol.
 965          */
 966         if (resid < 0) {
 967                 error = EINVAL;
 968                 goto out;
 969         }
 970
 971         dontroute =
 972             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
 973         if (td != NULL)
 974                 td->td_ru.ru_msgsnd++;
 975         if (control != NULL)
 976                 clen = control->m_len;
 977
 978         SOCKBUF_LOCK(&so->so_snd);
 979         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 980                 SOCKBUF_UNLOCK(&so->so_snd);
 981                 error = EPIPE;
 982                 goto out;
 983         }
 984         if (so->so_error) {
 985                 error = so->so_error;
 986                 so->so_error = 0;
 987                 SOCKBUF_UNLOCK(&so->so_snd);
 988                 goto out;
 989         }
 990         if ((so->so_state & SS_ISCONNECTED) == 0) {
 991                 /*
 992                  * `sendto' and `sendmsg' is allowed on a connection-based
 993                  * socket if it supports implied connect.  Return ENOTCONN if
 994                  * not connected and no address is supplied.
 995                  */
 996                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 997                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 998                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 999                             !(resid == 0 && clen != 0)) {
1000                                 SOCKBUF_UNLOCK(&so->so_snd);
1001                                 error = ENOTCONN;
1002                                 goto out;
1003                         }
1004                 } else if (addr == NULL) {
1005                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1006                                 error = ENOTCONN;
1007                         else
1008                                 error = EDESTADDRREQ;
1009                         SOCKBUF_UNLOCK(&so->so_snd);
1010                         goto out;
1011                 }
1012         }
1013
1014         /*
1015          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1016          * problem and need fixing.
1017          */
1018         space = sbspace(&so->so_snd);
1019         if (flags & MSG_OOB)
1020                 space += 1024;
1021         space -= clen;
1022         SOCKBUF_UNLOCK(&so->so_snd);
1023         if (resid > space) {
1024                 error = EMSGSIZE;
1025                 goto out;
1026         }
1027         if (uio == NULL) {
1028                 resid = 0;
1029                 if (flags & MSG_EOR)
1030                         top->m_flags |= M_EOR;
1031         } else {
1032 #ifdef ZERO_COPY_SOCKETS
1033                 error = sosend_copyin(uio, &top, atomic, &space, flags);
1034                 if (error)
1035                         goto out;
1036 #else
1037                 /*
1038                  * Copy the data from userland into a mbuf chain.
1039                  * If no data is to be copied in, a single empty mbuf
1040                  * is returned.
1041                  */
1042                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1043                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1044                 if (top == NULL) {
1045                         error = EFAULT; /* only possible error */
1046                         goto out;
1047                 }
1048                 space -= resid - uio->uio_resid;
1049 #endif
1050                 resid = uio->uio_resid;
1051         }
1052         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1053         /*
1054          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1055          * than with.
1056          */
1057         if (dontroute) {
1058                 SOCK_LOCK(so);
1059                 so->so_options |= SO_DONTROUTE;
1060                 SOCK_UNLOCK(so);
1061         }
1062         /*
1063          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1064          * of date.  We could have recieved a reset packet in an interrupt or
1065          * maybe we slept while doing page faults in uiomove() etc.  We could
1066          * probably recheck again inside the locking protection here, but
1067          * there are probably other places that this also happens.  We must
1068          * rethink this.
1069          */
1070         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1071             (flags & MSG_OOB) ? PRUS_OOB :
1072         /*
1073          * If the user set MSG_EOF, the protocol understands this flag and
1074          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1075          */
1076             ((flags & MSG_EOF) &&
1077              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1078              (resid <= 0)) ?
1079                 PRUS_EOF :
1080                 /* If there is more to send set PRUS_MORETOCOME */
1081                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1082                 top, addr, control, td);
1083         if (dontroute) {
1084                 SOCK_LOCK(so);
1085                 so->so_options &= ~SO_DONTROUTE;
1086                 SOCK_UNLOCK(so);
1087         }
1088         clen = 0;
1089         control = NULL;
1090         top = NULL;
1091 out:
1092         if (top != NULL)
1093                 m_freem(top);
1094         if (control != NULL)
1095                 m_freem(control);
1096         return (error);
1097 }
1098
1099 /*
1100  * Send on a socket.  If send must go all at once and message is larger than
1101  * send buffering, then hard error.  Lock against other senders.  If must go
1102  * all at once and not enough room now, then inform user that this would
1103  * block and do nothing.  Otherwise, if nonblocking, send as much as
1104  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1105  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1106  * in mbuf chain must be small enough to send all at once.
1107  *
1108  * Returns nonzero on error, timeout or signal; callers must check for short
1109  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1110  * on return.
1111  */
1112 int
1113 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1114     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1115 {
1116         long space, resid;
1117         int clen = 0, error, dontroute;
1118         int atomic = sosendallatonce(so) || top;
1119
1120         if (uio != NULL)
1121                 resid = uio->uio_resid;
1122         else
1123                 resid = top->m_pkthdr.len;
1124         /*
1125          * In theory resid should be unsigned.  However, space must be
1126          * signed, as it might be less than 0 if we over-committed, and we
1127          * must use a signed comparison of space and resid.  On the other
1128          * hand, a negative resid causes us to loop sending 0-length
1129          * segments to the protocol.
1130          *
1131          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1132          * type sockets since that's an error.
1133          */
1134         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1135                 error = EINVAL;
1136                 goto out;
1137         }
1138
1139         dontroute =
1140             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1141             (so->so_proto->pr_flags & PR_ATOMIC);
1142         if (td != NULL)
1143                 td->td_ru.ru_msgsnd++;
1144         if (control != NULL)
1145                 clen = control->m_len;
1146
1147         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1148         if (error)
1149                 goto out;
1150
1151 restart:
1152         do {
1153                 SOCKBUF_LOCK(&so->so_snd);
1154                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1155                         SOCKBUF_UNLOCK(&so->so_snd);
1156                         error = EPIPE;
1157                         goto release;
1158                 }
1159                 if (so->so_error) {
1160                         error = so->so_error;
1161                         so->so_error = 0;
1162                         SOCKBUF_UNLOCK(&so->so_snd);
1163                         goto release;
1164                 }
1165                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1166                         /*
1167                          * `sendto' and `sendmsg' is allowed on a connection-
1168                          * based socket if it supports implied connect.
1169                          * Return ENOTCONN if not connected and no address is
1170                          * supplied.
1171                          */
1172                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1173                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1174                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1175                                     !(resid == 0 && clen != 0)) {
1176                                         SOCKBUF_UNLOCK(&so->so_snd);
1177                                         error = ENOTCONN;
1178                                         goto release;
1179                                 }
1180                         } else if (addr == NULL) {
1181                                 SOCKBUF_UNLOCK(&so->so_snd);
1182                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1183                                         error = ENOTCONN;
1184                                 else
1185                                         error = EDESTADDRREQ;
1186                                 goto release;
1187                         }
1188                 }
1189                 space = sbspace(&so->so_snd);
1190                 if (flags & MSG_OOB)
1191                         space += 1024;
1192                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1193                     clen > so->so_snd.sb_hiwat) {
1194                         SOCKBUF_UNLOCK(&so->so_snd);
1195                         error = EMSGSIZE;
1196                         goto release;
1197                 }
1198                 if (space < resid + clen &&
1199                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1200                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1201                                 SOCKBUF_UNLOCK(&so->so_snd);
1202                                 error = EWOULDBLOCK;
1203                                 goto release;
1204                         }
1205                         error = sbwait(&so->so_snd);
1206                         SOCKBUF_UNLOCK(&so->so_snd);
1207                         if (error)
1208                                 goto release;
1209                         goto restart;
1210                 }
1211                 SOCKBUF_UNLOCK(&so->so_snd);
1212                 space -= clen;
1213                 do {
1214                         if (uio == NULL) {
1215                                 resid = 0;
1216                                 if (flags & MSG_EOR)
1217                                         top->m_flags |= M_EOR;
1218                         } else {
1219 #ifdef ZERO_COPY_SOCKETS
1220                                 error = sosend_copyin(uio, &top, atomic,
1221                                     &space, flags);
1222                                 if (error != 0)
1223                                         goto release;
1224 #else
1225                                 /*
1226                                  * Copy the data from userland into a mbuf
1227                                  * chain.  If no data is to be copied in,
1228                                  * a single empty mbuf is returned.
1229                                  */
1230                                 top = m_uiotombuf(uio, M_WAITOK, space,
1231                                     (atomic ? max_hdr : 0),
1232                                     (atomic ? M_PKTHDR : 0) |
1233                                     ((flags & MSG_EOR) ? M_EOR : 0));
1234                                 if (top == NULL) {
1235                                         error = EFAULT; /* only possible error */
1236                                         goto release;
1237                                 }
1238                                 space -= resid - uio->uio_resid;
1239 #endif
1240                                 resid = uio->uio_resid;
1241                         }
1242                         if (dontroute) {
1243                                 SOCK_LOCK(so);
1244                                 so->so_options |= SO_DONTROUTE;
1245                                 SOCK_UNLOCK(so);
1246                         }
1247                         /*
1248                          * XXX all the SBS_CANTSENDMORE checks previously
1249                          * done could be out of date.  We could have recieved
1250                          * a reset packet in an interrupt or maybe we slept
1251                          * while doing page faults in uiomove() etc.  We
1252                          * could probably recheck again inside the locking
1253                          * protection here, but there are probably other
1254                          * places that this also happens.  We must rethink
1255                          * this.
1256                          */
1257                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1258                             (flags & MSG_OOB) ? PRUS_OOB :
1259                         /*
1260                          * If the user set MSG_EOF, the protocol understands
1261                          * this flag and nothing left to send then use
1262                          * PRU_SEND_EOF instead of PRU_SEND.
1263                          */
1264                             ((flags & MSG_EOF) &&
1265                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1266                              (resid <= 0)) ?
1267                                 PRUS_EOF :
1268                         /* If there is more to send set PRUS_MORETOCOME. */
1269                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1270                             top, addr, control, td);
1271                         if (dontroute) {
1272                                 SOCK_LOCK(so);
1273                                 so->so_options &= ~SO_DONTROUTE;
1274                                 SOCK_UNLOCK(so);
1275                         }
1276                         clen = 0;
1277                         control = NULL;
1278                         top = NULL;
1279                         if (error)
1280                                 goto release;
1281                 } while (resid && space > 0);
1282         } while (resid);
1283
1284 release:
1285         sbunlock(&so->so_snd);
1286 out:
1287         if (top != NULL)
1288                 m_freem(top);
1289         if (control != NULL)
1290                 m_freem(control);
1291         return (error);
1292 }
1293
1294 int
1295 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1296     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1297 {
1298         int error;
1299
1300         CURVNET_SET(so->so_vnet);
1301         error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1302             control, flags, td);
1303         CURVNET_RESTORE();
1304         return (error);
1305 }
1306
1307 /*
1308  * The part of soreceive() that implements reading non-inline out-of-band
1309  * data from a socket.  For more complete comments, see soreceive(), from
1310  * which this code originated.
1311  *
1312  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1313  * unable to return an mbuf chain to the caller.
1314  */
1315 static int
1316 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1317 {
1318         struct protosw *pr = so->so_proto;
1319         struct mbuf *m;
1320         int error;
1321
1322         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1323
1324         m = m_get(M_WAIT, MT_DATA);
1325         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1326         if (error)
1327                 goto bad;
1328         do {
1329 #ifdef ZERO_COPY_SOCKETS
1330                 if (so_zero_copy_receive) {
1331                         int disposable;
1332
1333                         if ((m->m_flags & M_EXT)
1334                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
1335                                 disposable = 1;
1336                         else
1337                                 disposable = 0;
1338
1339                         error = uiomoveco(mtod(m, void *),
1340                                           min(uio->uio_resid, m->m_len),
1341                                           uio, disposable);
1342                 } else
1343 #endif /* ZERO_COPY_SOCKETS */
1344                 error = uiomove(mtod(m, void *),
1345                     (int) min(uio->uio_resid, m->m_len), uio);
1346                 m = m_free(m);
1347         } while (uio->uio_resid && error == 0 && m);
1348 bad:
1349         if (m != NULL)
1350                 m_freem(m);
1351         return (error);
1352 }
1353
1354 /*
1355  * Following replacement or removal of the first mbuf on the first mbuf chain
1356  * of a socket buffer, push necessary state changes back into the socket
1357  * buffer so that other consumers see the values consistently.  'nextrecord'
1358  * is the callers locally stored value of the original value of
1359  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1360  * NOTE: 'nextrecord' may be NULL.
1361  */
1362 static __inline void
1363 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1364 {
1365
1366         SOCKBUF_LOCK_ASSERT(sb);
1367         /*
1368          * First, update for the new value of nextrecord.  If necessary, make
1369          * it the first record.
1370          */
1371         if (sb->sb_mb != NULL)
1372                 sb->sb_mb->m_nextpkt = nextrecord;
1373         else
1374                 sb->sb_mb = nextrecord;
1375
1376         /*
1377          * Now update any dependent socket buffer fields to reflect the new
1378          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1379          * addition of a second clause that takes care of the case where
1380          * sb_mb has been updated, but remains the last record.
1381          */
1382         if (sb->sb_mb == NULL) {
1383                 sb->sb_mbtail = NULL;
1384                 sb->sb_lastrecord = NULL;
1385         } else if (sb->sb_mb->m_nextpkt == NULL)
1386                 sb->sb_lastrecord = sb->sb_mb;
1387 }
1388
1389
1390 /*
1391  * Implement receive operations on a socket.  We depend on the way that
1392  * records are added to the sockbuf by sbappend.  In particular, each record
1393  * (mbufs linked through m_next) must begin with an address if the protocol
1394  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1395  * data, and then zero or more mbufs of data.  In order to allow parallelism
1396  * between network receive and copying to user space, as well as avoid
1397  * sleeping with a mutex held, we release the socket buffer mutex during the
1398  * user space copy.  Although the sockbuf is locked, new data may still be
1399  * appended, and thus we must maintain consistency of the sockbuf during that
1400  * time.
1401  *
1402  * The caller may receive the data as a single mbuf chain by supplying an
1403  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1404  * the count in uio_resid.
1405  */
1406 int
1407 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1408     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1409 {
1410         struct mbuf *m, **mp;
1411         int flags, len, error, offset;
1412         struct protosw *pr = so->so_proto;
1413         struct mbuf *nextrecord;
1414         int moff, type = 0;
1415         int orig_resid = uio->uio_resid;
1416
1417         mp = mp0;
1418         if (psa != NULL)
1419                 *psa = NULL;
1420         if (controlp != NULL)
1421                 *controlp = NULL;
1422         if (flagsp != NULL)
1423                 flags = *flagsp &~ MSG_EOR;
1424         else
1425                 flags = 0;
1426         if (flags & MSG_OOB)
1427                 return (soreceive_rcvoob(so, uio, flags));
1428         if (mp != NULL)
1429                 *mp = NULL;
1430         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1431             && uio->uio_resid)
1432                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1433
1434         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1435         if (error)
1436                 return (error);
1437
1438 restart:
1439         SOCKBUF_LOCK(&so->so_rcv);
1440         m = so->so_rcv.sb_mb;
1441         /*
1442          * If we have less data than requested, block awaiting more (subject
1443          * to any timeout) if:
1444          *   1. the current count is less than the low water mark, or
1445          *   2. MSG_WAITALL is set, and it is possible to do the entire
1446          *      receive operation at once if we block (resid <= hiwat).
1447          *   3. MSG_DONTWAIT is not set
1448          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1449          * we have to do the receive in sections, and thus risk returning a
1450          * short count if a timeout or signal occurs after we start.
1451          */
1452         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1453             so->so_rcv.sb_cc < uio->uio_resid) &&
1454             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1455             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1456             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1457                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1458                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1459                     m, so->so_rcv.sb_cc));
1460                 if (so->so_error) {
1461                         if (m != NULL)
1462                                 goto dontblock;
1463                         error = so->so_error;
1464                         if ((flags & MSG_PEEK) == 0)
1465                                 so->so_error = 0;
1466                         SOCKBUF_UNLOCK(&so->so_rcv);
1467                         goto release;
1468                 }
1469                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1470                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1471                         if (m == NULL) {
1472                                 SOCKBUF_UNLOCK(&so->so_rcv);
1473                                 goto release;
1474                         } else
1475                                 goto dontblock;
1476                 }
1477                 for (; m != NULL; m = m->m_next)
1478                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1479                                 m = so->so_rcv.sb_mb;
1480                                 goto dontblock;
1481                         }
1482                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1483                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1484                         SOCKBUF_UNLOCK(&so->so_rcv);
1485                         error = ENOTCONN;
1486                         goto release;
1487                 }
1488                 if (uio->uio_resid == 0) {
1489                         SOCKBUF_UNLOCK(&so->so_rcv);
1490                         goto release;
1491                 }
1492                 if ((so->so_state & SS_NBIO) ||
1493                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1494                         SOCKBUF_UNLOCK(&so->so_rcv);
1495                         error = EWOULDBLOCK;
1496                         goto release;
1497                 }
1498                 SBLASTRECORDCHK(&so->so_rcv);
1499                 SBLASTMBUFCHK(&so->so_rcv);
1500                 error = sbwait(&so->so_rcv);
1501                 SOCKBUF_UNLOCK(&so->so_rcv);
1502                 if (error)
1503                         goto release;
1504                 goto restart;
1505         }
1506 dontblock:
1507         /*
1508          * From this point onward, we maintain 'nextrecord' as a cache of the
1509          * pointer to the next record in the socket buffer.  We must keep the
1510          * various socket buffer pointers and local stack versions of the
1511          * pointers in sync, pushing out modifications before dropping the
1512          * socket buffer mutex, and re-reading them when picking it up.
1513          *
1514          * Otherwise, we will race with the network stack appending new data
1515          * or records onto the socket buffer by using inconsistent/stale
1516          * versions of the field, possibly resulting in socket buffer
1517          * corruption.
1518          *
1519          * By holding the high-level sblock(), we prevent simultaneous
1520          * readers from pulling off the front of the socket buffer.
1521          */
1522         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1523         if (uio->uio_td)
1524                 uio->uio_td->td_ru.ru_msgrcv++;
1525         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1526         SBLASTRECORDCHK(&so->so_rcv);
1527         SBLASTMBUFCHK(&so->so_rcv);
1528         nextrecord = m->m_nextpkt;
1529         if (pr->pr_flags & PR_ADDR) {
1530                 KASSERT(m->m_type == MT_SONAME,
1531                     ("m->m_type == %d", m->m_type));
1532                 orig_resid = 0;
1533                 if (psa != NULL)
1534                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1535                             M_NOWAIT);
1536                 if (flags & MSG_PEEK) {
1537                         m = m->m_next;
1538                 } else {
1539                         sbfree(&so->so_rcv, m);
1540                         so->so_rcv.sb_mb = m_free(m);
1541                         m = so->so_rcv.sb_mb;
1542                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1543                 }
1544         }
1545
1546         /*
1547          * Process one or more MT_CONTROL mbufs present before any data mbufs
1548          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1549          * just copy the data; if !MSG_PEEK, we call into the protocol to
1550          * perform externalization (or freeing if controlp == NULL).
1551          */
1552         if (m != NULL && m->m_type == MT_CONTROL) {
1553                 struct mbuf *cm = NULL, *cmn;
1554                 struct mbuf **cme = &cm;
1555
1556                 do {
1557                         if (flags & MSG_PEEK) {
1558                                 if (controlp != NULL) {
1559                                         *controlp = m_copy(m, 0, m->m_len);
1560                                         controlp = &(*controlp)->m_next;
1561                                 }
1562                                 m = m->m_next;
1563                         } else {
1564                                 sbfree(&so->so_rcv, m);
1565                                 so->so_rcv.sb_mb = m->m_next;
1566                                 m->m_next = NULL;
1567                                 *cme = m;
1568                                 cme = &(*cme)->m_next;
1569                                 m = so->so_rcv.sb_mb;
1570                         }
1571                 } while (m != NULL && m->m_type == MT_CONTROL);
1572                 if ((flags & MSG_PEEK) == 0)
1573                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1574                 while (cm != NULL) {
1575                         cmn = cm->m_next;
1576                         cm->m_next = NULL;
1577                         if (pr->pr_domain->dom_externalize != NULL) {
1578                                 SOCKBUF_UNLOCK(&so->so_rcv);
1579                                 error = (*pr->pr_domain->dom_externalize)
1580                                     (cm, controlp);
1581                                 SOCKBUF_LOCK(&so->so_rcv);
1582                         } else if (controlp != NULL)
1583                                 *controlp = cm;
1584                         else
1585                                 m_freem(cm);
1586                         if (controlp != NULL) {
1587                                 orig_resid = 0;
1588                                 while (*controlp != NULL)
1589                                         controlp = &(*controlp)->m_next;
1590                         }
1591                         cm = cmn;
1592                 }
1593                 if (m != NULL)
1594                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1595                 else
1596                         nextrecord = so->so_rcv.sb_mb;
1597                 orig_resid = 0;
1598         }
1599         if (m != NULL) {
1600                 if ((flags & MSG_PEEK) == 0) {
1601                         KASSERT(m->m_nextpkt == nextrecord,
1602                             ("soreceive: post-control, nextrecord !sync"));
1603                         if (nextrecord == NULL) {
1604                                 KASSERT(so->so_rcv.sb_mb == m,
1605                                     ("soreceive: post-control, sb_mb!=m"));
1606                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1607                                     ("soreceive: post-control, lastrecord!=m"));
1608                         }
1609                 }
1610                 type = m->m_type;
1611                 if (type == MT_OOBDATA)
1612                         flags |= MSG_OOB;
1613         } else {
1614                 if ((flags & MSG_PEEK) == 0) {
1615                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1616                             ("soreceive: sb_mb != nextrecord"));
1617                         if (so->so_rcv.sb_mb == NULL) {
1618                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1619                                     ("soreceive: sb_lastercord != NULL"));
1620                         }
1621                 }
1622         }
1623         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1624         SBLASTRECORDCHK(&so->so_rcv);
1625         SBLASTMBUFCHK(&so->so_rcv);
1626
1627         /*
1628          * Now continue to read any data mbufs off of the head of the socket
1629          * buffer until the read request is satisfied.  Note that 'type' is
1630          * used to store the type of any mbuf reads that have happened so far
1631          * such that soreceive() can stop reading if the type changes, which
1632          * causes soreceive() to return only one of regular data and inline
1633          * out-of-band data in a single socket receive operation.
1634          */
1635         moff = 0;
1636         offset = 0;
1637         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1638                 /*
1639                  * If the type of mbuf has changed since the last mbuf
1640                  * examined ('type'), end the receive operation.
1641                  */
1642                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1643                 if (m->m_type == MT_OOBDATA) {
1644                         if (type != MT_OOBDATA)
1645                                 break;
1646                 } else if (type == MT_OOBDATA)
1647                         break;
1648                 else
1649                     KASSERT(m->m_type == MT_DATA,
1650                         ("m->m_type == %d", m->m_type));
1651                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1652                 len = uio->uio_resid;
1653                 if (so->so_oobmark && len > so->so_oobmark - offset)
1654                         len = so->so_oobmark - offset;
1655                 if (len > m->m_len - moff)
1656                         len = m->m_len - moff;
1657                 /*
1658                  * If mp is set, just pass back the mbufs.  Otherwise copy
1659                  * them out via the uio, then free.  Sockbuf must be
1660                  * consistent here (points to current mbuf, it points to next
1661                  * record) when we drop priority; we must note any additions
1662                  * to the sockbuf when we block interrupts again.
1663                  */
1664                 if (mp == NULL) {
1665                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1666                         SBLASTRECORDCHK(&so->so_rcv);
1667                         SBLASTMBUFCHK(&so->so_rcv);
1668                         SOCKBUF_UNLOCK(&so->so_rcv);
1669 #ifdef ZERO_COPY_SOCKETS
1670                         if (so_zero_copy_receive) {
1671                                 int disposable;
1672
1673                                 if ((m->m_flags & M_EXT)
1674                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
1675                                         disposable = 1;
1676                                 else
1677                                         disposable = 0;
1678
1679                                 error = uiomoveco(mtod(m, char *) + moff,
1680                                                   (int)len, uio,
1681                                                   disposable);
1682                         } else
1683 #endif /* ZERO_COPY_SOCKETS */
1684                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1685                         SOCKBUF_LOCK(&so->so_rcv);
1686                         if (error) {
1687                                 /*
1688                                  * The MT_SONAME mbuf has already been removed
1689                                  * from the record, so it is necessary to
1690                                  * remove the data mbufs, if any, to preserve
1691                                  * the invariant in the case of PR_ADDR that
1692                                  * requires MT_SONAME mbufs at the head of
1693                                  * each record.
1694                                  */
1695                                 if (m && pr->pr_flags & PR_ATOMIC &&
1696                                     ((flags & MSG_PEEK) == 0))
1697                                         (void)sbdroprecord_locked(&so->so_rcv);
1698                                 SOCKBUF_UNLOCK(&so->so_rcv);
1699                                 goto release;
1700                         }
1701                 } else
1702                         uio->uio_resid -= len;
1703                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1704                 if (len == m->m_len - moff) {
1705                         if (m->m_flags & M_EOR)
1706                                 flags |= MSG_EOR;
1707                         if (flags & MSG_PEEK) {
1708                                 m = m->m_next;
1709                                 moff = 0;
1710                         } else {
1711                                 nextrecord = m->m_nextpkt;
1712                                 sbfree(&so->so_rcv, m);
1713                                 if (mp != NULL) {
1714                                         *mp = m;
1715                                         mp = &m->m_next;
1716                                         so->so_rcv.sb_mb = m = m->m_next;
1717                                         *mp = NULL;
1718                                 } else {
1719                                         so->so_rcv.sb_mb = m_free(m);
1720                                         m = so->so_rcv.sb_mb;
1721                                 }
1722                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
1723                                 SBLASTRECORDCHK(&so->so_rcv);
1724                                 SBLASTMBUFCHK(&so->so_rcv);
1725                         }
1726                 } else {
1727                         if (flags & MSG_PEEK)
1728                                 moff += len;
1729                         else {
1730                                 if (mp != NULL) {
1731                                         int copy_flag;
1732
1733                                         if (flags & MSG_DONTWAIT)
1734                                                 copy_flag = M_DONTWAIT;
1735                                         else
1736                                                 copy_flag = M_WAIT;
1737                                         if (copy_flag == M_WAIT)
1738                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1739                                         *mp = m_copym(m, 0, len, copy_flag);
1740                                         if (copy_flag == M_WAIT)
1741                                                 SOCKBUF_LOCK(&so->so_rcv);
1742                                         if (*mp == NULL) {
1743                                                 /*
1744                                                  * m_copym() couldn't
1745                                                  * allocate an mbuf.  Adjust
1746                                                  * uio_resid back (it was
1747                                                  * adjusted down by len
1748                                                  * bytes, which we didn't end
1749                                                  * up "copying" over).
1750                                                  */
1751                                                 uio->uio_resid += len;
1752                                                 break;
1753                                         }
1754                                 }
1755                                 m->m_data += len;
1756                                 m->m_len -= len;
1757                                 so->so_rcv.sb_cc -= len;
1758                         }
1759                 }
1760                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1761                 if (so->so_oobmark) {
1762                         if ((flags & MSG_PEEK) == 0) {
1763                                 so->so_oobmark -= len;
1764                                 if (so->so_oobmark == 0) {
1765                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1766                                         break;
1767                                 }
1768                         } else {
1769                                 offset += len;
1770                                 if (offset == so->so_oobmark)
1771                                         break;
1772                         }
1773                 }
1774                 if (flags & MSG_EOR)
1775                         break;
1776                 /*
1777                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
1778                  * must not quit until "uio->uio_resid == 0" or an error
1779                  * termination.  If a signal/timeout occurs, return with a
1780                  * short count but without error.  Keep sockbuf locked
1781                  * against other readers.
1782                  */
1783                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1784                     !sosendallatonce(so) && nextrecord == NULL) {
1785                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1787                                 break;
1788                         /*
1789                          * Notify the protocol that some data has been
1790                          * drained before blocking.
1791                          */
1792                         if (pr->pr_flags & PR_WANTRCVD) {
1793                                 SOCKBUF_UNLOCK(&so->so_rcv);
1794                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1795                                 SOCKBUF_LOCK(&so->so_rcv);
1796                         }
1797                         SBLASTRECORDCHK(&so->so_rcv);
1798                         SBLASTMBUFCHK(&so->so_rcv);
1799                         error = sbwait(&so->so_rcv);
1800                         if (error) {
1801                                 SOCKBUF_UNLOCK(&so->so_rcv);
1802                                 goto release;
1803                         }
1804                         m = so->so_rcv.sb_mb;
1805                         if (m != NULL)
1806                                 nextrecord = m->m_nextpkt;
1807                 }
1808         }
1809
1810         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1812                 flags |= MSG_TRUNC;
1813                 if ((flags & MSG_PEEK) == 0)
1814                         (void) sbdroprecord_locked(&so->so_rcv);
1815         }
1816         if ((flags & MSG_PEEK) == 0) {
1817                 if (m == NULL) {
1818                         /*
1819                          * First part is an inline SB_EMPTY_FIXUP().  Second
1820                          * part makes sure sb_lastrecord is up-to-date if
1821                          * there is still data in the socket buffer.
1822                          */
1823                         so->so_rcv.sb_mb = nextrecord;
1824                         if (so->so_rcv.sb_mb == NULL) {
1825                                 so->so_rcv.sb_mbtail = NULL;
1826                                 so->so_rcv.sb_lastrecord = NULL;
1827                         } else if (nextrecord->m_nextpkt == NULL)
1828                                 so->so_rcv.sb_lastrecord = nextrecord;
1829                 }
1830                 SBLASTRECORDCHK(&so->so_rcv);
1831                 SBLASTMBUFCHK(&so->so_rcv);
1832                 /*
1833                  * If soreceive() is being done from the socket callback,
1834                  * then don't need to generate ACK to peer to update window,
1835                  * since ACK will be generated on return to TCP.
1836                  */
1837                 if (!(flags & MSG_SOCALLBCK) &&
1838                     (pr->pr_flags & PR_WANTRCVD)) {
1839                         SOCKBUF_UNLOCK(&so->so_rcv);
1840                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1841                         SOCKBUF_LOCK(&so->so_rcv);
1842                 }
1843         }
1844         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1845         if (orig_resid == uio->uio_resid && orig_resid &&
1846             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1847                 SOCKBUF_UNLOCK(&so->so_rcv);
1848                 goto restart;
1849         }
1850         SOCKBUF_UNLOCK(&so->so_rcv);
1851
1852         if (flagsp != NULL)
1853                 *flagsp |= flags;
1854 release:
1855         sbunlock(&so->so_rcv);
1856         return (error);
1857 }
1858
1859 /*
1860  * Optimized version of soreceive() for stream (TCP) sockets.
1861  */
1862 int
1863 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1864     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1865 {
1866         int len = 0, error = 0, flags, oresid;
1867         struct sockbuf *sb;
1868         struct mbuf *m, *n = NULL;
1869
1870         /* We only do stream sockets. */
1871         if (so->so_type != SOCK_STREAM)
1872                 return (EINVAL);
1873         if (psa != NULL)
1874                 *psa = NULL;
1875         if (controlp != NULL)
1876                 return (EINVAL);
1877         if (flagsp != NULL)
1878                 flags = *flagsp &~ MSG_EOR;
1879         else
1880                 flags = 0;
1881         if (flags & MSG_OOB)
1882                 return (soreceive_rcvoob(so, uio, flags));
1883         if (mp0 != NULL)
1884                 *mp0 = NULL;
1885
1886         sb = &so->so_rcv;
1887
1888         /* Prevent other readers from entering the socket. */
1889         error = sblock(sb, SBLOCKWAIT(flags));
1890         if (error)
1891                 goto out;
1892         SOCKBUF_LOCK(sb);
1893
1894         /* Easy one, no space to copyout anything. */
1895         if (uio->uio_resid == 0) {
1896                 error = EINVAL;
1897                 goto out;
1898         }
1899         oresid = uio->uio_resid;
1900
1901         /* We will never ever get anything unless we are connected. */
1902         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1903                 /* When disconnecting there may be still some data left. */
1904                 if (sb->sb_cc > 0)
1905                         goto deliver;
1906                 if (!(so->so_state & SS_ISDISCONNECTED))
1907                         error = ENOTCONN;
1908                 goto out;
1909         }
1910
1911         /* Socket buffer is empty and we shall not block. */
1912         if (sb->sb_cc == 0 &&
1913             ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1914                 error = EAGAIN;
1915                 goto out;
1916         }
1917
1918 restart:
1919         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1920
1921         /* Abort if socket has reported problems. */
1922         if (so->so_error) {
1923                 if (sb->sb_cc > 0)
1924                         goto deliver;
1925                 if (oresid > uio->uio_resid)
1926                         goto out;
1927                 error = so->so_error;
1928                 if (!(flags & MSG_PEEK))
1929                         so->so_error = 0;
1930                 goto out;
1931         }
1932
1933         /* Door is closed.  Deliver what is left, if any. */
1934         if (sb->sb_state & SBS_CANTRCVMORE) {
1935                 if (sb->sb_cc > 0)
1936                         goto deliver;
1937                 else
1938                         goto out;
1939         }
1940
1941         /* Socket buffer got some data that we shall deliver now. */
1942         if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1943             ((sb->sb_flags & SS_NBIO) ||
1944              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1945              sb->sb_cc >= sb->sb_lowat ||
1946              sb->sb_cc >= uio->uio_resid ||
1947              sb->sb_cc >= sb->sb_hiwat) ) {
1948                 goto deliver;
1949         }
1950
1951         /* On MSG_WAITALL we must wait until all data or error arrives. */
1952         if ((flags & MSG_WAITALL) &&
1953             (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1954                 goto deliver;
1955
1956         /*
1957          * Wait and block until (more) data comes in.
1958          * NB: Drops the sockbuf lock during wait.
1959          */
1960         error = sbwait(sb);
1961         if (error)
1962                 goto out;
1963         goto restart;
1964
1965 deliver:
1966         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1967         KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1968         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1969
1970         /* Statistics. */
1971         if (uio->uio_td)
1972                 uio->uio_td->td_ru.ru_msgrcv++;
1973
1974         /* Fill uio until full or current end of socket buffer is reached. */
1975         len = min(uio->uio_resid, sb->sb_cc);
1976         if (mp0 != NULL) {
1977                 /* Dequeue as many mbufs as possible. */
1978                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1979                         for (*mp0 = m = sb->sb_mb;
1980                              m != NULL && m->m_len <= len;
1981                              m = m->m_next) {
1982                                 len -= m->m_len;
1983                                 uio->uio_resid -= m->m_len;
1984                                 sbfree(sb, m);
1985                                 n = m;
1986                         }
1987                         sb->sb_mb = m;
1988                         if (sb->sb_mb == NULL)
1989                                 SB_EMPTY_FIXUP(sb);
1990                         n->m_next = NULL;
1991                 }
1992                 /* Copy the remainder. */
1993                 if (len > 0) {
1994                         KASSERT(sb->sb_mb != NULL,
1995                             ("%s: len > 0 && sb->sb_mb empty", __func__));
1996
1997                         m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
1998                         if (m == NULL)
1999                                 len = 0;        /* Don't flush data from sockbuf. */
2000                         else
2001                                 uio->uio_resid -= m->m_len;
2002                         if (*mp0 != NULL)
2003                                 n->m_next = m;
2004                         else
2005                                 *mp0 = m;
2006                         if (*mp0 == NULL) {
2007                                 error = ENOBUFS;
2008                                 goto out;
2009                         }
2010                 }
2011         } else {
2012                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2013                 SOCKBUF_UNLOCK(sb);
2014                 error = m_mbuftouio(uio, sb->sb_mb, len);
2015                 SOCKBUF_LOCK(sb);
2016                 if (error)
2017                         goto out;
2018         }
2019         SBLASTRECORDCHK(sb);
2020         SBLASTMBUFCHK(sb);
2021
2022         /*
2023          * Remove the delivered data from the socket buffer unless we
2024          * were only peeking.
2025          */
2026         if (!(flags & MSG_PEEK)) {
2027                 if (len > 0)
2028                         sbdrop_locked(sb, len);
2029
2030                 /* Notify protocol that we drained some data. */
2031                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2032                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2033                      !(flags & MSG_SOCALLBCK))) {
2034                         SOCKBUF_UNLOCK(sb);
2035                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2036                         SOCKBUF_LOCK(sb);
2037                 }
2038         }
2039
2040         /*
2041          * For MSG_WAITALL we may have to loop again and wait for
2042          * more data to come in.
2043          */
2044         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2045                 goto restart;
2046 out:
2047         SOCKBUF_LOCK_ASSERT(sb);
2048         SBLASTRECORDCHK(sb);
2049         SBLASTMBUFCHK(sb);
2050         SOCKBUF_UNLOCK(sb);
2051         sbunlock(sb);
2052         return (error);
2053 }
2054
2055 /*
2056  * Optimized version of soreceive() for simple datagram cases from userspace.
2057  * Unlike in the stream case, we're able to drop a datagram if copyout()
2058  * fails, and because we handle datagrams atomically, we don't need to use a
2059  * sleep lock to prevent I/O interlacing.
2060  */
2061 int
2062 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2063     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2064 {
2065         struct mbuf *m, *m2;
2066         int flags, len, error;
2067         struct protosw *pr = so->so_proto;
2068         struct mbuf *nextrecord;
2069
2070         if (psa != NULL)
2071                 *psa = NULL;
2072         if (controlp != NULL)
2073                 *controlp = NULL;
2074         if (flagsp != NULL)
2075                 flags = *flagsp &~ MSG_EOR;
2076         else
2077                 flags = 0;
2078
2079         /*
2080          * For any complicated cases, fall back to the full
2081          * soreceive_generic().
2082          */
2083         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2084                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2085                     flagsp));
2086
2087         /*
2088          * Enforce restrictions on use.
2089          */
2090         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2091             ("soreceive_dgram: wantrcvd"));
2092         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2093         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2094             ("soreceive_dgram: SBS_RCVATMARK"));
2095         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2096             ("soreceive_dgram: P_CONNREQUIRED"));
2097
2098         /*
2099          * Loop blocking while waiting for a datagram.
2100          */
2101         SOCKBUF_LOCK(&so->so_rcv);
2102         while ((m = so->so_rcv.sb_mb) == NULL) {
2103                 KASSERT(so->so_rcv.sb_cc == 0,
2104                     ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2105                     so->so_rcv.sb_cc));
2106                 if (so->so_error) {
2107                         error = so->so_error;
2108                         so->so_error = 0;
2109                         SOCKBUF_UNLOCK(&so->so_rcv);
2110                         return (error);
2111                 }
2112                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2113                     uio->uio_resid == 0) {
2114                         SOCKBUF_UNLOCK(&so->so_rcv);
2115                         return (0);
2116                 }
2117                 if ((so->so_state & SS_NBIO) ||
2118                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2119                         SOCKBUF_UNLOCK(&so->so_rcv);
2120                         return (EWOULDBLOCK);
2121                 }
2122                 SBLASTRECORDCHK(&so->so_rcv);
2123                 SBLASTMBUFCHK(&so->so_rcv);
2124                 error = sbwait(&so->so_rcv);
2125                 if (error) {
2126                         SOCKBUF_UNLOCK(&so->so_rcv);
2127                         return (error);
2128                 }
2129         }
2130         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2131
2132         if (uio->uio_td)
2133                 uio->uio_td->td_ru.ru_msgrcv++;
2134         SBLASTRECORDCHK(&so->so_rcv);
2135         SBLASTMBUFCHK(&so->so_rcv);
2136         nextrecord = m->m_nextpkt;
2137         if (nextrecord == NULL) {
2138                 KASSERT(so->so_rcv.sb_lastrecord == m,
2139                     ("soreceive_dgram: lastrecord != m"));
2140         }
2141
2142         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2143             ("soreceive_dgram: m_nextpkt != nextrecord"));
2144
2145         /*
2146          * Pull 'm' and its chain off the front of the packet queue.
2147          */
2148         so->so_rcv.sb_mb = NULL;
2149         sockbuf_pushsync(&so->so_rcv, nextrecord);
2150
2151         /*
2152          * Walk 'm's chain and free that many bytes from the socket buffer.
2153          */
2154         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2155                 sbfree(&so->so_rcv, m2);
2156
2157         /*
2158          * Do a few last checks before we let go of the lock.
2159          */
2160         SBLASTRECORDCHK(&so->so_rcv);
2161         SBLASTMBUFCHK(&so->so_rcv);
2162         SOCKBUF_UNLOCK(&so->so_rcv);
2163
2164         if (pr->pr_flags & PR_ADDR) {
2165                 KASSERT(m->m_type == MT_SONAME,
2166                     ("m->m_type == %d", m->m_type));
2167                 if (psa != NULL)
2168                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2169                             M_NOWAIT);
2170                 m = m_free(m);
2171         }
2172         if (m == NULL) {
2173                 /* XXXRW: Can this happen? */
2174                 return (0);
2175         }
2176
2177         /*
2178          * Packet to copyout() is now in 'm' and it is disconnected from the
2179          * queue.
2180          *
2181          * Process one or more MT_CONTROL mbufs present before any data mbufs
2182          * in the first mbuf chain on the socket buffer.  We call into the
2183          * protocol to perform externalization (or freeing if controlp ==
2184          * NULL).
2185          */
2186         if (m->m_type == MT_CONTROL) {
2187                 struct mbuf *cm = NULL, *cmn;
2188                 struct mbuf **cme = &cm;
2189
2190                 do {
2191                         m2 = m->m_next;
2192                         m->m_next = NULL;
2193                         *cme = m;
2194                         cme = &(*cme)->m_next;
2195                         m = m2;
2196                 } while (m != NULL && m->m_type == MT_CONTROL);
2197                 while (cm != NULL) {
2198                         cmn = cm->m_next;
2199                         cm->m_next = NULL;
2200                         if (pr->pr_domain->dom_externalize != NULL) {
2201                                 error = (*pr->pr_domain->dom_externalize)
2202                                     (cm, controlp);
2203                         } else if (controlp != NULL)
2204                                 *controlp = cm;
2205                         else
2206                                 m_freem(cm);
2207                         if (controlp != NULL) {
2208                                 while (*controlp != NULL)
2209                                         controlp = &(*controlp)->m_next;
2210                         }
2211                         cm = cmn;
2212                 }
2213         }
2214         KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2215
2216         while (m != NULL && uio->uio_resid > 0) {
2217                 len = uio->uio_resid;
2218                 if (len > m->m_len)
2219                         len = m->m_len;
2220                 error = uiomove(mtod(m, char *), (int)len, uio);
2221                 if (error) {
2222                         m_freem(m);
2223                         return (error);
2224                 }
2225                 m = m_free(m);
2226         }
2227         if (m != NULL)
2228                 flags |= MSG_TRUNC;
2229         m_freem(m);
2230         if (flagsp != NULL)
2231                 *flagsp |= flags;
2232         return (0);
2233 }
2234
2235 int
2236 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2237     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2238 {
2239
2240         return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2241             controlp, flagsp));
2242 }
2243
2244 int
2245 soshutdown(struct socket *so, int how)
2246 {
2247         struct protosw *pr = so->so_proto;
2248         int error;
2249
2250         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2251                 return (EINVAL);
2252         if (pr->pr_usrreqs->pru_flush != NULL) {
2253                 (*pr->pr_usrreqs->pru_flush)(so, how);
2254         }
2255         if (how != SHUT_WR)
2256                 sorflush(so);
2257         if (how != SHUT_RD) {
2258                 CURVNET_SET(so->so_vnet);
2259                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2260                 CURVNET_RESTORE();
2261                 return (error);
2262         }
2263         return (0);
2264 }
2265
2266 void
2267 sorflush(struct socket *so)
2268 {
2269         struct sockbuf *sb = &so->so_rcv;
2270         struct protosw *pr = so->so_proto;
2271         struct sockbuf asb;
2272
2273         /*
2274          * In order to avoid calling dom_dispose with the socket buffer mutex
2275          * held, and in order to generally avoid holding the lock for a long
2276          * time, we make a copy of the socket buffer and clear the original
2277          * (except locks, state).  The new socket buffer copy won't have
2278          * initialized locks so we can only call routines that won't use or
2279          * assert those locks.
2280          *
2281          * Dislodge threads currently blocked in receive and wait to acquire
2282          * a lock against other simultaneous readers before clearing the
2283          * socket buffer.  Don't let our acquire be interrupted by a signal
2284          * despite any existing socket disposition on interruptable waiting.
2285          */
2286         CURVNET_SET(so->so_vnet);
2287         socantrcvmore(so);
2288         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2289
2290         /*
2291          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2292          * and mutex data unchanged.
2293          */
2294         SOCKBUF_LOCK(sb);
2295         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2296         bcopy(&sb->sb_startzero, &asb.sb_startzero,
2297             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2298         bzero(&sb->sb_startzero,
2299             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2300         SOCKBUF_UNLOCK(sb);
2301         sbunlock(sb);
2302
2303         /*
2304          * Dispose of special rights and flush the socket buffer.  Don't call
2305          * any unsafe routines (that rely on locks being initialized) on asb.
2306          */
2307         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2308                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2309         sbrelease_internal(&asb, so);
2310         CURVNET_RESTORE();
2311 }
2312
2313 /*
2314  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2315  * additional variant to handle the case where the option value needs to be
2316  * some kind of integer, but not a specific size.  In addition to their use
2317  * here, these functions are also called by the protocol-level pr_ctloutput()
2318  * routines.
2319  */
2320 int
2321 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2322 {
2323         size_t  valsize;
2324
2325         /*
2326          * If the user gives us more than we wanted, we ignore it, but if we
2327          * don't get the minimum length the caller wants, we return EINVAL.
2328          * On success, sopt->sopt_valsize is set to however much we actually
2329          * retrieved.
2330          */
2331         if ((valsize = sopt->sopt_valsize) < minlen)
2332                 return EINVAL;
2333         if (valsize > len)
2334                 sopt->sopt_valsize = valsize = len;
2335
2336         if (sopt->sopt_td != NULL)
2337                 return (copyin(sopt->sopt_val, buf, valsize));
2338
2339         bcopy(sopt->sopt_val, buf, valsize);
2340         return (0);
2341 }
2342
2343 /*
2344  * Kernel version of setsockopt(2).
2345  *
2346  * XXX: optlen is size_t, not socklen_t
2347  */
2348 int
2349 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2350     size_t optlen)
2351 {
2352         struct sockopt sopt;
2353
2354         sopt.sopt_level = level;
2355         sopt.sopt_name = optname;
2356         sopt.sopt_dir = SOPT_SET;
2357         sopt.sopt_val = optval;
2358         sopt.sopt_valsize = optlen;
2359         sopt.sopt_td = NULL;
2360         return (sosetopt(so, &sopt));
2361 }
2362
2363 int
2364 sosetopt(struct socket *so, struct sockopt *sopt)
2365 {
2366         int     error, optval;
2367         struct  linger l;
2368         struct  timeval tv;
2369         u_long  val;
2370 #ifdef MAC
2371         struct mac extmac;
2372 #endif
2373
2374         error = 0;
2375         if (sopt->sopt_level != SOL_SOCKET) {
2376                 if (so->so_proto && so->so_proto->pr_ctloutput)
2377                         return ((*so->so_proto->pr_ctloutput)
2378                                   (so, sopt));
2379                 error = ENOPROTOOPT;
2380         } else {
2381                 switch (sopt->sopt_name) {
2382 #ifdef INET
2383                 case SO_ACCEPTFILTER:
2384                         error = do_setopt_accept_filter(so, sopt);
2385                         if (error)
2386                                 goto bad;
2387                         break;
2388 #endif
2389                 case SO_LINGER:
2390                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2391                         if (error)
2392                                 goto bad;
2393
2394                         SOCK_LOCK(so);
2395                         so->so_linger = l.l_linger;
2396                         if (l.l_onoff)
2397                                 so->so_options |= SO_LINGER;
2398                         else
2399                                 so->so_options &= ~SO_LINGER;
2400                         SOCK_UNLOCK(so);
2401                         break;
2402
2403                 case SO_DEBUG:
2404                 case SO_KEEPALIVE:
2405                 case SO_DONTROUTE:
2406                 case SO_USELOOPBACK:
2407                 case SO_BROADCAST:
2408                 case SO_REUSEADDR:
2409                 case SO_REUSEPORT:
2410                 case SO_OOBINLINE:
2411                 case SO_TIMESTAMP:
2412                 case SO_BINTIME:
2413                 case SO_NOSIGPIPE:
2414                 case SO_NO_DDP:
2415                 case SO_NO_OFFLOAD:
2416                         error = sooptcopyin(sopt, &optval, sizeof optval,
2417                                             sizeof optval);
2418                         if (error)
2419                                 goto bad;
2420                         SOCK_LOCK(so);
2421                         if (optval)
2422                                 so->so_options |= sopt->sopt_name;
2423                         else
2424                                 so->so_options &= ~sopt->sopt_name;
2425                         SOCK_UNLOCK(so);
2426                         break;
2427
2428                 case SO_SETFIB:
2429                         error = sooptcopyin(sopt, &optval, sizeof optval,
2430                                             sizeof optval);
2431                         if (optval < 1 || optval > rt_numfibs) {
2432                                 error = EINVAL;
2433                                 goto bad;
2434                         }
2435                         if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2436                             (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2437                                 so->so_fibnum = optval;
2438                                 /* Note: ignore error */
2439                                 if (so->so_proto && so->so_proto->pr_ctloutput)
2440                                         (*so->so_proto->pr_ctloutput)(so, sopt);
2441                         } else {
2442                                 so->so_fibnum = 0;
2443                         }
2444                         break;
2445                 case SO_SNDBUF:
2446                 case SO_RCVBUF:
2447                 case SO_SNDLOWAT:
2448                 case SO_RCVLOWAT:
2449                         error = sooptcopyin(sopt, &optval, sizeof optval,
2450                                             sizeof optval);
2451                         if (error)
2452                                 goto bad;
2453
2454                         /*
2455                          * Values < 1 make no sense for any of these options,
2456                          * so disallow them.
2457                          */
2458                         if (optval < 1) {
2459                                 error = EINVAL;
2460                                 goto bad;
2461                         }
2462
2463                         switch (sopt->sopt_name) {
2464                         case SO_SNDBUF:
2465                         case SO_RCVBUF:
2466                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2467                                     &so->so_snd : &so->so_rcv, (u_long)optval,
2468                                     so, curthread) == 0) {
2469                                         error = ENOBUFS;
2470                                         goto bad;
2471                                 }
2472                                 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2473                                     &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2474                                 break;
2475
2476                         /*
2477                          * Make sure the low-water is never greater than the
2478                          * high-water.
2479                          */
2480                         case SO_SNDLOWAT:
2481                                 SOCKBUF_LOCK(&so->so_snd);
2482                                 so->so_snd.sb_lowat =
2483                                     (optval > so->so_snd.sb_hiwat) ?
2484                                     so->so_snd.sb_hiwat : optval;
2485                                 SOCKBUF_UNLOCK(&so->so_snd);
2486                                 break;
2487                         case SO_RCVLOWAT:
2488                                 SOCKBUF_LOCK(&so->so_rcv);
2489                                 so->so_rcv.sb_lowat =
2490                                     (optval > so->so_rcv.sb_hiwat) ?
2491                                     so->so_rcv.sb_hiwat : optval;
2492                                 SOCKBUF_UNLOCK(&so->so_rcv);
2493                                 break;
2494                         }
2495                         break;
2496
2497                 case SO_SNDTIMEO:
2498                 case SO_RCVTIMEO:
2499 #ifdef COMPAT_FREEBSD32
2500                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2501                                 struct timeval32 tv32;
2502
2503                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2504                                     sizeof tv32);
2505                                 CP(tv32, tv, tv_sec);
2506                                 CP(tv32, tv, tv_usec);
2507                         } else
2508 #endif
2509                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2510                                     sizeof tv);
2511                         if (error)
2512                                 goto bad;
2513
2514                         /* assert(hz > 0); */
2515                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2516                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2517                                 error = EDOM;
2518                                 goto bad;
2519                         }
2520                         /* assert(tick > 0); */
2521                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2522                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2523                         if (val > INT_MAX) {
2524                                 error = EDOM;
2525                                 goto bad;
2526                         }
2527                         if (val == 0 && tv.tv_usec != 0)
2528                                 val = 1;
2529
2530                         switch (sopt->sopt_name) {
2531                         case SO_SNDTIMEO:
2532                                 so->so_snd.sb_timeo = val;
2533                                 break;
2534                         case SO_RCVTIMEO:
2535                                 so->so_rcv.sb_timeo = val;
2536                                 break;
2537                         }
2538                         break;
2539
2540                 case SO_LABEL:
2541 #ifdef MAC
2542                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2543                             sizeof extmac);
2544                         if (error)
2545                                 goto bad;
2546                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2547                             so, &extmac);
2548 #else
2549                         error = EOPNOTSUPP;
2550 #endif
2551                         break;
2552
2553                 default:
2554                         error = ENOPROTOOPT;
2555                         break;
2556                 }
2557                 if (error == 0 && so->so_proto != NULL &&
2558                     so->so_proto->pr_ctloutput != NULL) {
2559                         (void) ((*so->so_proto->pr_ctloutput)
2560                                   (so, sopt));
2561                 }
2562         }
2563 bad:
2564         return (error);
2565 }
2566
2567 /*
2568  * Helper routine for getsockopt.
2569  */
2570 int
2571 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2572 {
2573         int     error;
2574         size_t  valsize;
2575
2576         error = 0;
2577
2578         /*
2579          * Documented get behavior is that we always return a value, possibly
2580          * truncated to fit in the user's buffer.  Traditional behavior is
2581          * that we always tell the user precisely how much we copied, rather
2582          * than something useful like the total amount we had available for
2583          * her.  Note that this interface is not idempotent; the entire
2584          * answer must generated ahead of time.
2585          */
2586         valsize = min(len, sopt->sopt_valsize);
2587         sopt->sopt_valsize = valsize;
2588         if (sopt->sopt_val != NULL) {
2589                 if (sopt->sopt_td != NULL)
2590                         error = copyout(buf, sopt->sopt_val, valsize);
2591                 else
2592                         bcopy(buf, sopt->sopt_val, valsize);
2593         }
2594         return (error);
2595 }
2596
2597 int
2598 sogetopt(struct socket *so, struct sockopt *sopt)
2599 {
2600         int     error, optval;
2601         struct  linger l;
2602         struct  timeval tv;
2603 #ifdef MAC
2604         struct mac extmac;
2605 #endif
2606
2607         error = 0;
2608         if (sopt->sopt_level != SOL_SOCKET) {
2609                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2610                         return ((*so->so_proto->pr_ctloutput)
2611                                   (so, sopt));
2612                 } else
2613                         return (ENOPROTOOPT);
2614         } else {
2615                 switch (sopt->sopt_name) {
2616 #ifdef INET
2617                 case SO_ACCEPTFILTER:
2618                         error = do_getopt_accept_filter(so, sopt);
2619                         break;
2620 #endif
2621                 case SO_LINGER:
2622                         SOCK_LOCK(so);
2623                         l.l_onoff = so->so_options & SO_LINGER;
2624                         l.l_linger = so->so_linger;
2625                         SOCK_UNLOCK(so);
2626                         error = sooptcopyout(sopt, &l, sizeof l);
2627                         break;
2628
2629                 case SO_USELOOPBACK:
2630                 case SO_DONTROUTE:
2631                 case SO_DEBUG:
2632                 case SO_KEEPALIVE:
2633                 case SO_REUSEADDR:
2634                 case SO_REUSEPORT:
2635                 case SO_BROADCAST:
2636                 case SO_OOBINLINE:
2637                 case SO_ACCEPTCONN:
2638                 case SO_TIMESTAMP:
2639                 case SO_BINTIME:
2640                 case SO_NOSIGPIPE:
2641                         optval = so->so_options & sopt->sopt_name;
2642 integer:
2643                         error = sooptcopyout(sopt, &optval, sizeof optval);
2644                         break;
2645
2646                 case SO_TYPE:
2647                         optval = so->so_type;
2648                         goto integer;
2649
2650                 case SO_ERROR:
2651                         SOCK_LOCK(so);
2652                         optval = so->so_error;
2653                         so->so_error = 0;
2654                         SOCK_UNLOCK(so);
2655                         goto integer;
2656
2657                 case SO_SNDBUF:
2658                         optval = so->so_snd.sb_hiwat;
2659                         goto integer;
2660
2661                 case SO_RCVBUF:
2662                         optval = so->so_rcv.sb_hiwat;
2663                         goto integer;
2664
2665                 case SO_SNDLOWAT:
2666                         optval = so->so_snd.sb_lowat;
2667                         goto integer;
2668
2669                 case SO_RCVLOWAT:
2670                         optval = so->so_rcv.sb_lowat;
2671                         goto integer;
2672
2673                 case SO_SNDTIMEO:
2674                 case SO_RCVTIMEO:
2675                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
2676                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2677
2678                         tv.tv_sec = optval / hz;
2679                         tv.tv_usec = (optval % hz) * tick;
2680 #ifdef COMPAT_FREEBSD32
2681                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2682                                 struct timeval32 tv32;
2683
2684                                 CP(tv, tv32, tv_sec);
2685                                 CP(tv, tv32, tv_usec);
2686                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2687                         } else
2688 #endif
2689                                 error = sooptcopyout(sopt, &tv, sizeof tv);
2690                         break;
2691
2692                 case SO_LABEL:
2693 #ifdef MAC
2694                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2695                             sizeof(extmac));
2696                         if (error)
2697                                 return (error);
2698                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2699                             so, &extmac);
2700                         if (error)
2701                                 return (error);
2702                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2703 #else
2704                         error = EOPNOTSUPP;
2705 #endif
2706                         break;
2707
2708                 case SO_PEERLABEL:
2709 #ifdef MAC
2710                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2711                             sizeof(extmac));
2712                         if (error)
2713                                 return (error);
2714                         error = mac_getsockopt_peerlabel(
2715                             sopt->sopt_td->td_ucred, so, &extmac);
2716                         if (error)
2717                                 return (error);
2718                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2719 #else
2720                         error = EOPNOTSUPP;
2721 #endif
2722                         break;
2723
2724                 case SO_LISTENQLIMIT:
2725                         optval = so->so_qlimit;
2726                         goto integer;
2727
2728                 case SO_LISTENQLEN:
2729                         optval = so->so_qlen;
2730                         goto integer;
2731
2732                 case SO_LISTENINCQLEN:
2733                         optval = so->so_incqlen;
2734                         goto integer;
2735
2736                 default:
2737                         error = ENOPROTOOPT;
2738                         break;
2739                 }
2740                 return (error);
2741         }
2742 }
2743
2744 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2745 int
2746 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2747 {
2748         struct mbuf *m, *m_prev;
2749         int sopt_size = sopt->sopt_valsize;
2750
2751         MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2752         if (m == NULL)
2753                 return ENOBUFS;
2754         if (sopt_size > MLEN) {
2755                 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2756                 if ((m->m_flags & M_EXT) == 0) {
2757                         m_free(m);
2758                         return ENOBUFS;
2759                 }
2760                 m->m_len = min(MCLBYTES, sopt_size);
2761         } else {
2762                 m->m_len = min(MLEN, sopt_size);
2763         }
2764         sopt_size -= m->m_len;
2765         *mp = m;
2766         m_prev = m;
2767
2768         while (sopt_size) {
2769                 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2770                 if (m == NULL) {
2771                         m_freem(*mp);
2772                         return ENOBUFS;
2773                 }
2774                 if (sopt_size > MLEN) {
2775                         MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2776                             M_DONTWAIT);
2777                         if ((m->m_flags & M_EXT) == 0) {
2778                                 m_freem(m);
2779                                 m_freem(*mp);
2780                                 return ENOBUFS;
2781                         }
2782                         m->m_len = min(MCLBYTES, sopt_size);
2783                 } else {
2784                         m->m_len = min(MLEN, sopt_size);
2785                 }
2786                 sopt_size -= m->m_len;
2787                 m_prev->m_next = m;
2788                 m_prev = m;
2789         }
2790         return (0);
2791 }
2792
2793 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2794 int
2795 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2796 {
2797         struct mbuf *m0 = m;
2798
2799         if (sopt->sopt_val == NULL)
2800                 return (0);
2801         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2802                 if (sopt->sopt_td != NULL) {
2803                         int error;
2804
2805                         error = copyin(sopt->sopt_val, mtod(m, char *),
2806                                        m->m_len);
2807                         if (error != 0) {
2808                                 m_freem(m0);
2809                                 return(error);
2810                         }
2811                 } else
2812                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2813                 sopt->sopt_valsize -= m->m_len;
2814                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2815                 m = m->m_next;
2816         }
2817         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2818                 panic("ip6_sooptmcopyin");
2819         return (0);
2820 }
2821
2822 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2823 int
2824 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2825 {
2826         struct mbuf *m0 = m;
2827         size_t valsize = 0;
2828
2829         if (sopt->sopt_val == NULL)
2830                 return (0);
2831         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2832                 if (sopt->sopt_td != NULL) {
2833                         int error;
2834
2835                         error = copyout(mtod(m, char *), sopt->sopt_val,
2836                                        m->m_len);
2837                         if (error != 0) {
2838                                 m_freem(m0);
2839                                 return(error);
2840                         }
2841                 } else
2842                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2843                sopt->sopt_valsize -= m->m_len;
2844                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2845                valsize += m->m_len;
2846                m = m->m_next;
2847         }
2848         if (m != NULL) {
2849                 /* enough soopt buffer should be given from user-land */
2850                 m_freem(m0);
2851                 return(EINVAL);
2852         }
2853         sopt->sopt_valsize = valsize;
2854         return (0);
2855 }
2856
2857 /*
2858  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2859  * out-of-band data, which will then notify socket consumers.
2860  */
2861 void
2862 sohasoutofband(struct socket *so)
2863 {
2864
2865         if (so->so_sigio != NULL)
2866                 pgsigio(&so->so_sigio, SIGURG, 0);
2867         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2868 }
2869
2870 int
2871 sopoll(struct socket *so, int events, struct ucred *active_cred,
2872     struct thread *td)
2873 {
2874
2875         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2876             td));
2877 }
2878
2879 int
2880 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2881     struct thread *td)
2882 {
2883         int revents = 0;
2884
2885         SOCKBUF_LOCK(&so->so_snd);
2886         SOCKBUF_LOCK(&so->so_rcv);
2887         if (events & (POLLIN | POLLRDNORM))
2888                 if (soreadabledata(so))
2889                         revents |= events & (POLLIN | POLLRDNORM);
2890
2891         if (events & (POLLOUT | POLLWRNORM))
2892                 if (sowriteable(so))
2893                         revents |= events & (POLLOUT | POLLWRNORM);
2894
2895         if (events & (POLLPRI | POLLRDBAND))
2896                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2897                         revents |= events & (POLLPRI | POLLRDBAND);
2898
2899         if ((events & POLLINIGNEOF) == 0) {
2900                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2901                         revents |= events & (POLLIN | POLLRDNORM);
2902                         if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2903                                 revents |= POLLHUP;
2904                 }
2905         }
2906
2907         if (revents == 0) {
2908                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2909                         selrecord(td, &so->so_rcv.sb_sel);
2910                         so->so_rcv.sb_flags |= SB_SEL;
2911                 }
2912
2913                 if (events & (POLLOUT | POLLWRNORM)) {
2914                         selrecord(td, &so->so_snd.sb_sel);
2915                         so->so_snd.sb_flags |= SB_SEL;
2916                 }
2917         }
2918
2919         SOCKBUF_UNLOCK(&so->so_rcv);
2920         SOCKBUF_UNLOCK(&so->so_snd);
2921         return (revents);
2922 }
2923
2924 int
2925 soo_kqfilter(struct file *fp, struct knote *kn)
2926 {
2927         struct socket *so = kn->kn_fp->f_data;
2928         struct sockbuf *sb;
2929
2930         switch (kn->kn_filter) {
2931         case EVFILT_READ:
2932                 if (so->so_options & SO_ACCEPTCONN)
2933                         kn->kn_fop = &solisten_filtops;
2934                 else
2935                         kn->kn_fop = &soread_filtops;
2936                 sb = &so->so_rcv;
2937                 break;
2938         case EVFILT_WRITE:
2939                 kn->kn_fop = &sowrite_filtops;
2940                 sb = &so->so_snd;
2941                 break;
2942         default:
2943                 return (EINVAL);
2944         }
2945
2946         SOCKBUF_LOCK(sb);
2947         knlist_add(&sb->sb_sel.si_note, kn, 1);
2948         sb->sb_flags |= SB_KNOTE;
2949         SOCKBUF_UNLOCK(sb);
2950         return (0);
2951 }
2952
2953 /*
2954  * Some routines that return EOPNOTSUPP for entry points that are not
2955  * supported by a protocol.  Fill in as needed.
2956  */
2957 int
2958 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2959 {
2960
2961         return EOPNOTSUPP;
2962 }
2963
2964 int
2965 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2966 {
2967
2968         return EOPNOTSUPP;
2969 }
2970
2971 int
2972 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2973 {
2974
2975         return EOPNOTSUPP;
2976 }
2977
2978 int
2979 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2980 {
2981
2982         return EOPNOTSUPP;
2983 }
2984
2985 int
2986 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2987 {
2988
2989         return EOPNOTSUPP;
2990 }
2991
2992 int
2993 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2994     struct ifnet *ifp, struct thread *td)
2995 {
2996
2997         return EOPNOTSUPP;
2998 }
2999
3000 int
3001 pru_disconnect_notsupp(struct socket *so)
3002 {
3003
3004         return EOPNOTSUPP;
3005 }
3006
3007 int
3008 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3009 {
3010
3011         return EOPNOTSUPP;
3012 }
3013
3014 int
3015 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3016 {
3017
3018         return EOPNOTSUPP;
3019 }
3020
3021 int
3022 pru_rcvd_notsupp(struct socket *so, int flags)
3023 {
3024
3025         return EOPNOTSUPP;
3026 }
3027
3028 int
3029 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3030 {
3031
3032         return EOPNOTSUPP;
3033 }
3034
3035 int
3036 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3037     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3038 {
3039
3040         return EOPNOTSUPP;
3041 }
3042
3043 /*
3044  * This isn't really a ``null'' operation, but it's the default one and
3045  * doesn't do anything destructive.
3046  */
3047 int
3048 pru_sense_null(struct socket *so, struct stat *sb)
3049 {
3050
3051         sb->st_blksize = so->so_snd.sb_hiwat;
3052         return 0;
3053 }
3054
3055 int
3056 pru_shutdown_notsupp(struct socket *so)
3057 {
3058
3059         return EOPNOTSUPP;
3060 }
3061
3062 int
3063 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3064 {
3065
3066         return EOPNOTSUPP;
3067 }
3068
3069 int
3070 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3071     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3072 {
3073
3074         return EOPNOTSUPP;
3075 }
3076
3077 int
3078 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3079     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3080 {
3081
3082         return EOPNOTSUPP;
3083 }
3084
3085 int
3086 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3087     struct thread *td)
3088 {
3089
3090         return EOPNOTSUPP;
3091 }
3092
3093 static void
3094 filt_sordetach(struct knote *kn)
3095 {
3096         struct socket *so = kn->kn_fp->f_data;
3097
3098         SOCKBUF_LOCK(&so->so_rcv);
3099         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3100         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3101                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3102         SOCKBUF_UNLOCK(&so->so_rcv);
3103 }
3104
3105 /*ARGSUSED*/
3106 static int
3107 filt_soread(struct knote *kn, long hint)
3108 {
3109         struct socket *so;
3110
3111         so = kn->kn_fp->f_data;
3112         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3113
3114         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3115         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3116                 kn->kn_flags |= EV_EOF;
3117                 kn->kn_fflags = so->so_error;
3118                 return (1);
3119         } else if (so->so_error)        /* temporary udp error */
3120                 return (1);
3121         else if (kn->kn_sfflags & NOTE_LOWAT)
3122                 return (kn->kn_data >= kn->kn_sdata);
3123         else
3124                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3125 }
3126
3127 static void
3128 filt_sowdetach(struct knote *kn)
3129 {
3130         struct socket *so = kn->kn_fp->f_data;
3131
3132         SOCKBUF_LOCK(&so->so_snd);
3133         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3134         if (knlist_empty(&so->so_snd.sb_sel.si_note))
3135                 so->so_snd.sb_flags &= ~SB_KNOTE;
3136         SOCKBUF_UNLOCK(&so->so_snd);
3137 }
3138
3139 /*ARGSUSED*/
3140 static int
3141 filt_sowrite(struct knote *kn, long hint)
3142 {
3143         struct socket *so;
3144
3145         so = kn->kn_fp->f_data;
3146         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3147         kn->kn_data = sbspace(&so->so_snd);
3148         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3149                 kn->kn_flags |= EV_EOF;
3150                 kn->kn_fflags = so->so_error;
3151                 return (1);
3152         } else if (so->so_error)        /* temporary udp error */
3153                 return (1);
3154         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3155             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3156                 return (0);
3157         else if (kn->kn_sfflags & NOTE_LOWAT)
3158                 return (kn->kn_data >= kn->kn_sdata);
3159         else
3160                 return (kn->kn_data >= so->so_snd.sb_lowat);
3161 }
3162
3163 /*ARGSUSED*/
3164 static int
3165 filt_solisten(struct knote *kn, long hint)
3166 {
3167         struct socket *so = kn->kn_fp->f_data;
3168
3169         kn->kn_data = so->so_qlen;
3170         return (! TAILQ_EMPTY(&so->so_comp));
3171 }
3172
3173 int
3174 socheckuid(struct socket *so, uid_t uid)
3175 {
3176
3177         if (so == NULL)
3178                 return (EPERM);
3179         if (so->so_cred->cr_uid != uid)
3180                 return (EPERM);
3181         return (0);
3182 }
3183
3184 static int
3185 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3186 {
3187         int error;
3188         int val;
3189
3190         val = somaxconn;
3191         error = sysctl_handle_int(oidp, &val, 0, req);
3192         if (error || !req->newptr )
3193                 return (error);
3194
3195         if (val < 1 || val > USHRT_MAX)
3196                 return (EINVAL);
3197
3198         somaxconn = val;
3199         return (0);
3200 }
3201
3202 /*
3203  * These functions are used by protocols to notify the socket layer (and its
3204  * consumers) of state changes in the sockets driven by protocol-side events.
3205  */
3206
3207 /*
3208  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3209  *
3210  * Normal sequence from the active (originating) side is that
3211  * soisconnecting() is called during processing of connect() call, resulting
3212  * in an eventual call to soisconnected() if/when the connection is
3213  * established.  When the connection is torn down soisdisconnecting() is
3214  * called during processing of disconnect() call, and soisdisconnected() is
3215  * called when the connection to the peer is totally severed.  The semantics
3216  * of these routines are such that connectionless protocols can call
3217  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3218  * calls when setting up a ``connection'' takes no time.
3219  *
3220  * From the passive side, a socket is created with two queues of sockets:
3221  * so_incomp for connections in progress and so_comp for connections already
3222  * made and awaiting user acceptance.  As a protocol is preparing incoming
3223  * connections, it creates a socket structure queued on so_incomp by calling
3224  * sonewconn().  When the connection is established, soisconnected() is
3225  * called, and transfers the socket structure to so_comp, making it available
3226  * to accept().
3227  *
3228  * If a socket is closed with sockets on either so_incomp or so_comp, these
3229  * sockets are dropped.
3230  *
3231  * If higher-level protocols are implemented in the kernel, the wakeups done
3232  * here will sometimes cause software-interrupt process scheduling.
3233  */
3234 void
3235 soisconnecting(struct socket *so)
3236 {
3237
3238         SOCK_LOCK(so);
3239         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3240         so->so_state |= SS_ISCONNECTING;
3241         SOCK_UNLOCK(so);
3242 }
3243
3244 void
3245 soisconnected(struct socket *so)
3246 {
3247         struct socket *head;
3248         int ret;
3249
3250 restart:
3251         ACCEPT_LOCK();
3252         SOCK_LOCK(so);
3253         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3254         so->so_state |= SS_ISCONNECTED;
3255         head = so->so_head;
3256         if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3257                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3258                         SOCK_UNLOCK(so);
3259                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
3260                         head->so_incqlen--;
3261                         so->so_qstate &= ~SQ_INCOMP;
3262                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3263                         head->so_qlen++;
3264                         so->so_qstate |= SQ_COMP;
3265                         ACCEPT_UNLOCK();
3266                         sorwakeup(head);
3267                         wakeup_one(&head->so_timeo);
3268                 } else {
3269                         ACCEPT_UNLOCK();
3270                         soupcall_set(so, SO_RCV,
3271                             head->so_accf->so_accept_filter->accf_callback,
3272                             head->so_accf->so_accept_filter_arg);
3273                         so->so_options &= ~SO_ACCEPTFILTER;
3274                         ret = head->so_accf->so_accept_filter->accf_callback(so,
3275                             head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3276                         if (ret == SU_ISCONNECTED)
3277                                 soupcall_clear(so, SO_RCV);
3278                         SOCK_UNLOCK(so);
3279                         if (ret == SU_ISCONNECTED)
3280                                 goto restart;
3281                 }
3282                 return;
3283         }
3284         SOCK_UNLOCK(so);
3285         ACCEPT_UNLOCK();
3286         wakeup(&so->so_timeo);
3287         sorwakeup(so);
3288         sowwakeup(so);
3289 }
3290
3291 void
3292 soisdisconnecting(struct socket *so)
3293 {
3294
3295         /*
3296          * Note: This code assumes that SOCK_LOCK(so) and
3297          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3298          */
3299         SOCKBUF_LOCK(&so->so_rcv);
3300         so->so_state &= ~SS_ISCONNECTING;
3301         so->so_state |= SS_ISDISCONNECTING;
3302         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3303         sorwakeup_locked(so);
3304         SOCKBUF_LOCK(&so->so_snd);
3305         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3306         sowwakeup_locked(so);
3307         wakeup(&so->so_timeo);
3308 }
3309
3310 void
3311 soisdisconnected(struct socket *so)
3312 {
3313
3314         /*
3315          * Note: This code assumes that SOCK_LOCK(so) and
3316          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3317          */
3318         SOCKBUF_LOCK(&so->so_rcv);
3319         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3320         so->so_state |= SS_ISDISCONNECTED;
3321         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3322         sorwakeup_locked(so);
3323         SOCKBUF_LOCK(&so->so_snd);
3324         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3325         sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3326         sowwakeup_locked(so);
3327         wakeup(&so->so_timeo);
3328 }
3329
3330 /*
3331  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3332  */
3333 struct sockaddr *
3334 sodupsockaddr(const struct sockaddr *sa, int mflags)
3335 {
3336         struct sockaddr *sa2;
3337
3338         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3339         if (sa2)
3340                 bcopy(sa, sa2, sa->sa_len);
3341         return sa2;
3342 }
3343
3344 /*
3345  * Register per-socket buffer upcalls.
3346  */
3347 void
3348 soupcall_set(struct socket *so, int which,
3349     int (*func)(struct socket *, void *, int), void *arg)
3350 {
3351         struct sockbuf *sb;
3352
3353         switch (which) {
3354         case SO_RCV:
3355                 sb = &so->so_rcv;
3356                 break;
3357         case SO_SND:
3358                 sb = &so->so_snd;
3359                 break;
3360         default:
3361                 panic("soupcall_set: bad which");
3362         }
3363         SOCKBUF_LOCK_ASSERT(sb);
3364 #if 0
3365         /* XXX: accf_http actually wants to do this on purpose. */
3366         KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3367 #endif
3368         sb->sb_upcall = func;
3369         sb->sb_upcallarg = arg;
3370         sb->sb_flags |= SB_UPCALL;
3371 }
3372
3373 void
3374 soupcall_clear(struct socket *so, int which)
3375 {
3376         struct sockbuf *sb;
3377
3378         switch (which) {
3379         case SO_RCV:
3380                 sb = &so->so_rcv;
3381                 break;
3382         case SO_SND:
3383                 sb = &so->so_snd;
3384                 break;
3385         default:
3386                 panic("soupcall_clear: bad which");
3387         }
3388         SOCKBUF_LOCK_ASSERT(sb);
3389         KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3390         sb->sb_upcall = NULL;
3391         sb->sb_upcallarg = NULL;
3392         sb->sb_flags &= ~SB_UPCALL;
3393 }
3394
3395 /*
3396  * Create an external-format (``xsocket'') structure using the information in
3397  * the kernel-format socket structure pointed to by so.  This is done to
3398  * reduce the spew of irrelevant information over this interface, to isolate
3399  * user code from changes in the kernel structure, and potentially to provide
3400  * information-hiding if we decide that some of this information should be
3401  * hidden from users.
3402  */
3403 void
3404 sotoxsocket(struct socket *so, struct xsocket *xso)
3405 {
3406
3407         xso->xso_len = sizeof *xso;
3408         xso->xso_so = so;
3409         xso->so_type = so->so_type;
3410         xso->so_options = so->so_options;
3411         xso->so_linger = so->so_linger;
3412         xso->so_state = so->so_state;
3413         xso->so_pcb = so->so_pcb;
3414         xso->xso_protocol = so->so_proto->pr_protocol;
3415         xso->xso_family = so->so_proto->pr_domain->dom_family;
3416         xso->so_qlen = so->so_qlen;
3417         xso->so_incqlen = so->so_incqlen;
3418         xso->so_qlimit = so->so_qlimit;
3419         xso->so_timeo = so->so_timeo;
3420         xso->so_error = so->so_error;
3421         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3422         xso->so_oobmark = so->so_oobmark;
3423         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3424         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3425         xso->so_uid = so->so_cred->cr_uid;
3426 }
3427
3428
3429 /*
3430  * Socket accessor functions to provide external consumers with
3431  * a safe interface to socket state
3432  *
3433  */
3434
3435 void
3436 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3437 {
3438
3439         TAILQ_FOREACH(so, &so->so_comp, so_list)
3440                 func(so, arg);
3441 }
3442
3443 struct sockbuf *
3444 so_sockbuf_rcv(struct socket *so)
3445 {
3446
3447         return (&so->so_rcv);
3448 }
3449
3450 struct sockbuf *
3451 so_sockbuf_snd(struct socket *so)
3452 {
3453
3454         return (&so->so_snd);
3455 }
3456
3457 int
3458 so_state_get(const struct socket *so)
3459 {
3460
3461         return (so->so_state);
3462 }
3463
3464 void
3465 so_state_set(struct socket *so, int val)
3466 {
3467
3468         so->so_state = val;
3469 }
3470
3471 int
3472 so_options_get(const struct socket *so)
3473 {
3474
3475         return (so->so_options);
3476 }
3477
3478 void
3479 so_options_set(struct socket *so, int val)
3480 {
3481
3482         so->so_options = val;
3483 }
3484
3485 int
3486 so_error_get(const struct socket *so)
3487 {
3488
3489         return (so->so_error);
3490 }
3491
3492 void
3493 so_error_set(struct socket *so, int val)
3494 {
3495
3496         so->so_error = val;
3497 }
3498
3499 int
3500 so_linger_get(const struct socket *so)
3501 {
3502
3503         return (so->so_linger);
3504 }
3505
3506 void
3507 so_linger_set(struct socket *so, int val)
3508 {
3509
3510         so->so_linger = val;
3511 }
3512
3513 struct protosw *
3514 so_protosw_get(const struct socket *so)
3515 {
3516
3517         return (so->so_proto);
3518 }
3519
3520 void
3521 so_protosw_set(struct socket *so, struct protosw *val)
3522 {
3523
3524         so->so_proto = val;
3525 }
3526
3527 void
3528 so_sorwakeup(struct socket *so)
3529 {
3530
3531         sorwakeup(so);
3532 }
3533
3534 void
3535 so_sowwakeup(struct socket *so)
3536 {
3537
3538         sowwakeup(so);
3539 }
3540
3541 void
3542 so_sorwakeup_locked(struct socket *so)
3543 {
3544
3545         sorwakeup_locked(so);
3546 }
3547
3548 void
3549 so_sowwakeup_locked(struct socket *so)
3550 {
3551
3552         sowwakeup_locked(so);
3553 }
3554
3555 void
3556 so_lock(struct socket *so)
3557 {
3558         SOCK_LOCK(so);
3559 }
3560
3561 void
3562 so_unlock(struct socket *so)
3563 {
3564         SOCK_UNLOCK(so);
3565 }