sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3  *      The Regents of the University of California.
   4  * Copyright (c) 2004 The FreeBSD Foundation
   5  * Copyright (c) 2004-2008 Robert N. M. Watson
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  *
  96  * NOTE: With regard to VNETs the general rule is that callers do not set
  97  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  98  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  99  * and sorflush(), which are usually called from a pre-set VNET context.
 100  * sopoll() currently does not need a VNET context to be set.
 101  */
 102
 103 #include <sys/cdefs.h>
 104 __FBSDID("$FreeBSD$");
 105
 106 #include "opt_inet.h"
 107 #include "opt_inet6.h"
 108 #include "opt_zero.h"
 109 #include "opt_compat.h"
 110
 111 #include <sys/param.h>
 112 #include <sys/systm.h>
 113 #include <sys/fcntl.h>
 114 #include <sys/limits.h>
 115 #include <sys/lock.h>
 116 #include <sys/mac.h>
 117 #include <sys/malloc.h>
 118 #include <sys/mbuf.h>
 119 #include <sys/mutex.h>
 120 #include <sys/domain.h>
 121 #include <sys/file.h>                   /* for struct knote */
 122 #include <sys/kernel.h>
 123 #include <sys/event.h>
 124 #include <sys/eventhandler.h>
 125 #include <sys/poll.h>
 126 #include <sys/proc.h>
 127 #include <sys/protosw.h>
 128 #include <sys/socket.h>
 129 #include <sys/socketvar.h>
 130 #include <sys/resourcevar.h>
 131 #include <net/route.h>
 132 #include <sys/signalvar.h>
 133 #include <sys/stat.h>
 134 #include <sys/sx.h>
 135 #include <sys/sysctl.h>
 136 #include <sys/uio.h>
 137 #include <sys/jail.h>
 138
 139 #include <net/vnet.h>
 140
 141 #include <security/mac/mac_framework.h>
 142
 143 #include <vm/uma.h>
 144
 145 #ifdef COMPAT_FREEBSD32
 146 #include <sys/mount.h>
 147 #include <sys/sysent.h>
 148 #include <compat/freebsd32/freebsd32.h>
 149 #endif
 150
 151 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 152                     int flags);
 153
 154 static void     filt_sordetach(struct knote *kn);
 155 static int      filt_soread(struct knote *kn, long hint);
 156 static void     filt_sowdetach(struct knote *kn);
 157 static int      filt_sowrite(struct knote *kn, long hint);
 158 static int      filt_solisten(struct knote *kn, long hint);
 159
 160 static struct filterops solisten_filtops = {
 161         .f_isfd = 1,
 162         .f_detach = filt_sordetach,
 163         .f_event = filt_solisten,
 164 };
 165 static struct filterops soread_filtops = {
 166         .f_isfd = 1,
 167         .f_detach = filt_sordetach,
 168         .f_event = filt_soread,
 169 };
 170 static struct filterops sowrite_filtops = {
 171         .f_isfd = 1,
 172         .f_detach = filt_sowdetach,
 173         .f_event = filt_sowrite,
 174 };
 175
 176 uma_zone_t socket_zone;
 177 so_gen_t        so_gencnt;      /* generation count for sockets */
 178
 179 int     maxsockets;
 180
 181 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 182 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 183
 184 #define VNET_SO_ASSERT(so)                                              \
 185         VNET_ASSERT(curvnet != NULL,                                    \
 186             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 187
 188 static int somaxconn = SOMAXCONN;
 189 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
 190 /* XXX: we dont have SYSCTL_USHORT */
 191 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
 192     0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
 193     "queue size");
 194 static int numopensockets;
 195 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 196     &numopensockets, 0, "Number of open sockets");
 197 #ifdef ZERO_COPY_SOCKETS
 198 /* These aren't static because they're used in other files. */
 199 int so_zero_copy_send = 1;
 200 int so_zero_copy_receive = 1;
 201 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
 202     "Zero copy controls");
 203 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
 204     &so_zero_copy_receive, 0, "Enable zero copy receive");
 205 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
 206     &so_zero_copy_send, 0, "Enable zero copy send");
 207 #endif /* ZERO_COPY_SOCKETS */
 208
 209 /*
 210  * accept_mtx locks down per-socket fields relating to accept queues.  See
 211  * socketvar.h for an annotation of the protected fields of struct socket.
 212  */
 213 struct mtx accept_mtx;
 214 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 215
 216 /*
 217  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 218  * so_gencnt field.
 219  */
 220 static struct mtx so_global_mtx;
 221 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 222
 223 /*
 224  * General IPC sysctl name space, used by sockets and a variety of other IPC
 225  * types.
 226  */
 227 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 228
 229 /*
 230  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 231  * of the change so that they can update their dependent limits as required.
 232  */
 233 static int
 234 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 235 {
 236         int error, newmaxsockets;
 237
 238         newmaxsockets = maxsockets;
 239         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 240         if (error == 0 && req->newptr) {
 241                 if (newmaxsockets > maxsockets) {
 242                         maxsockets = newmaxsockets;
 243                         if (maxsockets > ((maxfiles / 4) * 3)) {
 244                                 maxfiles = (maxsockets * 5) / 4;
 245                                 maxfilesperproc = (maxfiles * 9) / 10;
 246                         }
 247                         EVENTHANDLER_INVOKE(maxsockets_change);
 248                 } else
 249                         error = EINVAL;
 250         }
 251         return (error);
 252 }
 253
 254 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 255     &maxsockets, 0, sysctl_maxsockets, "IU",
 256     "Maximum number of sockets avaliable");
 257
 258 /*
 259  * Initialise maxsockets.  This SYSINIT must be run after
 260  * tunable_mbinit().
 261  */
 262 static void
 263 init_maxsockets(void *ignored)
 264 {
 265
 266         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 267         maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
 268 }
 269 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 270
 271 /*
 272  * Socket operation routines.  These routines are called by the routines in
 273  * sys_socket.c or from a system process, and implement the semantics of
 274  * socket operations by switching out to the protocol specific routines.
 275  */
 276
 277 /*
 278  * Get a socket structure from our zone, and initialize it.  Note that it
 279  * would probably be better to allocate socket and PCB at the same time, but
 280  * I'm not convinced that all the protocols can be easily modified to do
 281  * this.
 282  *
 283  * soalloc() returns a socket with a ref count of 0.
 284  */
 285 static struct socket *
 286 soalloc(struct vnet *vnet)
 287 {
 288         struct socket *so;
 289
 290         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 291         if (so == NULL)
 292                 return (NULL);
 293 #ifdef MAC
 294         if (mac_socket_init(so, M_NOWAIT) != 0) {
 295                 uma_zfree(socket_zone, so);
 296                 return (NULL);
 297         }
 298 #endif
 299         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 300         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 301         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 302         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 303         TAILQ_INIT(&so->so_aiojobq);
 304         mtx_lock(&so_global_mtx);
 305         so->so_gencnt = ++so_gencnt;
 306         ++numopensockets;
 307 #ifdef VIMAGE
 308         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 309             __func__, __LINE__, so));
 310         vnet->vnet_sockcnt++;
 311         so->so_vnet = vnet;
 312 #endif
 313         mtx_unlock(&so_global_mtx);
 314         return (so);
 315 }
 316
 317 /*
 318  * Free the storage associated with a socket at the socket layer, tear down
 319  * locks, labels, etc.  All protocol state is assumed already to have been
 320  * torn down (and possibly never set up) by the caller.
 321  */
 322 static void
 323 sodealloc(struct socket *so)
 324 {
 325
 326         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 327         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 328
 329         mtx_lock(&so_global_mtx);
 330         so->so_gencnt = ++so_gencnt;
 331         --numopensockets;       /* Could be below, but faster here. */
 332 #ifdef VIMAGE
 333         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 334             __func__, __LINE__, so));
 335         so->so_vnet->vnet_sockcnt--;
 336 #endif
 337         mtx_unlock(&so_global_mtx);
 338         if (so->so_rcv.sb_hiwat)
 339                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 340                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 341         if (so->so_snd.sb_hiwat)
 342                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 343                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 344 #ifdef INET
 345         /* remove acccept filter if one is present. */
 346         if (so->so_accf != NULL)
 347                 do_setopt_accept_filter(so, NULL);
 348 #endif
 349 #ifdef MAC
 350         mac_socket_destroy(so);
 351 #endif
 352         crfree(so->so_cred);
 353         sx_destroy(&so->so_snd.sb_sx);
 354         sx_destroy(&so->so_rcv.sb_sx);
 355         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 356         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 357         uma_zfree(socket_zone, so);
 358 }
 359
 360 /*
 361  * socreate returns a socket with a ref count of 1.  The socket should be
 362  * closed with soclose().
 363  */
 364 int
 365 socreate(int dom, struct socket **aso, int type, int proto,
 366     struct ucred *cred, struct thread *td)
 367 {
 368         struct protosw *prp;
 369         struct socket *so;
 370         int error;
 371
 372         if (proto)
 373                 prp = pffindproto(dom, proto, type);
 374         else
 375                 prp = pffindtype(dom, type);
 376
 377         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
 378             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 379                 return (EPROTONOSUPPORT);
 380
 381         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 382                 return (EPROTONOSUPPORT);
 383
 384         if (prp->pr_type != type)
 385                 return (EPROTOTYPE);
 386         so = soalloc(CRED_TO_VNET(cred));
 387         if (so == NULL)
 388                 return (ENOBUFS);
 389
 390         TAILQ_INIT(&so->so_incomp);
 391         TAILQ_INIT(&so->so_comp);
 392         so->so_type = type;
 393         so->so_cred = crhold(cred);
 394         if ((prp->pr_domain->dom_family == PF_INET) ||
 395             (prp->pr_domain->dom_family == PF_ROUTE))
 396                 so->so_fibnum = td->td_proc->p_fibnum;
 397         else
 398                 so->so_fibnum = 0;
 399         so->so_proto = prp;
 400 #ifdef MAC
 401         mac_socket_create(cred, so);
 402 #endif
 403         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 404         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 405         so->so_count = 1;
 406         /*
 407          * Auto-sizing of socket buffers is managed by the protocols and
 408          * the appropriate flags must be set in the pru_attach function.
 409          */
 410         CURVNET_SET(so->so_vnet);
 411         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 412         CURVNET_RESTORE();
 413         if (error) {
 414                 KASSERT(so->so_count == 1, ("socreate: so_count %d",
 415                     so->so_count));
 416                 so->so_count = 0;
 417                 sodealloc(so);
 418                 return (error);
 419         }
 420         *aso = so;
 421         return (0);
 422 }
 423
 424 #ifdef REGRESSION
 425 static int regression_sonewconn_earlytest = 1;
 426 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 427     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 428 #endif
 429
 430 /*
 431  * When an attempt at a new connection is noted on a socket which accepts
 432  * connections, sonewconn is called.  If the connection is possible (subject
 433  * to space constraints, etc.) then we allocate a new structure, propoerly
 434  * linked into the data structure of the original socket, and return this.
 435  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 436  *
 437  * Note: the ref count on the socket is 0 on return.
 438  */
 439 struct socket *
 440 sonewconn(struct socket *head, int connstatus)
 441 {
 442         struct socket *so;
 443         int over;
 444
 445         ACCEPT_LOCK();
 446         over = (head->so_qlen > 3 * head->so_qlimit / 2);
 447         ACCEPT_UNLOCK();
 448 #ifdef REGRESSION
 449         if (regression_sonewconn_earlytest && over)
 450 #else
 451         if (over)
 452 #endif
 453                 return (NULL);
 454         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 455             __func__, __LINE__, head));
 456         so = soalloc(head->so_vnet);
 457         if (so == NULL)
 458                 return (NULL);
 459         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 460                 connstatus = 0;
 461         so->so_head = head;
 462         so->so_type = head->so_type;
 463         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 464         so->so_linger = head->so_linger;
 465         so->so_state = head->so_state | SS_NOFDREF;
 466         so->so_fibnum = head->so_fibnum;
 467         so->so_proto = head->so_proto;
 468         so->so_cred = crhold(head->so_cred);
 469 #ifdef MAC
 470         mac_socket_newconn(head, so);
 471 #endif
 472         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 473         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 474         VNET_SO_ASSERT(head);
 475         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
 476             (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 477                 sodealloc(so);
 478                 return (NULL);
 479         }
 480         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 481         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 482         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 483         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 484         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 485         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 486         so->so_state |= connstatus;
 487         ACCEPT_LOCK();
 488         if (connstatus) {
 489                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 490                 so->so_qstate |= SQ_COMP;
 491                 head->so_qlen++;
 492         } else {
 493                 /*
 494                  * Keep removing sockets from the head until there's room for
 495                  * us to insert on the tail.  In pre-locking revisions, this
 496                  * was a simple if(), but as we could be racing with other
 497                  * threads and soabort() requires dropping locks, we must
 498                  * loop waiting for the condition to be true.
 499                  */
 500                 while (head->so_incqlen > head->so_qlimit) {
 501                         struct socket *sp;
 502                         sp = TAILQ_FIRST(&head->so_incomp);
 503                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 504                         head->so_incqlen--;
 505                         sp->so_qstate &= ~SQ_INCOMP;
 506                         sp->so_head = NULL;
 507                         ACCEPT_UNLOCK();
 508                         soabort(sp);
 509                         ACCEPT_LOCK();
 510                 }
 511                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 512                 so->so_qstate |= SQ_INCOMP;
 513                 head->so_incqlen++;
 514         }
 515         ACCEPT_UNLOCK();
 516         if (connstatus) {
 517                 sorwakeup(head);
 518                 wakeup_one(&head->so_timeo);
 519         }
 520         return (so);
 521 }
 522
 523 int
 524 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 525 {
 526         int error;
 527
 528         CURVNET_SET(so->so_vnet);
 529         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 530         CURVNET_RESTORE();
 531         return error;
 532 }
 533
 534 /*
 535  * solisten() transitions a socket from a non-listening state to a listening
 536  * state, but can also be used to update the listen queue depth on an
 537  * existing listen socket.  The protocol will call back into the sockets
 538  * layer using solisten_proto_check() and solisten_proto() to check and set
 539  * socket-layer listen state.  Call backs are used so that the protocol can
 540  * acquire both protocol and socket layer locks in whatever order is required
 541  * by the protocol.
 542  *
 543  * Protocol implementors are advised to hold the socket lock across the
 544  * socket-layer test and set to avoid races at the socket layer.
 545  */
 546 int
 547 solisten(struct socket *so, int backlog, struct thread *td)
 548 {
 549         int error;
 550
 551         CURVNET_SET(so->so_vnet);
 552         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 553         CURVNET_RESTORE();
 554         return error;
 555 }
 556
 557 int
 558 solisten_proto_check(struct socket *so)
 559 {
 560
 561         SOCK_LOCK_ASSERT(so);
 562
 563         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 564             SS_ISDISCONNECTING))
 565                 return (EINVAL);
 566         return (0);
 567 }
 568
 569 void
 570 solisten_proto(struct socket *so, int backlog)
 571 {
 572
 573         SOCK_LOCK_ASSERT(so);
 574
 575         if (backlog < 0 || backlog > somaxconn)
 576                 backlog = somaxconn;
 577         so->so_qlimit = backlog;
 578         so->so_options |= SO_ACCEPTCONN;
 579 }
 580
 581 /*
 582  * Evaluate the reference count and named references on a socket; if no
 583  * references remain, free it.  This should be called whenever a reference is
 584  * released, such as in sorele(), but also when named reference flags are
 585  * cleared in socket or protocol code.
 586  *
 587  * sofree() will free the socket if:
 588  *
 589  * - There are no outstanding file descriptor references or related consumers
 590  *   (so_count == 0).
 591  *
 592  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 593  *
 594  * - The protocol does not have an outstanding strong reference on the socket
 595  *   (SS_PROTOREF).
 596  *
 597  * - The socket is not in a completed connection queue, so a process has been
 598  *   notified that it is present.  If it is removed, the user process may
 599  *   block in accept() despite select() saying the socket was ready.
 600  */
 601 void
 602 sofree(struct socket *so)
 603 {
 604         struct protosw *pr = so->so_proto;
 605         struct socket *head;
 606
 607         ACCEPT_LOCK_ASSERT();
 608         SOCK_LOCK_ASSERT(so);
 609
 610         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 611             (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 612                 SOCK_UNLOCK(so);
 613                 ACCEPT_UNLOCK();
 614                 return;
 615         }
 616
 617         head = so->so_head;
 618         if (head != NULL) {
 619                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 620                     (so->so_qstate & SQ_INCOMP) != 0,
 621                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 622                     "SQ_INCOMP"));
 623                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 624                     (so->so_qstate & SQ_INCOMP) == 0,
 625                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 626                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 627                 head->so_incqlen--;
 628                 so->so_qstate &= ~SQ_INCOMP;
 629                 so->so_head = NULL;
 630         }
 631         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 632             (so->so_qstate & SQ_INCOMP) == 0,
 633             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 634             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 635         if (so->so_options & SO_ACCEPTCONN) {
 636                 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
 637                 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
 638         }
 639         SOCK_UNLOCK(so);
 640         ACCEPT_UNLOCK();
 641
 642         VNET_SO_ASSERT(so);
 643         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 644                 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 645         if (pr->pr_usrreqs->pru_detach != NULL)
 646                 (*pr->pr_usrreqs->pru_detach)(so);
 647
 648         /*
 649          * From this point on, we assume that no other references to this
 650          * socket exist anywhere else in the stack.  Therefore, no locks need
 651          * to be acquired or held.
 652          *
 653          * We used to do a lot of socket buffer and socket locking here, as
 654          * well as invoke sorflush() and perform wakeups.  The direct call to
 655          * dom_dispose() and sbrelease_internal() are an inlining of what was
 656          * necessary from sorflush().
 657          *
 658          * Notice that the socket buffer and kqueue state are torn down
 659          * before calling pru_detach.  This means that protocols shold not
 660          * assume they can perform socket wakeups, etc, in their detach code.
 661          */
 662         sbdestroy(&so->so_snd, so);
 663         sbdestroy(&so->so_rcv, so);
 664         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 665         knlist_destroy(&so->so_snd.sb_sel.si_note);
 666         sodealloc(so);
 667 }
 668
 669 /*
 670  * Close a socket on last file table reference removal.  Initiate disconnect
 671  * if connected.  Free socket when disconnect complete.
 672  *
 673  * This function will sorele() the socket.  Note that soclose() may be called
 674  * prior to the ref count reaching zero.  The actual socket structure will
 675  * not be freed until the ref count reaches zero.
 676  */
 677 int
 678 soclose(struct socket *so)
 679 {
 680         int error = 0;
 681
 682         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 683
 684         CURVNET_SET(so->so_vnet);
 685         funsetown(&so->so_sigio);
 686         if (so->so_state & SS_ISCONNECTED) {
 687                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 688                         error = sodisconnect(so);
 689                         if (error) {
 690                                 if (error == ENOTCONN)
 691                                         error = 0;
 692                                 goto drop;
 693                         }
 694                 }
 695                 if (so->so_options & SO_LINGER) {
 696                         if ((so->so_state & SS_ISDISCONNECTING) &&
 697                             (so->so_state & SS_NBIO))
 698                                 goto drop;
 699                         while (so->so_state & SS_ISCONNECTED) {
 700                                 error = tsleep(&so->so_timeo,
 701                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
 702                                 if (error)
 703                                         break;
 704                         }
 705                 }
 706         }
 707
 708 drop:
 709         if (so->so_proto->pr_usrreqs->pru_close != NULL)
 710                 (*so->so_proto->pr_usrreqs->pru_close)(so);
 711         if (so->so_options & SO_ACCEPTCONN) {
 712                 struct socket *sp;
 713                 ACCEPT_LOCK();
 714                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 715                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 716                         so->so_incqlen--;
 717                         sp->so_qstate &= ~SQ_INCOMP;
 718                         sp->so_head = NULL;
 719                         ACCEPT_UNLOCK();
 720                         soabort(sp);
 721                         ACCEPT_LOCK();
 722                 }
 723                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 724                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 725                         so->so_qlen--;
 726                         sp->so_qstate &= ~SQ_COMP;
 727                         sp->so_head = NULL;
 728                         ACCEPT_UNLOCK();
 729                         soabort(sp);
 730                         ACCEPT_LOCK();
 731                 }
 732                 ACCEPT_UNLOCK();
 733         }
 734         ACCEPT_LOCK();
 735         SOCK_LOCK(so);
 736         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 737         so->so_state |= SS_NOFDREF;
 738         sorele(so);
 739         CURVNET_RESTORE();
 740         return (error);
 741 }
 742
 743 /*
 744  * soabort() is used to abruptly tear down a connection, such as when a
 745  * resource limit is reached (listen queue depth exceeded), or if a listen
 746  * socket is closed while there are sockets waiting to be accepted.
 747  *
 748  * This interface is tricky, because it is called on an unreferenced socket,
 749  * and must be called only by a thread that has actually removed the socket
 750  * from the listen queue it was on, or races with other threads are risked.
 751  *
 752  * This interface will call into the protocol code, so must not be called
 753  * with any socket locks held.  Protocols do call it while holding their own
 754  * recursible protocol mutexes, but this is something that should be subject
 755  * to review in the future.
 756  */
 757 void
 758 soabort(struct socket *so)
 759 {
 760
 761         /*
 762          * In as much as is possible, assert that no references to this
 763          * socket are held.  This is not quite the same as asserting that the
 764          * current thread is responsible for arranging for no references, but
 765          * is as close as we can get for now.
 766          */
 767         KASSERT(so->so_count == 0, ("soabort: so_count"));
 768         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 769         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 770         KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 771         KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 772         VNET_SO_ASSERT(so);
 773
 774         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 775                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
 776         ACCEPT_LOCK();
 777         SOCK_LOCK(so);
 778         sofree(so);
 779 }
 780
 781 int
 782 soaccept(struct socket *so, struct sockaddr **nam)
 783 {
 784         int error;
 785
 786         SOCK_LOCK(so);
 787         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 788         so->so_state &= ~SS_NOFDREF;
 789         SOCK_UNLOCK(so);
 790
 791         CURVNET_SET(so->so_vnet);
 792         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 793         CURVNET_RESTORE();
 794         return (error);
 795 }
 796
 797 int
 798 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 799 {
 800         int error;
 801
 802         if (so->so_options & SO_ACCEPTCONN)
 803                 return (EOPNOTSUPP);
 804
 805         CURVNET_SET(so->so_vnet);
 806         /*
 807          * If protocol is connection-based, can only connect once.
 808          * Otherwise, if connected, try to disconnect first.  This allows
 809          * user to disconnect by connecting to, e.g., a null address.
 810          */
 811         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 812             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 813             (error = sodisconnect(so)))) {
 814                 error = EISCONN;
 815         } else {
 816                 /*
 817                  * Prevent accumulated error from previous connection from
 818                  * biting us.
 819                  */
 820                 so->so_error = 0;
 821                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
 822         }
 823         CURVNET_RESTORE();
 824
 825         return (error);
 826 }
 827
 828 int
 829 soconnect2(struct socket *so1, struct socket *so2)
 830 {
 831         int error;
 832
 833         CURVNET_SET(so1->so_vnet);
 834         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 835         CURVNET_RESTORE();
 836         return (error);
 837 }
 838
 839 int
 840 sodisconnect(struct socket *so)
 841 {
 842         int error;
 843
 844         if ((so->so_state & SS_ISCONNECTED) == 0)
 845                 return (ENOTCONN);
 846         if (so->so_state & SS_ISDISCONNECTING)
 847                 return (EALREADY);
 848         VNET_SO_ASSERT(so);
 849         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 850         return (error);
 851 }
 852
 853 #ifdef ZERO_COPY_SOCKETS
 854 struct so_zerocopy_stats{
 855         int size_ok;
 856         int align_ok;
 857         int found_ifp;
 858 };
 859 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
 860 #include <netinet/in.h>
 861 #include <net/route.h>
 862 #include <netinet/in_pcb.h>
 863 #include <vm/vm.h>
 864 #include <vm/vm_page.h>
 865 #include <vm/vm_object.h>
 866
 867 /*
 868  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
 869  * sosend_dgram() and sosend_generic() use m_uiotombuf().
 870  *
 871  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
 872  * all of the data referenced by the uio.  If desired, it uses zero-copy.
 873  * *space will be updated to reflect data copied in.
 874  *
 875  * NB: If atomic I/O is requested, the caller must already have checked that
 876  * space can hold resid bytes.
 877  *
 878  * NB: In the event of an error, the caller may need to free the partial
 879  * chain pointed to by *mpp.  The contents of both *uio and *space may be
 880  * modified even in the case of an error.
 881  */
 882 static int
 883 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
 884     int flags)
 885 {
 886         struct mbuf *m, **mp, *top;
 887         long len, resid;
 888         int error;
 889 #ifdef ZERO_COPY_SOCKETS
 890         int cow_send;
 891 #endif
 892
 893         *retmp = top = NULL;
 894         mp = &top;
 895         len = 0;
 896         resid = uio->uio_resid;
 897         error = 0;
 898         do {
 899 #ifdef ZERO_COPY_SOCKETS
 900                 cow_send = 0;
 901 #endif /* ZERO_COPY_SOCKETS */
 902                 if (resid >= MINCLSIZE) {
 903 #ifdef ZERO_COPY_SOCKETS
 904                         if (top == NULL) {
 905                                 m = m_gethdr(M_WAITOK, MT_DATA);
 906                                 m->m_pkthdr.len = 0;
 907                                 m->m_pkthdr.rcvif = NULL;
 908                         } else
 909                                 m = m_get(M_WAITOK, MT_DATA);
 910                         if (so_zero_copy_send &&
 911                             resid>=PAGE_SIZE &&
 912                             *space>=PAGE_SIZE &&
 913                             uio->uio_iov->iov_len>=PAGE_SIZE) {
 914                                 so_zerocp_stats.size_ok++;
 915                                 so_zerocp_stats.align_ok++;
 916                                 cow_send = socow_setup(m, uio);
 917                                 len = cow_send;
 918                         }
 919                         if (!cow_send) {
 920                                 m_clget(m, M_WAITOK);
 921                                 len = min(min(MCLBYTES, resid), *space);
 922                         }
 923 #else /* ZERO_COPY_SOCKETS */
 924                         if (top == NULL) {
 925                                 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
 926                                 m->m_pkthdr.len = 0;
 927                                 m->m_pkthdr.rcvif = NULL;
 928                         } else
 929                                 m = m_getcl(M_WAIT, MT_DATA, 0);
 930                         len = min(min(MCLBYTES, resid), *space);
 931 #endif /* ZERO_COPY_SOCKETS */
 932                 } else {
 933                         if (top == NULL) {
 934                                 m = m_gethdr(M_WAIT, MT_DATA);
 935                                 m->m_pkthdr.len = 0;
 936                                 m->m_pkthdr.rcvif = NULL;
 937
 938                                 len = min(min(MHLEN, resid), *space);
 939                                 /*
 940                                  * For datagram protocols, leave room
 941                                  * for protocol headers in first mbuf.
 942                                  */
 943                                 if (atomic && m && len < MHLEN)
 944                                         MH_ALIGN(m, len);
 945                         } else {
 946                                 m = m_get(M_WAIT, MT_DATA);
 947                                 len = min(min(MLEN, resid), *space);
 948                         }
 949                 }
 950                 if (m == NULL) {
 951                         error = ENOBUFS;
 952                         goto out;
 953                 }
 954
 955                 *space -= len;
 956 #ifdef ZERO_COPY_SOCKETS
 957                 if (cow_send)
 958                         error = 0;
 959                 else
 960 #endif /* ZERO_COPY_SOCKETS */
 961                 error = uiomove(mtod(m, void *), (int)len, uio);
 962                 resid = uio->uio_resid;
 963                 m->m_len = len;
 964                 *mp = m;
 965                 top->m_pkthdr.len += len;
 966                 if (error)
 967                         goto out;
 968                 mp = &m->m_next;
 969                 if (resid <= 0) {
 970                         if (flags & MSG_EOR)
 971                                 top->m_flags |= M_EOR;
 972                         break;
 973                 }
 974         } while (*space > 0 && atomic);
 975 out:
 976         *retmp = top;
 977         return (error);
 978 }
 979 #endif /*ZERO_COPY_SOCKETS*/
 980
 981 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 982
 983 int
 984 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 985     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 986 {
 987         long space, resid;
 988         int clen = 0, error, dontroute;
 989 #ifdef ZERO_COPY_SOCKETS
 990         int atomic = sosendallatonce(so) || top;
 991 #endif
 992
 993         KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
 994         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 995             ("sodgram_send: !PR_ATOMIC"));
 996
 997         if (uio != NULL)
 998                 resid = uio->uio_resid;
 999         else
1000                 resid = top->m_pkthdr.len;
1001         /*
1002          * In theory resid should be unsigned.  However, space must be
1003          * signed, as it might be less than 0 if we over-committed, and we
1004          * must use a signed comparison of space and resid.  On the other
1005          * hand, a negative resid causes us to loop sending 0-length
1006          * segments to the protocol.
1007          */
1008         if (resid < 0) {
1009                 error = EINVAL;
1010                 goto out;
1011         }
1012
1013         dontroute =
1014             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1015         if (td != NULL)
1016                 td->td_ru.ru_msgsnd++;
1017         if (control != NULL)
1018                 clen = control->m_len;
1019
1020         SOCKBUF_LOCK(&so->so_snd);
1021         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1022                 SOCKBUF_UNLOCK(&so->so_snd);
1023                 error = EPIPE;
1024                 goto out;
1025         }
1026         if (so->so_error) {
1027                 error = so->so_error;
1028                 so->so_error = 0;
1029                 SOCKBUF_UNLOCK(&so->so_snd);
1030                 goto out;
1031         }
1032         if ((so->so_state & SS_ISCONNECTED) == 0) {
1033                 /*
1034                  * `sendto' and `sendmsg' is allowed on a connection-based
1035                  * socket if it supports implied connect.  Return ENOTCONN if
1036                  * not connected and no address is supplied.
1037                  */
1038                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1039                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1040                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1041                             !(resid == 0 && clen != 0)) {
1042                                 SOCKBUF_UNLOCK(&so->so_snd);
1043                                 error = ENOTCONN;
1044                                 goto out;
1045                         }
1046                 } else if (addr == NULL) {
1047                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1048                                 error = ENOTCONN;
1049                         else
1050                                 error = EDESTADDRREQ;
1051                         SOCKBUF_UNLOCK(&so->so_snd);
1052                         goto out;
1053                 }
1054         }
1055
1056         /*
1057          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1058          * problem and need fixing.
1059          */
1060         space = sbspace(&so->so_snd);
1061         if (flags & MSG_OOB)
1062                 space += 1024;
1063         space -= clen;
1064         SOCKBUF_UNLOCK(&so->so_snd);
1065         if (resid > space) {
1066                 error = EMSGSIZE;
1067                 goto out;
1068         }
1069         if (uio == NULL) {
1070                 resid = 0;
1071                 if (flags & MSG_EOR)
1072                         top->m_flags |= M_EOR;
1073         } else {
1074 #ifdef ZERO_COPY_SOCKETS
1075                 error = sosend_copyin(uio, &top, atomic, &space, flags);
1076                 if (error)
1077                         goto out;
1078 #else
1079                 /*
1080                  * Copy the data from userland into a mbuf chain.
1081                  * If no data is to be copied in, a single empty mbuf
1082                  * is returned.
1083                  */
1084                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1085                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1086                 if (top == NULL) {
1087                         error = EFAULT; /* only possible error */
1088                         goto out;
1089                 }
1090                 space -= resid - uio->uio_resid;
1091 #endif
1092                 resid = uio->uio_resid;
1093         }
1094         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1095         /*
1096          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1097          * than with.
1098          */
1099         if (dontroute) {
1100                 SOCK_LOCK(so);
1101                 so->so_options |= SO_DONTROUTE;
1102                 SOCK_UNLOCK(so);
1103         }
1104         /*
1105          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1106          * of date.  We could have recieved a reset packet in an interrupt or
1107          * maybe we slept while doing page faults in uiomove() etc.  We could
1108          * probably recheck again inside the locking protection here, but
1109          * there are probably other places that this also happens.  We must
1110          * rethink this.
1111          */
1112         VNET_SO_ASSERT(so);
1113         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1114             (flags & MSG_OOB) ? PRUS_OOB :
1115         /*
1116          * If the user set MSG_EOF, the protocol understands this flag and
1117          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1118          */
1119             ((flags & MSG_EOF) &&
1120              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1121              (resid <= 0)) ?
1122                 PRUS_EOF :
1123                 /* If there is more to send set PRUS_MORETOCOME */
1124                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1125                 top, addr, control, td);
1126         if (dontroute) {
1127                 SOCK_LOCK(so);
1128                 so->so_options &= ~SO_DONTROUTE;
1129                 SOCK_UNLOCK(so);
1130         }
1131         clen = 0;
1132         control = NULL;
1133         top = NULL;
1134 out:
1135         if (top != NULL)
1136                 m_freem(top);
1137         if (control != NULL)
1138                 m_freem(control);
1139         return (error);
1140 }
1141
1142 /*
1143  * Send on a socket.  If send must go all at once and message is larger than
1144  * send buffering, then hard error.  Lock against other senders.  If must go
1145  * all at once and not enough room now, then inform user that this would
1146  * block and do nothing.  Otherwise, if nonblocking, send as much as
1147  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1148  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1149  * in mbuf chain must be small enough to send all at once.
1150  *
1151  * Returns nonzero on error, timeout or signal; callers must check for short
1152  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1153  * on return.
1154  */
1155 int
1156 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1157     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1158 {
1159         long space, resid;
1160         int clen = 0, error, dontroute;
1161         int atomic = sosendallatonce(so) || top;
1162
1163         if (uio != NULL)
1164                 resid = uio->uio_resid;
1165         else
1166                 resid = top->m_pkthdr.len;
1167         /*
1168          * In theory resid should be unsigned.  However, space must be
1169          * signed, as it might be less than 0 if we over-committed, and we
1170          * must use a signed comparison of space and resid.  On the other
1171          * hand, a negative resid causes us to loop sending 0-length
1172          * segments to the protocol.
1173          *
1174          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1175          * type sockets since that's an error.
1176          */
1177         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1178                 error = EINVAL;
1179                 goto out;
1180         }
1181
1182         dontroute =
1183             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1184             (so->so_proto->pr_flags & PR_ATOMIC);
1185         if (td != NULL)
1186                 td->td_ru.ru_msgsnd++;
1187         if (control != NULL)
1188                 clen = control->m_len;
1189
1190         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1191         if (error)
1192                 goto out;
1193
1194 restart:
1195         do {
1196                 SOCKBUF_LOCK(&so->so_snd);
1197                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1198                         SOCKBUF_UNLOCK(&so->so_snd);
1199                         error = EPIPE;
1200                         goto release;
1201                 }
1202                 if (so->so_error) {
1203                         error = so->so_error;
1204                         so->so_error = 0;
1205                         SOCKBUF_UNLOCK(&so->so_snd);
1206                         goto release;
1207                 }
1208                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1209                         /*
1210                          * `sendto' and `sendmsg' is allowed on a connection-
1211                          * based socket if it supports implied connect.
1212                          * Return ENOTCONN if not connected and no address is
1213                          * supplied.
1214                          */
1215                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1216                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1217                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1218                                     !(resid == 0 && clen != 0)) {
1219                                         SOCKBUF_UNLOCK(&so->so_snd);
1220                                         error = ENOTCONN;
1221                                         goto release;
1222                                 }
1223                         } else if (addr == NULL) {
1224                                 SOCKBUF_UNLOCK(&so->so_snd);
1225                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1226                                         error = ENOTCONN;
1227                                 else
1228                                         error = EDESTADDRREQ;
1229                                 goto release;
1230                         }
1231                 }
1232                 space = sbspace(&so->so_snd);
1233                 if (flags & MSG_OOB)
1234                         space += 1024;
1235                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1236                     clen > so->so_snd.sb_hiwat) {
1237                         SOCKBUF_UNLOCK(&so->so_snd);
1238                         error = EMSGSIZE;
1239                         goto release;
1240                 }
1241                 if (space < resid + clen &&
1242                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1243                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1244                                 SOCKBUF_UNLOCK(&so->so_snd);
1245                                 error = EWOULDBLOCK;
1246                                 goto release;
1247                         }
1248                         error = sbwait(&so->so_snd);
1249                         SOCKBUF_UNLOCK(&so->so_snd);
1250                         if (error)
1251                                 goto release;
1252                         goto restart;
1253                 }
1254                 SOCKBUF_UNLOCK(&so->so_snd);
1255                 space -= clen;
1256                 do {
1257                         if (uio == NULL) {
1258                                 resid = 0;
1259                                 if (flags & MSG_EOR)
1260                                         top->m_flags |= M_EOR;
1261                         } else {
1262 #ifdef ZERO_COPY_SOCKETS
1263                                 error = sosend_copyin(uio, &top, atomic,
1264                                     &space, flags);
1265                                 if (error != 0)
1266                                         goto release;
1267 #else
1268                                 /*
1269                                  * Copy the data from userland into a mbuf
1270                                  * chain.  If no data is to be copied in,
1271                                  * a single empty mbuf is returned.
1272                                  */
1273                                 top = m_uiotombuf(uio, M_WAITOK, space,
1274                                     (atomic ? max_hdr : 0),
1275                                     (atomic ? M_PKTHDR : 0) |
1276                                     ((flags & MSG_EOR) ? M_EOR : 0));
1277                                 if (top == NULL) {
1278                                         error = EFAULT; /* only possible error */
1279                                         goto release;
1280                                 }
1281                                 space -= resid - uio->uio_resid;
1282 #endif
1283                                 resid = uio->uio_resid;
1284                         }
1285                         if (dontroute) {
1286                                 SOCK_LOCK(so);
1287                                 so->so_options |= SO_DONTROUTE;
1288                                 SOCK_UNLOCK(so);
1289                         }
1290                         /*
1291                          * XXX all the SBS_CANTSENDMORE checks previously
1292                          * done could be out of date.  We could have recieved
1293                          * a reset packet in an interrupt or maybe we slept
1294                          * while doing page faults in uiomove() etc.  We
1295                          * could probably recheck again inside the locking
1296                          * protection here, but there are probably other
1297                          * places that this also happens.  We must rethink
1298                          * this.
1299                          */
1300                         VNET_SO_ASSERT(so);
1301                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1302                             (flags & MSG_OOB) ? PRUS_OOB :
1303                         /*
1304                          * If the user set MSG_EOF, the protocol understands
1305                          * this flag and nothing left to send then use
1306                          * PRU_SEND_EOF instead of PRU_SEND.
1307                          */
1308                             ((flags & MSG_EOF) &&
1309                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1310                              (resid <= 0)) ?
1311                                 PRUS_EOF :
1312                         /* If there is more to send set PRUS_MORETOCOME. */
1313                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1314                             top, addr, control, td);
1315                         if (dontroute) {
1316                                 SOCK_LOCK(so);
1317                                 so->so_options &= ~SO_DONTROUTE;
1318                                 SOCK_UNLOCK(so);
1319                         }
1320                         clen = 0;
1321                         control = NULL;
1322                         top = NULL;
1323                         if (error)
1324                                 goto release;
1325                 } while (resid && space > 0);
1326         } while (resid);
1327
1328 release:
1329         sbunlock(&so->so_snd);
1330 out:
1331         if (top != NULL)
1332                 m_freem(top);
1333         if (control != NULL)
1334                 m_freem(control);
1335         return (error);
1336 }
1337
1338 int
1339 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1340     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1341 {
1342         int error;
1343
1344         CURVNET_SET(so->so_vnet);
1345         error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1346             control, flags, td);
1347         CURVNET_RESTORE();
1348         return (error);
1349 }
1350
1351 /*
1352  * The part of soreceive() that implements reading non-inline out-of-band
1353  * data from a socket.  For more complete comments, see soreceive(), from
1354  * which this code originated.
1355  *
1356  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1357  * unable to return an mbuf chain to the caller.
1358  */
1359 static int
1360 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1361 {
1362         struct protosw *pr = so->so_proto;
1363         struct mbuf *m;
1364         int error;
1365
1366         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1367         VNET_SO_ASSERT(so);
1368
1369         m = m_get(M_WAIT, MT_DATA);
1370         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1371         if (error)
1372                 goto bad;
1373         do {
1374 #ifdef ZERO_COPY_SOCKETS
1375                 if (so_zero_copy_receive) {
1376                         int disposable;
1377
1378                         if ((m->m_flags & M_EXT)
1379                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
1380                                 disposable = 1;
1381                         else
1382                                 disposable = 0;
1383
1384                         error = uiomoveco(mtod(m, void *),
1385                                           min(uio->uio_resid, m->m_len),
1386                                           uio, disposable);
1387                 } else
1388 #endif /* ZERO_COPY_SOCKETS */
1389                 error = uiomove(mtod(m, void *),
1390                     (int) min(uio->uio_resid, m->m_len), uio);
1391                 m = m_free(m);
1392         } while (uio->uio_resid && error == 0 && m);
1393 bad:
1394         if (m != NULL)
1395                 m_freem(m);
1396         return (error);
1397 }
1398
1399 /*
1400  * Following replacement or removal of the first mbuf on the first mbuf chain
1401  * of a socket buffer, push necessary state changes back into the socket
1402  * buffer so that other consumers see the values consistently.  'nextrecord'
1403  * is the callers locally stored value of the original value of
1404  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1405  * NOTE: 'nextrecord' may be NULL.
1406  */
1407 static __inline void
1408 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1409 {
1410
1411         SOCKBUF_LOCK_ASSERT(sb);
1412         /*
1413          * First, update for the new value of nextrecord.  If necessary, make
1414          * it the first record.
1415          */
1416         if (sb->sb_mb != NULL)
1417                 sb->sb_mb->m_nextpkt = nextrecord;
1418         else
1419                 sb->sb_mb = nextrecord;
1420
1421         /*
1422          * Now update any dependent socket buffer fields to reflect the new
1423          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1424          * addition of a second clause that takes care of the case where
1425          * sb_mb has been updated, but remains the last record.
1426          */
1427         if (sb->sb_mb == NULL) {
1428                 sb->sb_mbtail = NULL;
1429                 sb->sb_lastrecord = NULL;
1430         } else if (sb->sb_mb->m_nextpkt == NULL)
1431                 sb->sb_lastrecord = sb->sb_mb;
1432 }
1433
1434
1435 /*
1436  * Implement receive operations on a socket.  We depend on the way that
1437  * records are added to the sockbuf by sbappend.  In particular, each record
1438  * (mbufs linked through m_next) must begin with an address if the protocol
1439  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1440  * data, and then zero or more mbufs of data.  In order to allow parallelism
1441  * between network receive and copying to user space, as well as avoid
1442  * sleeping with a mutex held, we release the socket buffer mutex during the
1443  * user space copy.  Although the sockbuf is locked, new data may still be
1444  * appended, and thus we must maintain consistency of the sockbuf during that
1445  * time.
1446  *
1447  * The caller may receive the data as a single mbuf chain by supplying an
1448  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1449  * the count in uio_resid.
1450  */
1451 int
1452 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1453     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1454 {
1455         struct mbuf *m, **mp;
1456         int flags, len, error, offset;
1457         struct protosw *pr = so->so_proto;
1458         struct mbuf *nextrecord;
1459         int moff, type = 0;
1460         int orig_resid = uio->uio_resid;
1461
1462         mp = mp0;
1463         if (psa != NULL)
1464                 *psa = NULL;
1465         if (controlp != NULL)
1466                 *controlp = NULL;
1467         if (flagsp != NULL)
1468                 flags = *flagsp &~ MSG_EOR;
1469         else
1470                 flags = 0;
1471         if (flags & MSG_OOB)
1472                 return (soreceive_rcvoob(so, uio, flags));
1473         if (mp != NULL)
1474                 *mp = NULL;
1475         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1476             && uio->uio_resid) {
1477                 VNET_SO_ASSERT(so);
1478                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1479         }
1480
1481         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1482         if (error)
1483                 return (error);
1484
1485 restart:
1486         SOCKBUF_LOCK(&so->so_rcv);
1487         m = so->so_rcv.sb_mb;
1488         /*
1489          * If we have less data than requested, block awaiting more (subject
1490          * to any timeout) if:
1491          *   1. the current count is less than the low water mark, or
1492          *   2. MSG_WAITALL is set, and it is possible to do the entire
1493          *      receive operation at once if we block (resid <= hiwat).
1494          *   3. MSG_DONTWAIT is not set
1495          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1496          * we have to do the receive in sections, and thus risk returning a
1497          * short count if a timeout or signal occurs after we start.
1498          */
1499         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1500             so->so_rcv.sb_cc < uio->uio_resid) &&
1501             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1502             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1503             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1504                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1505                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1506                     m, so->so_rcv.sb_cc));
1507                 if (so->so_error) {
1508                         if (m != NULL)
1509                                 goto dontblock;
1510                         error = so->so_error;
1511                         if ((flags & MSG_PEEK) == 0)
1512                                 so->so_error = 0;
1513                         SOCKBUF_UNLOCK(&so->so_rcv);
1514                         goto release;
1515                 }
1516                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1517                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1518                         if (m == NULL) {
1519                                 SOCKBUF_UNLOCK(&so->so_rcv);
1520                                 goto release;
1521                         } else
1522                                 goto dontblock;
1523                 }
1524                 for (; m != NULL; m = m->m_next)
1525                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1526                                 m = so->so_rcv.sb_mb;
1527                                 goto dontblock;
1528                         }
1529                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1530                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1531                         SOCKBUF_UNLOCK(&so->so_rcv);
1532                         error = ENOTCONN;
1533                         goto release;
1534                 }
1535                 if (uio->uio_resid == 0) {
1536                         SOCKBUF_UNLOCK(&so->so_rcv);
1537                         goto release;
1538                 }
1539                 if ((so->so_state & SS_NBIO) ||
1540                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1541                         SOCKBUF_UNLOCK(&so->so_rcv);
1542                         error = EWOULDBLOCK;
1543                         goto release;
1544                 }
1545                 SBLASTRECORDCHK(&so->so_rcv);
1546                 SBLASTMBUFCHK(&so->so_rcv);
1547                 error = sbwait(&so->so_rcv);
1548                 SOCKBUF_UNLOCK(&so->so_rcv);
1549                 if (error)
1550                         goto release;
1551                 goto restart;
1552         }
1553 dontblock:
1554         /*
1555          * From this point onward, we maintain 'nextrecord' as a cache of the
1556          * pointer to the next record in the socket buffer.  We must keep the
1557          * various socket buffer pointers and local stack versions of the
1558          * pointers in sync, pushing out modifications before dropping the
1559          * socket buffer mutex, and re-reading them when picking it up.
1560          *
1561          * Otherwise, we will race with the network stack appending new data
1562          * or records onto the socket buffer by using inconsistent/stale
1563          * versions of the field, possibly resulting in socket buffer
1564          * corruption.
1565          *
1566          * By holding the high-level sblock(), we prevent simultaneous
1567          * readers from pulling off the front of the socket buffer.
1568          */
1569         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1570         if (uio->uio_td)
1571                 uio->uio_td->td_ru.ru_msgrcv++;
1572         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1573         SBLASTRECORDCHK(&so->so_rcv);
1574         SBLASTMBUFCHK(&so->so_rcv);
1575         nextrecord = m->m_nextpkt;
1576         if (pr->pr_flags & PR_ADDR) {
1577                 KASSERT(m->m_type == MT_SONAME,
1578                     ("m->m_type == %d", m->m_type));
1579                 orig_resid = 0;
1580                 if (psa != NULL)
1581                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1582                             M_NOWAIT);
1583                 if (flags & MSG_PEEK) {
1584                         m = m->m_next;
1585                 } else {
1586                         sbfree(&so->so_rcv, m);
1587                         so->so_rcv.sb_mb = m_free(m);
1588                         m = so->so_rcv.sb_mb;
1589                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1590                 }
1591         }
1592
1593         /*
1594          * Process one or more MT_CONTROL mbufs present before any data mbufs
1595          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1596          * just copy the data; if !MSG_PEEK, we call into the protocol to
1597          * perform externalization (or freeing if controlp == NULL).
1598          */
1599         if (m != NULL && m->m_type == MT_CONTROL) {
1600                 struct mbuf *cm = NULL, *cmn;
1601                 struct mbuf **cme = &cm;
1602
1603                 do {
1604                         if (flags & MSG_PEEK) {
1605                                 if (controlp != NULL) {
1606                                         *controlp = m_copy(m, 0, m->m_len);
1607                                         controlp = &(*controlp)->m_next;
1608                                 }
1609                                 m = m->m_next;
1610                         } else {
1611                                 sbfree(&so->so_rcv, m);
1612                                 so->so_rcv.sb_mb = m->m_next;
1613                                 m->m_next = NULL;
1614                                 *cme = m;
1615                                 cme = &(*cme)->m_next;
1616                                 m = so->so_rcv.sb_mb;
1617                         }
1618                 } while (m != NULL && m->m_type == MT_CONTROL);
1619                 if ((flags & MSG_PEEK) == 0)
1620                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1621                 while (cm != NULL) {
1622                         cmn = cm->m_next;
1623                         cm->m_next = NULL;
1624                         if (pr->pr_domain->dom_externalize != NULL) {
1625                                 SOCKBUF_UNLOCK(&so->so_rcv);
1626                                 VNET_SO_ASSERT(so);
1627                                 error = (*pr->pr_domain->dom_externalize)
1628                                     (cm, controlp);
1629                                 SOCKBUF_LOCK(&so->so_rcv);
1630                         } else if (controlp != NULL)
1631                                 *controlp = cm;
1632                         else
1633                                 m_freem(cm);
1634                         if (controlp != NULL) {
1635                                 orig_resid = 0;
1636                                 while (*controlp != NULL)
1637                                         controlp = &(*controlp)->m_next;
1638                         }
1639                         cm = cmn;
1640                 }
1641                 if (m != NULL)
1642                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1643                 else
1644                         nextrecord = so->so_rcv.sb_mb;
1645                 orig_resid = 0;
1646         }
1647         if (m != NULL) {
1648                 if ((flags & MSG_PEEK) == 0) {
1649                         KASSERT(m->m_nextpkt == nextrecord,
1650                             ("soreceive: post-control, nextrecord !sync"));
1651                         if (nextrecord == NULL) {
1652                                 KASSERT(so->so_rcv.sb_mb == m,
1653                                     ("soreceive: post-control, sb_mb!=m"));
1654                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1655                                     ("soreceive: post-control, lastrecord!=m"));
1656                         }
1657                 }
1658                 type = m->m_type;
1659                 if (type == MT_OOBDATA)
1660                         flags |= MSG_OOB;
1661         } else {
1662                 if ((flags & MSG_PEEK) == 0) {
1663                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1664                             ("soreceive: sb_mb != nextrecord"));
1665                         if (so->so_rcv.sb_mb == NULL) {
1666                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1667                                     ("soreceive: sb_lastercord != NULL"));
1668                         }
1669                 }
1670         }
1671         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1672         SBLASTRECORDCHK(&so->so_rcv);
1673         SBLASTMBUFCHK(&so->so_rcv);
1674
1675         /*
1676          * Now continue to read any data mbufs off of the head of the socket
1677          * buffer until the read request is satisfied.  Note that 'type' is
1678          * used to store the type of any mbuf reads that have happened so far
1679          * such that soreceive() can stop reading if the type changes, which
1680          * causes soreceive() to return only one of regular data and inline
1681          * out-of-band data in a single socket receive operation.
1682          */
1683         moff = 0;
1684         offset = 0;
1685         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1686                 /*
1687                  * If the type of mbuf has changed since the last mbuf
1688                  * examined ('type'), end the receive operation.
1689                  */
1690                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1691                 if (m->m_type == MT_OOBDATA) {
1692                         if (type != MT_OOBDATA)
1693                                 break;
1694                 } else if (type == MT_OOBDATA)
1695                         break;
1696                 else
1697                     KASSERT(m->m_type == MT_DATA,
1698                         ("m->m_type == %d", m->m_type));
1699                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1700                 len = uio->uio_resid;
1701                 if (so->so_oobmark && len > so->so_oobmark - offset)
1702                         len = so->so_oobmark - offset;
1703                 if (len > m->m_len - moff)
1704                         len = m->m_len - moff;
1705                 /*
1706                  * If mp is set, just pass back the mbufs.  Otherwise copy
1707                  * them out via the uio, then free.  Sockbuf must be
1708                  * consistent here (points to current mbuf, it points to next
1709                  * record) when we drop priority; we must note any additions
1710                  * to the sockbuf when we block interrupts again.
1711                  */
1712                 if (mp == NULL) {
1713                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1714                         SBLASTRECORDCHK(&so->so_rcv);
1715                         SBLASTMBUFCHK(&so->so_rcv);
1716                         SOCKBUF_UNLOCK(&so->so_rcv);
1717 #ifdef ZERO_COPY_SOCKETS
1718                         if (so_zero_copy_receive) {
1719                                 int disposable;
1720
1721                                 if ((m->m_flags & M_EXT)
1722                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
1723                                         disposable = 1;
1724                                 else
1725                                         disposable = 0;
1726
1727                                 error = uiomoveco(mtod(m, char *) + moff,
1728                                                   (int)len, uio,
1729                                                   disposable);
1730                         } else
1731 #endif /* ZERO_COPY_SOCKETS */
1732                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1733                         SOCKBUF_LOCK(&so->so_rcv);
1734                         if (error) {
1735                                 /*
1736                                  * The MT_SONAME mbuf has already been removed
1737                                  * from the record, so it is necessary to
1738                                  * remove the data mbufs, if any, to preserve
1739                                  * the invariant in the case of PR_ADDR that
1740                                  * requires MT_SONAME mbufs at the head of
1741                                  * each record.
1742                                  */
1743                                 if (m && pr->pr_flags & PR_ATOMIC &&
1744                                     ((flags & MSG_PEEK) == 0))
1745                                         (void)sbdroprecord_locked(&so->so_rcv);
1746                                 SOCKBUF_UNLOCK(&so->so_rcv);
1747                                 goto release;
1748                         }
1749                 } else
1750                         uio->uio_resid -= len;
1751                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1752                 if (len == m->m_len - moff) {
1753                         if (m->m_flags & M_EOR)
1754                                 flags |= MSG_EOR;
1755                         if (flags & MSG_PEEK) {
1756                                 m = m->m_next;
1757                                 moff = 0;
1758                         } else {
1759                                 nextrecord = m->m_nextpkt;
1760                                 sbfree(&so->so_rcv, m);
1761                                 if (mp != NULL) {
1762                                         *mp = m;
1763                                         mp = &m->m_next;
1764                                         so->so_rcv.sb_mb = m = m->m_next;
1765                                         *mp = NULL;
1766                                 } else {
1767                                         so->so_rcv.sb_mb = m_free(m);
1768                                         m = so->so_rcv.sb_mb;
1769                                 }
1770                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
1771                                 SBLASTRECORDCHK(&so->so_rcv);
1772                                 SBLASTMBUFCHK(&so->so_rcv);
1773                         }
1774                 } else {
1775                         if (flags & MSG_PEEK)
1776                                 moff += len;
1777                         else {
1778                                 if (mp != NULL) {
1779                                         int copy_flag;
1780
1781                                         if (flags & MSG_DONTWAIT)
1782                                                 copy_flag = M_DONTWAIT;
1783                                         else
1784                                                 copy_flag = M_WAIT;
1785                                         if (copy_flag == M_WAIT)
1786                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1787                                         *mp = m_copym(m, 0, len, copy_flag);
1788                                         if (copy_flag == M_WAIT)
1789                                                 SOCKBUF_LOCK(&so->so_rcv);
1790                                         if (*mp == NULL) {
1791                                                 /*
1792                                                  * m_copym() couldn't
1793                                                  * allocate an mbuf.  Adjust
1794                                                  * uio_resid back (it was
1795                                                  * adjusted down by len
1796                                                  * bytes, which we didn't end
1797                                                  * up "copying" over).
1798                                                  */
1799                                                 uio->uio_resid += len;
1800                                                 break;
1801                                         }
1802                                 }
1803                                 m->m_data += len;
1804                                 m->m_len -= len;
1805                                 so->so_rcv.sb_cc -= len;
1806                         }
1807                 }
1808                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1809                 if (so->so_oobmark) {
1810                         if ((flags & MSG_PEEK) == 0) {
1811                                 so->so_oobmark -= len;
1812                                 if (so->so_oobmark == 0) {
1813                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1814                                         break;
1815                                 }
1816                         } else {
1817                                 offset += len;
1818                                 if (offset == so->so_oobmark)
1819                                         break;
1820                         }
1821                 }
1822                 if (flags & MSG_EOR)
1823                         break;
1824                 /*
1825                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
1826                  * must not quit until "uio->uio_resid == 0" or an error
1827                  * termination.  If a signal/timeout occurs, return with a
1828                  * short count but without error.  Keep sockbuf locked
1829                  * against other readers.
1830                  */
1831                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1832                     !sosendallatonce(so) && nextrecord == NULL) {
1833                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1834                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1835                                 break;
1836                         /*
1837                          * Notify the protocol that some data has been
1838                          * drained before blocking.
1839                          */
1840                         if (pr->pr_flags & PR_WANTRCVD) {
1841                                 SOCKBUF_UNLOCK(&so->so_rcv);
1842                                 VNET_SO_ASSERT(so);
1843                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1844                                 SOCKBUF_LOCK(&so->so_rcv);
1845                         }
1846                         SBLASTRECORDCHK(&so->so_rcv);
1847                         SBLASTMBUFCHK(&so->so_rcv);
1848                         /*
1849                          * We could receive some data while was notifying
1850                          * the protocol. Skip blocking in this case.
1851                          */
1852                         if (so->so_rcv.sb_mb == NULL) {
1853                                 error = sbwait(&so->so_rcv);
1854                                 if (error) {
1855                                         SOCKBUF_UNLOCK(&so->so_rcv);
1856                                         goto release;
1857                                 }
1858                         }
1859                         m = so->so_rcv.sb_mb;
1860                         if (m != NULL)
1861                                 nextrecord = m->m_nextpkt;
1862                 }
1863         }
1864
1865         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1866         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1867                 flags |= MSG_TRUNC;
1868                 if ((flags & MSG_PEEK) == 0)
1869                         (void) sbdroprecord_locked(&so->so_rcv);
1870         }
1871         if ((flags & MSG_PEEK) == 0) {
1872                 if (m == NULL) {
1873                         /*
1874                          * First part is an inline SB_EMPTY_FIXUP().  Second
1875                          * part makes sure sb_lastrecord is up-to-date if
1876                          * there is still data in the socket buffer.
1877                          */
1878                         so->so_rcv.sb_mb = nextrecord;
1879                         if (so->so_rcv.sb_mb == NULL) {
1880                                 so->so_rcv.sb_mbtail = NULL;
1881                                 so->so_rcv.sb_lastrecord = NULL;
1882                         } else if (nextrecord->m_nextpkt == NULL)
1883                                 so->so_rcv.sb_lastrecord = nextrecord;
1884                 }
1885                 SBLASTRECORDCHK(&so->so_rcv);
1886                 SBLASTMBUFCHK(&so->so_rcv);
1887                 /*
1888                  * If soreceive() is being done from the socket callback,
1889                  * then don't need to generate ACK to peer to update window,
1890                  * since ACK will be generated on return to TCP.
1891                  */
1892                 if (!(flags & MSG_SOCALLBCK) &&
1893                     (pr->pr_flags & PR_WANTRCVD)) {
1894                         SOCKBUF_UNLOCK(&so->so_rcv);
1895                         VNET_SO_ASSERT(so);
1896                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1897                         SOCKBUF_LOCK(&so->so_rcv);
1898                 }
1899         }
1900         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1901         if (orig_resid == uio->uio_resid && orig_resid &&
1902             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1903                 SOCKBUF_UNLOCK(&so->so_rcv);
1904                 goto restart;
1905         }
1906         SOCKBUF_UNLOCK(&so->so_rcv);
1907
1908         if (flagsp != NULL)
1909                 *flagsp |= flags;
1910 release:
1911         sbunlock(&so->so_rcv);
1912         return (error);
1913 }
1914
1915 /*
1916  * Optimized version of soreceive() for stream (TCP) sockets.
1917  */
1918 #ifdef TCP_SORECEIVE_STREAM
1919 int
1920 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1921     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1922 {
1923         int len = 0, error = 0, flags, oresid;
1924         struct sockbuf *sb;
1925         struct mbuf *m, *n = NULL;
1926
1927         /* We only do stream sockets. */
1928         if (so->so_type != SOCK_STREAM)
1929                 return (EINVAL);
1930         if (psa != NULL)
1931                 *psa = NULL;
1932         if (controlp != NULL)
1933                 return (EINVAL);
1934         if (flagsp != NULL)
1935                 flags = *flagsp &~ MSG_EOR;
1936         else
1937                 flags = 0;
1938         if (flags & MSG_OOB)
1939                 return (soreceive_rcvoob(so, uio, flags));
1940         if (mp0 != NULL)
1941                 *mp0 = NULL;
1942
1943         sb = &so->so_rcv;
1944
1945         /* Prevent other readers from entering the socket. */
1946         error = sblock(sb, SBLOCKWAIT(flags));
1947         if (error)
1948                 goto out;
1949         SOCKBUF_LOCK(sb);
1950
1951         /* Easy one, no space to copyout anything. */
1952         if (uio->uio_resid == 0) {
1953                 error = EINVAL;
1954                 goto out;
1955         }
1956         oresid = uio->uio_resid;
1957
1958         /* We will never ever get anything unless we are connected. */
1959         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1960                 /* When disconnecting there may be still some data left. */
1961                 if (sb->sb_cc > 0)
1962                         goto deliver;
1963                 if (!(so->so_state & SS_ISDISCONNECTED))
1964                         error = ENOTCONN;
1965                 goto out;
1966         }
1967
1968         /* Socket buffer is empty and we shall not block. */
1969         if (sb->sb_cc == 0 &&
1970             ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1971                 error = EAGAIN;
1972                 goto out;
1973         }
1974
1975 restart:
1976         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1977
1978         /* Abort if socket has reported problems. */
1979         if (so->so_error) {
1980                 if (sb->sb_cc > 0)
1981                         goto deliver;
1982                 if (oresid > uio->uio_resid)
1983                         goto out;
1984                 error = so->so_error;
1985                 if (!(flags & MSG_PEEK))
1986                         so->so_error = 0;
1987                 goto out;
1988         }
1989
1990         /* Door is closed.  Deliver what is left, if any. */
1991         if (sb->sb_state & SBS_CANTRCVMORE) {
1992                 if (sb->sb_cc > 0)
1993                         goto deliver;
1994                 else
1995                         goto out;
1996         }
1997
1998         /* Socket buffer got some data that we shall deliver now. */
1999         if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2000             ((sb->sb_flags & SS_NBIO) ||
2001              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2002              sb->sb_cc >= sb->sb_lowat ||
2003              sb->sb_cc >= uio->uio_resid ||
2004              sb->sb_cc >= sb->sb_hiwat) ) {
2005                 goto deliver;
2006         }
2007
2008         /* On MSG_WAITALL we must wait until all data or error arrives. */
2009         if ((flags & MSG_WAITALL) &&
2010             (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
2011                 goto deliver;
2012
2013         /*
2014          * Wait and block until (more) data comes in.
2015          * NB: Drops the sockbuf lock during wait.
2016          */
2017         error = sbwait(sb);
2018         if (error)
2019                 goto out;
2020         goto restart;
2021
2022 deliver:
2023         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2024         KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2025         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2026
2027         /* Statistics. */
2028         if (uio->uio_td)
2029                 uio->uio_td->td_ru.ru_msgrcv++;
2030
2031         /* Fill uio until full or current end of socket buffer is reached. */
2032         len = min(uio->uio_resid, sb->sb_cc);
2033         if (mp0 != NULL) {
2034                 /* Dequeue as many mbufs as possible. */
2035                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2036                         for (*mp0 = m = sb->sb_mb;
2037                              m != NULL && m->m_len <= len;
2038                              m = m->m_next) {
2039                                 len -= m->m_len;
2040                                 uio->uio_resid -= m->m_len;
2041                                 sbfree(sb, m);
2042                                 n = m;
2043                         }
2044                         sb->sb_mb = m;
2045                         if (sb->sb_mb == NULL)
2046                                 SB_EMPTY_FIXUP(sb);
2047                         n->m_next = NULL;
2048                 }
2049                 /* Copy the remainder. */
2050                 if (len > 0) {
2051                         KASSERT(sb->sb_mb != NULL,
2052                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2053
2054                         m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2055                         if (m == NULL)
2056                                 len = 0;        /* Don't flush data from sockbuf. */
2057                         else
2058                                 uio->uio_resid -= m->m_len;
2059                         if (*mp0 != NULL)
2060                                 n->m_next = m;
2061                         else
2062                                 *mp0 = m;
2063                         if (*mp0 == NULL) {
2064                                 error = ENOBUFS;
2065                                 goto out;
2066                         }
2067                 }
2068         } else {
2069                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2070                 SOCKBUF_UNLOCK(sb);
2071                 error = m_mbuftouio(uio, sb->sb_mb, len);
2072                 SOCKBUF_LOCK(sb);
2073                 if (error)
2074                         goto out;
2075         }
2076         SBLASTRECORDCHK(sb);
2077         SBLASTMBUFCHK(sb);
2078
2079         /*
2080          * Remove the delivered data from the socket buffer unless we
2081          * were only peeking.
2082          */
2083         if (!(flags & MSG_PEEK)) {
2084                 if (len > 0)
2085                         sbdrop_locked(sb, len);
2086
2087                 /* Notify protocol that we drained some data. */
2088                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2089                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2090                      !(flags & MSG_SOCALLBCK))) {
2091                         SOCKBUF_UNLOCK(sb);
2092                         VNET_SO_ASSERT(so);
2093                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2094                         SOCKBUF_LOCK(sb);
2095                 }
2096         }
2097
2098         /*
2099          * For MSG_WAITALL we may have to loop again and wait for
2100          * more data to come in.
2101          */
2102         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2103                 goto restart;
2104 out:
2105         SOCKBUF_LOCK_ASSERT(sb);
2106         SBLASTRECORDCHK(sb);
2107         SBLASTMBUFCHK(sb);
2108         SOCKBUF_UNLOCK(sb);
2109         sbunlock(sb);
2110         return (error);
2111 }
2112 #endif /* TCP_SORECEIVE_STREAM */
2113
2114 /*
2115  * Optimized version of soreceive() for simple datagram cases from userspace.
2116  * Unlike in the stream case, we're able to drop a datagram if copyout()
2117  * fails, and because we handle datagrams atomically, we don't need to use a
2118  * sleep lock to prevent I/O interlacing.
2119  */
2120 int
2121 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2122     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2123 {
2124         struct mbuf *m, *m2;
2125         int flags, len, error;
2126         struct protosw *pr = so->so_proto;
2127         struct mbuf *nextrecord;
2128
2129         if (psa != NULL)
2130                 *psa = NULL;
2131         if (controlp != NULL)
2132                 *controlp = NULL;
2133         if (flagsp != NULL)
2134                 flags = *flagsp &~ MSG_EOR;
2135         else
2136                 flags = 0;
2137
2138         /*
2139          * For any complicated cases, fall back to the full
2140          * soreceive_generic().
2141          */
2142         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2143                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2144                     flagsp));
2145
2146         /*
2147          * Enforce restrictions on use.
2148          */
2149         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2150             ("soreceive_dgram: wantrcvd"));
2151         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2152         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2153             ("soreceive_dgram: SBS_RCVATMARK"));
2154         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2155             ("soreceive_dgram: P_CONNREQUIRED"));
2156
2157         /*
2158          * Loop blocking while waiting for a datagram.
2159          */
2160         SOCKBUF_LOCK(&so->so_rcv);
2161         while ((m = so->so_rcv.sb_mb) == NULL) {
2162                 KASSERT(so->so_rcv.sb_cc == 0,
2163                     ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2164                     so->so_rcv.sb_cc));
2165                 if (so->so_error) {
2166                         error = so->so_error;
2167                         so->so_error = 0;
2168                         SOCKBUF_UNLOCK(&so->so_rcv);
2169                         return (error);
2170                 }
2171                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2172                     uio->uio_resid == 0) {
2173                         SOCKBUF_UNLOCK(&so->so_rcv);
2174                         return (0);
2175                 }
2176                 if ((so->so_state & SS_NBIO) ||
2177                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2178                         SOCKBUF_UNLOCK(&so->so_rcv);
2179                         return (EWOULDBLOCK);
2180                 }
2181                 SBLASTRECORDCHK(&so->so_rcv);
2182                 SBLASTMBUFCHK(&so->so_rcv);
2183                 error = sbwait(&so->so_rcv);
2184                 if (error) {
2185                         SOCKBUF_UNLOCK(&so->so_rcv);
2186                         return (error);
2187                 }
2188         }
2189         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2190
2191         if (uio->uio_td)
2192                 uio->uio_td->td_ru.ru_msgrcv++;
2193         SBLASTRECORDCHK(&so->so_rcv);
2194         SBLASTMBUFCHK(&so->so_rcv);
2195         nextrecord = m->m_nextpkt;
2196         if (nextrecord == NULL) {
2197                 KASSERT(so->so_rcv.sb_lastrecord == m,
2198                     ("soreceive_dgram: lastrecord != m"));
2199         }
2200
2201         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2202             ("soreceive_dgram: m_nextpkt != nextrecord"));
2203
2204         /*
2205          * Pull 'm' and its chain off the front of the packet queue.
2206          */
2207         so->so_rcv.sb_mb = NULL;
2208         sockbuf_pushsync(&so->so_rcv, nextrecord);
2209
2210         /*
2211          * Walk 'm's chain and free that many bytes from the socket buffer.
2212          */
2213         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2214                 sbfree(&so->so_rcv, m2);
2215
2216         /*
2217          * Do a few last checks before we let go of the lock.
2218          */
2219         SBLASTRECORDCHK(&so->so_rcv);
2220         SBLASTMBUFCHK(&so->so_rcv);
2221         SOCKBUF_UNLOCK(&so->so_rcv);
2222
2223         if (pr->pr_flags & PR_ADDR) {
2224                 KASSERT(m->m_type == MT_SONAME,
2225                     ("m->m_type == %d", m->m_type));
2226                 if (psa != NULL)
2227                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2228                             M_NOWAIT);
2229                 m = m_free(m);
2230         }
2231         if (m == NULL) {
2232                 /* XXXRW: Can this happen? */
2233                 return (0);
2234         }
2235
2236         /*
2237          * Packet to copyout() is now in 'm' and it is disconnected from the
2238          * queue.
2239          *
2240          * Process one or more MT_CONTROL mbufs present before any data mbufs
2241          * in the first mbuf chain on the socket buffer.  We call into the
2242          * protocol to perform externalization (or freeing if controlp ==
2243          * NULL).
2244          */
2245         if (m->m_type == MT_CONTROL) {
2246                 struct mbuf *cm = NULL, *cmn;
2247                 struct mbuf **cme = &cm;
2248
2249                 do {
2250                         m2 = m->m_next;
2251                         m->m_next = NULL;
2252                         *cme = m;
2253                         cme = &(*cme)->m_next;
2254                         m = m2;
2255                 } while (m != NULL && m->m_type == MT_CONTROL);
2256                 while (cm != NULL) {
2257                         cmn = cm->m_next;
2258                         cm->m_next = NULL;
2259                         if (pr->pr_domain->dom_externalize != NULL) {
2260                                 error = (*pr->pr_domain->dom_externalize)
2261                                     (cm, controlp);
2262                         } else if (controlp != NULL)
2263                                 *controlp = cm;
2264                         else
2265                                 m_freem(cm);
2266                         if (controlp != NULL) {
2267                                 while (*controlp != NULL)
2268                                         controlp = &(*controlp)->m_next;
2269                         }
2270                         cm = cmn;
2271                 }
2272         }
2273         KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2274
2275         while (m != NULL && uio->uio_resid > 0) {
2276                 len = uio->uio_resid;
2277                 if (len > m->m_len)
2278                         len = m->m_len;
2279                 error = uiomove(mtod(m, char *), (int)len, uio);
2280                 if (error) {
2281                         m_freem(m);
2282                         return (error);
2283                 }
2284                 if (len == m->m_len)
2285                         m = m_free(m);
2286                 else {
2287                         m->m_data += len;
2288                         m->m_len -= len;
2289                 }
2290         }
2291         if (m != NULL)
2292                 flags |= MSG_TRUNC;
2293         m_freem(m);
2294         if (flagsp != NULL)
2295                 *flagsp |= flags;
2296         return (0);
2297 }
2298
2299 int
2300 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2301     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2302 {
2303         int error;
2304
2305         CURVNET_SET(so->so_vnet);
2306         error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2307             controlp, flagsp));
2308         CURVNET_RESTORE();
2309         return (error);
2310 }
2311
2312 int
2313 soshutdown(struct socket *so, int how)
2314 {
2315         struct protosw *pr = so->so_proto;
2316         int error;
2317
2318         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2319                 return (EINVAL);
2320
2321         CURVNET_SET(so->so_vnet);
2322         if (pr->pr_usrreqs->pru_flush != NULL) {
2323                 (*pr->pr_usrreqs->pru_flush)(so, how);
2324         }
2325         if (how != SHUT_WR)
2326                 sorflush(so);
2327         if (how != SHUT_RD) {
2328                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2329                 CURVNET_RESTORE();
2330                 return (error);
2331         }
2332         CURVNET_RESTORE();
2333         return (0);
2334 }
2335
2336 void
2337 sorflush(struct socket *so)
2338 {
2339         struct sockbuf *sb = &so->so_rcv;
2340         struct protosw *pr = so->so_proto;
2341         struct sockbuf asb;
2342
2343         VNET_SO_ASSERT(so);
2344
2345         /*
2346          * In order to avoid calling dom_dispose with the socket buffer mutex
2347          * held, and in order to generally avoid holding the lock for a long
2348          * time, we make a copy of the socket buffer and clear the original
2349          * (except locks, state).  The new socket buffer copy won't have
2350          * initialized locks so we can only call routines that won't use or
2351          * assert those locks.
2352          *
2353          * Dislodge threads currently blocked in receive and wait to acquire
2354          * a lock against other simultaneous readers before clearing the
2355          * socket buffer.  Don't let our acquire be interrupted by a signal
2356          * despite any existing socket disposition on interruptable waiting.
2357          */
2358         socantrcvmore(so);
2359         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2360
2361         /*
2362          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2363          * and mutex data unchanged.
2364          */
2365         SOCKBUF_LOCK(sb);
2366         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2367         bcopy(&sb->sb_startzero, &asb.sb_startzero,
2368             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2369         bzero(&sb->sb_startzero,
2370             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2371         SOCKBUF_UNLOCK(sb);
2372         sbunlock(sb);
2373
2374         /*
2375          * Dispose of special rights and flush the socket buffer.  Don't call
2376          * any unsafe routines (that rely on locks being initialized) on asb.
2377          */
2378         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2379                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2380         sbrelease_internal(&asb, so);
2381 }
2382
2383 /*
2384  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2385  * additional variant to handle the case where the option value needs to be
2386  * some kind of integer, but not a specific size.  In addition to their use
2387  * here, these functions are also called by the protocol-level pr_ctloutput()
2388  * routines.
2389  */
2390 int
2391 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2392 {
2393         size_t  valsize;
2394
2395         /*
2396          * If the user gives us more than we wanted, we ignore it, but if we
2397          * don't get the minimum length the caller wants, we return EINVAL.
2398          * On success, sopt->sopt_valsize is set to however much we actually
2399          * retrieved.
2400          */
2401         if ((valsize = sopt->sopt_valsize) < minlen)
2402                 return EINVAL;
2403         if (valsize > len)
2404                 sopt->sopt_valsize = valsize = len;
2405
2406         if (sopt->sopt_td != NULL)
2407                 return (copyin(sopt->sopt_val, buf, valsize));
2408
2409         bcopy(sopt->sopt_val, buf, valsize);
2410         return (0);
2411 }
2412
2413 /*
2414  * Kernel version of setsockopt(2).
2415  *
2416  * XXX: optlen is size_t, not socklen_t
2417  */
2418 int
2419 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2420     size_t optlen)
2421 {
2422         struct sockopt sopt;
2423
2424         sopt.sopt_level = level;
2425         sopt.sopt_name = optname;
2426         sopt.sopt_dir = SOPT_SET;
2427         sopt.sopt_val = optval;
2428         sopt.sopt_valsize = optlen;
2429         sopt.sopt_td = NULL;
2430         return (sosetopt(so, &sopt));
2431 }
2432
2433 int
2434 sosetopt(struct socket *so, struct sockopt *sopt)
2435 {
2436         int     error, optval;
2437         struct  linger l;
2438         struct  timeval tv;
2439         u_long  val;
2440         uint32_t val32;
2441 #ifdef MAC
2442         struct mac extmac;
2443 #endif
2444
2445         CURVNET_SET(so->so_vnet);
2446         error = 0;
2447         if (sopt->sopt_level != SOL_SOCKET) {
2448                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2449                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2450                         CURVNET_RESTORE();
2451                         return (error);
2452                 }
2453                 error = ENOPROTOOPT;
2454         } else {
2455                 switch (sopt->sopt_name) {
2456 #ifdef INET
2457                 case SO_ACCEPTFILTER:
2458                         error = do_setopt_accept_filter(so, sopt);
2459                         if (error)
2460                                 goto bad;
2461                         break;
2462 #endif
2463                 case SO_LINGER:
2464                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2465                         if (error)
2466                                 goto bad;
2467
2468                         SOCK_LOCK(so);
2469                         so->so_linger = l.l_linger;
2470                         if (l.l_onoff)
2471                                 so->so_options |= SO_LINGER;
2472                         else
2473                                 so->so_options &= ~SO_LINGER;
2474                         SOCK_UNLOCK(so);
2475                         break;
2476
2477                 case SO_DEBUG:
2478                 case SO_KEEPALIVE:
2479                 case SO_DONTROUTE:
2480                 case SO_USELOOPBACK:
2481                 case SO_BROADCAST:
2482                 case SO_REUSEADDR:
2483                 case SO_REUSEPORT:
2484                 case SO_OOBINLINE:
2485                 case SO_TIMESTAMP:
2486                 case SO_BINTIME:
2487                 case SO_NOSIGPIPE:
2488                 case SO_NO_DDP:
2489                 case SO_NO_OFFLOAD:
2490                         error = sooptcopyin(sopt, &optval, sizeof optval,
2491                                             sizeof optval);
2492                         if (error)
2493                                 goto bad;
2494                         SOCK_LOCK(so);
2495                         if (optval)
2496                                 so->so_options |= sopt->sopt_name;
2497                         else
2498                                 so->so_options &= ~sopt->sopt_name;
2499                         SOCK_UNLOCK(so);
2500                         break;
2501
2502                 case SO_SETFIB:
2503                         error = sooptcopyin(sopt, &optval, sizeof optval,
2504                                             sizeof optval);
2505                         if (optval < 0 || optval > rt_numfibs) {
2506                                 error = EINVAL;
2507                                 goto bad;
2508                         }
2509                         if (so->so_proto != NULL &&
2510                            ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2511                            (so->so_proto->pr_domain->dom_family == PF_ROUTE))) {
2512                                 so->so_fibnum = optval;
2513                                 /* Note: ignore error */
2514                                 if (so->so_proto->pr_ctloutput)
2515                                         (*so->so_proto->pr_ctloutput)(so, sopt);
2516                         } else {
2517                                 so->so_fibnum = 0;
2518                         }
2519                         break;
2520
2521                 case SO_USER_COOKIE:
2522                         error = sooptcopyin(sopt, &val32, sizeof val32,
2523                                             sizeof val32);
2524                         if (error)
2525                                 goto bad;
2526                         so->so_user_cookie = val32;
2527                         break;
2528
2529                 case SO_SNDBUF:
2530                 case SO_RCVBUF:
2531                 case SO_SNDLOWAT:
2532                 case SO_RCVLOWAT:
2533                         error = sooptcopyin(sopt, &optval, sizeof optval,
2534                                             sizeof optval);
2535                         if (error)
2536                                 goto bad;
2537
2538                         /*
2539                          * Values < 1 make no sense for any of these options,
2540                          * so disallow them.
2541                          */
2542                         if (optval < 1) {
2543                                 error = EINVAL;
2544                                 goto bad;
2545                         }
2546
2547                         switch (sopt->sopt_name) {
2548                         case SO_SNDBUF:
2549                         case SO_RCVBUF:
2550                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2551                                     &so->so_snd : &so->so_rcv, (u_long)optval,
2552                                     so, curthread) == 0) {
2553                                         error = ENOBUFS;
2554                                         goto bad;
2555                                 }
2556                                 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2557                                     &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2558                                 break;
2559
2560                         /*
2561                          * Make sure the low-water is never greater than the
2562                          * high-water.
2563                          */
2564                         case SO_SNDLOWAT:
2565                                 SOCKBUF_LOCK(&so->so_snd);
2566                                 so->so_snd.sb_lowat =
2567                                     (optval > so->so_snd.sb_hiwat) ?
2568                                     so->so_snd.sb_hiwat : optval;
2569                                 SOCKBUF_UNLOCK(&so->so_snd);
2570                                 break;
2571                         case SO_RCVLOWAT:
2572                                 SOCKBUF_LOCK(&so->so_rcv);
2573                                 so->so_rcv.sb_lowat =
2574                                     (optval > so->so_rcv.sb_hiwat) ?
2575                                     so->so_rcv.sb_hiwat : optval;
2576                                 SOCKBUF_UNLOCK(&so->so_rcv);
2577                                 break;
2578                         }
2579                         break;
2580
2581                 case SO_SNDTIMEO:
2582                 case SO_RCVTIMEO:
2583 #ifdef COMPAT_FREEBSD32
2584                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2585                                 struct timeval32 tv32;
2586
2587                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2588                                     sizeof tv32);
2589                                 CP(tv32, tv, tv_sec);
2590                                 CP(tv32, tv, tv_usec);
2591                         } else
2592 #endif
2593                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2594                                     sizeof tv);
2595                         if (error)
2596                                 goto bad;
2597
2598                         /* assert(hz > 0); */
2599                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2600                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2601                                 error = EDOM;
2602                                 goto bad;
2603                         }
2604                         /* assert(tick > 0); */
2605                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2606                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2607                         if (val > INT_MAX) {
2608                                 error = EDOM;
2609                                 goto bad;
2610                         }
2611                         if (val == 0 && tv.tv_usec != 0)
2612                                 val = 1;
2613
2614                         switch (sopt->sopt_name) {
2615                         case SO_SNDTIMEO:
2616                                 so->so_snd.sb_timeo = val;
2617                                 break;
2618                         case SO_RCVTIMEO:
2619                                 so->so_rcv.sb_timeo = val;
2620                                 break;
2621                         }
2622                         break;
2623
2624                 case SO_LABEL:
2625 #ifdef MAC
2626                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2627                             sizeof extmac);
2628                         if (error)
2629                                 goto bad;
2630                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2631                             so, &extmac);
2632 #else
2633                         error = EOPNOTSUPP;
2634 #endif
2635                         break;
2636
2637                 default:
2638                         error = ENOPROTOOPT;
2639                         break;
2640                 }
2641                 if (error == 0 && so->so_proto != NULL &&
2642                     so->so_proto->pr_ctloutput != NULL) {
2643                         (void) ((*so->so_proto->pr_ctloutput)
2644                                   (so, sopt));
2645                 }
2646         }
2647 bad:
2648         CURVNET_RESTORE();
2649         return (error);
2650 }
2651
2652 /*
2653  * Helper routine for getsockopt.
2654  */
2655 int
2656 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2657 {
2658         int     error;
2659         size_t  valsize;
2660
2661         error = 0;
2662
2663         /*
2664          * Documented get behavior is that we always return a value, possibly
2665          * truncated to fit in the user's buffer.  Traditional behavior is
2666          * that we always tell the user precisely how much we copied, rather
2667          * than something useful like the total amount we had available for
2668          * her.  Note that this interface is not idempotent; the entire
2669          * answer must generated ahead of time.
2670          */
2671         valsize = min(len, sopt->sopt_valsize);
2672         sopt->sopt_valsize = valsize;
2673         if (sopt->sopt_val != NULL) {
2674                 if (sopt->sopt_td != NULL)
2675                         error = copyout(buf, sopt->sopt_val, valsize);
2676                 else
2677                         bcopy(buf, sopt->sopt_val, valsize);
2678         }
2679         return (error);
2680 }
2681
2682 int
2683 sogetopt(struct socket *so, struct sockopt *sopt)
2684 {
2685         int     error, optval;
2686         struct  linger l;
2687         struct  timeval tv;
2688 #ifdef MAC
2689         struct mac extmac;
2690 #endif
2691
2692         CURVNET_SET(so->so_vnet);
2693         error = 0;
2694         if (sopt->sopt_level != SOL_SOCKET) {
2695                 if (so->so_proto && so->so_proto->pr_ctloutput)
2696                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2697                 else
2698                         error = ENOPROTOOPT;
2699                 CURVNET_RESTORE();
2700                 return (error);
2701         } else {
2702                 switch (sopt->sopt_name) {
2703 #ifdef INET
2704                 case SO_ACCEPTFILTER:
2705                         error = do_getopt_accept_filter(so, sopt);
2706                         break;
2707 #endif
2708                 case SO_LINGER:
2709                         SOCK_LOCK(so);
2710                         l.l_onoff = so->so_options & SO_LINGER;
2711                         l.l_linger = so->so_linger;
2712                         SOCK_UNLOCK(so);
2713                         error = sooptcopyout(sopt, &l, sizeof l);
2714                         break;
2715
2716                 case SO_USELOOPBACK:
2717                 case SO_DONTROUTE:
2718                 case SO_DEBUG:
2719                 case SO_KEEPALIVE:
2720                 case SO_REUSEADDR:
2721                 case SO_REUSEPORT:
2722                 case SO_BROADCAST:
2723                 case SO_OOBINLINE:
2724                 case SO_ACCEPTCONN:
2725                 case SO_TIMESTAMP:
2726                 case SO_BINTIME:
2727                 case SO_NOSIGPIPE:
2728                         optval = so->so_options & sopt->sopt_name;
2729 integer:
2730                         error = sooptcopyout(sopt, &optval, sizeof optval);
2731                         break;
2732
2733                 case SO_TYPE:
2734                         optval = so->so_type;
2735                         goto integer;
2736
2737                 case SO_ERROR:
2738                         SOCK_LOCK(so);
2739                         optval = so->so_error;
2740                         so->so_error = 0;
2741                         SOCK_UNLOCK(so);
2742                         goto integer;
2743
2744                 case SO_SNDBUF:
2745                         optval = so->so_snd.sb_hiwat;
2746                         goto integer;
2747
2748                 case SO_RCVBUF:
2749                         optval = so->so_rcv.sb_hiwat;
2750                         goto integer;
2751
2752                 case SO_SNDLOWAT:
2753                         optval = so->so_snd.sb_lowat;
2754                         goto integer;
2755
2756                 case SO_RCVLOWAT:
2757                         optval = so->so_rcv.sb_lowat;
2758                         goto integer;
2759
2760                 case SO_SNDTIMEO:
2761                 case SO_RCVTIMEO:
2762                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
2763                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2764
2765                         tv.tv_sec = optval / hz;
2766                         tv.tv_usec = (optval % hz) * tick;
2767 #ifdef COMPAT_FREEBSD32
2768                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2769                                 struct timeval32 tv32;
2770
2771                                 CP(tv, tv32, tv_sec);
2772                                 CP(tv, tv32, tv_usec);
2773                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2774                         } else
2775 #endif
2776                                 error = sooptcopyout(sopt, &tv, sizeof tv);
2777                         break;
2778
2779                 case SO_LABEL:
2780 #ifdef MAC
2781                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2782                             sizeof(extmac));
2783                         if (error)
2784                                 goto bad;
2785                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2786                             so, &extmac);
2787                         if (error)
2788                                 goto bad;
2789                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2790 #else
2791                         error = EOPNOTSUPP;
2792 #endif
2793                         break;
2794
2795                 case SO_PEERLABEL:
2796 #ifdef MAC
2797                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2798                             sizeof(extmac));
2799                         if (error)
2800                                 goto bad;
2801                         error = mac_getsockopt_peerlabel(
2802                             sopt->sopt_td->td_ucred, so, &extmac);
2803                         if (error)
2804                                 goto bad;
2805                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2806 #else
2807                         error = EOPNOTSUPP;
2808 #endif
2809                         break;
2810
2811                 case SO_LISTENQLIMIT:
2812                         optval = so->so_qlimit;
2813                         goto integer;
2814
2815                 case SO_LISTENQLEN:
2816                         optval = so->so_qlen;
2817                         goto integer;
2818
2819                 case SO_LISTENINCQLEN:
2820                         optval = so->so_incqlen;
2821                         goto integer;
2822
2823                 default:
2824                         error = ENOPROTOOPT;
2825                         break;
2826                 }
2827         }
2828 #ifdef MAC
2829 bad:
2830 #endif
2831         CURVNET_RESTORE();
2832         return (error);
2833 }
2834
2835 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2836 int
2837 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2838 {
2839         struct mbuf *m, *m_prev;
2840         int sopt_size = sopt->sopt_valsize;
2841
2842         MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2843         if (m == NULL)
2844                 return ENOBUFS;
2845         if (sopt_size > MLEN) {
2846                 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2847                 if ((m->m_flags & M_EXT) == 0) {
2848                         m_free(m);
2849                         return ENOBUFS;
2850                 }
2851                 m->m_len = min(MCLBYTES, sopt_size);
2852         } else {
2853                 m->m_len = min(MLEN, sopt_size);
2854         }
2855         sopt_size -= m->m_len;
2856         *mp = m;
2857         m_prev = m;
2858
2859         while (sopt_size) {
2860                 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2861                 if (m == NULL) {
2862                         m_freem(*mp);
2863                         return ENOBUFS;
2864                 }
2865                 if (sopt_size > MLEN) {
2866                         MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2867                             M_DONTWAIT);
2868                         if ((m->m_flags & M_EXT) == 0) {
2869                                 m_freem(m);
2870                                 m_freem(*mp);
2871                                 return ENOBUFS;
2872                         }
2873                         m->m_len = min(MCLBYTES, sopt_size);
2874                 } else {
2875                         m->m_len = min(MLEN, sopt_size);
2876                 }
2877                 sopt_size -= m->m_len;
2878                 m_prev->m_next = m;
2879                 m_prev = m;
2880         }
2881         return (0);
2882 }
2883
2884 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2885 int
2886 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2887 {
2888         struct mbuf *m0 = m;
2889
2890         if (sopt->sopt_val == NULL)
2891                 return (0);
2892         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2893                 if (sopt->sopt_td != NULL) {
2894                         int error;
2895
2896                         error = copyin(sopt->sopt_val, mtod(m, char *),
2897                                        m->m_len);
2898                         if (error != 0) {
2899                                 m_freem(m0);
2900                                 return(error);
2901                         }
2902                 } else
2903                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2904                 sopt->sopt_valsize -= m->m_len;
2905                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2906                 m = m->m_next;
2907         }
2908         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2909                 panic("ip6_sooptmcopyin");
2910         return (0);
2911 }
2912
2913 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2914 int
2915 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2916 {
2917         struct mbuf *m0 = m;
2918         size_t valsize = 0;
2919
2920         if (sopt->sopt_val == NULL)
2921                 return (0);
2922         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2923                 if (sopt->sopt_td != NULL) {
2924                         int error;
2925
2926                         error = copyout(mtod(m, char *), sopt->sopt_val,
2927                                        m->m_len);
2928                         if (error != 0) {
2929                                 m_freem(m0);
2930                                 return(error);
2931                         }
2932                 } else
2933                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2934                sopt->sopt_valsize -= m->m_len;
2935                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2936                valsize += m->m_len;
2937                m = m->m_next;
2938         }
2939         if (m != NULL) {
2940                 /* enough soopt buffer should be given from user-land */
2941                 m_freem(m0);
2942                 return(EINVAL);
2943         }
2944         sopt->sopt_valsize = valsize;
2945         return (0);
2946 }
2947
2948 /*
2949  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2950  * out-of-band data, which will then notify socket consumers.
2951  */
2952 void
2953 sohasoutofband(struct socket *so)
2954 {
2955
2956         if (so->so_sigio != NULL)
2957                 pgsigio(&so->so_sigio, SIGURG, 0);
2958         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2959 }
2960
2961 int
2962 sopoll(struct socket *so, int events, struct ucred *active_cred,
2963     struct thread *td)
2964 {
2965
2966         /*
2967          * We do not need to set or assert curvnet as long as everyone uses
2968          * sopoll_generic().
2969          */
2970         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2971             td));
2972 }
2973
2974 int
2975 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2976     struct thread *td)
2977 {
2978         int revents = 0;
2979
2980         SOCKBUF_LOCK(&so->so_snd);
2981         SOCKBUF_LOCK(&so->so_rcv);
2982         if (events & (POLLIN | POLLRDNORM))
2983                 if (soreadabledata(so))
2984                         revents |= events & (POLLIN | POLLRDNORM);
2985
2986         if (events & (POLLOUT | POLLWRNORM))
2987                 if (sowriteable(so))
2988                         revents |= events & (POLLOUT | POLLWRNORM);
2989
2990         if (events & (POLLPRI | POLLRDBAND))
2991                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2992                         revents |= events & (POLLPRI | POLLRDBAND);
2993
2994         if ((events & POLLINIGNEOF) == 0) {
2995                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2996                         revents |= events & (POLLIN | POLLRDNORM);
2997                         if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2998                                 revents |= POLLHUP;
2999                 }
3000         }
3001
3002         if (revents == 0) {
3003                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3004                         selrecord(td, &so->so_rcv.sb_sel);
3005                         so->so_rcv.sb_flags |= SB_SEL;
3006                 }
3007
3008                 if (events & (POLLOUT | POLLWRNORM)) {
3009                         selrecord(td, &so->so_snd.sb_sel);
3010                         so->so_snd.sb_flags |= SB_SEL;
3011                 }
3012         }
3013
3014         SOCKBUF_UNLOCK(&so->so_rcv);
3015         SOCKBUF_UNLOCK(&so->so_snd);
3016         return (revents);
3017 }
3018
3019 int
3020 soo_kqfilter(struct file *fp, struct knote *kn)
3021 {
3022         struct socket *so = kn->kn_fp->f_data;
3023         struct sockbuf *sb;
3024
3025         switch (kn->kn_filter) {
3026         case EVFILT_READ:
3027                 if (so->so_options & SO_ACCEPTCONN)
3028                         kn->kn_fop = &solisten_filtops;
3029                 else
3030                         kn->kn_fop = &soread_filtops;
3031                 sb = &so->so_rcv;
3032                 break;
3033         case EVFILT_WRITE:
3034                 kn->kn_fop = &sowrite_filtops;
3035                 sb = &so->so_snd;
3036                 break;
3037         default:
3038                 return (EINVAL);
3039         }
3040
3041         SOCKBUF_LOCK(sb);
3042         knlist_add(&sb->sb_sel.si_note, kn, 1);
3043         sb->sb_flags |= SB_KNOTE;
3044         SOCKBUF_UNLOCK(sb);
3045         return (0);
3046 }
3047
3048 /*
3049  * Some routines that return EOPNOTSUPP for entry points that are not
3050  * supported by a protocol.  Fill in as needed.
3051  */
3052 int
3053 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3054 {
3055
3056         return EOPNOTSUPP;
3057 }
3058
3059 int
3060 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3061 {
3062
3063         return EOPNOTSUPP;
3064 }
3065
3066 int
3067 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3068 {
3069
3070         return EOPNOTSUPP;
3071 }
3072
3073 int
3074 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3075 {
3076
3077         return EOPNOTSUPP;
3078 }
3079
3080 int
3081 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3082 {
3083
3084         return EOPNOTSUPP;
3085 }
3086
3087 int
3088 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3089     struct ifnet *ifp, struct thread *td)
3090 {
3091
3092         return EOPNOTSUPP;
3093 }
3094
3095 int
3096 pru_disconnect_notsupp(struct socket *so)
3097 {
3098
3099         return EOPNOTSUPP;
3100 }
3101
3102 int
3103 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3104 {
3105
3106         return EOPNOTSUPP;
3107 }
3108
3109 int
3110 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3111 {
3112
3113         return EOPNOTSUPP;
3114 }
3115
3116 int
3117 pru_rcvd_notsupp(struct socket *so, int flags)
3118 {
3119
3120         return EOPNOTSUPP;
3121 }
3122
3123 int
3124 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3125 {
3126
3127         return EOPNOTSUPP;
3128 }
3129
3130 int
3131 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3132     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3133 {
3134
3135         return EOPNOTSUPP;
3136 }
3137
3138 /*
3139  * This isn't really a ``null'' operation, but it's the default one and
3140  * doesn't do anything destructive.
3141  */
3142 int
3143 pru_sense_null(struct socket *so, struct stat *sb)
3144 {
3145
3146         sb->st_blksize = so->so_snd.sb_hiwat;
3147         return 0;
3148 }
3149
3150 int
3151 pru_shutdown_notsupp(struct socket *so)
3152 {
3153
3154         return EOPNOTSUPP;
3155 }
3156
3157 int
3158 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3159 {
3160
3161         return EOPNOTSUPP;
3162 }
3163
3164 int
3165 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3166     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3167 {
3168
3169         return EOPNOTSUPP;
3170 }
3171
3172 int
3173 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3174     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3175 {
3176
3177         return EOPNOTSUPP;
3178 }
3179
3180 int
3181 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3182     struct thread *td)
3183 {
3184
3185         return EOPNOTSUPP;
3186 }
3187
3188 static void
3189 filt_sordetach(struct knote *kn)
3190 {
3191         struct socket *so = kn->kn_fp->f_data;
3192
3193         SOCKBUF_LOCK(&so->so_rcv);
3194         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3195         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3196                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3197         SOCKBUF_UNLOCK(&so->so_rcv);
3198 }
3199
3200 /*ARGSUSED*/
3201 static int
3202 filt_soread(struct knote *kn, long hint)
3203 {
3204         struct socket *so;
3205
3206         so = kn->kn_fp->f_data;
3207         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3208
3209         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3210         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3211                 kn->kn_flags |= EV_EOF;
3212                 kn->kn_fflags = so->so_error;
3213                 return (1);
3214         } else if (so->so_error)        /* temporary udp error */
3215                 return (1);
3216         else if (kn->kn_sfflags & NOTE_LOWAT)
3217                 return (kn->kn_data >= kn->kn_sdata);
3218         else
3219                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3220 }
3221
3222 static void
3223 filt_sowdetach(struct knote *kn)
3224 {
3225         struct socket *so = kn->kn_fp->f_data;
3226
3227         SOCKBUF_LOCK(&so->so_snd);
3228         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3229         if (knlist_empty(&so->so_snd.sb_sel.si_note))
3230                 so->so_snd.sb_flags &= ~SB_KNOTE;
3231         SOCKBUF_UNLOCK(&so->so_snd);
3232 }
3233
3234 /*ARGSUSED*/
3235 static int
3236 filt_sowrite(struct knote *kn, long hint)
3237 {
3238         struct socket *so;
3239
3240         so = kn->kn_fp->f_data;
3241         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3242         kn->kn_data = sbspace(&so->so_snd);
3243         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3244                 kn->kn_flags |= EV_EOF;
3245                 kn->kn_fflags = so->so_error;
3246                 return (1);
3247         } else if (so->so_error)        /* temporary udp error */
3248                 return (1);
3249         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3250             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3251                 return (0);
3252         else if (kn->kn_sfflags & NOTE_LOWAT)
3253                 return (kn->kn_data >= kn->kn_sdata);
3254         else
3255                 return (kn->kn_data >= so->so_snd.sb_lowat);
3256 }
3257
3258 /*ARGSUSED*/
3259 static int
3260 filt_solisten(struct knote *kn, long hint)
3261 {
3262         struct socket *so = kn->kn_fp->f_data;
3263
3264         kn->kn_data = so->so_qlen;
3265         return (! TAILQ_EMPTY(&so->so_comp));
3266 }
3267
3268 int
3269 socheckuid(struct socket *so, uid_t uid)
3270 {
3271
3272         if (so == NULL)
3273                 return (EPERM);
3274         if (so->so_cred->cr_uid != uid)
3275                 return (EPERM);
3276         return (0);
3277 }
3278
3279 static int
3280 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3281 {
3282         int error;
3283         int val;
3284
3285         val = somaxconn;
3286         error = sysctl_handle_int(oidp, &val, 0, req);
3287         if (error || !req->newptr )
3288                 return (error);
3289
3290         if (val < 1 || val > USHRT_MAX)
3291                 return (EINVAL);
3292
3293         somaxconn = val;
3294         return (0);
3295 }
3296
3297 /*
3298  * These functions are used by protocols to notify the socket layer (and its
3299  * consumers) of state changes in the sockets driven by protocol-side events.
3300  */
3301
3302 /*
3303  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3304  *
3305  * Normal sequence from the active (originating) side is that
3306  * soisconnecting() is called during processing of connect() call, resulting
3307  * in an eventual call to soisconnected() if/when the connection is
3308  * established.  When the connection is torn down soisdisconnecting() is
3309  * called during processing of disconnect() call, and soisdisconnected() is
3310  * called when the connection to the peer is totally severed.  The semantics
3311  * of these routines are such that connectionless protocols can call
3312  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3313  * calls when setting up a ``connection'' takes no time.
3314  *
3315  * From the passive side, a socket is created with two queues of sockets:
3316  * so_incomp for connections in progress and so_comp for connections already
3317  * made and awaiting user acceptance.  As a protocol is preparing incoming
3318  * connections, it creates a socket structure queued on so_incomp by calling
3319  * sonewconn().  When the connection is established, soisconnected() is
3320  * called, and transfers the socket structure to so_comp, making it available
3321  * to accept().
3322  *
3323  * If a socket is closed with sockets on either so_incomp or so_comp, these
3324  * sockets are dropped.
3325  *
3326  * If higher-level protocols are implemented in the kernel, the wakeups done
3327  * here will sometimes cause software-interrupt process scheduling.
3328  */
3329 void
3330 soisconnecting(struct socket *so)
3331 {
3332
3333         SOCK_LOCK(so);
3334         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3335         so->so_state |= SS_ISCONNECTING;
3336         SOCK_UNLOCK(so);
3337 }
3338
3339 void
3340 soisconnected(struct socket *so)
3341 {
3342         struct socket *head;
3343         int ret;
3344
3345 restart:
3346         ACCEPT_LOCK();
3347         SOCK_LOCK(so);
3348         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3349         so->so_state |= SS_ISCONNECTED;
3350         head = so->so_head;
3351         if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3352                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3353                         SOCK_UNLOCK(so);
3354                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
3355                         head->so_incqlen--;
3356                         so->so_qstate &= ~SQ_INCOMP;
3357                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3358                         head->so_qlen++;
3359                         so->so_qstate |= SQ_COMP;
3360                         ACCEPT_UNLOCK();
3361                         sorwakeup(head);
3362                         wakeup_one(&head->so_timeo);
3363                 } else {
3364                         ACCEPT_UNLOCK();
3365                         soupcall_set(so, SO_RCV,
3366                             head->so_accf->so_accept_filter->accf_callback,
3367                             head->so_accf->so_accept_filter_arg);
3368                         so->so_options &= ~SO_ACCEPTFILTER;
3369                         ret = head->so_accf->so_accept_filter->accf_callback(so,
3370                             head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3371                         if (ret == SU_ISCONNECTED)
3372                                 soupcall_clear(so, SO_RCV);
3373                         SOCK_UNLOCK(so);
3374                         if (ret == SU_ISCONNECTED)
3375                                 goto restart;
3376                 }
3377                 return;
3378         }
3379         SOCK_UNLOCK(so);
3380         ACCEPT_UNLOCK();
3381         wakeup(&so->so_timeo);
3382         sorwakeup(so);
3383         sowwakeup(so);
3384 }
3385
3386 void
3387 soisdisconnecting(struct socket *so)
3388 {
3389
3390         /*
3391          * Note: This code assumes that SOCK_LOCK(so) and
3392          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3393          */
3394         SOCKBUF_LOCK(&so->so_rcv);
3395         so->so_state &= ~SS_ISCONNECTING;
3396         so->so_state |= SS_ISDISCONNECTING;
3397         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3398         sorwakeup_locked(so);
3399         SOCKBUF_LOCK(&so->so_snd);
3400         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3401         sowwakeup_locked(so);
3402         wakeup(&so->so_timeo);
3403 }
3404
3405 void
3406 soisdisconnected(struct socket *so)
3407 {
3408
3409         /*
3410          * Note: This code assumes that SOCK_LOCK(so) and
3411          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3412          */
3413         SOCKBUF_LOCK(&so->so_rcv);
3414         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3415         so->so_state |= SS_ISDISCONNECTED;
3416         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3417         sorwakeup_locked(so);
3418         SOCKBUF_LOCK(&so->so_snd);
3419         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3420         sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3421         sowwakeup_locked(so);
3422         wakeup(&so->so_timeo);
3423 }
3424
3425 /*
3426  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3427  */
3428 struct sockaddr *
3429 sodupsockaddr(const struct sockaddr *sa, int mflags)
3430 {
3431         struct sockaddr *sa2;
3432
3433         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3434         if (sa2)
3435                 bcopy(sa, sa2, sa->sa_len);
3436         return sa2;
3437 }
3438
3439 /*
3440  * Register per-socket buffer upcalls.
3441  */
3442 void
3443 soupcall_set(struct socket *so, int which,
3444     int (*func)(struct socket *, void *, int), void *arg)
3445 {
3446         struct sockbuf *sb;
3447
3448         switch (which) {
3449         case SO_RCV:
3450                 sb = &so->so_rcv;
3451                 break;
3452         case SO_SND:
3453                 sb = &so->so_snd;
3454                 break;
3455         default:
3456                 panic("soupcall_set: bad which");
3457         }
3458         SOCKBUF_LOCK_ASSERT(sb);
3459 #if 0
3460         /* XXX: accf_http actually wants to do this on purpose. */
3461         KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3462 #endif
3463         sb->sb_upcall = func;
3464         sb->sb_upcallarg = arg;
3465         sb->sb_flags |= SB_UPCALL;
3466 }
3467
3468 void
3469 soupcall_clear(struct socket *so, int which)
3470 {
3471         struct sockbuf *sb;
3472
3473         switch (which) {
3474         case SO_RCV:
3475                 sb = &so->so_rcv;
3476                 break;
3477         case SO_SND:
3478                 sb = &so->so_snd;
3479                 break;
3480         default:
3481                 panic("soupcall_clear: bad which");
3482         }
3483         SOCKBUF_LOCK_ASSERT(sb);
3484         KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3485         sb->sb_upcall = NULL;
3486         sb->sb_upcallarg = NULL;
3487         sb->sb_flags &= ~SB_UPCALL;
3488 }
3489
3490 /*
3491  * Create an external-format (``xsocket'') structure using the information in
3492  * the kernel-format socket structure pointed to by so.  This is done to
3493  * reduce the spew of irrelevant information over this interface, to isolate
3494  * user code from changes in the kernel structure, and potentially to provide
3495  * information-hiding if we decide that some of this information should be
3496  * hidden from users.
3497  */
3498 void
3499 sotoxsocket(struct socket *so, struct xsocket *xso)
3500 {
3501
3502         xso->xso_len = sizeof *xso;
3503         xso->xso_so = so;
3504         xso->so_type = so->so_type;
3505         xso->so_options = so->so_options;
3506         xso->so_linger = so->so_linger;
3507         xso->so_state = so->so_state;
3508         xso->so_pcb = so->so_pcb;
3509         xso->xso_protocol = so->so_proto->pr_protocol;
3510         xso->xso_family = so->so_proto->pr_domain->dom_family;
3511         xso->so_qlen = so->so_qlen;
3512         xso->so_incqlen = so->so_incqlen;
3513         xso->so_qlimit = so->so_qlimit;
3514         xso->so_timeo = so->so_timeo;
3515         xso->so_error = so->so_error;
3516         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3517         xso->so_oobmark = so->so_oobmark;
3518         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3519         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3520         xso->so_uid = so->so_cred->cr_uid;
3521 }
3522
3523
3524 /*
3525  * Socket accessor functions to provide external consumers with
3526  * a safe interface to socket state
3527  *
3528  */
3529
3530 void
3531 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3532 {
3533
3534         TAILQ_FOREACH(so, &so->so_comp, so_list)
3535                 func(so, arg);
3536 }
3537
3538 struct sockbuf *
3539 so_sockbuf_rcv(struct socket *so)
3540 {
3541
3542         return (&so->so_rcv);
3543 }
3544
3545 struct sockbuf *
3546 so_sockbuf_snd(struct socket *so)
3547 {
3548
3549         return (&so->so_snd);
3550 }
3551
3552 int
3553 so_state_get(const struct socket *so)
3554 {
3555
3556         return (so->so_state);
3557 }
3558
3559 void
3560 so_state_set(struct socket *so, int val)
3561 {
3562
3563         so->so_state = val;
3564 }
3565
3566 int
3567 so_options_get(const struct socket *so)
3568 {
3569
3570         return (so->so_options);
3571 }
3572
3573 void
3574 so_options_set(struct socket *so, int val)
3575 {
3576
3577         so->so_options = val;
3578 }
3579
3580 int
3581 so_error_get(const struct socket *so)
3582 {
3583
3584         return (so->so_error);
3585 }
3586
3587 void
3588 so_error_set(struct socket *so, int val)
3589 {
3590
3591         so->so_error = val;
3592 }
3593
3594 int
3595 so_linger_get(const struct socket *so)
3596 {
3597
3598         return (so->so_linger);
3599 }
3600
3601 void
3602 so_linger_set(struct socket *so, int val)
3603 {
3604
3605         so->so_linger = val;
3606 }
3607
3608 struct protosw *
3609 so_protosw_get(const struct socket *so)
3610 {
3611
3612         return (so->so_proto);
3613 }
3614
3615 void
3616 so_protosw_set(struct socket *so, struct protosw *val)
3617 {
3618
3619         so->so_proto = val;
3620 }
3621
3622 void
3623 so_sorwakeup(struct socket *so)
3624 {
3625
3626         sorwakeup(so);
3627 }
3628
3629 void
3630 so_sowwakeup(struct socket *so)
3631 {
3632
3633         sowwakeup(so);
3634 }
3635
3636 void
3637 so_sorwakeup_locked(struct socket *so)
3638 {
3639
3640         sorwakeup_locked(so);
3641 }
3642
3643 void
3644 so_sowwakeup_locked(struct socket *so)
3645 {
3646
3647         sowwakeup_locked(so);
3648 }
3649
3650 void
3651 so_lock(struct socket *so)
3652 {
3653         SOCK_LOCK(so);
3654 }
3655
3656 void
3657 so_unlock(struct socket *so)
3658 {
3659         SOCK_UNLOCK(so);
3660 }