sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3  *      The Regents of the University of California.
   4  * Copyright (c) 2004 The FreeBSD Foundation
   5  * Copyright (c) 2004-2008 Robert N. M. Watson
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  *
  96  * NOTE: With regard to VNETs the general rule is that callers do not set
  97  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  98  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  99  * and sorflush(), which are usually called from a pre-set VNET context.
 100  * sopoll() currently does not need a VNET context to be set.
 101  */
 102
 103 #include <sys/cdefs.h>
 104 __FBSDID("$FreeBSD$");
 105
 106 #include "opt_inet.h"
 107 #include "opt_inet6.h"
 108 #include "opt_zero.h"
 109 #include "opt_compat.h"
 110
 111 #include <sys/param.h>
 112 #include <sys/systm.h>
 113 #include <sys/fcntl.h>
 114 #include <sys/limits.h>
 115 #include <sys/lock.h>
 116 #include <sys/mac.h>
 117 #include <sys/malloc.h>
 118 #include <sys/mbuf.h>
 119 #include <sys/mutex.h>
 120 #include <sys/domain.h>
 121 #include <sys/file.h>                   /* for struct knote */
 122 #include <sys/kernel.h>
 123 #include <sys/event.h>
 124 #include <sys/eventhandler.h>
 125 #include <sys/poll.h>
 126 #include <sys/proc.h>
 127 #include <sys/protosw.h>
 128 #include <sys/socket.h>
 129 #include <sys/socketvar.h>
 130 #include <sys/resourcevar.h>
 131 #include <net/route.h>
 132 #include <sys/signalvar.h>
 133 #include <sys/stat.h>
 134 #include <sys/sx.h>
 135 #include <sys/sysctl.h>
 136 #include <sys/uio.h>
 137 #include <sys/jail.h>
 138
 139 #include <net/vnet.h>
 140
 141 #include <security/mac/mac_framework.h>
 142
 143 #include <vm/uma.h>
 144
 145 #ifdef COMPAT_FREEBSD32
 146 #include <sys/mount.h>
 147 #include <sys/sysent.h>
 148 #include <compat/freebsd32/freebsd32.h>
 149 #endif
 150
 151 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 152                     int flags);
 153
 154 static void     filt_sordetach(struct knote *kn);
 155 static int      filt_soread(struct knote *kn, long hint);
 156 static void     filt_sowdetach(struct knote *kn);
 157 static int      filt_sowrite(struct knote *kn, long hint);
 158 static int      filt_solisten(struct knote *kn, long hint);
 159
 160 static struct filterops solisten_filtops = {
 161         .f_isfd = 1,
 162         .f_detach = filt_sordetach,
 163         .f_event = filt_solisten,
 164 };
 165 static struct filterops soread_filtops = {
 166         .f_isfd = 1,
 167         .f_detach = filt_sordetach,
 168         .f_event = filt_soread,
 169 };
 170 static struct filterops sowrite_filtops = {
 171         .f_isfd = 1,
 172         .f_detach = filt_sowdetach,
 173         .f_event = filt_sowrite,
 174 };
 175
 176 uma_zone_t socket_zone;
 177 so_gen_t        so_gencnt;      /* generation count for sockets */
 178
 179 int     maxsockets;
 180
 181 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 182 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 183
 184 #define VNET_SO_ASSERT(so)                                              \
 185         VNET_ASSERT(curvnet != NULL,                                    \
 186             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 187
 188 static int somaxconn = SOMAXCONN;
 189 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
 190 /* XXX: we dont have SYSCTL_USHORT */
 191 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
 192     0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
 193     "queue size");
 194 static int numopensockets;
 195 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 196     &numopensockets, 0, "Number of open sockets");
 197 #ifdef ZERO_COPY_SOCKETS
 198 /* These aren't static because they're used in other files. */
 199 int so_zero_copy_send = 1;
 200 int so_zero_copy_receive = 1;
 201 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
 202     "Zero copy controls");
 203 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
 204     &so_zero_copy_receive, 0, "Enable zero copy receive");
 205 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
 206     &so_zero_copy_send, 0, "Enable zero copy send");
 207 #endif /* ZERO_COPY_SOCKETS */
 208
 209 /*
 210  * accept_mtx locks down per-socket fields relating to accept queues.  See
 211  * socketvar.h for an annotation of the protected fields of struct socket.
 212  */
 213 struct mtx accept_mtx;
 214 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 215
 216 /*
 217  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 218  * so_gencnt field.
 219  */
 220 static struct mtx so_global_mtx;
 221 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 222
 223 /*
 224  * General IPC sysctl name space, used by sockets and a variety of other IPC
 225  * types.
 226  */
 227 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 228
 229 /*
 230  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 231  * of the change so that they can update their dependent limits as required.
 232  */
 233 static int
 234 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 235 {
 236         int error, newmaxsockets;
 237
 238         newmaxsockets = maxsockets;
 239         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 240         if (error == 0 && req->newptr) {
 241                 if (newmaxsockets > maxsockets) {
 242                         maxsockets = newmaxsockets;
 243                         if (maxsockets > ((maxfiles / 4) * 3)) {
 244                                 maxfiles = (maxsockets * 5) / 4;
 245                                 maxfilesperproc = (maxfiles * 9) / 10;
 246                         }
 247                         EVENTHANDLER_INVOKE(maxsockets_change);
 248                 } else
 249                         error = EINVAL;
 250         }
 251         return (error);
 252 }
 253
 254 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 255     &maxsockets, 0, sysctl_maxsockets, "IU",
 256     "Maximum number of sockets avaliable");
 257
 258 /*
 259  * Initialise maxsockets.  This SYSINIT must be run after
 260  * tunable_mbinit().
 261  */
 262 static void
 263 init_maxsockets(void *ignored)
 264 {
 265
 266         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 267         maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
 268 }
 269 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 270
 271 /*
 272  * Socket operation routines.  These routines are called by the routines in
 273  * sys_socket.c or from a system process, and implement the semantics of
 274  * socket operations by switching out to the protocol specific routines.
 275  */
 276
 277 /*
 278  * Get a socket structure from our zone, and initialize it.  Note that it
 279  * would probably be better to allocate socket and PCB at the same time, but
 280  * I'm not convinced that all the protocols can be easily modified to do
 281  * this.
 282  *
 283  * soalloc() returns a socket with a ref count of 0.
 284  */
 285 static struct socket *
 286 soalloc(struct vnet *vnet)
 287 {
 288         struct socket *so;
 289
 290         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 291         if (so == NULL)
 292                 return (NULL);
 293 #ifdef MAC
 294         if (mac_socket_init(so, M_NOWAIT) != 0) {
 295                 uma_zfree(socket_zone, so);
 296                 return (NULL);
 297         }
 298 #endif
 299         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 300         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 301         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 302         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 303         TAILQ_INIT(&so->so_aiojobq);
 304         mtx_lock(&so_global_mtx);
 305         so->so_gencnt = ++so_gencnt;
 306         ++numopensockets;
 307 #ifdef VIMAGE
 308         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 309             __func__, __LINE__, so));
 310         vnet->vnet_sockcnt++;
 311         so->so_vnet = vnet;
 312 #endif
 313         mtx_unlock(&so_global_mtx);
 314         return (so);
 315 }
 316
 317 /*
 318  * Free the storage associated with a socket at the socket layer, tear down
 319  * locks, labels, etc.  All protocol state is assumed already to have been
 320  * torn down (and possibly never set up) by the caller.
 321  */
 322 static void
 323 sodealloc(struct socket *so)
 324 {
 325
 326         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 327         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 328
 329         mtx_lock(&so_global_mtx);
 330         so->so_gencnt = ++so_gencnt;
 331         --numopensockets;       /* Could be below, but faster here. */
 332 #ifdef VIMAGE
 333         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 334             __func__, __LINE__, so));
 335         so->so_vnet->vnet_sockcnt--;
 336 #endif
 337         mtx_unlock(&so_global_mtx);
 338         if (so->so_rcv.sb_hiwat)
 339                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 340                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 341         if (so->so_snd.sb_hiwat)
 342                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 343                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 344 #ifdef INET
 345         /* remove acccept filter if one is present. */
 346         if (so->so_accf != NULL)
 347                 do_setopt_accept_filter(so, NULL);
 348 #endif
 349 #ifdef MAC
 350         mac_socket_destroy(so);
 351 #endif
 352         crfree(so->so_cred);
 353         sx_destroy(&so->so_snd.sb_sx);
 354         sx_destroy(&so->so_rcv.sb_sx);
 355         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 356         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 357         uma_zfree(socket_zone, so);
 358 }
 359
 360 /*
 361  * socreate returns a socket with a ref count of 1.  The socket should be
 362  * closed with soclose().
 363  */
 364 int
 365 socreate(int dom, struct socket **aso, int type, int proto,
 366     struct ucred *cred, struct thread *td)
 367 {
 368         struct protosw *prp;
 369         struct socket *so;
 370         int error;
 371
 372         if (proto)
 373                 prp = pffindproto(dom, proto, type);
 374         else
 375                 prp = pffindtype(dom, type);
 376
 377         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
 378             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 379                 return (EPROTONOSUPPORT);
 380
 381         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 382                 return (EPROTONOSUPPORT);
 383
 384         if (prp->pr_type != type)
 385                 return (EPROTOTYPE);
 386         so = soalloc(CRED_TO_VNET(cred));
 387         if (so == NULL)
 388                 return (ENOBUFS);
 389
 390         TAILQ_INIT(&so->so_incomp);
 391         TAILQ_INIT(&so->so_comp);
 392         so->so_type = type;
 393         so->so_cred = crhold(cred);
 394         if ((prp->pr_domain->dom_family == PF_INET) ||
 395             (prp->pr_domain->dom_family == PF_INET6) ||
 396             (prp->pr_domain->dom_family == PF_ROUTE))
 397                 so->so_fibnum = td->td_proc->p_fibnum;
 398         else
 399                 so->so_fibnum = 0;
 400         so->so_proto = prp;
 401 #ifdef MAC
 402         mac_socket_create(cred, so);
 403 #endif
 404         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 405         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 406         so->so_count = 1;
 407         /*
 408          * Auto-sizing of socket buffers is managed by the protocols and
 409          * the appropriate flags must be set in the pru_attach function.
 410          */
 411         CURVNET_SET(so->so_vnet);
 412         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 413         CURVNET_RESTORE();
 414         if (error) {
 415                 KASSERT(so->so_count == 1, ("socreate: so_count %d",
 416                     so->so_count));
 417                 so->so_count = 0;
 418                 sodealloc(so);
 419                 return (error);
 420         }
 421         *aso = so;
 422         return (0);
 423 }
 424
 425 #ifdef REGRESSION
 426 static int regression_sonewconn_earlytest = 1;
 427 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 428     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 429 #endif
 430
 431 /*
 432  * When an attempt at a new connection is noted on a socket which accepts
 433  * connections, sonewconn is called.  If the connection is possible (subject
 434  * to space constraints, etc.) then we allocate a new structure, propoerly
 435  * linked into the data structure of the original socket, and return this.
 436  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 437  *
 438  * Note: the ref count on the socket is 0 on return.
 439  */
 440 struct socket *
 441 sonewconn(struct socket *head, int connstatus)
 442 {
 443         struct socket *so;
 444         int over;
 445
 446         ACCEPT_LOCK();
 447         over = (head->so_qlen > 3 * head->so_qlimit / 2);
 448         ACCEPT_UNLOCK();
 449 #ifdef REGRESSION
 450         if (regression_sonewconn_earlytest && over)
 451 #else
 452         if (over)
 453 #endif
 454                 return (NULL);
 455         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 456             __func__, __LINE__, head));
 457         so = soalloc(head->so_vnet);
 458         if (so == NULL)
 459                 return (NULL);
 460         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 461                 connstatus = 0;
 462         so->so_head = head;
 463         so->so_type = head->so_type;
 464         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 465         so->so_linger = head->so_linger;
 466         so->so_state = head->so_state | SS_NOFDREF;
 467         so->so_fibnum = head->so_fibnum;
 468         so->so_proto = head->so_proto;
 469         so->so_cred = crhold(head->so_cred);
 470 #ifdef MAC
 471         mac_socket_newconn(head, so);
 472 #endif
 473         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 474         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 475         VNET_SO_ASSERT(head);
 476         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
 477             (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 478                 sodealloc(so);
 479                 return (NULL);
 480         }
 481         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 482         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 483         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 484         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 485         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 486         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 487         so->so_state |= connstatus;
 488         ACCEPT_LOCK();
 489         if (connstatus) {
 490                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 491                 so->so_qstate |= SQ_COMP;
 492                 head->so_qlen++;
 493         } else {
 494                 /*
 495                  * Keep removing sockets from the head until there's room for
 496                  * us to insert on the tail.  In pre-locking revisions, this
 497                  * was a simple if(), but as we could be racing with other
 498                  * threads and soabort() requires dropping locks, we must
 499                  * loop waiting for the condition to be true.
 500                  */
 501                 while (head->so_incqlen > head->so_qlimit) {
 502                         struct socket *sp;
 503                         sp = TAILQ_FIRST(&head->so_incomp);
 504                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 505                         head->so_incqlen--;
 506                         sp->so_qstate &= ~SQ_INCOMP;
 507                         sp->so_head = NULL;
 508                         ACCEPT_UNLOCK();
 509                         soabort(sp);
 510                         ACCEPT_LOCK();
 511                 }
 512                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 513                 so->so_qstate |= SQ_INCOMP;
 514                 head->so_incqlen++;
 515         }
 516         ACCEPT_UNLOCK();
 517         if (connstatus) {
 518                 sorwakeup(head);
 519                 wakeup_one(&head->so_timeo);
 520         }
 521         return (so);
 522 }
 523
 524 int
 525 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 526 {
 527         int error;
 528
 529         CURVNET_SET(so->so_vnet);
 530         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 531         CURVNET_RESTORE();
 532         return error;
 533 }
 534
 535 /*
 536  * solisten() transitions a socket from a non-listening state to a listening
 537  * state, but can also be used to update the listen queue depth on an
 538  * existing listen socket.  The protocol will call back into the sockets
 539  * layer using solisten_proto_check() and solisten_proto() to check and set
 540  * socket-layer listen state.  Call backs are used so that the protocol can
 541  * acquire both protocol and socket layer locks in whatever order is required
 542  * by the protocol.
 543  *
 544  * Protocol implementors are advised to hold the socket lock across the
 545  * socket-layer test and set to avoid races at the socket layer.
 546  */
 547 int
 548 solisten(struct socket *so, int backlog, struct thread *td)
 549 {
 550         int error;
 551
 552         CURVNET_SET(so->so_vnet);
 553         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 554         CURVNET_RESTORE();
 555         return error;
 556 }
 557
 558 int
 559 solisten_proto_check(struct socket *so)
 560 {
 561
 562         SOCK_LOCK_ASSERT(so);
 563
 564         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 565             SS_ISDISCONNECTING))
 566                 return (EINVAL);
 567         return (0);
 568 }
 569
 570 void
 571 solisten_proto(struct socket *so, int backlog)
 572 {
 573
 574         SOCK_LOCK_ASSERT(so);
 575
 576         if (backlog < 0 || backlog > somaxconn)
 577                 backlog = somaxconn;
 578         so->so_qlimit = backlog;
 579         so->so_options |= SO_ACCEPTCONN;
 580 }
 581
 582 /*
 583  * Evaluate the reference count and named references on a socket; if no
 584  * references remain, free it.  This should be called whenever a reference is
 585  * released, such as in sorele(), but also when named reference flags are
 586  * cleared in socket or protocol code.
 587  *
 588  * sofree() will free the socket if:
 589  *
 590  * - There are no outstanding file descriptor references or related consumers
 591  *   (so_count == 0).
 592  *
 593  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 594  *
 595  * - The protocol does not have an outstanding strong reference on the socket
 596  *   (SS_PROTOREF).
 597  *
 598  * - The socket is not in a completed connection queue, so a process has been
 599  *   notified that it is present.  If it is removed, the user process may
 600  *   block in accept() despite select() saying the socket was ready.
 601  */
 602 void
 603 sofree(struct socket *so)
 604 {
 605         struct protosw *pr = so->so_proto;
 606         struct socket *head;
 607
 608         ACCEPT_LOCK_ASSERT();
 609         SOCK_LOCK_ASSERT(so);
 610
 611         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 612             (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 613                 SOCK_UNLOCK(so);
 614                 ACCEPT_UNLOCK();
 615                 return;
 616         }
 617
 618         head = so->so_head;
 619         if (head != NULL) {
 620                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 621                     (so->so_qstate & SQ_INCOMP) != 0,
 622                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 623                     "SQ_INCOMP"));
 624                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 625                     (so->so_qstate & SQ_INCOMP) == 0,
 626                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 627                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 628                 head->so_incqlen--;
 629                 so->so_qstate &= ~SQ_INCOMP;
 630                 so->so_head = NULL;
 631         }
 632         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 633             (so->so_qstate & SQ_INCOMP) == 0,
 634             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 635             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 636         if (so->so_options & SO_ACCEPTCONN) {
 637                 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
 638                 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
 639         }
 640         SOCK_UNLOCK(so);
 641         ACCEPT_UNLOCK();
 642
 643         VNET_SO_ASSERT(so);
 644         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 645                 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 646         if (pr->pr_usrreqs->pru_detach != NULL)
 647                 (*pr->pr_usrreqs->pru_detach)(so);
 648
 649         /*
 650          * From this point on, we assume that no other references to this
 651          * socket exist anywhere else in the stack.  Therefore, no locks need
 652          * to be acquired or held.
 653          *
 654          * We used to do a lot of socket buffer and socket locking here, as
 655          * well as invoke sorflush() and perform wakeups.  The direct call to
 656          * dom_dispose() and sbrelease_internal() are an inlining of what was
 657          * necessary from sorflush().
 658          *
 659          * Notice that the socket buffer and kqueue state are torn down
 660          * before calling pru_detach.  This means that protocols shold not
 661          * assume they can perform socket wakeups, etc, in their detach code.
 662          */
 663         sbdestroy(&so->so_snd, so);
 664         sbdestroy(&so->so_rcv, so);
 665         seldrain(&so->so_snd.sb_sel);
 666         seldrain(&so->so_rcv.sb_sel);
 667         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 668         knlist_destroy(&so->so_snd.sb_sel.si_note);
 669         sodealloc(so);
 670 }
 671
 672 /*
 673  * Close a socket on last file table reference removal.  Initiate disconnect
 674  * if connected.  Free socket when disconnect complete.
 675  *
 676  * This function will sorele() the socket.  Note that soclose() may be called
 677  * prior to the ref count reaching zero.  The actual socket structure will
 678  * not be freed until the ref count reaches zero.
 679  */
 680 int
 681 soclose(struct socket *so)
 682 {
 683         int error = 0;
 684
 685         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 686
 687         CURVNET_SET(so->so_vnet);
 688         funsetown(&so->so_sigio);
 689         if (so->so_state & SS_ISCONNECTED) {
 690                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 691                         error = sodisconnect(so);
 692                         if (error) {
 693                                 if (error == ENOTCONN)
 694                                         error = 0;
 695                                 goto drop;
 696                         }
 697                 }
 698                 if (so->so_options & SO_LINGER) {
 699                         if ((so->so_state & SS_ISDISCONNECTING) &&
 700                             (so->so_state & SS_NBIO))
 701                                 goto drop;
 702                         while (so->so_state & SS_ISCONNECTED) {
 703                                 error = tsleep(&so->so_timeo,
 704                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
 705                                 if (error)
 706                                         break;
 707                         }
 708                 }
 709         }
 710
 711 drop:
 712         if (so->so_proto->pr_usrreqs->pru_close != NULL)
 713                 (*so->so_proto->pr_usrreqs->pru_close)(so);
 714         if (so->so_options & SO_ACCEPTCONN) {
 715                 struct socket *sp;
 716                 ACCEPT_LOCK();
 717                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 718                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 719                         so->so_incqlen--;
 720                         sp->so_qstate &= ~SQ_INCOMP;
 721                         sp->so_head = NULL;
 722                         ACCEPT_UNLOCK();
 723                         soabort(sp);
 724                         ACCEPT_LOCK();
 725                 }
 726                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 727                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 728                         so->so_qlen--;
 729                         sp->so_qstate &= ~SQ_COMP;
 730                         sp->so_head = NULL;
 731                         ACCEPT_UNLOCK();
 732                         soabort(sp);
 733                         ACCEPT_LOCK();
 734                 }
 735                 ACCEPT_UNLOCK();
 736         }
 737         ACCEPT_LOCK();
 738         SOCK_LOCK(so);
 739         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 740         so->so_state |= SS_NOFDREF;
 741         sorele(so);
 742         CURVNET_RESTORE();
 743         return (error);
 744 }
 745
 746 /*
 747  * soabort() is used to abruptly tear down a connection, such as when a
 748  * resource limit is reached (listen queue depth exceeded), or if a listen
 749  * socket is closed while there are sockets waiting to be accepted.
 750  *
 751  * This interface is tricky, because it is called on an unreferenced socket,
 752  * and must be called only by a thread that has actually removed the socket
 753  * from the listen queue it was on, or races with other threads are risked.
 754  *
 755  * This interface will call into the protocol code, so must not be called
 756  * with any socket locks held.  Protocols do call it while holding their own
 757  * recursible protocol mutexes, but this is something that should be subject
 758  * to review in the future.
 759  */
 760 void
 761 soabort(struct socket *so)
 762 {
 763
 764         /*
 765          * In as much as is possible, assert that no references to this
 766          * socket are held.  This is not quite the same as asserting that the
 767          * current thread is responsible for arranging for no references, but
 768          * is as close as we can get for now.
 769          */
 770         KASSERT(so->so_count == 0, ("soabort: so_count"));
 771         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 772         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 773         KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 774         KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 775         VNET_SO_ASSERT(so);
 776
 777         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 778                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
 779         ACCEPT_LOCK();
 780         SOCK_LOCK(so);
 781         sofree(so);
 782 }
 783
 784 int
 785 soaccept(struct socket *so, struct sockaddr **nam)
 786 {
 787         int error;
 788
 789         SOCK_LOCK(so);
 790         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 791         so->so_state &= ~SS_NOFDREF;
 792         SOCK_UNLOCK(so);
 793
 794         CURVNET_SET(so->so_vnet);
 795         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 796         CURVNET_RESTORE();
 797         return (error);
 798 }
 799
 800 int
 801 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 802 {
 803         int error;
 804
 805         if (so->so_options & SO_ACCEPTCONN)
 806                 return (EOPNOTSUPP);
 807
 808         CURVNET_SET(so->so_vnet);
 809         /*
 810          * If protocol is connection-based, can only connect once.
 811          * Otherwise, if connected, try to disconnect first.  This allows
 812          * user to disconnect by connecting to, e.g., a null address.
 813          */
 814         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 815             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 816             (error = sodisconnect(so)))) {
 817                 error = EISCONN;
 818         } else {
 819                 /*
 820                  * Prevent accumulated error from previous connection from
 821                  * biting us.
 822                  */
 823                 so->so_error = 0;
 824                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
 825         }
 826         CURVNET_RESTORE();
 827
 828         return (error);
 829 }
 830
 831 int
 832 soconnect2(struct socket *so1, struct socket *so2)
 833 {
 834         int error;
 835
 836         CURVNET_SET(so1->so_vnet);
 837         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 838         CURVNET_RESTORE();
 839         return (error);
 840 }
 841
 842 int
 843 sodisconnect(struct socket *so)
 844 {
 845         int error;
 846
 847         if ((so->so_state & SS_ISCONNECTED) == 0)
 848                 return (ENOTCONN);
 849         if (so->so_state & SS_ISDISCONNECTING)
 850                 return (EALREADY);
 851         VNET_SO_ASSERT(so);
 852         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 853         return (error);
 854 }
 855
 856 #ifdef ZERO_COPY_SOCKETS
 857 struct so_zerocopy_stats{
 858         int size_ok;
 859         int align_ok;
 860         int found_ifp;
 861 };
 862 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
 863 #include <netinet/in.h>
 864 #include <net/route.h>
 865 #include <netinet/in_pcb.h>
 866 #include <vm/vm.h>
 867 #include <vm/vm_page.h>
 868 #include <vm/vm_object.h>
 869
 870 /*
 871  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
 872  * sosend_dgram() and sosend_generic() use m_uiotombuf().
 873  *
 874  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
 875  * all of the data referenced by the uio.  If desired, it uses zero-copy.
 876  * *space will be updated to reflect data copied in.
 877  *
 878  * NB: If atomic I/O is requested, the caller must already have checked that
 879  * space can hold resid bytes.
 880  *
 881  * NB: In the event of an error, the caller may need to free the partial
 882  * chain pointed to by *mpp.  The contents of both *uio and *space may be
 883  * modified even in the case of an error.
 884  */
 885 static int
 886 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
 887     int flags)
 888 {
 889         struct mbuf *m, **mp, *top;
 890         long len;
 891         ssize_t resid;
 892         int error;
 893 #ifdef ZERO_COPY_SOCKETS
 894         int cow_send;
 895 #endif
 896
 897         *retmp = top = NULL;
 898         mp = &top;
 899         len = 0;
 900         resid = uio->uio_resid;
 901         error = 0;
 902         do {
 903 #ifdef ZERO_COPY_SOCKETS
 904                 cow_send = 0;
 905 #endif /* ZERO_COPY_SOCKETS */
 906                 if (resid >= MINCLSIZE) {
 907 #ifdef ZERO_COPY_SOCKETS
 908                         if (top == NULL) {
 909                                 m = m_gethdr(M_WAITOK, MT_DATA);
 910                                 m->m_pkthdr.len = 0;
 911                                 m->m_pkthdr.rcvif = NULL;
 912                         } else
 913                                 m = m_get(M_WAITOK, MT_DATA);
 914                         if (so_zero_copy_send &&
 915                             resid>=PAGE_SIZE &&
 916                             *space>=PAGE_SIZE &&
 917                             uio->uio_iov->iov_len>=PAGE_SIZE) {
 918                                 so_zerocp_stats.size_ok++;
 919                                 so_zerocp_stats.align_ok++;
 920                                 cow_send = socow_setup(m, uio);
 921                                 len = cow_send;
 922                         }
 923                         if (!cow_send) {
 924                                 m_clget(m, M_WAITOK);
 925                                 len = min(min(MCLBYTES, resid), *space);
 926                         }
 927 #else /* ZERO_COPY_SOCKETS */
 928                         if (top == NULL) {
 929                                 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
 930                                 m->m_pkthdr.len = 0;
 931                                 m->m_pkthdr.rcvif = NULL;
 932                         } else
 933                                 m = m_getcl(M_WAIT, MT_DATA, 0);
 934                         len = min(min(MCLBYTES, resid), *space);
 935 #endif /* ZERO_COPY_SOCKETS */
 936                 } else {
 937                         if (top == NULL) {
 938                                 m = m_gethdr(M_WAIT, MT_DATA);
 939                                 m->m_pkthdr.len = 0;
 940                                 m->m_pkthdr.rcvif = NULL;
 941
 942                                 len = min(min(MHLEN, resid), *space);
 943                                 /*
 944                                  * For datagram protocols, leave room
 945                                  * for protocol headers in first mbuf.
 946                                  */
 947                                 if (atomic && m && len < MHLEN)
 948                                         MH_ALIGN(m, len);
 949                         } else {
 950                                 m = m_get(M_WAIT, MT_DATA);
 951                                 len = min(min(MLEN, resid), *space);
 952                         }
 953                 }
 954                 if (m == NULL) {
 955                         error = ENOBUFS;
 956                         goto out;
 957                 }
 958
 959                 *space -= len;
 960 #ifdef ZERO_COPY_SOCKETS
 961                 if (cow_send)
 962                         error = 0;
 963                 else
 964 #endif /* ZERO_COPY_SOCKETS */
 965                 error = uiomove(mtod(m, void *), (int)len, uio);
 966                 resid = uio->uio_resid;
 967                 m->m_len = len;
 968                 *mp = m;
 969                 top->m_pkthdr.len += len;
 970                 if (error)
 971                         goto out;
 972                 mp = &m->m_next;
 973                 if (resid <= 0) {
 974                         if (flags & MSG_EOR)
 975                                 top->m_flags |= M_EOR;
 976                         break;
 977                 }
 978         } while (*space > 0 && atomic);
 979 out:
 980         *retmp = top;
 981         return (error);
 982 }
 983 #endif /*ZERO_COPY_SOCKETS*/
 984
 985 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 986
 987 int
 988 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 989     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 990 {
 991         long space;
 992         ssize_t resid;
 993         int clen = 0, error, dontroute;
 994 #ifdef ZERO_COPY_SOCKETS
 995         int atomic = sosendallatonce(so) || top;
 996 #endif
 997
 998         KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
 999         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1000             ("sodgram_send: !PR_ATOMIC"));
1001
1002         if (uio != NULL)
1003                 resid = uio->uio_resid;
1004         else
1005                 resid = top->m_pkthdr.len;
1006         /*
1007          * In theory resid should be unsigned.  However, space must be
1008          * signed, as it might be less than 0 if we over-committed, and we
1009          * must use a signed comparison of space and resid.  On the other
1010          * hand, a negative resid causes us to loop sending 0-length
1011          * segments to the protocol.
1012          */
1013         if (resid < 0) {
1014                 error = EINVAL;
1015                 goto out;
1016         }
1017
1018         dontroute =
1019             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1020         if (td != NULL)
1021                 td->td_ru.ru_msgsnd++;
1022         if (control != NULL)
1023                 clen = control->m_len;
1024
1025         SOCKBUF_LOCK(&so->so_snd);
1026         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1027                 SOCKBUF_UNLOCK(&so->so_snd);
1028                 error = EPIPE;
1029                 goto out;
1030         }
1031         if (so->so_error) {
1032                 error = so->so_error;
1033                 so->so_error = 0;
1034                 SOCKBUF_UNLOCK(&so->so_snd);
1035                 goto out;
1036         }
1037         if ((so->so_state & SS_ISCONNECTED) == 0) {
1038                 /*
1039                  * `sendto' and `sendmsg' is allowed on a connection-based
1040                  * socket if it supports implied connect.  Return ENOTCONN if
1041                  * not connected and no address is supplied.
1042                  */
1043                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1044                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1045                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1046                             !(resid == 0 && clen != 0)) {
1047                                 SOCKBUF_UNLOCK(&so->so_snd);
1048                                 error = ENOTCONN;
1049                                 goto out;
1050                         }
1051                 } else if (addr == NULL) {
1052                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1053                                 error = ENOTCONN;
1054                         else
1055                                 error = EDESTADDRREQ;
1056                         SOCKBUF_UNLOCK(&so->so_snd);
1057                         goto out;
1058                 }
1059         }
1060
1061         /*
1062          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1063          * problem and need fixing.
1064          */
1065         space = sbspace(&so->so_snd);
1066         if (flags & MSG_OOB)
1067                 space += 1024;
1068         space -= clen;
1069         SOCKBUF_UNLOCK(&so->so_snd);
1070         if (resid > space) {
1071                 error = EMSGSIZE;
1072                 goto out;
1073         }
1074         if (uio == NULL) {
1075                 resid = 0;
1076                 if (flags & MSG_EOR)
1077                         top->m_flags |= M_EOR;
1078         } else {
1079 #ifdef ZERO_COPY_SOCKETS
1080                 error = sosend_copyin(uio, &top, atomic, &space, flags);
1081                 if (error)
1082                         goto out;
1083 #else
1084                 /*
1085                  * Copy the data from userland into a mbuf chain.
1086                  * If no data is to be copied in, a single empty mbuf
1087                  * is returned.
1088                  */
1089                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1090                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1091                 if (top == NULL) {
1092                         error = EFAULT; /* only possible error */
1093                         goto out;
1094                 }
1095                 space -= resid - uio->uio_resid;
1096 #endif
1097                 resid = uio->uio_resid;
1098         }
1099         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1100         /*
1101          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1102          * than with.
1103          */
1104         if (dontroute) {
1105                 SOCK_LOCK(so);
1106                 so->so_options |= SO_DONTROUTE;
1107                 SOCK_UNLOCK(so);
1108         }
1109         /*
1110          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1111          * of date.  We could have recieved a reset packet in an interrupt or
1112          * maybe we slept while doing page faults in uiomove() etc.  We could
1113          * probably recheck again inside the locking protection here, but
1114          * there are probably other places that this also happens.  We must
1115          * rethink this.
1116          */
1117         VNET_SO_ASSERT(so);
1118         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1119             (flags & MSG_OOB) ? PRUS_OOB :
1120         /*
1121          * If the user set MSG_EOF, the protocol understands this flag and
1122          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1123          */
1124             ((flags & MSG_EOF) &&
1125              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1126              (resid <= 0)) ?
1127                 PRUS_EOF :
1128                 /* If there is more to send set PRUS_MORETOCOME */
1129                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1130                 top, addr, control, td);
1131         if (dontroute) {
1132                 SOCK_LOCK(so);
1133                 so->so_options &= ~SO_DONTROUTE;
1134                 SOCK_UNLOCK(so);
1135         }
1136         clen = 0;
1137         control = NULL;
1138         top = NULL;
1139 out:
1140         if (top != NULL)
1141                 m_freem(top);
1142         if (control != NULL)
1143                 m_freem(control);
1144         return (error);
1145 }
1146
1147 /*
1148  * Send on a socket.  If send must go all at once and message is larger than
1149  * send buffering, then hard error.  Lock against other senders.  If must go
1150  * all at once and not enough room now, then inform user that this would
1151  * block and do nothing.  Otherwise, if nonblocking, send as much as
1152  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1153  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1154  * in mbuf chain must be small enough to send all at once.
1155  *
1156  * Returns nonzero on error, timeout or signal; callers must check for short
1157  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1158  * on return.
1159  */
1160 int
1161 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1162     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1163 {
1164         long space;
1165         ssize_t resid;
1166         int clen = 0, error, dontroute;
1167         int atomic = sosendallatonce(so) || top;
1168
1169         if (uio != NULL)
1170                 resid = uio->uio_resid;
1171         else
1172                 resid = top->m_pkthdr.len;
1173         /*
1174          * In theory resid should be unsigned.  However, space must be
1175          * signed, as it might be less than 0 if we over-committed, and we
1176          * must use a signed comparison of space and resid.  On the other
1177          * hand, a negative resid causes us to loop sending 0-length
1178          * segments to the protocol.
1179          *
1180          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1181          * type sockets since that's an error.
1182          */
1183         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1184                 error = EINVAL;
1185                 goto out;
1186         }
1187
1188         dontroute =
1189             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1190             (so->so_proto->pr_flags & PR_ATOMIC);
1191         if (td != NULL)
1192                 td->td_ru.ru_msgsnd++;
1193         if (control != NULL)
1194                 clen = control->m_len;
1195
1196         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1197         if (error)
1198                 goto out;
1199
1200 restart:
1201         do {
1202                 SOCKBUF_LOCK(&so->so_snd);
1203                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1204                         SOCKBUF_UNLOCK(&so->so_snd);
1205                         error = EPIPE;
1206                         goto release;
1207                 }
1208                 if (so->so_error) {
1209                         error = so->so_error;
1210                         so->so_error = 0;
1211                         SOCKBUF_UNLOCK(&so->so_snd);
1212                         goto release;
1213                 }
1214                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1215                         /*
1216                          * `sendto' and `sendmsg' is allowed on a connection-
1217                          * based socket if it supports implied connect.
1218                          * Return ENOTCONN if not connected and no address is
1219                          * supplied.
1220                          */
1221                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1222                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1223                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1224                                     !(resid == 0 && clen != 0)) {
1225                                         SOCKBUF_UNLOCK(&so->so_snd);
1226                                         error = ENOTCONN;
1227                                         goto release;
1228                                 }
1229                         } else if (addr == NULL) {
1230                                 SOCKBUF_UNLOCK(&so->so_snd);
1231                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1232                                         error = ENOTCONN;
1233                                 else
1234                                         error = EDESTADDRREQ;
1235                                 goto release;
1236                         }
1237                 }
1238                 space = sbspace(&so->so_snd);
1239                 if (flags & MSG_OOB)
1240                         space += 1024;
1241                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1242                     clen > so->so_snd.sb_hiwat) {
1243                         SOCKBUF_UNLOCK(&so->so_snd);
1244                         error = EMSGSIZE;
1245                         goto release;
1246                 }
1247                 if (space < resid + clen &&
1248                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1249                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1250                                 SOCKBUF_UNLOCK(&so->so_snd);
1251                                 error = EWOULDBLOCK;
1252                                 goto release;
1253                         }
1254                         error = sbwait(&so->so_snd);
1255                         SOCKBUF_UNLOCK(&so->so_snd);
1256                         if (error)
1257                                 goto release;
1258                         goto restart;
1259                 }
1260                 SOCKBUF_UNLOCK(&so->so_snd);
1261                 space -= clen;
1262                 do {
1263                         if (uio == NULL) {
1264                                 resid = 0;
1265                                 if (flags & MSG_EOR)
1266                                         top->m_flags |= M_EOR;
1267                         } else {
1268 #ifdef ZERO_COPY_SOCKETS
1269                                 error = sosend_copyin(uio, &top, atomic,
1270                                     &space, flags);
1271                                 if (error != 0)
1272                                         goto release;
1273 #else
1274                                 /*
1275                                  * Copy the data from userland into a mbuf
1276                                  * chain.  If no data is to be copied in,
1277                                  * a single empty mbuf is returned.
1278                                  */
1279                                 top = m_uiotombuf(uio, M_WAITOK, space,
1280                                     (atomic ? max_hdr : 0),
1281                                     (atomic ? M_PKTHDR : 0) |
1282                                     ((flags & MSG_EOR) ? M_EOR : 0));
1283                                 if (top == NULL) {
1284                                         error = EFAULT; /* only possible error */
1285                                         goto release;
1286                                 }
1287                                 space -= resid - uio->uio_resid;
1288 #endif
1289                                 resid = uio->uio_resid;
1290                         }
1291                         if (dontroute) {
1292                                 SOCK_LOCK(so);
1293                                 so->so_options |= SO_DONTROUTE;
1294                                 SOCK_UNLOCK(so);
1295                         }
1296                         /*
1297                          * XXX all the SBS_CANTSENDMORE checks previously
1298                          * done could be out of date.  We could have recieved
1299                          * a reset packet in an interrupt or maybe we slept
1300                          * while doing page faults in uiomove() etc.  We
1301                          * could probably recheck again inside the locking
1302                          * protection here, but there are probably other
1303                          * places that this also happens.  We must rethink
1304                          * this.
1305                          */
1306                         VNET_SO_ASSERT(so);
1307                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1308                             (flags & MSG_OOB) ? PRUS_OOB :
1309                         /*
1310                          * If the user set MSG_EOF, the protocol understands
1311                          * this flag and nothing left to send then use
1312                          * PRU_SEND_EOF instead of PRU_SEND.
1313                          */
1314                             ((flags & MSG_EOF) &&
1315                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1316                              (resid <= 0)) ?
1317                                 PRUS_EOF :
1318                         /* If there is more to send set PRUS_MORETOCOME. */
1319                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1320                             top, addr, control, td);
1321                         if (dontroute) {
1322                                 SOCK_LOCK(so);
1323                                 so->so_options &= ~SO_DONTROUTE;
1324                                 SOCK_UNLOCK(so);
1325                         }
1326                         clen = 0;
1327                         control = NULL;
1328                         top = NULL;
1329                         if (error)
1330                                 goto release;
1331                 } while (resid && space > 0);
1332         } while (resid);
1333
1334 release:
1335         sbunlock(&so->so_snd);
1336 out:
1337         if (top != NULL)
1338                 m_freem(top);
1339         if (control != NULL)
1340                 m_freem(control);
1341         return (error);
1342 }
1343
1344 int
1345 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1346     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1347 {
1348         int error;
1349
1350         CURVNET_SET(so->so_vnet);
1351         error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1352             control, flags, td);
1353         CURVNET_RESTORE();
1354         return (error);
1355 }
1356
1357 /*
1358  * The part of soreceive() that implements reading non-inline out-of-band
1359  * data from a socket.  For more complete comments, see soreceive(), from
1360  * which this code originated.
1361  *
1362  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1363  * unable to return an mbuf chain to the caller.
1364  */
1365 static int
1366 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1367 {
1368         struct protosw *pr = so->so_proto;
1369         struct mbuf *m;
1370         int error;
1371
1372         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1373         VNET_SO_ASSERT(so);
1374
1375         m = m_get(M_WAIT, MT_DATA);
1376         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1377         if (error)
1378                 goto bad;
1379         do {
1380 #ifdef ZERO_COPY_SOCKETS
1381                 if (so_zero_copy_receive) {
1382                         int disposable;
1383
1384                         if ((m->m_flags & M_EXT)
1385                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
1386                                 disposable = 1;
1387                         else
1388                                 disposable = 0;
1389
1390                         error = uiomoveco(mtod(m, void *),
1391                                           min(uio->uio_resid, m->m_len),
1392                                           uio, disposable);
1393                 } else
1394 #endif /* ZERO_COPY_SOCKETS */
1395                 error = uiomove(mtod(m, void *),
1396                     (int) min(uio->uio_resid, m->m_len), uio);
1397                 m = m_free(m);
1398         } while (uio->uio_resid && error == 0 && m);
1399 bad:
1400         if (m != NULL)
1401                 m_freem(m);
1402         return (error);
1403 }
1404
1405 /*
1406  * Following replacement or removal of the first mbuf on the first mbuf chain
1407  * of a socket buffer, push necessary state changes back into the socket
1408  * buffer so that other consumers see the values consistently.  'nextrecord'
1409  * is the callers locally stored value of the original value of
1410  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1411  * NOTE: 'nextrecord' may be NULL.
1412  */
1413 static __inline void
1414 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1415 {
1416
1417         SOCKBUF_LOCK_ASSERT(sb);
1418         /*
1419          * First, update for the new value of nextrecord.  If necessary, make
1420          * it the first record.
1421          */
1422         if (sb->sb_mb != NULL)
1423                 sb->sb_mb->m_nextpkt = nextrecord;
1424         else
1425                 sb->sb_mb = nextrecord;
1426
1427         /*
1428          * Now update any dependent socket buffer fields to reflect the new
1429          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1430          * addition of a second clause that takes care of the case where
1431          * sb_mb has been updated, but remains the last record.
1432          */
1433         if (sb->sb_mb == NULL) {
1434                 sb->sb_mbtail = NULL;
1435                 sb->sb_lastrecord = NULL;
1436         } else if (sb->sb_mb->m_nextpkt == NULL)
1437                 sb->sb_lastrecord = sb->sb_mb;
1438 }
1439
1440
1441 /*
1442  * Implement receive operations on a socket.  We depend on the way that
1443  * records are added to the sockbuf by sbappend.  In particular, each record
1444  * (mbufs linked through m_next) must begin with an address if the protocol
1445  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1446  * data, and then zero or more mbufs of data.  In order to allow parallelism
1447  * between network receive and copying to user space, as well as avoid
1448  * sleeping with a mutex held, we release the socket buffer mutex during the
1449  * user space copy.  Although the sockbuf is locked, new data may still be
1450  * appended, and thus we must maintain consistency of the sockbuf during that
1451  * time.
1452  *
1453  * The caller may receive the data as a single mbuf chain by supplying an
1454  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1455  * the count in uio_resid.
1456  */
1457 int
1458 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1459     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1460 {
1461         struct mbuf *m, **mp;
1462         int flags, error, offset;
1463         ssize_t len;
1464         struct protosw *pr = so->so_proto;
1465         struct mbuf *nextrecord;
1466         int moff, type = 0;
1467         ssize_t orig_resid = uio->uio_resid;
1468
1469         mp = mp0;
1470         if (psa != NULL)
1471                 *psa = NULL;
1472         if (controlp != NULL)
1473                 *controlp = NULL;
1474         if (flagsp != NULL)
1475                 flags = *flagsp &~ MSG_EOR;
1476         else
1477                 flags = 0;
1478         if (flags & MSG_OOB)
1479                 return (soreceive_rcvoob(so, uio, flags));
1480         if (mp != NULL)
1481                 *mp = NULL;
1482         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1483             && uio->uio_resid) {
1484                 VNET_SO_ASSERT(so);
1485                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1486         }
1487
1488         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1489         if (error)
1490                 return (error);
1491
1492 restart:
1493         SOCKBUF_LOCK(&so->so_rcv);
1494         m = so->so_rcv.sb_mb;
1495         /*
1496          * If we have less data than requested, block awaiting more (subject
1497          * to any timeout) if:
1498          *   1. the current count is less than the low water mark, or
1499          *   2. MSG_WAITALL is set, and it is possible to do the entire
1500          *      receive operation at once if we block (resid <= hiwat).
1501          *   3. MSG_DONTWAIT is not set
1502          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1503          * we have to do the receive in sections, and thus risk returning a
1504          * short count if a timeout or signal occurs after we start.
1505          */
1506         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1507             so->so_rcv.sb_cc < uio->uio_resid) &&
1508             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1509             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1510             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1511                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1512                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1513                     m, so->so_rcv.sb_cc));
1514                 if (so->so_error) {
1515                         if (m != NULL)
1516                                 goto dontblock;
1517                         error = so->so_error;
1518                         if ((flags & MSG_PEEK) == 0)
1519                                 so->so_error = 0;
1520                         SOCKBUF_UNLOCK(&so->so_rcv);
1521                         goto release;
1522                 }
1523                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1524                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1525                         if (m == NULL) {
1526                                 SOCKBUF_UNLOCK(&so->so_rcv);
1527                                 goto release;
1528                         } else
1529                                 goto dontblock;
1530                 }
1531                 for (; m != NULL; m = m->m_next)
1532                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1533                                 m = so->so_rcv.sb_mb;
1534                                 goto dontblock;
1535                         }
1536                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1537                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1538                         SOCKBUF_UNLOCK(&so->so_rcv);
1539                         error = ENOTCONN;
1540                         goto release;
1541                 }
1542                 if (uio->uio_resid == 0) {
1543                         SOCKBUF_UNLOCK(&so->so_rcv);
1544                         goto release;
1545                 }
1546                 if ((so->so_state & SS_NBIO) ||
1547                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1548                         SOCKBUF_UNLOCK(&so->so_rcv);
1549                         error = EWOULDBLOCK;
1550                         goto release;
1551                 }
1552                 SBLASTRECORDCHK(&so->so_rcv);
1553                 SBLASTMBUFCHK(&so->so_rcv);
1554                 error = sbwait(&so->so_rcv);
1555                 SOCKBUF_UNLOCK(&so->so_rcv);
1556                 if (error)
1557                         goto release;
1558                 goto restart;
1559         }
1560 dontblock:
1561         /*
1562          * From this point onward, we maintain 'nextrecord' as a cache of the
1563          * pointer to the next record in the socket buffer.  We must keep the
1564          * various socket buffer pointers and local stack versions of the
1565          * pointers in sync, pushing out modifications before dropping the
1566          * socket buffer mutex, and re-reading them when picking it up.
1567          *
1568          * Otherwise, we will race with the network stack appending new data
1569          * or records onto the socket buffer by using inconsistent/stale
1570          * versions of the field, possibly resulting in socket buffer
1571          * corruption.
1572          *
1573          * By holding the high-level sblock(), we prevent simultaneous
1574          * readers from pulling off the front of the socket buffer.
1575          */
1576         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1577         if (uio->uio_td)
1578                 uio->uio_td->td_ru.ru_msgrcv++;
1579         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1580         SBLASTRECORDCHK(&so->so_rcv);
1581         SBLASTMBUFCHK(&so->so_rcv);
1582         nextrecord = m->m_nextpkt;
1583         if (pr->pr_flags & PR_ADDR) {
1584                 KASSERT(m->m_type == MT_SONAME,
1585                     ("m->m_type == %d", m->m_type));
1586                 orig_resid = 0;
1587                 if (psa != NULL)
1588                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1589                             M_NOWAIT);
1590                 if (flags & MSG_PEEK) {
1591                         m = m->m_next;
1592                 } else {
1593                         sbfree(&so->so_rcv, m);
1594                         so->so_rcv.sb_mb = m_free(m);
1595                         m = so->so_rcv.sb_mb;
1596                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1597                 }
1598         }
1599
1600         /*
1601          * Process one or more MT_CONTROL mbufs present before any data mbufs
1602          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1603          * just copy the data; if !MSG_PEEK, we call into the protocol to
1604          * perform externalization (or freeing if controlp == NULL).
1605          */
1606         if (m != NULL && m->m_type == MT_CONTROL) {
1607                 struct mbuf *cm = NULL, *cmn;
1608                 struct mbuf **cme = &cm;
1609
1610                 do {
1611                         if (flags & MSG_PEEK) {
1612                                 if (controlp != NULL) {
1613                                         *controlp = m_copy(m, 0, m->m_len);
1614                                         controlp = &(*controlp)->m_next;
1615                                 }
1616                                 m = m->m_next;
1617                         } else {
1618                                 sbfree(&so->so_rcv, m);
1619                                 so->so_rcv.sb_mb = m->m_next;
1620                                 m->m_next = NULL;
1621                                 *cme = m;
1622                                 cme = &(*cme)->m_next;
1623                                 m = so->so_rcv.sb_mb;
1624                         }
1625                 } while (m != NULL && m->m_type == MT_CONTROL);
1626                 if ((flags & MSG_PEEK) == 0)
1627                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1628                 while (cm != NULL) {
1629                         cmn = cm->m_next;
1630                         cm->m_next = NULL;
1631                         if (pr->pr_domain->dom_externalize != NULL) {
1632                                 SOCKBUF_UNLOCK(&so->so_rcv);
1633                                 VNET_SO_ASSERT(so);
1634                                 error = (*pr->pr_domain->dom_externalize)
1635                                     (cm, controlp);
1636                                 SOCKBUF_LOCK(&so->so_rcv);
1637                         } else if (controlp != NULL)
1638                                 *controlp = cm;
1639                         else
1640                                 m_freem(cm);
1641                         if (controlp != NULL) {
1642                                 orig_resid = 0;
1643                                 while (*controlp != NULL)
1644                                         controlp = &(*controlp)->m_next;
1645                         }
1646                         cm = cmn;
1647                 }
1648                 if (m != NULL)
1649                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1650                 else
1651                         nextrecord = so->so_rcv.sb_mb;
1652                 orig_resid = 0;
1653         }
1654         if (m != NULL) {
1655                 if ((flags & MSG_PEEK) == 0) {
1656                         KASSERT(m->m_nextpkt == nextrecord,
1657                             ("soreceive: post-control, nextrecord !sync"));
1658                         if (nextrecord == NULL) {
1659                                 KASSERT(so->so_rcv.sb_mb == m,
1660                                     ("soreceive: post-control, sb_mb!=m"));
1661                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1662                                     ("soreceive: post-control, lastrecord!=m"));
1663                         }
1664                 }
1665                 type = m->m_type;
1666                 if (type == MT_OOBDATA)
1667                         flags |= MSG_OOB;
1668         } else {
1669                 if ((flags & MSG_PEEK) == 0) {
1670                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1671                             ("soreceive: sb_mb != nextrecord"));
1672                         if (so->so_rcv.sb_mb == NULL) {
1673                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1674                                     ("soreceive: sb_lastercord != NULL"));
1675                         }
1676                 }
1677         }
1678         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1679         SBLASTRECORDCHK(&so->so_rcv);
1680         SBLASTMBUFCHK(&so->so_rcv);
1681
1682         /*
1683          * Now continue to read any data mbufs off of the head of the socket
1684          * buffer until the read request is satisfied.  Note that 'type' is
1685          * used to store the type of any mbuf reads that have happened so far
1686          * such that soreceive() can stop reading if the type changes, which
1687          * causes soreceive() to return only one of regular data and inline
1688          * out-of-band data in a single socket receive operation.
1689          */
1690         moff = 0;
1691         offset = 0;
1692         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1693                 /*
1694                  * If the type of mbuf has changed since the last mbuf
1695                  * examined ('type'), end the receive operation.
1696                  */
1697                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1698                 if (m->m_type == MT_OOBDATA) {
1699                         if (type != MT_OOBDATA)
1700                                 break;
1701                 } else if (type == MT_OOBDATA)
1702                         break;
1703                 else
1704                     KASSERT(m->m_type == MT_DATA,
1705                         ("m->m_type == %d", m->m_type));
1706                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1707                 len = uio->uio_resid;
1708                 if (so->so_oobmark && len > so->so_oobmark - offset)
1709                         len = so->so_oobmark - offset;
1710                 if (len > m->m_len - moff)
1711                         len = m->m_len - moff;
1712                 /*
1713                  * If mp is set, just pass back the mbufs.  Otherwise copy
1714                  * them out via the uio, then free.  Sockbuf must be
1715                  * consistent here (points to current mbuf, it points to next
1716                  * record) when we drop priority; we must note any additions
1717                  * to the sockbuf when we block interrupts again.
1718                  */
1719                 if (mp == NULL) {
1720                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1721                         SBLASTRECORDCHK(&so->so_rcv);
1722                         SBLASTMBUFCHK(&so->so_rcv);
1723                         SOCKBUF_UNLOCK(&so->so_rcv);
1724 #ifdef ZERO_COPY_SOCKETS
1725                         if (so_zero_copy_receive) {
1726                                 int disposable;
1727
1728                                 if ((m->m_flags & M_EXT)
1729                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
1730                                         disposable = 1;
1731                                 else
1732                                         disposable = 0;
1733
1734                                 error = uiomoveco(mtod(m, char *) + moff,
1735                                                   (int)len, uio,
1736                                                   disposable);
1737                         } else
1738 #endif /* ZERO_COPY_SOCKETS */
1739                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1740                         SOCKBUF_LOCK(&so->so_rcv);
1741                         if (error) {
1742                                 /*
1743                                  * The MT_SONAME mbuf has already been removed
1744                                  * from the record, so it is necessary to
1745                                  * remove the data mbufs, if any, to preserve
1746                                  * the invariant in the case of PR_ADDR that
1747                                  * requires MT_SONAME mbufs at the head of
1748                                  * each record.
1749                                  */
1750                                 if (m && pr->pr_flags & PR_ATOMIC &&
1751                                     ((flags & MSG_PEEK) == 0))
1752                                         (void)sbdroprecord_locked(&so->so_rcv);
1753                                 SOCKBUF_UNLOCK(&so->so_rcv);
1754                                 goto release;
1755                         }
1756                 } else
1757                         uio->uio_resid -= len;
1758                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1759                 if (len == m->m_len - moff) {
1760                         if (m->m_flags & M_EOR)
1761                                 flags |= MSG_EOR;
1762                         if (flags & MSG_PEEK) {
1763                                 m = m->m_next;
1764                                 moff = 0;
1765                         } else {
1766                                 nextrecord = m->m_nextpkt;
1767                                 sbfree(&so->so_rcv, m);
1768                                 if (mp != NULL) {
1769                                         *mp = m;
1770                                         mp = &m->m_next;
1771                                         so->so_rcv.sb_mb = m = m->m_next;
1772                                         *mp = NULL;
1773                                 } else {
1774                                         so->so_rcv.sb_mb = m_free(m);
1775                                         m = so->so_rcv.sb_mb;
1776                                 }
1777                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
1778                                 SBLASTRECORDCHK(&so->so_rcv);
1779                                 SBLASTMBUFCHK(&so->so_rcv);
1780                         }
1781                 } else {
1782                         if (flags & MSG_PEEK)
1783                                 moff += len;
1784                         else {
1785                                 if (mp != NULL) {
1786                                         int copy_flag;
1787
1788                                         if (flags & MSG_DONTWAIT)
1789                                                 copy_flag = M_DONTWAIT;
1790                                         else
1791                                                 copy_flag = M_WAIT;
1792                                         if (copy_flag == M_WAIT)
1793                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1794                                         *mp = m_copym(m, 0, len, copy_flag);
1795                                         if (copy_flag == M_WAIT)
1796                                                 SOCKBUF_LOCK(&so->so_rcv);
1797                                         if (*mp == NULL) {
1798                                                 /*
1799                                                  * m_copym() couldn't
1800                                                  * allocate an mbuf.  Adjust
1801                                                  * uio_resid back (it was
1802                                                  * adjusted down by len
1803                                                  * bytes, which we didn't end
1804                                                  * up "copying" over).
1805                                                  */
1806                                                 uio->uio_resid += len;
1807                                                 break;
1808                                         }
1809                                 }
1810                                 m->m_data += len;
1811                                 m->m_len -= len;
1812                                 so->so_rcv.sb_cc -= len;
1813                         }
1814                 }
1815                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1816                 if (so->so_oobmark) {
1817                         if ((flags & MSG_PEEK) == 0) {
1818                                 so->so_oobmark -= len;
1819                                 if (so->so_oobmark == 0) {
1820                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1821                                         break;
1822                                 }
1823                         } else {
1824                                 offset += len;
1825                                 if (offset == so->so_oobmark)
1826                                         break;
1827                         }
1828                 }
1829                 if (flags & MSG_EOR)
1830                         break;
1831                 /*
1832                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
1833                  * must not quit until "uio->uio_resid == 0" or an error
1834                  * termination.  If a signal/timeout occurs, return with a
1835                  * short count but without error.  Keep sockbuf locked
1836                  * against other readers.
1837                  */
1838                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1839                     !sosendallatonce(so) && nextrecord == NULL) {
1840                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1841                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1842                                 break;
1843                         /*
1844                          * Notify the protocol that some data has been
1845                          * drained before blocking.
1846                          */
1847                         if (pr->pr_flags & PR_WANTRCVD) {
1848                                 SOCKBUF_UNLOCK(&so->so_rcv);
1849                                 VNET_SO_ASSERT(so);
1850                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1851                                 SOCKBUF_LOCK(&so->so_rcv);
1852                         }
1853                         SBLASTRECORDCHK(&so->so_rcv);
1854                         SBLASTMBUFCHK(&so->so_rcv);
1855                         /*
1856                          * We could receive some data while was notifying
1857                          * the protocol. Skip blocking in this case.
1858                          */
1859                         if (so->so_rcv.sb_mb == NULL) {
1860                                 error = sbwait(&so->so_rcv);
1861                                 if (error) {
1862                                         SOCKBUF_UNLOCK(&so->so_rcv);
1863                                         goto release;
1864                                 }
1865                         }
1866                         m = so->so_rcv.sb_mb;
1867                         if (m != NULL)
1868                                 nextrecord = m->m_nextpkt;
1869                 }
1870         }
1871
1872         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1873         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1874                 flags |= MSG_TRUNC;
1875                 if ((flags & MSG_PEEK) == 0)
1876                         (void) sbdroprecord_locked(&so->so_rcv);
1877         }
1878         if ((flags & MSG_PEEK) == 0) {
1879                 if (m == NULL) {
1880                         /*
1881                          * First part is an inline SB_EMPTY_FIXUP().  Second
1882                          * part makes sure sb_lastrecord is up-to-date if
1883                          * there is still data in the socket buffer.
1884                          */
1885                         so->so_rcv.sb_mb = nextrecord;
1886                         if (so->so_rcv.sb_mb == NULL) {
1887                                 so->so_rcv.sb_mbtail = NULL;
1888                                 so->so_rcv.sb_lastrecord = NULL;
1889                         } else if (nextrecord->m_nextpkt == NULL)
1890                                 so->so_rcv.sb_lastrecord = nextrecord;
1891                 }
1892                 SBLASTRECORDCHK(&so->so_rcv);
1893                 SBLASTMBUFCHK(&so->so_rcv);
1894                 /*
1895                  * If soreceive() is being done from the socket callback,
1896                  * then don't need to generate ACK to peer to update window,
1897                  * since ACK will be generated on return to TCP.
1898                  */
1899                 if (!(flags & MSG_SOCALLBCK) &&
1900                     (pr->pr_flags & PR_WANTRCVD)) {
1901                         SOCKBUF_UNLOCK(&so->so_rcv);
1902                         VNET_SO_ASSERT(so);
1903                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1904                         SOCKBUF_LOCK(&so->so_rcv);
1905                 }
1906         }
1907         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1908         if (orig_resid == uio->uio_resid && orig_resid &&
1909             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1910                 SOCKBUF_UNLOCK(&so->so_rcv);
1911                 goto restart;
1912         }
1913         SOCKBUF_UNLOCK(&so->so_rcv);
1914
1915         if (flagsp != NULL)
1916                 *flagsp |= flags;
1917 release:
1918         sbunlock(&so->so_rcv);
1919         return (error);
1920 }
1921
1922 /*
1923  * Optimized version of soreceive() for stream (TCP) sockets.
1924  */
1925 int
1926 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1927     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1928 {
1929         int len = 0, error = 0, flags, oresid;
1930         struct sockbuf *sb;
1931         struct mbuf *m, *n = NULL;
1932
1933         /* We only do stream sockets. */
1934         if (so->so_type != SOCK_STREAM)
1935                 return (EINVAL);
1936         if (psa != NULL)
1937                 *psa = NULL;
1938         if (controlp != NULL)
1939                 return (EINVAL);
1940         if (flagsp != NULL)
1941                 flags = *flagsp &~ MSG_EOR;
1942         else
1943                 flags = 0;
1944         if (flags & MSG_OOB)
1945                 return (soreceive_rcvoob(so, uio, flags));
1946         if (mp0 != NULL)
1947                 *mp0 = NULL;
1948
1949         sb = &so->so_rcv;
1950
1951         /* Prevent other readers from entering the socket. */
1952         error = sblock(sb, SBLOCKWAIT(flags));
1953         if (error)
1954                 goto out;
1955         SOCKBUF_LOCK(sb);
1956
1957         /* Easy one, no space to copyout anything. */
1958         if (uio->uio_resid == 0) {
1959                 error = EINVAL;
1960                 goto out;
1961         }
1962         oresid = uio->uio_resid;
1963
1964         /* We will never ever get anything unless we are or were connected. */
1965         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1966                 error = ENOTCONN;
1967                 goto out;
1968         }
1969
1970 restart:
1971         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1972
1973         /* Abort if socket has reported problems. */
1974         if (so->so_error) {
1975                 if (sb->sb_cc > 0)
1976                         goto deliver;
1977                 if (oresid > uio->uio_resid)
1978                         goto out;
1979                 error = so->so_error;
1980                 if (!(flags & MSG_PEEK))
1981                         so->so_error = 0;
1982                 goto out;
1983         }
1984
1985         /* Door is closed.  Deliver what is left, if any. */
1986         if (sb->sb_state & SBS_CANTRCVMORE) {
1987                 if (sb->sb_cc > 0)
1988                         goto deliver;
1989                 else
1990                         goto out;
1991         }
1992
1993         /* Socket buffer is empty and we shall not block. */
1994         if (sb->sb_cc == 0 &&
1995             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1996                 error = EAGAIN;
1997                 goto out;
1998         }
1999
2000         /* Socket buffer got some data that we shall deliver now. */
2001         if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2002             ((sb->sb_flags & SS_NBIO) ||
2003              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2004              sb->sb_cc >= sb->sb_lowat ||
2005              sb->sb_cc >= uio->uio_resid ||
2006              sb->sb_cc >= sb->sb_hiwat) ) {
2007                 goto deliver;
2008         }
2009
2010         /* On MSG_WAITALL we must wait until all data or error arrives. */
2011         if ((flags & MSG_WAITALL) &&
2012             (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
2013                 goto deliver;
2014
2015         /*
2016          * Wait and block until (more) data comes in.
2017          * NB: Drops the sockbuf lock during wait.
2018          */
2019         error = sbwait(sb);
2020         if (error)
2021                 goto out;
2022         goto restart;
2023
2024 deliver:
2025         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2026         KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2027         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2028
2029         /* Statistics. */
2030         if (uio->uio_td)
2031                 uio->uio_td->td_ru.ru_msgrcv++;
2032
2033         /* Fill uio until full or current end of socket buffer is reached. */
2034         len = min(uio->uio_resid, sb->sb_cc);
2035         if (mp0 != NULL) {
2036                 /* Dequeue as many mbufs as possible. */
2037                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2038                         for (*mp0 = m = sb->sb_mb;
2039                              m != NULL && m->m_len <= len;
2040                              m = m->m_next) {
2041                                 len -= m->m_len;
2042                                 uio->uio_resid -= m->m_len;
2043                                 sbfree(sb, m);
2044                                 n = m;
2045                         }
2046                         sb->sb_mb = m;
2047                         if (sb->sb_mb == NULL)
2048                                 SB_EMPTY_FIXUP(sb);
2049                         n->m_next = NULL;
2050                 }
2051                 /* Copy the remainder. */
2052                 if (len > 0) {
2053                         KASSERT(sb->sb_mb != NULL,
2054                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2055
2056                         m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2057                         if (m == NULL)
2058                                 len = 0;        /* Don't flush data from sockbuf. */
2059                         else
2060                                 uio->uio_resid -= m->m_len;
2061                         if (*mp0 != NULL)
2062                                 n->m_next = m;
2063                         else
2064                                 *mp0 = m;
2065                         if (*mp0 == NULL) {
2066                                 error = ENOBUFS;
2067                                 goto out;
2068                         }
2069                 }
2070         } else {
2071                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2072                 SOCKBUF_UNLOCK(sb);
2073                 error = m_mbuftouio(uio, sb->sb_mb, len);
2074                 SOCKBUF_LOCK(sb);
2075                 if (error)
2076                         goto out;
2077         }
2078         SBLASTRECORDCHK(sb);
2079         SBLASTMBUFCHK(sb);
2080
2081         /*
2082          * Remove the delivered data from the socket buffer unless we
2083          * were only peeking.
2084          */
2085         if (!(flags & MSG_PEEK)) {
2086                 if (len > 0)
2087                         sbdrop_locked(sb, len);
2088
2089                 /* Notify protocol that we drained some data. */
2090                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2091                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2092                      !(flags & MSG_SOCALLBCK))) {
2093                         SOCKBUF_UNLOCK(sb);
2094                         VNET_SO_ASSERT(so);
2095                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2096                         SOCKBUF_LOCK(sb);
2097                 }
2098         }
2099
2100         /*
2101          * For MSG_WAITALL we may have to loop again and wait for
2102          * more data to come in.
2103          */
2104         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2105                 goto restart;
2106 out:
2107         SOCKBUF_LOCK_ASSERT(sb);
2108         SBLASTRECORDCHK(sb);
2109         SBLASTMBUFCHK(sb);
2110         SOCKBUF_UNLOCK(sb);
2111         sbunlock(sb);
2112         return (error);
2113 }
2114
2115 /*
2116  * Optimized version of soreceive() for simple datagram cases from userspace.
2117  * Unlike in the stream case, we're able to drop a datagram if copyout()
2118  * fails, and because we handle datagrams atomically, we don't need to use a
2119  * sleep lock to prevent I/O interlacing.
2120  */
2121 int
2122 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2123     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2124 {
2125         struct mbuf *m, *m2;
2126         int flags, error;
2127         ssize_t len;
2128         struct protosw *pr = so->so_proto;
2129         struct mbuf *nextrecord;
2130
2131         if (psa != NULL)
2132                 *psa = NULL;
2133         if (controlp != NULL)
2134                 *controlp = NULL;
2135         if (flagsp != NULL)
2136                 flags = *flagsp &~ MSG_EOR;
2137         else
2138                 flags = 0;
2139
2140         /*
2141          * For any complicated cases, fall back to the full
2142          * soreceive_generic().
2143          */
2144         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2145                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2146                     flagsp));
2147
2148         /*
2149          * Enforce restrictions on use.
2150          */
2151         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2152             ("soreceive_dgram: wantrcvd"));
2153         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2154         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2155             ("soreceive_dgram: SBS_RCVATMARK"));
2156         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2157             ("soreceive_dgram: P_CONNREQUIRED"));
2158
2159         /*
2160          * Loop blocking while waiting for a datagram.
2161          */
2162         SOCKBUF_LOCK(&so->so_rcv);
2163         while ((m = so->so_rcv.sb_mb) == NULL) {
2164                 KASSERT(so->so_rcv.sb_cc == 0,
2165                     ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2166                     so->so_rcv.sb_cc));
2167                 if (so->so_error) {
2168                         error = so->so_error;
2169                         so->so_error = 0;
2170                         SOCKBUF_UNLOCK(&so->so_rcv);
2171                         return (error);
2172                 }
2173                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2174                     uio->uio_resid == 0) {
2175                         SOCKBUF_UNLOCK(&so->so_rcv);
2176                         return (0);
2177                 }
2178                 if ((so->so_state & SS_NBIO) ||
2179                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2180                         SOCKBUF_UNLOCK(&so->so_rcv);
2181                         return (EWOULDBLOCK);
2182                 }
2183                 SBLASTRECORDCHK(&so->so_rcv);
2184                 SBLASTMBUFCHK(&so->so_rcv);
2185                 error = sbwait(&so->so_rcv);
2186                 if (error) {
2187                         SOCKBUF_UNLOCK(&so->so_rcv);
2188                         return (error);
2189                 }
2190         }
2191         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2192
2193         if (uio->uio_td)
2194                 uio->uio_td->td_ru.ru_msgrcv++;
2195         SBLASTRECORDCHK(&so->so_rcv);
2196         SBLASTMBUFCHK(&so->so_rcv);
2197         nextrecord = m->m_nextpkt;
2198         if (nextrecord == NULL) {
2199                 KASSERT(so->so_rcv.sb_lastrecord == m,
2200                     ("soreceive_dgram: lastrecord != m"));
2201         }
2202
2203         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2204             ("soreceive_dgram: m_nextpkt != nextrecord"));
2205
2206         /*
2207          * Pull 'm' and its chain off the front of the packet queue.
2208          */
2209         so->so_rcv.sb_mb = NULL;
2210         sockbuf_pushsync(&so->so_rcv, nextrecord);
2211
2212         /*
2213          * Walk 'm's chain and free that many bytes from the socket buffer.
2214          */
2215         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2216                 sbfree(&so->so_rcv, m2);
2217
2218         /*
2219          * Do a few last checks before we let go of the lock.
2220          */
2221         SBLASTRECORDCHK(&so->so_rcv);
2222         SBLASTMBUFCHK(&so->so_rcv);
2223         SOCKBUF_UNLOCK(&so->so_rcv);
2224
2225         if (pr->pr_flags & PR_ADDR) {
2226                 KASSERT(m->m_type == MT_SONAME,
2227                     ("m->m_type == %d", m->m_type));
2228                 if (psa != NULL)
2229                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2230                             M_NOWAIT);
2231                 m = m_free(m);
2232         }
2233         if (m == NULL) {
2234                 /* XXXRW: Can this happen? */
2235                 return (0);
2236         }
2237
2238         /*
2239          * Packet to copyout() is now in 'm' and it is disconnected from the
2240          * queue.
2241          *
2242          * Process one or more MT_CONTROL mbufs present before any data mbufs
2243          * in the first mbuf chain on the socket buffer.  We call into the
2244          * protocol to perform externalization (or freeing if controlp ==
2245          * NULL).
2246          */
2247         if (m->m_type == MT_CONTROL) {
2248                 struct mbuf *cm = NULL, *cmn;
2249                 struct mbuf **cme = &cm;
2250
2251                 do {
2252                         m2 = m->m_next;
2253                         m->m_next = NULL;
2254                         *cme = m;
2255                         cme = &(*cme)->m_next;
2256                         m = m2;
2257                 } while (m != NULL && m->m_type == MT_CONTROL);
2258                 while (cm != NULL) {
2259                         cmn = cm->m_next;
2260                         cm->m_next = NULL;
2261                         if (pr->pr_domain->dom_externalize != NULL) {
2262                                 error = (*pr->pr_domain->dom_externalize)
2263                                     (cm, controlp);
2264                         } else if (controlp != NULL)
2265                                 *controlp = cm;
2266                         else
2267                                 m_freem(cm);
2268                         if (controlp != NULL) {
2269                                 while (*controlp != NULL)
2270                                         controlp = &(*controlp)->m_next;
2271                         }
2272                         cm = cmn;
2273                 }
2274         }
2275         KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2276
2277         while (m != NULL && uio->uio_resid > 0) {
2278                 len = uio->uio_resid;
2279                 if (len > m->m_len)
2280                         len = m->m_len;
2281                 error = uiomove(mtod(m, char *), (int)len, uio);
2282                 if (error) {
2283                         m_freem(m);
2284                         return (error);
2285                 }
2286                 if (len == m->m_len)
2287                         m = m_free(m);
2288                 else {
2289                         m->m_data += len;
2290                         m->m_len -= len;
2291                 }
2292         }
2293         if (m != NULL)
2294                 flags |= MSG_TRUNC;
2295         m_freem(m);
2296         if (flagsp != NULL)
2297                 *flagsp |= flags;
2298         return (0);
2299 }
2300
2301 int
2302 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2303     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2304 {
2305         int error;
2306
2307         CURVNET_SET(so->so_vnet);
2308         error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2309             controlp, flagsp));
2310         CURVNET_RESTORE();
2311         return (error);
2312 }
2313
2314 int
2315 soshutdown(struct socket *so, int how)
2316 {
2317         struct protosw *pr = so->so_proto;
2318         int error;
2319
2320         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2321                 return (EINVAL);
2322
2323         CURVNET_SET(so->so_vnet);
2324         if (pr->pr_usrreqs->pru_flush != NULL) {
2325                 (*pr->pr_usrreqs->pru_flush)(so, how);
2326         }
2327         if (how != SHUT_WR)
2328                 sorflush(so);
2329         if (how != SHUT_RD) {
2330                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2331                 CURVNET_RESTORE();
2332                 return (error);
2333         }
2334         CURVNET_RESTORE();
2335         return (0);
2336 }
2337
2338 void
2339 sorflush(struct socket *so)
2340 {
2341         struct sockbuf *sb = &so->so_rcv;
2342         struct protosw *pr = so->so_proto;
2343         struct sockbuf asb;
2344
2345         VNET_SO_ASSERT(so);
2346
2347         /*
2348          * In order to avoid calling dom_dispose with the socket buffer mutex
2349          * held, and in order to generally avoid holding the lock for a long
2350          * time, we make a copy of the socket buffer and clear the original
2351          * (except locks, state).  The new socket buffer copy won't have
2352          * initialized locks so we can only call routines that won't use or
2353          * assert those locks.
2354          *
2355          * Dislodge threads currently blocked in receive and wait to acquire
2356          * a lock against other simultaneous readers before clearing the
2357          * socket buffer.  Don't let our acquire be interrupted by a signal
2358          * despite any existing socket disposition on interruptable waiting.
2359          */
2360         socantrcvmore(so);
2361         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2362
2363         /*
2364          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2365          * and mutex data unchanged.
2366          */
2367         SOCKBUF_LOCK(sb);
2368         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2369         bcopy(&sb->sb_startzero, &asb.sb_startzero,
2370             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2371         bzero(&sb->sb_startzero,
2372             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2373         SOCKBUF_UNLOCK(sb);
2374         sbunlock(sb);
2375
2376         /*
2377          * Dispose of special rights and flush the socket buffer.  Don't call
2378          * any unsafe routines (that rely on locks being initialized) on asb.
2379          */
2380         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2381                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2382         sbrelease_internal(&asb, so);
2383 }
2384
2385 /*
2386  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2387  * additional variant to handle the case where the option value needs to be
2388  * some kind of integer, but not a specific size.  In addition to their use
2389  * here, these functions are also called by the protocol-level pr_ctloutput()
2390  * routines.
2391  */
2392 int
2393 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2394 {
2395         size_t  valsize;
2396
2397         /*
2398          * If the user gives us more than we wanted, we ignore it, but if we
2399          * don't get the minimum length the caller wants, we return EINVAL.
2400          * On success, sopt->sopt_valsize is set to however much we actually
2401          * retrieved.
2402          */
2403         if ((valsize = sopt->sopt_valsize) < minlen)
2404                 return EINVAL;
2405         if (valsize > len)
2406                 sopt->sopt_valsize = valsize = len;
2407
2408         if (sopt->sopt_td != NULL)
2409                 return (copyin(sopt->sopt_val, buf, valsize));
2410
2411         bcopy(sopt->sopt_val, buf, valsize);
2412         return (0);
2413 }
2414
2415 /*
2416  * Kernel version of setsockopt(2).
2417  *
2418  * XXX: optlen is size_t, not socklen_t
2419  */
2420 int
2421 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2422     size_t optlen)
2423 {
2424         struct sockopt sopt;
2425
2426         sopt.sopt_level = level;
2427         sopt.sopt_name = optname;
2428         sopt.sopt_dir = SOPT_SET;
2429         sopt.sopt_val = optval;
2430         sopt.sopt_valsize = optlen;
2431         sopt.sopt_td = NULL;
2432         return (sosetopt(so, &sopt));
2433 }
2434
2435 int
2436 sosetopt(struct socket *so, struct sockopt *sopt)
2437 {
2438         int     error, optval;
2439         struct  linger l;
2440         struct  timeval tv;
2441         u_long  val;
2442         uint32_t val32;
2443 #ifdef MAC
2444         struct mac extmac;
2445 #endif
2446
2447         CURVNET_SET(so->so_vnet);
2448         error = 0;
2449         if (sopt->sopt_level != SOL_SOCKET) {
2450                 if (so->so_proto->pr_ctloutput != NULL) {
2451                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2452                         CURVNET_RESTORE();
2453                         return (error);
2454                 }
2455                 error = ENOPROTOOPT;
2456         } else {
2457                 switch (sopt->sopt_name) {
2458 #ifdef INET
2459                 case SO_ACCEPTFILTER:
2460                         error = do_setopt_accept_filter(so, sopt);
2461                         if (error)
2462                                 goto bad;
2463                         break;
2464 #endif
2465                 case SO_LINGER:
2466                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2467                         if (error)
2468                                 goto bad;
2469
2470                         SOCK_LOCK(so);
2471                         so->so_linger = l.l_linger;
2472                         if (l.l_onoff)
2473                                 so->so_options |= SO_LINGER;
2474                         else
2475                                 so->so_options &= ~SO_LINGER;
2476                         SOCK_UNLOCK(so);
2477                         break;
2478
2479                 case SO_DEBUG:
2480                 case SO_KEEPALIVE:
2481                 case SO_DONTROUTE:
2482                 case SO_USELOOPBACK:
2483                 case SO_BROADCAST:
2484                 case SO_REUSEADDR:
2485                 case SO_REUSEPORT:
2486                 case SO_OOBINLINE:
2487                 case SO_TIMESTAMP:
2488                 case SO_BINTIME:
2489                 case SO_NOSIGPIPE:
2490                 case SO_NO_DDP:
2491                 case SO_NO_OFFLOAD:
2492                         error = sooptcopyin(sopt, &optval, sizeof optval,
2493                                             sizeof optval);
2494                         if (error)
2495                                 goto bad;
2496                         SOCK_LOCK(so);
2497                         if (optval)
2498                                 so->so_options |= sopt->sopt_name;
2499                         else
2500                                 so->so_options &= ~sopt->sopt_name;
2501                         SOCK_UNLOCK(so);
2502                         break;
2503
2504                 case SO_SETFIB:
2505                         error = sooptcopyin(sopt, &optval, sizeof optval,
2506                                             sizeof optval);
2507                         if (optval < 0 || optval >= rt_numfibs) {
2508                                 error = EINVAL;
2509                                 goto bad;
2510                         }
2511                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2512                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2513                            (so->so_proto->pr_domain->dom_family == PF_ROUTE))) {
2514                                 so->so_fibnum = optval;
2515                                 /* Note: ignore error */
2516                                 if (so->so_proto->pr_ctloutput)
2517                                         (*so->so_proto->pr_ctloutput)(so, sopt);
2518                         } else {
2519                                 so->so_fibnum = 0;
2520                         }
2521                         break;
2522
2523                 case SO_USER_COOKIE:
2524                         error = sooptcopyin(sopt, &val32, sizeof val32,
2525                                             sizeof val32);
2526                         if (error)
2527                                 goto bad;
2528                         so->so_user_cookie = val32;
2529                         break;
2530
2531                 case SO_SNDBUF:
2532                 case SO_RCVBUF:
2533                 case SO_SNDLOWAT:
2534                 case SO_RCVLOWAT:
2535                         error = sooptcopyin(sopt, &optval, sizeof optval,
2536                                             sizeof optval);
2537                         if (error)
2538                                 goto bad;
2539
2540                         /*
2541                          * Values < 1 make no sense for any of these options,
2542                          * so disallow them.
2543                          */
2544                         if (optval < 1) {
2545                                 error = EINVAL;
2546                                 goto bad;
2547                         }
2548
2549                         switch (sopt->sopt_name) {
2550                         case SO_SNDBUF:
2551                         case SO_RCVBUF:
2552                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2553                                     &so->so_snd : &so->so_rcv, (u_long)optval,
2554                                     so, curthread) == 0) {
2555                                         error = ENOBUFS;
2556                                         goto bad;
2557                                 }
2558                                 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2559                                     &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2560                                 break;
2561
2562                         /*
2563                          * Make sure the low-water is never greater than the
2564                          * high-water.
2565                          */
2566                         case SO_SNDLOWAT:
2567                                 SOCKBUF_LOCK(&so->so_snd);
2568                                 so->so_snd.sb_lowat =
2569                                     (optval > so->so_snd.sb_hiwat) ?
2570                                     so->so_snd.sb_hiwat : optval;
2571                                 SOCKBUF_UNLOCK(&so->so_snd);
2572                                 break;
2573                         case SO_RCVLOWAT:
2574                                 SOCKBUF_LOCK(&so->so_rcv);
2575                                 so->so_rcv.sb_lowat =
2576                                     (optval > so->so_rcv.sb_hiwat) ?
2577                                     so->so_rcv.sb_hiwat : optval;
2578                                 SOCKBUF_UNLOCK(&so->so_rcv);
2579                                 break;
2580                         }
2581                         break;
2582
2583                 case SO_SNDTIMEO:
2584                 case SO_RCVTIMEO:
2585 #ifdef COMPAT_FREEBSD32
2586                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2587                                 struct timeval32 tv32;
2588
2589                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2590                                     sizeof tv32);
2591                                 CP(tv32, tv, tv_sec);
2592                                 CP(tv32, tv, tv_usec);
2593                         } else
2594 #endif
2595                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2596                                     sizeof tv);
2597                         if (error)
2598                                 goto bad;
2599
2600                         /* assert(hz > 0); */
2601                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2602                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2603                                 error = EDOM;
2604                                 goto bad;
2605                         }
2606                         /* assert(tick > 0); */
2607                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2608                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2609                         if (val > INT_MAX) {
2610                                 error = EDOM;
2611                                 goto bad;
2612                         }
2613                         if (val == 0 && tv.tv_usec != 0)
2614                                 val = 1;
2615
2616                         switch (sopt->sopt_name) {
2617                         case SO_SNDTIMEO:
2618                                 so->so_snd.sb_timeo = val;
2619                                 break;
2620                         case SO_RCVTIMEO:
2621                                 so->so_rcv.sb_timeo = val;
2622                                 break;
2623                         }
2624                         break;
2625
2626                 case SO_LABEL:
2627 #ifdef MAC
2628                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2629                             sizeof extmac);
2630                         if (error)
2631                                 goto bad;
2632                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2633                             so, &extmac);
2634 #else
2635                         error = EOPNOTSUPP;
2636 #endif
2637                         break;
2638
2639                 default:
2640                         error = ENOPROTOOPT;
2641                         break;
2642                 }
2643                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2644                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
2645         }
2646 bad:
2647         CURVNET_RESTORE();
2648         return (error);
2649 }
2650
2651 /*
2652  * Helper routine for getsockopt.
2653  */
2654 int
2655 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2656 {
2657         int     error;
2658         size_t  valsize;
2659
2660         error = 0;
2661
2662         /*
2663          * Documented get behavior is that we always return a value, possibly
2664          * truncated to fit in the user's buffer.  Traditional behavior is
2665          * that we always tell the user precisely how much we copied, rather
2666          * than something useful like the total amount we had available for
2667          * her.  Note that this interface is not idempotent; the entire
2668          * answer must generated ahead of time.
2669          */
2670         valsize = min(len, sopt->sopt_valsize);
2671         sopt->sopt_valsize = valsize;
2672         if (sopt->sopt_val != NULL) {
2673                 if (sopt->sopt_td != NULL)
2674                         error = copyout(buf, sopt->sopt_val, valsize);
2675                 else
2676                         bcopy(buf, sopt->sopt_val, valsize);
2677         }
2678         return (error);
2679 }
2680
2681 int
2682 sogetopt(struct socket *so, struct sockopt *sopt)
2683 {
2684         int     error, optval;
2685         struct  linger l;
2686         struct  timeval tv;
2687 #ifdef MAC
2688         struct mac extmac;
2689 #endif
2690
2691         CURVNET_SET(so->so_vnet);
2692         error = 0;
2693         if (sopt->sopt_level != SOL_SOCKET) {
2694                 if (so->so_proto->pr_ctloutput != NULL)
2695                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2696                 else
2697                         error = ENOPROTOOPT;
2698                 CURVNET_RESTORE();
2699                 return (error);
2700         } else {
2701                 switch (sopt->sopt_name) {
2702 #ifdef INET
2703                 case SO_ACCEPTFILTER:
2704                         error = do_getopt_accept_filter(so, sopt);
2705                         break;
2706 #endif
2707                 case SO_LINGER:
2708                         SOCK_LOCK(so);
2709                         l.l_onoff = so->so_options & SO_LINGER;
2710                         l.l_linger = so->so_linger;
2711                         SOCK_UNLOCK(so);
2712                         error = sooptcopyout(sopt, &l, sizeof l);
2713                         break;
2714
2715                 case SO_USELOOPBACK:
2716                 case SO_DONTROUTE:
2717                 case SO_DEBUG:
2718                 case SO_KEEPALIVE:
2719                 case SO_REUSEADDR:
2720                 case SO_REUSEPORT:
2721                 case SO_BROADCAST:
2722                 case SO_OOBINLINE:
2723                 case SO_ACCEPTCONN:
2724                 case SO_TIMESTAMP:
2725                 case SO_BINTIME:
2726                 case SO_NOSIGPIPE:
2727                         optval = so->so_options & sopt->sopt_name;
2728 integer:
2729                         error = sooptcopyout(sopt, &optval, sizeof optval);
2730                         break;
2731
2732                 case SO_TYPE:
2733                         optval = so->so_type;
2734                         goto integer;
2735
2736                 case SO_PROTOCOL:
2737                         optval = so->so_proto->pr_protocol;
2738                         goto integer;
2739
2740                 case SO_ERROR:
2741                         SOCK_LOCK(so);
2742                         optval = so->so_error;
2743                         so->so_error = 0;
2744                         SOCK_UNLOCK(so);
2745                         goto integer;
2746
2747                 case SO_SNDBUF:
2748                         optval = so->so_snd.sb_hiwat;
2749                         goto integer;
2750
2751                 case SO_RCVBUF:
2752                         optval = so->so_rcv.sb_hiwat;
2753                         goto integer;
2754
2755                 case SO_SNDLOWAT:
2756                         optval = so->so_snd.sb_lowat;
2757                         goto integer;
2758
2759                 case SO_RCVLOWAT:
2760                         optval = so->so_rcv.sb_lowat;
2761                         goto integer;
2762
2763                 case SO_SNDTIMEO:
2764                 case SO_RCVTIMEO:
2765                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
2766                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2767
2768                         tv.tv_sec = optval / hz;
2769                         tv.tv_usec = (optval % hz) * tick;
2770 #ifdef COMPAT_FREEBSD32
2771                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2772                                 struct timeval32 tv32;
2773
2774                                 CP(tv, tv32, tv_sec);
2775                                 CP(tv, tv32, tv_usec);
2776                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2777                         } else
2778 #endif
2779                                 error = sooptcopyout(sopt, &tv, sizeof tv);
2780                         break;
2781
2782                 case SO_LABEL:
2783 #ifdef MAC
2784                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2785                             sizeof(extmac));
2786                         if (error)
2787                                 goto bad;
2788                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2789                             so, &extmac);
2790                         if (error)
2791                                 goto bad;
2792                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2793 #else
2794                         error = EOPNOTSUPP;
2795 #endif
2796                         break;
2797
2798                 case SO_PEERLABEL:
2799 #ifdef MAC
2800                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2801                             sizeof(extmac));
2802                         if (error)
2803                                 goto bad;
2804                         error = mac_getsockopt_peerlabel(
2805                             sopt->sopt_td->td_ucred, so, &extmac);
2806                         if (error)
2807                                 goto bad;
2808                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2809 #else
2810                         error = EOPNOTSUPP;
2811 #endif
2812                         break;
2813
2814                 case SO_LISTENQLIMIT:
2815                         optval = so->so_qlimit;
2816                         goto integer;
2817
2818                 case SO_LISTENQLEN:
2819                         optval = so->so_qlen;
2820                         goto integer;
2821
2822                 case SO_LISTENINCQLEN:
2823                         optval = so->so_incqlen;
2824                         goto integer;
2825
2826                 default:
2827                         error = ENOPROTOOPT;
2828                         break;
2829                 }
2830         }
2831 #ifdef MAC
2832 bad:
2833 #endif
2834         CURVNET_RESTORE();
2835         return (error);
2836 }
2837
2838 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2839 int
2840 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2841 {
2842         struct mbuf *m, *m_prev;
2843         int sopt_size = sopt->sopt_valsize;
2844
2845         MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2846         if (m == NULL)
2847                 return ENOBUFS;
2848         if (sopt_size > MLEN) {
2849                 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2850                 if ((m->m_flags & M_EXT) == 0) {
2851                         m_free(m);
2852                         return ENOBUFS;
2853                 }
2854                 m->m_len = min(MCLBYTES, sopt_size);
2855         } else {
2856                 m->m_len = min(MLEN, sopt_size);
2857         }
2858         sopt_size -= m->m_len;
2859         *mp = m;
2860         m_prev = m;
2861
2862         while (sopt_size) {
2863                 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2864                 if (m == NULL) {
2865                         m_freem(*mp);
2866                         return ENOBUFS;
2867                 }
2868                 if (sopt_size > MLEN) {
2869                         MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2870                             M_DONTWAIT);
2871                         if ((m->m_flags & M_EXT) == 0) {
2872                                 m_freem(m);
2873                                 m_freem(*mp);
2874                                 return ENOBUFS;
2875                         }
2876                         m->m_len = min(MCLBYTES, sopt_size);
2877                 } else {
2878                         m->m_len = min(MLEN, sopt_size);
2879                 }
2880                 sopt_size -= m->m_len;
2881                 m_prev->m_next = m;
2882                 m_prev = m;
2883         }
2884         return (0);
2885 }
2886
2887 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2888 int
2889 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2890 {
2891         struct mbuf *m0 = m;
2892
2893         if (sopt->sopt_val == NULL)
2894                 return (0);
2895         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2896                 if (sopt->sopt_td != NULL) {
2897                         int error;
2898
2899                         error = copyin(sopt->sopt_val, mtod(m, char *),
2900                                        m->m_len);
2901                         if (error != 0) {
2902                                 m_freem(m0);
2903                                 return(error);
2904                         }
2905                 } else
2906                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2907                 sopt->sopt_valsize -= m->m_len;
2908                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2909                 m = m->m_next;
2910         }
2911         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2912                 panic("ip6_sooptmcopyin");
2913         return (0);
2914 }
2915
2916 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2917 int
2918 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2919 {
2920         struct mbuf *m0 = m;
2921         size_t valsize = 0;
2922
2923         if (sopt->sopt_val == NULL)
2924                 return (0);
2925         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2926                 if (sopt->sopt_td != NULL) {
2927                         int error;
2928
2929                         error = copyout(mtod(m, char *), sopt->sopt_val,
2930                                        m->m_len);
2931                         if (error != 0) {
2932                                 m_freem(m0);
2933                                 return(error);
2934                         }
2935                 } else
2936                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2937                sopt->sopt_valsize -= m->m_len;
2938                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2939                valsize += m->m_len;
2940                m = m->m_next;
2941         }
2942         if (m != NULL) {
2943                 /* enough soopt buffer should be given from user-land */
2944                 m_freem(m0);
2945                 return(EINVAL);
2946         }
2947         sopt->sopt_valsize = valsize;
2948         return (0);
2949 }
2950
2951 /*
2952  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2953  * out-of-band data, which will then notify socket consumers.
2954  */
2955 void
2956 sohasoutofband(struct socket *so)
2957 {
2958
2959         if (so->so_sigio != NULL)
2960                 pgsigio(&so->so_sigio, SIGURG, 0);
2961         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2962 }
2963
2964 int
2965 sopoll(struct socket *so, int events, struct ucred *active_cred,
2966     struct thread *td)
2967 {
2968
2969         /*
2970          * We do not need to set or assert curvnet as long as everyone uses
2971          * sopoll_generic().
2972          */
2973         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2974             td));
2975 }
2976
2977 int
2978 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2979     struct thread *td)
2980 {
2981         int revents = 0;
2982
2983         SOCKBUF_LOCK(&so->so_snd);
2984         SOCKBUF_LOCK(&so->so_rcv);
2985         if (events & (POLLIN | POLLRDNORM))
2986                 if (soreadabledata(so))
2987                         revents |= events & (POLLIN | POLLRDNORM);
2988
2989         if (events & (POLLOUT | POLLWRNORM))
2990                 if (sowriteable(so))
2991                         revents |= events & (POLLOUT | POLLWRNORM);
2992
2993         if (events & (POLLPRI | POLLRDBAND))
2994                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2995                         revents |= events & (POLLPRI | POLLRDBAND);
2996
2997         if ((events & POLLINIGNEOF) == 0) {
2998                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2999                         revents |= events & (POLLIN | POLLRDNORM);
3000                         if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3001                                 revents |= POLLHUP;
3002                 }
3003         }
3004
3005         if (revents == 0) {
3006                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3007                         selrecord(td, &so->so_rcv.sb_sel);
3008                         so->so_rcv.sb_flags |= SB_SEL;
3009                 }
3010
3011                 if (events & (POLLOUT | POLLWRNORM)) {
3012                         selrecord(td, &so->so_snd.sb_sel);
3013                         so->so_snd.sb_flags |= SB_SEL;
3014                 }
3015         }
3016
3017         SOCKBUF_UNLOCK(&so->so_rcv);
3018         SOCKBUF_UNLOCK(&so->so_snd);
3019         return (revents);
3020 }
3021
3022 int
3023 soo_kqfilter(struct file *fp, struct knote *kn)
3024 {
3025         struct socket *so = kn->kn_fp->f_data;
3026         struct sockbuf *sb;
3027
3028         switch (kn->kn_filter) {
3029         case EVFILT_READ:
3030                 if (so->so_options & SO_ACCEPTCONN)
3031                         kn->kn_fop = &solisten_filtops;
3032                 else
3033                         kn->kn_fop = &soread_filtops;
3034                 sb = &so->so_rcv;
3035                 break;
3036         case EVFILT_WRITE:
3037                 kn->kn_fop = &sowrite_filtops;
3038                 sb = &so->so_snd;
3039                 break;
3040         default:
3041                 return (EINVAL);
3042         }
3043
3044         SOCKBUF_LOCK(sb);
3045         knlist_add(&sb->sb_sel.si_note, kn, 1);
3046         sb->sb_flags |= SB_KNOTE;
3047         SOCKBUF_UNLOCK(sb);
3048         return (0);
3049 }
3050
3051 /*
3052  * Some routines that return EOPNOTSUPP for entry points that are not
3053  * supported by a protocol.  Fill in as needed.
3054  */
3055 int
3056 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3057 {
3058
3059         return EOPNOTSUPP;
3060 }
3061
3062 int
3063 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3064 {
3065
3066         return EOPNOTSUPP;
3067 }
3068
3069 int
3070 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3071 {
3072
3073         return EOPNOTSUPP;
3074 }
3075
3076 int
3077 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3078 {
3079
3080         return EOPNOTSUPP;
3081 }
3082
3083 int
3084 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3085 {
3086
3087         return EOPNOTSUPP;
3088 }
3089
3090 int
3091 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3092     struct ifnet *ifp, struct thread *td)
3093 {
3094
3095         return EOPNOTSUPP;
3096 }
3097
3098 int
3099 pru_disconnect_notsupp(struct socket *so)
3100 {
3101
3102         return EOPNOTSUPP;
3103 }
3104
3105 int
3106 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3107 {
3108
3109         return EOPNOTSUPP;
3110 }
3111
3112 int
3113 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3114 {
3115
3116         return EOPNOTSUPP;
3117 }
3118
3119 int
3120 pru_rcvd_notsupp(struct socket *so, int flags)
3121 {
3122
3123         return EOPNOTSUPP;
3124 }
3125
3126 int
3127 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3128 {
3129
3130         return EOPNOTSUPP;
3131 }
3132
3133 int
3134 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3135     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3136 {
3137
3138         return EOPNOTSUPP;
3139 }
3140
3141 /*
3142  * This isn't really a ``null'' operation, but it's the default one and
3143  * doesn't do anything destructive.
3144  */
3145 int
3146 pru_sense_null(struct socket *so, struct stat *sb)
3147 {
3148
3149         sb->st_blksize = so->so_snd.sb_hiwat;
3150         return 0;
3151 }
3152
3153 int
3154 pru_shutdown_notsupp(struct socket *so)
3155 {
3156
3157         return EOPNOTSUPP;
3158 }
3159
3160 int
3161 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3162 {
3163
3164         return EOPNOTSUPP;
3165 }
3166
3167 int
3168 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3169     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3170 {
3171
3172         return EOPNOTSUPP;
3173 }
3174
3175 int
3176 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3177     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3178 {
3179
3180         return EOPNOTSUPP;
3181 }
3182
3183 int
3184 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3185     struct thread *td)
3186 {
3187
3188         return EOPNOTSUPP;
3189 }
3190
3191 static void
3192 filt_sordetach(struct knote *kn)
3193 {
3194         struct socket *so = kn->kn_fp->f_data;
3195
3196         SOCKBUF_LOCK(&so->so_rcv);
3197         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3198         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3199                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3200         SOCKBUF_UNLOCK(&so->so_rcv);
3201 }
3202
3203 /*ARGSUSED*/
3204 static int
3205 filt_soread(struct knote *kn, long hint)
3206 {
3207         struct socket *so;
3208
3209         so = kn->kn_fp->f_data;
3210         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3211
3212         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3213         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3214                 kn->kn_flags |= EV_EOF;
3215                 kn->kn_fflags = so->so_error;
3216                 return (1);
3217         } else if (so->so_error)        /* temporary udp error */
3218                 return (1);
3219         else if (kn->kn_sfflags & NOTE_LOWAT)
3220                 return (kn->kn_data >= kn->kn_sdata);
3221         else
3222                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3223 }
3224
3225 static void
3226 filt_sowdetach(struct knote *kn)
3227 {
3228         struct socket *so = kn->kn_fp->f_data;
3229
3230         SOCKBUF_LOCK(&so->so_snd);
3231         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3232         if (knlist_empty(&so->so_snd.sb_sel.si_note))
3233                 so->so_snd.sb_flags &= ~SB_KNOTE;
3234         SOCKBUF_UNLOCK(&so->so_snd);
3235 }
3236
3237 /*ARGSUSED*/
3238 static int
3239 filt_sowrite(struct knote *kn, long hint)
3240 {
3241         struct socket *so;
3242
3243         so = kn->kn_fp->f_data;
3244         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3245         kn->kn_data = sbspace(&so->so_snd);
3246         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3247                 kn->kn_flags |= EV_EOF;
3248                 kn->kn_fflags = so->so_error;
3249                 return (1);
3250         } else if (so->so_error)        /* temporary udp error */
3251                 return (1);
3252         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3253             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3254                 return (0);
3255         else if (kn->kn_sfflags & NOTE_LOWAT)
3256                 return (kn->kn_data >= kn->kn_sdata);
3257         else
3258                 return (kn->kn_data >= so->so_snd.sb_lowat);
3259 }
3260
3261 /*ARGSUSED*/
3262 static int
3263 filt_solisten(struct knote *kn, long hint)
3264 {
3265         struct socket *so = kn->kn_fp->f_data;
3266
3267         kn->kn_data = so->so_qlen;
3268         return (! TAILQ_EMPTY(&so->so_comp));
3269 }
3270
3271 int
3272 socheckuid(struct socket *so, uid_t uid)
3273 {
3274
3275         if (so == NULL)
3276                 return (EPERM);
3277         if (so->so_cred->cr_uid != uid)
3278                 return (EPERM);
3279         return (0);
3280 }
3281
3282 static int
3283 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3284 {
3285         int error;
3286         int val;
3287
3288         val = somaxconn;
3289         error = sysctl_handle_int(oidp, &val, 0, req);
3290         if (error || !req->newptr )
3291                 return (error);
3292
3293         if (val < 1 || val > USHRT_MAX)
3294                 return (EINVAL);
3295
3296         somaxconn = val;
3297         return (0);
3298 }
3299
3300 /*
3301  * These functions are used by protocols to notify the socket layer (and its
3302  * consumers) of state changes in the sockets driven by protocol-side events.
3303  */
3304
3305 /*
3306  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3307  *
3308  * Normal sequence from the active (originating) side is that
3309  * soisconnecting() is called during processing of connect() call, resulting
3310  * in an eventual call to soisconnected() if/when the connection is
3311  * established.  When the connection is torn down soisdisconnecting() is
3312  * called during processing of disconnect() call, and soisdisconnected() is
3313  * called when the connection to the peer is totally severed.  The semantics
3314  * of these routines are such that connectionless protocols can call
3315  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3316  * calls when setting up a ``connection'' takes no time.
3317  *
3318  * From the passive side, a socket is created with two queues of sockets:
3319  * so_incomp for connections in progress and so_comp for connections already
3320  * made and awaiting user acceptance.  As a protocol is preparing incoming
3321  * connections, it creates a socket structure queued on so_incomp by calling
3322  * sonewconn().  When the connection is established, soisconnected() is
3323  * called, and transfers the socket structure to so_comp, making it available
3324  * to accept().
3325  *
3326  * If a socket is closed with sockets on either so_incomp or so_comp, these
3327  * sockets are dropped.
3328  *
3329  * If higher-level protocols are implemented in the kernel, the wakeups done
3330  * here will sometimes cause software-interrupt process scheduling.
3331  */
3332 void
3333 soisconnecting(struct socket *so)
3334 {
3335
3336         SOCK_LOCK(so);
3337         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3338         so->so_state |= SS_ISCONNECTING;
3339         SOCK_UNLOCK(so);
3340 }
3341
3342 void
3343 soisconnected(struct socket *so)
3344 {
3345         struct socket *head;
3346         int ret;
3347
3348 restart:
3349         ACCEPT_LOCK();
3350         SOCK_LOCK(so);
3351         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3352         so->so_state |= SS_ISCONNECTED;
3353         head = so->so_head;
3354         if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3355                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3356                         SOCK_UNLOCK(so);
3357                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
3358                         head->so_incqlen--;
3359                         so->so_qstate &= ~SQ_INCOMP;
3360                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3361                         head->so_qlen++;
3362                         so->so_qstate |= SQ_COMP;
3363                         ACCEPT_UNLOCK();
3364                         sorwakeup(head);
3365                         wakeup_one(&head->so_timeo);
3366                 } else {
3367                         ACCEPT_UNLOCK();
3368                         soupcall_set(so, SO_RCV,
3369                             head->so_accf->so_accept_filter->accf_callback,
3370                             head->so_accf->so_accept_filter_arg);
3371                         so->so_options &= ~SO_ACCEPTFILTER;
3372                         ret = head->so_accf->so_accept_filter->accf_callback(so,
3373                             head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3374                         if (ret == SU_ISCONNECTED)
3375                                 soupcall_clear(so, SO_RCV);
3376                         SOCK_UNLOCK(so);
3377                         if (ret == SU_ISCONNECTED)
3378                                 goto restart;
3379                 }
3380                 return;
3381         }
3382         SOCK_UNLOCK(so);
3383         ACCEPT_UNLOCK();
3384         wakeup(&so->so_timeo);
3385         sorwakeup(so);
3386         sowwakeup(so);
3387 }
3388
3389 void
3390 soisdisconnecting(struct socket *so)
3391 {
3392
3393         /*
3394          * Note: This code assumes that SOCK_LOCK(so) and
3395          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3396          */
3397         SOCKBUF_LOCK(&so->so_rcv);
3398         so->so_state &= ~SS_ISCONNECTING;
3399         so->so_state |= SS_ISDISCONNECTING;
3400         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3401         sorwakeup_locked(so);
3402         SOCKBUF_LOCK(&so->so_snd);
3403         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3404         sowwakeup_locked(so);
3405         wakeup(&so->so_timeo);
3406 }
3407
3408 void
3409 soisdisconnected(struct socket *so)
3410 {
3411
3412         /*
3413          * Note: This code assumes that SOCK_LOCK(so) and
3414          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3415          */
3416         SOCKBUF_LOCK(&so->so_rcv);
3417         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3418         so->so_state |= SS_ISDISCONNECTED;
3419         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3420         sorwakeup_locked(so);
3421         SOCKBUF_LOCK(&so->so_snd);
3422         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3423         sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3424         sowwakeup_locked(so);
3425         wakeup(&so->so_timeo);
3426 }
3427
3428 /*
3429  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3430  */
3431 struct sockaddr *
3432 sodupsockaddr(const struct sockaddr *sa, int mflags)
3433 {
3434         struct sockaddr *sa2;
3435
3436         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3437         if (sa2)
3438                 bcopy(sa, sa2, sa->sa_len);
3439         return sa2;
3440 }
3441
3442 /*
3443  * Register per-socket buffer upcalls.
3444  */
3445 void
3446 soupcall_set(struct socket *so, int which,
3447     int (*func)(struct socket *, void *, int), void *arg)
3448 {
3449         struct sockbuf *sb;
3450
3451         switch (which) {
3452         case SO_RCV:
3453                 sb = &so->so_rcv;
3454                 break;
3455         case SO_SND:
3456                 sb = &so->so_snd;
3457                 break;
3458         default:
3459                 panic("soupcall_set: bad which");
3460         }
3461         SOCKBUF_LOCK_ASSERT(sb);
3462 #if 0
3463         /* XXX: accf_http actually wants to do this on purpose. */
3464         KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3465 #endif
3466         sb->sb_upcall = func;
3467         sb->sb_upcallarg = arg;
3468         sb->sb_flags |= SB_UPCALL;
3469 }
3470
3471 void
3472 soupcall_clear(struct socket *so, int which)
3473 {
3474         struct sockbuf *sb;
3475
3476         switch (which) {
3477         case SO_RCV:
3478                 sb = &so->so_rcv;
3479                 break;
3480         case SO_SND:
3481                 sb = &so->so_snd;
3482                 break;
3483         default:
3484                 panic("soupcall_clear: bad which");
3485         }
3486         SOCKBUF_LOCK_ASSERT(sb);
3487         KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3488         sb->sb_upcall = NULL;
3489         sb->sb_upcallarg = NULL;
3490         sb->sb_flags &= ~SB_UPCALL;
3491 }
3492
3493 /*
3494  * Create an external-format (``xsocket'') structure using the information in
3495  * the kernel-format socket structure pointed to by so.  This is done to
3496  * reduce the spew of irrelevant information over this interface, to isolate
3497  * user code from changes in the kernel structure, and potentially to provide
3498  * information-hiding if we decide that some of this information should be
3499  * hidden from users.
3500  */
3501 void
3502 sotoxsocket(struct socket *so, struct xsocket *xso)
3503 {
3504
3505         xso->xso_len = sizeof *xso;
3506         xso->xso_so = so;
3507         xso->so_type = so->so_type;
3508         xso->so_options = so->so_options;
3509         xso->so_linger = so->so_linger;
3510         xso->so_state = so->so_state;
3511         xso->so_pcb = so->so_pcb;
3512         xso->xso_protocol = so->so_proto->pr_protocol;
3513         xso->xso_family = so->so_proto->pr_domain->dom_family;
3514         xso->so_qlen = so->so_qlen;
3515         xso->so_incqlen = so->so_incqlen;
3516         xso->so_qlimit = so->so_qlimit;
3517         xso->so_timeo = so->so_timeo;
3518         xso->so_error = so->so_error;
3519         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3520         xso->so_oobmark = so->so_oobmark;
3521         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3522         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3523         xso->so_uid = so->so_cred->cr_uid;
3524 }
3525
3526
3527 /*
3528  * Socket accessor functions to provide external consumers with
3529  * a safe interface to socket state
3530  *
3531  */
3532
3533 void
3534 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3535 {
3536
3537         TAILQ_FOREACH(so, &so->so_comp, so_list)
3538                 func(so, arg);
3539 }
3540
3541 struct sockbuf *
3542 so_sockbuf_rcv(struct socket *so)
3543 {
3544
3545         return (&so->so_rcv);
3546 }
3547
3548 struct sockbuf *
3549 so_sockbuf_snd(struct socket *so)
3550 {
3551
3552         return (&so->so_snd);
3553 }
3554
3555 int
3556 so_state_get(const struct socket *so)
3557 {
3558
3559         return (so->so_state);
3560 }
3561
3562 void
3563 so_state_set(struct socket *so, int val)
3564 {
3565
3566         so->so_state = val;
3567 }
3568
3569 int
3570 so_options_get(const struct socket *so)
3571 {
3572
3573         return (so->so_options);
3574 }
3575
3576 void
3577 so_options_set(struct socket *so, int val)
3578 {
3579
3580         so->so_options = val;
3581 }
3582
3583 int
3584 so_error_get(const struct socket *so)
3585 {
3586
3587         return (so->so_error);
3588 }
3589
3590 void
3591 so_error_set(struct socket *so, int val)
3592 {
3593
3594         so->so_error = val;
3595 }
3596
3597 int
3598 so_linger_get(const struct socket *so)
3599 {
3600
3601         return (so->so_linger);
3602 }
3603
3604 void
3605 so_linger_set(struct socket *so, int val)
3606 {
3607
3608         so->so_linger = val;
3609 }
3610
3611 struct protosw *
3612 so_protosw_get(const struct socket *so)
3613 {
3614
3615         return (so->so_proto);
3616 }
3617
3618 void
3619 so_protosw_set(struct socket *so, struct protosw *val)
3620 {
3621
3622         so->so_proto = val;
3623 }
3624
3625 void
3626 so_sorwakeup(struct socket *so)
3627 {
3628
3629         sorwakeup(so);
3630 }
3631
3632 void
3633 so_sowwakeup(struct socket *so)
3634 {
3635
3636         sowwakeup(so);
3637 }
3638
3639 void
3640 so_sorwakeup_locked(struct socket *so)
3641 {
3642
3643         sorwakeup_locked(so);
3644 }
3645
3646 void
3647 so_sowwakeup_locked(struct socket *so)
3648 {
3649
3650         sowwakeup_locked(so);
3651 }
3652
3653 void
3654 so_lock(struct socket *so)
3655 {
3656         SOCK_LOCK(so);
3657 }
3658
3659 void
3660 so_unlock(struct socket *so)
3661 {
3662         SOCK_UNLOCK(so);
3663 }