sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3  *      The Regents of the University of California.
   4  * Copyright (c) 2004 The FreeBSD Foundation
   5  * Copyright (c) 2004-2008 Robert N. M. Watson
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  *
  96  * NOTE: With regard to VNETs the general rule is that callers do not set
  97  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  98  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  99  * and sorflush(), which are usually called from a pre-set VNET context.
 100  * sopoll() currently does not need a VNET context to be set.
 101  */
 102
 103 #include <sys/cdefs.h>
 104 __FBSDID("$FreeBSD$");
 105
 106 #include "opt_inet.h"
 107 #include "opt_inet6.h"
 108 #include "opt_compat.h"
 109
 110 #include <sys/param.h>
 111 #include <sys/systm.h>
 112 #include <sys/fcntl.h>
 113 #include <sys/limits.h>
 114 #include <sys/lock.h>
 115 #include <sys/mac.h>
 116 #include <sys/malloc.h>
 117 #include <sys/mbuf.h>
 118 #include <sys/mutex.h>
 119 #include <sys/domain.h>
 120 #include <sys/file.h>                   /* for struct knote */
 121 #include <sys/hhook.h>
 122 #include <sys/kernel.h>
 123 #include <sys/khelp.h>
 124 #include <sys/event.h>
 125 #include <sys/eventhandler.h>
 126 #include <sys/poll.h>
 127 #include <sys/proc.h>
 128 #include <sys/protosw.h>
 129 #include <sys/socket.h>
 130 #include <sys/socketvar.h>
 131 #include <sys/resourcevar.h>
 132 #include <net/route.h>
 133 #include <sys/signalvar.h>
 134 #include <sys/stat.h>
 135 #include <sys/sx.h>
 136 #include <sys/sysctl.h>
 137 #include <sys/uio.h>
 138 #include <sys/jail.h>
 139 #include <sys/syslog.h>
 140 #include <netinet/in.h>
 141
 142 #include <net/vnet.h>
 143
 144 #include <security/mac/mac_framework.h>
 145
 146 #include <vm/uma.h>
 147
 148 #ifdef COMPAT_FREEBSD32
 149 #include <sys/mount.h>
 150 #include <sys/sysent.h>
 151 #include <compat/freebsd32/freebsd32.h>
 152 #endif
 153
 154 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 155                     int flags);
 156
 157 static void     filt_sordetach(struct knote *kn);
 158 static int      filt_soread(struct knote *kn, long hint);
 159 static void     filt_sowdetach(struct knote *kn);
 160 static int      filt_sowrite(struct knote *kn, long hint);
 161 static int      filt_solisten(struct knote *kn, long hint);
 162 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 163
 164 static struct filterops solisten_filtops = {
 165         .f_isfd = 1,
 166         .f_detach = filt_sordetach,
 167         .f_event = filt_solisten,
 168 };
 169 static struct filterops soread_filtops = {
 170         .f_isfd = 1,
 171         .f_detach = filt_sordetach,
 172         .f_event = filt_soread,
 173 };
 174 static struct filterops sowrite_filtops = {
 175         .f_isfd = 1,
 176         .f_detach = filt_sowdetach,
 177         .f_event = filt_sowrite,
 178 };
 179
 180 so_gen_t        so_gencnt;      /* generation count for sockets */
 181
 182 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 183 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 184
 185 #define VNET_SO_ASSERT(so)                                              \
 186         VNET_ASSERT(curvnet != NULL,                                    \
 187             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 188
 189 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 190 #define V_socket_hhh            VNET(socket_hhh)
 191
 192 /*
 193  * Limit on the number of connections in the listen queue waiting
 194  * for accept(2).
 195  * NB: The orginal sysctl somaxconn is still available but hidden
 196  * to prevent confusion about the actual purpose of this number.
 197  */
 198 static int somaxconn = SOMAXCONN;
 199
 200 static int
 201 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 202 {
 203         int error;
 204         int val;
 205
 206         val = somaxconn;
 207         error = sysctl_handle_int(oidp, &val, 0, req);
 208         if (error || !req->newptr )
 209                 return (error);
 210
 211         if (val < 1 || val > USHRT_MAX)
 212                 return (EINVAL);
 213
 214         somaxconn = val;
 215         return (0);
 216 }
 217 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
 218     0, sizeof(int), sysctl_somaxconn, "I",
 219     "Maximum listen socket pending connection accept queue size");
 220 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 221     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
 222     0, sizeof(int), sysctl_somaxconn, "I",
 223     "Maximum listen socket pending connection accept queue size (compat)");
 224
 225 static int numopensockets;
 226 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 227     &numopensockets, 0, "Number of open sockets");
 228
 229 /*
 230  * accept_mtx locks down per-socket fields relating to accept queues.  See
 231  * socketvar.h for an annotation of the protected fields of struct socket.
 232  */
 233 struct mtx accept_mtx;
 234 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 235
 236 /*
 237  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 238  * so_gencnt field.
 239  */
 240 static struct mtx so_global_mtx;
 241 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 242
 243 /*
 244  * General IPC sysctl name space, used by sockets and a variety of other IPC
 245  * types.
 246  */
 247 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 248
 249 /*
 250  * Initialize the socket subsystem and set up the socket
 251  * memory allocator.
 252  */
 253 static uma_zone_t socket_zone;
 254 int     maxsockets;
 255
 256 static void
 257 socket_zone_change(void *tag)
 258 {
 259
 260         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 261 }
 262
 263 static void
 264 socket_hhook_register(int subtype)
 265 {
 266
 267         if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 268             &V_socket_hhh[subtype],
 269             HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 270                 printf("%s: WARNING: unable to register hook\n", __func__);
 271 }
 272
 273 static void
 274 socket_hhook_deregister(int subtype)
 275 {
 276
 277         if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 278                 printf("%s: WARNING: unable to deregister hook\n", __func__);
 279 }
 280
 281 static void
 282 socket_init(void *tag)
 283 {
 284
 285         socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 286             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 287         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 288         uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 289         EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 290             EVENTHANDLER_PRI_FIRST);
 291 }
 292 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 293
 294 static void
 295 socket_vnet_init(const void *unused __unused)
 296 {
 297         int i;
 298
 299         /* We expect a contiguous range */
 300         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 301                 socket_hhook_register(i);
 302 }
 303 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 304     socket_vnet_init, NULL);
 305
 306 static void
 307 socket_vnet_uninit(const void *unused __unused)
 308 {
 309         int i;
 310
 311         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 312                 socket_hhook_deregister(i);
 313 }
 314 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 315     socket_vnet_uninit, NULL);
 316
 317 /*
 318  * Initialise maxsockets.  This SYSINIT must be run after
 319  * tunable_mbinit().
 320  */
 321 static void
 322 init_maxsockets(void *ignored)
 323 {
 324
 325         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 326         maxsockets = imax(maxsockets, maxfiles);
 327 }
 328 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 329
 330 /*
 331  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 332  * of the change so that they can update their dependent limits as required.
 333  */
 334 static int
 335 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 336 {
 337         int error, newmaxsockets;
 338
 339         newmaxsockets = maxsockets;
 340         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 341         if (error == 0 && req->newptr) {
 342                 if (newmaxsockets > maxsockets &&
 343                     newmaxsockets <= maxfiles) {
 344                         maxsockets = newmaxsockets;
 345                         EVENTHANDLER_INVOKE(maxsockets_change);
 346                 } else
 347                         error = EINVAL;
 348         }
 349         return (error);
 350 }
 351 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 352     &maxsockets, 0, sysctl_maxsockets, "IU",
 353     "Maximum number of sockets avaliable");
 354
 355 /*
 356  * Socket operation routines.  These routines are called by the routines in
 357  * sys_socket.c or from a system process, and implement the semantics of
 358  * socket operations by switching out to the protocol specific routines.
 359  */
 360
 361 /*
 362  * Get a socket structure from our zone, and initialize it.  Note that it
 363  * would probably be better to allocate socket and PCB at the same time, but
 364  * I'm not convinced that all the protocols can be easily modified to do
 365  * this.
 366  *
 367  * soalloc() returns a socket with a ref count of 0.
 368  */
 369 static struct socket *
 370 soalloc(struct vnet *vnet)
 371 {
 372         struct socket *so;
 373
 374         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 375         if (so == NULL)
 376                 return (NULL);
 377 #ifdef MAC
 378         if (mac_socket_init(so, M_NOWAIT) != 0) {
 379                 uma_zfree(socket_zone, so);
 380                 return (NULL);
 381         }
 382 #endif
 383         if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 384                 uma_zfree(socket_zone, so);
 385                 return (NULL);
 386         }
 387
 388         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 389         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 390         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 391         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 392         TAILQ_INIT(&so->so_aiojobq);
 393         mtx_lock(&so_global_mtx);
 394         so->so_gencnt = ++so_gencnt;
 395         ++numopensockets;
 396 #ifdef VIMAGE
 397         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 398             __func__, __LINE__, so));
 399         vnet->vnet_sockcnt++;
 400         so->so_vnet = vnet;
 401 #endif
 402         mtx_unlock(&so_global_mtx);
 403
 404         CURVNET_SET(vnet);
 405         /* We shouldn't need the so_global_mtx */
 406         if (V_socket_hhh[HHOOK_SOCKET_CREATE]->hhh_nhooks > 0) {
 407                 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE))
 408                         /* Do we need more comprehensive error returns? */
 409                         so = NULL;
 410         }
 411         CURVNET_RESTORE();
 412
 413         return (so);
 414 }
 415
 416 /*
 417  * Free the storage associated with a socket at the socket layer, tear down
 418  * locks, labels, etc.  All protocol state is assumed already to have been
 419  * torn down (and possibly never set up) by the caller.
 420  */
 421 static void
 422 sodealloc(struct socket *so)
 423 {
 424
 425         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 426         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 427
 428         mtx_lock(&so_global_mtx);
 429         so->so_gencnt = ++so_gencnt;
 430         --numopensockets;       /* Could be below, but faster here. */
 431 #ifdef VIMAGE
 432         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 433             __func__, __LINE__, so));
 434         so->so_vnet->vnet_sockcnt--;
 435 #endif
 436         mtx_unlock(&so_global_mtx);
 437         if (so->so_rcv.sb_hiwat)
 438                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 439                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 440         if (so->so_snd.sb_hiwat)
 441                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 442                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 443         /* remove acccept filter if one is present. */
 444         if (so->so_accf != NULL)
 445                 do_setopt_accept_filter(so, NULL);
 446 #ifdef MAC
 447         mac_socket_destroy(so);
 448 #endif
 449         CURVNET_SET(so->so_vnet);
 450         if (V_socket_hhh[HHOOK_SOCKET_CLOSE]->hhh_nhooks > 0)
 451                 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 452         CURVNET_RESTORE();
 453
 454         crfree(so->so_cred);
 455         khelp_destroy_osd(&so->osd);
 456         sx_destroy(&so->so_snd.sb_sx);
 457         sx_destroy(&so->so_rcv.sb_sx);
 458         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 459         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 460         uma_zfree(socket_zone, so);
 461 }
 462
 463 /*
 464  * socreate returns a socket with a ref count of 1.  The socket should be
 465  * closed with soclose().
 466  */
 467 int
 468 socreate(int dom, struct socket **aso, int type, int proto,
 469     struct ucred *cred, struct thread *td)
 470 {
 471         struct protosw *prp;
 472         struct socket *so;
 473         int error;
 474
 475         if (proto)
 476                 prp = pffindproto(dom, proto, type);
 477         else
 478                 prp = pffindtype(dom, type);
 479
 480         if (prp == NULL) {
 481                 /* No support for domain. */
 482                 if (pffinddomain(dom) == NULL)
 483                         return (EAFNOSUPPORT);
 484                 /* No support for socket type. */
 485                 if (proto == 0 && type != 0)
 486                         return (EPROTOTYPE);
 487                 return (EPROTONOSUPPORT);
 488         }
 489         if (prp->pr_usrreqs->pru_attach == NULL ||
 490             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 491                 return (EPROTONOSUPPORT);
 492
 493         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 494                 return (EPROTONOSUPPORT);
 495
 496         if (prp->pr_type != type)
 497                 return (EPROTOTYPE);
 498         so = soalloc(CRED_TO_VNET(cred));
 499         if (so == NULL)
 500                 return (ENOBUFS);
 501
 502         TAILQ_INIT(&so->so_incomp);
 503         TAILQ_INIT(&so->so_comp);
 504         so->so_type = type;
 505         so->so_cred = crhold(cred);
 506         if ((prp->pr_domain->dom_family == PF_INET) ||
 507             (prp->pr_domain->dom_family == PF_INET6) ||
 508             (prp->pr_domain->dom_family == PF_ROUTE))
 509                 so->so_fibnum = td->td_proc->p_fibnum;
 510         else
 511                 so->so_fibnum = 0;
 512         so->so_proto = prp;
 513 #ifdef MAC
 514         mac_socket_create(cred, so);
 515 #endif
 516         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 517         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 518         so->so_count = 1;
 519         /*
 520          * Auto-sizing of socket buffers is managed by the protocols and
 521          * the appropriate flags must be set in the pru_attach function.
 522          */
 523         CURVNET_SET(so->so_vnet);
 524         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 525         CURVNET_RESTORE();
 526         if (error) {
 527                 KASSERT(so->so_count == 1, ("socreate: so_count %d",
 528                     so->so_count));
 529                 so->so_count = 0;
 530                 sodealloc(so);
 531                 return (error);
 532         }
 533         *aso = so;
 534         return (0);
 535 }
 536
 537 #ifdef REGRESSION
 538 static int regression_sonewconn_earlytest = 1;
 539 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 540     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 541 #endif
 542
 543 /*
 544  * When an attempt at a new connection is noted on a socket which accepts
 545  * connections, sonewconn is called.  If the connection is possible (subject
 546  * to space constraints, etc.) then we allocate a new structure, propoerly
 547  * linked into the data structure of the original socket, and return this.
 548  * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
 549  *
 550  * Note: the ref count on the socket is 0 on return.
 551  */
 552 struct socket *
 553 sonewconn(struct socket *head, int connstatus)
 554 {
 555         static struct timeval lastover;
 556         static struct timeval overinterval = { 60, 0 };
 557         static int overcount;
 558
 559         struct socket *so;
 560         int over;
 561
 562         ACCEPT_LOCK();
 563         over = (head->so_qlen > 3 * head->so_qlimit / 2);
 564         ACCEPT_UNLOCK();
 565 #ifdef REGRESSION
 566         if (regression_sonewconn_earlytest && over) {
 567 #else
 568         if (over) {
 569 #endif
 570                 overcount++;
 571
 572                 if (ratecheck(&lastover, &overinterval)) {
 573                         log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
 574                             "%i already in queue awaiting acceptance "
 575                             "(%d occurrences)\n",
 576                             __func__, head->so_pcb, head->so_qlen, overcount);
 577
 578                         overcount = 0;
 579                 }
 580
 581                 return (NULL);
 582         }
 583         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 584             __func__, __LINE__, head));
 585         so = soalloc(head->so_vnet);
 586         if (so == NULL) {
 587                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 588                     "limit reached or out of memory\n",
 589                     __func__, head->so_pcb);
 590                 return (NULL);
 591         }
 592         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 593                 connstatus = 0;
 594         so->so_head = head;
 595         so->so_type = head->so_type;
 596         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 597         so->so_linger = head->so_linger;
 598         so->so_state = head->so_state | SS_NOFDREF;
 599         so->so_fibnum = head->so_fibnum;
 600         so->so_proto = head->so_proto;
 601         so->so_cred = crhold(head->so_cred);
 602 #ifdef MAC
 603         mac_socket_newconn(head, so);
 604 #endif
 605         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 606         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 607         VNET_SO_ASSERT(head);
 608         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 609                 sodealloc(so);
 610                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 611                     __func__, head->so_pcb);
 612                 return (NULL);
 613         }
 614         if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 615                 sodealloc(so);
 616                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 617                     __func__, head->so_pcb);
 618                 return (NULL);
 619         }
 620         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 621         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 622         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 623         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 624         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 625         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 626         so->so_state |= connstatus;
 627         ACCEPT_LOCK();
 628         /*
 629          * The accept socket may be tearing down but we just
 630          * won a race on the ACCEPT_LOCK.
 631          * However, if sctp_peeloff() is called on a 1-to-many
 632          * style socket, the SO_ACCEPTCONN doesn't need to be set.
 633          */
 634         if (!(head->so_options & SO_ACCEPTCONN) &&
 635             ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
 636              (head->so_type != SOCK_SEQPACKET))) {
 637                 SOCK_LOCK(so);
 638                 so->so_head = NULL;
 639                 sofree(so);             /* NB: returns ACCEPT_UNLOCK'ed. */
 640                 return (NULL);
 641         }
 642         if (connstatus) {
 643                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 644                 so->so_qstate |= SQ_COMP;
 645                 head->so_qlen++;
 646         } else {
 647                 /*
 648                  * Keep removing sockets from the head until there's room for
 649                  * us to insert on the tail.  In pre-locking revisions, this
 650                  * was a simple if(), but as we could be racing with other
 651                  * threads and soabort() requires dropping locks, we must
 652                  * loop waiting for the condition to be true.
 653                  */
 654                 while (head->so_incqlen > head->so_qlimit) {
 655                         struct socket *sp;
 656                         sp = TAILQ_FIRST(&head->so_incomp);
 657                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 658                         head->so_incqlen--;
 659                         sp->so_qstate &= ~SQ_INCOMP;
 660                         sp->so_head = NULL;
 661                         ACCEPT_UNLOCK();
 662                         soabort(sp);
 663                         ACCEPT_LOCK();
 664                 }
 665                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 666                 so->so_qstate |= SQ_INCOMP;
 667                 head->so_incqlen++;
 668         }
 669         ACCEPT_UNLOCK();
 670         if (connstatus) {
 671                 sorwakeup(head);
 672                 wakeup_one(&head->so_timeo);
 673         }
 674         return (so);
 675 }
 676
 677 int
 678 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 679 {
 680         int error;
 681
 682         CURVNET_SET(so->so_vnet);
 683         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 684         CURVNET_RESTORE();
 685         return (error);
 686 }
 687
 688 int
 689 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 690 {
 691         int error;
 692
 693         CURVNET_SET(so->so_vnet);
 694         error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
 695         CURVNET_RESTORE();
 696         return (error);
 697 }
 698
 699 /*
 700  * solisten() transitions a socket from a non-listening state to a listening
 701  * state, but can also be used to update the listen queue depth on an
 702  * existing listen socket.  The protocol will call back into the sockets
 703  * layer using solisten_proto_check() and solisten_proto() to check and set
 704  * socket-layer listen state.  Call backs are used so that the protocol can
 705  * acquire both protocol and socket layer locks in whatever order is required
 706  * by the protocol.
 707  *
 708  * Protocol implementors are advised to hold the socket lock across the
 709  * socket-layer test and set to avoid races at the socket layer.
 710  */
 711 int
 712 solisten(struct socket *so, int backlog, struct thread *td)
 713 {
 714         int error;
 715
 716         CURVNET_SET(so->so_vnet);
 717         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 718         CURVNET_RESTORE();
 719         return (error);
 720 }
 721
 722 int
 723 solisten_proto_check(struct socket *so)
 724 {
 725
 726         SOCK_LOCK_ASSERT(so);
 727
 728         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 729             SS_ISDISCONNECTING))
 730                 return (EINVAL);
 731         return (0);
 732 }
 733
 734 void
 735 solisten_proto(struct socket *so, int backlog)
 736 {
 737
 738         SOCK_LOCK_ASSERT(so);
 739
 740         if (backlog < 0 || backlog > somaxconn)
 741                 backlog = somaxconn;
 742         so->so_qlimit = backlog;
 743         so->so_options |= SO_ACCEPTCONN;
 744 }
 745
 746 /*
 747  * Evaluate the reference count and named references on a socket; if no
 748  * references remain, free it.  This should be called whenever a reference is
 749  * released, such as in sorele(), but also when named reference flags are
 750  * cleared in socket or protocol code.
 751  *
 752  * sofree() will free the socket if:
 753  *
 754  * - There are no outstanding file descriptor references or related consumers
 755  *   (so_count == 0).
 756  *
 757  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 758  *
 759  * - The protocol does not have an outstanding strong reference on the socket
 760  *   (SS_PROTOREF).
 761  *
 762  * - The socket is not in a completed connection queue, so a process has been
 763  *   notified that it is present.  If it is removed, the user process may
 764  *   block in accept() despite select() saying the socket was ready.
 765  */
 766 void
 767 sofree(struct socket *so)
 768 {
 769         struct protosw *pr = so->so_proto;
 770         struct socket *head;
 771
 772         ACCEPT_LOCK_ASSERT();
 773         SOCK_LOCK_ASSERT(so);
 774
 775         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 776             (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 777                 SOCK_UNLOCK(so);
 778                 ACCEPT_UNLOCK();
 779                 return;
 780         }
 781
 782         head = so->so_head;
 783         if (head != NULL) {
 784                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 785                     (so->so_qstate & SQ_INCOMP) != 0,
 786                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 787                     "SQ_INCOMP"));
 788                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 789                     (so->so_qstate & SQ_INCOMP) == 0,
 790                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 791                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 792                 head->so_incqlen--;
 793                 so->so_qstate &= ~SQ_INCOMP;
 794                 so->so_head = NULL;
 795         }
 796         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 797             (so->so_qstate & SQ_INCOMP) == 0,
 798             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 799             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 800         if (so->so_options & SO_ACCEPTCONN) {
 801                 KASSERT((TAILQ_EMPTY(&so->so_comp)),
 802                     ("sofree: so_comp populated"));
 803                 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
 804                     ("sofree: so_incomp populated"));
 805         }
 806         SOCK_UNLOCK(so);
 807         ACCEPT_UNLOCK();
 808
 809         VNET_SO_ASSERT(so);
 810         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 811                 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 812         if (pr->pr_usrreqs->pru_detach != NULL)
 813                 (*pr->pr_usrreqs->pru_detach)(so);
 814
 815         /*
 816          * From this point on, we assume that no other references to this
 817          * socket exist anywhere else in the stack.  Therefore, no locks need
 818          * to be acquired or held.
 819          *
 820          * We used to do a lot of socket buffer and socket locking here, as
 821          * well as invoke sorflush() and perform wakeups.  The direct call to
 822          * dom_dispose() and sbrelease_internal() are an inlining of what was
 823          * necessary from sorflush().
 824          *
 825          * Notice that the socket buffer and kqueue state are torn down
 826          * before calling pru_detach.  This means that protocols shold not
 827          * assume they can perform socket wakeups, etc, in their detach code.
 828          */
 829         sbdestroy(&so->so_snd, so);
 830         sbdestroy(&so->so_rcv, so);
 831         seldrain(&so->so_snd.sb_sel);
 832         seldrain(&so->so_rcv.sb_sel);
 833         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 834         knlist_destroy(&so->so_snd.sb_sel.si_note);
 835         sodealloc(so);
 836 }
 837
 838 /*
 839  * Close a socket on last file table reference removal.  Initiate disconnect
 840  * if connected.  Free socket when disconnect complete.
 841  *
 842  * This function will sorele() the socket.  Note that soclose() may be called
 843  * prior to the ref count reaching zero.  The actual socket structure will
 844  * not be freed until the ref count reaches zero.
 845  */
 846 int
 847 soclose(struct socket *so)
 848 {
 849         int error = 0;
 850
 851         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 852
 853         CURVNET_SET(so->so_vnet);
 854         funsetown(&so->so_sigio);
 855         if (so->so_state & SS_ISCONNECTED) {
 856                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 857                         error = sodisconnect(so);
 858                         if (error) {
 859                                 if (error == ENOTCONN)
 860                                         error = 0;
 861                                 goto drop;
 862                         }
 863                 }
 864                 if (so->so_options & SO_LINGER) {
 865                         if ((so->so_state & SS_ISDISCONNECTING) &&
 866                             (so->so_state & SS_NBIO))
 867                                 goto drop;
 868                         while (so->so_state & SS_ISCONNECTED) {
 869                                 error = tsleep(&so->so_timeo,
 870                                     PSOCK | PCATCH, "soclos",
 871                                     so->so_linger * hz);
 872                                 if (error)
 873                                         break;
 874                         }
 875                 }
 876         }
 877
 878 drop:
 879         if (so->so_proto->pr_usrreqs->pru_close != NULL)
 880                 (*so->so_proto->pr_usrreqs->pru_close)(so);
 881         ACCEPT_LOCK();
 882         if (so->so_options & SO_ACCEPTCONN) {
 883                 struct socket *sp;
 884                 /*
 885                  * Prevent new additions to the accept queues due
 886                  * to ACCEPT_LOCK races while we are draining them.
 887                  */
 888                 so->so_options &= ~SO_ACCEPTCONN;
 889                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 890                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 891                         so->so_incqlen--;
 892                         sp->so_qstate &= ~SQ_INCOMP;
 893                         sp->so_head = NULL;
 894                         ACCEPT_UNLOCK();
 895                         soabort(sp);
 896                         ACCEPT_LOCK();
 897                 }
 898                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 899                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 900                         so->so_qlen--;
 901                         sp->so_qstate &= ~SQ_COMP;
 902                         sp->so_head = NULL;
 903                         ACCEPT_UNLOCK();
 904                         soabort(sp);
 905                         ACCEPT_LOCK();
 906                 }
 907                 KASSERT((TAILQ_EMPTY(&so->so_comp)),
 908                     ("%s: so_comp populated", __func__));
 909                 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
 910                     ("%s: so_incomp populated", __func__));
 911         }
 912         SOCK_LOCK(so);
 913         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 914         so->so_state |= SS_NOFDREF;
 915         sorele(so);                     /* NB: Returns with ACCEPT_UNLOCK(). */
 916         CURVNET_RESTORE();
 917         return (error);
 918 }
 919
 920 /*
 921  * soabort() is used to abruptly tear down a connection, such as when a
 922  * resource limit is reached (listen queue depth exceeded), or if a listen
 923  * socket is closed while there are sockets waiting to be accepted.
 924  *
 925  * This interface is tricky, because it is called on an unreferenced socket,
 926  * and must be called only by a thread that has actually removed the socket
 927  * from the listen queue it was on, or races with other threads are risked.
 928  *
 929  * This interface will call into the protocol code, so must not be called
 930  * with any socket locks held.  Protocols do call it while holding their own
 931  * recursible protocol mutexes, but this is something that should be subject
 932  * to review in the future.
 933  */
 934 void
 935 soabort(struct socket *so)
 936 {
 937
 938         /*
 939          * In as much as is possible, assert that no references to this
 940          * socket are held.  This is not quite the same as asserting that the
 941          * current thread is responsible for arranging for no references, but
 942          * is as close as we can get for now.
 943          */
 944         KASSERT(so->so_count == 0, ("soabort: so_count"));
 945         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 946         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 947         KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 948         KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 949         VNET_SO_ASSERT(so);
 950
 951         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 952                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
 953         ACCEPT_LOCK();
 954         SOCK_LOCK(so);
 955         sofree(so);
 956 }
 957
 958 int
 959 soaccept(struct socket *so, struct sockaddr **nam)
 960 {
 961         int error;
 962
 963         SOCK_LOCK(so);
 964         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 965         so->so_state &= ~SS_NOFDREF;
 966         SOCK_UNLOCK(so);
 967
 968         CURVNET_SET(so->so_vnet);
 969         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 970         CURVNET_RESTORE();
 971         return (error);
 972 }
 973
 974 int
 975 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 976 {
 977
 978         return (soconnectat(AT_FDCWD, so, nam, td));
 979 }
 980
 981 int
 982 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 983 {
 984         int error;
 985
 986         if (so->so_options & SO_ACCEPTCONN)
 987                 return (EOPNOTSUPP);
 988
 989         CURVNET_SET(so->so_vnet);
 990         /*
 991          * If protocol is connection-based, can only connect once.
 992          * Otherwise, if connected, try to disconnect first.  This allows
 993          * user to disconnect by connecting to, e.g., a null address.
 994          */
 995         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 996             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 997             (error = sodisconnect(so)))) {
 998                 error = EISCONN;
 999         } else {
1000                 /*
1001                  * Prevent accumulated error from previous connection from
1002                  * biting us.
1003                  */
1004                 so->so_error = 0;
1005                 if (fd == AT_FDCWD) {
1006                         error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1007                             nam, td);
1008                 } else {
1009                         error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1010                             so, nam, td);
1011                 }
1012         }
1013         CURVNET_RESTORE();
1014
1015         return (error);
1016 }
1017
1018 int
1019 soconnect2(struct socket *so1, struct socket *so2)
1020 {
1021         int error;
1022
1023         CURVNET_SET(so1->so_vnet);
1024         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1025         CURVNET_RESTORE();
1026         return (error);
1027 }
1028
1029 int
1030 sodisconnect(struct socket *so)
1031 {
1032         int error;
1033
1034         if ((so->so_state & SS_ISCONNECTED) == 0)
1035                 return (ENOTCONN);
1036         if (so->so_state & SS_ISDISCONNECTING)
1037                 return (EALREADY);
1038         VNET_SO_ASSERT(so);
1039         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1040         return (error);
1041 }
1042
1043 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1044
1045 int
1046 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1047     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1048 {
1049         long space;
1050         ssize_t resid;
1051         int clen = 0, error, dontroute;
1052
1053         KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1054         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1055             ("sosend_dgram: !PR_ATOMIC"));
1056
1057         if (uio != NULL)
1058                 resid = uio->uio_resid;
1059         else
1060                 resid = top->m_pkthdr.len;
1061         /*
1062          * In theory resid should be unsigned.  However, space must be
1063          * signed, as it might be less than 0 if we over-committed, and we
1064          * must use a signed comparison of space and resid.  On the other
1065          * hand, a negative resid causes us to loop sending 0-length
1066          * segments to the protocol.
1067          */
1068         if (resid < 0) {
1069                 error = EINVAL;
1070                 goto out;
1071         }
1072
1073         dontroute =
1074             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1075         if (td != NULL)
1076                 td->td_ru.ru_msgsnd++;
1077         if (control != NULL)
1078                 clen = control->m_len;
1079
1080         SOCKBUF_LOCK(&so->so_snd);
1081         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1082                 SOCKBUF_UNLOCK(&so->so_snd);
1083                 error = EPIPE;
1084                 goto out;
1085         }
1086         if (so->so_error) {
1087                 error = so->so_error;
1088                 so->so_error = 0;
1089                 SOCKBUF_UNLOCK(&so->so_snd);
1090                 goto out;
1091         }
1092         if ((so->so_state & SS_ISCONNECTED) == 0) {
1093                 /*
1094                  * `sendto' and `sendmsg' is allowed on a connection-based
1095                  * socket if it supports implied connect.  Return ENOTCONN if
1096                  * not connected and no address is supplied.
1097                  */
1098                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1099                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1100                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1101                             !(resid == 0 && clen != 0)) {
1102                                 SOCKBUF_UNLOCK(&so->so_snd);
1103                                 error = ENOTCONN;
1104                                 goto out;
1105                         }
1106                 } else if (addr == NULL) {
1107                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1108                                 error = ENOTCONN;
1109                         else
1110                                 error = EDESTADDRREQ;
1111                         SOCKBUF_UNLOCK(&so->so_snd);
1112                         goto out;
1113                 }
1114         }
1115
1116         /*
1117          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1118          * problem and need fixing.
1119          */
1120         space = sbspace(&so->so_snd);
1121         if (flags & MSG_OOB)
1122                 space += 1024;
1123         space -= clen;
1124         SOCKBUF_UNLOCK(&so->so_snd);
1125         if (resid > space) {
1126                 error = EMSGSIZE;
1127                 goto out;
1128         }
1129         if (uio == NULL) {
1130                 resid = 0;
1131                 if (flags & MSG_EOR)
1132                         top->m_flags |= M_EOR;
1133         } else {
1134                 /*
1135                  * Copy the data from userland into a mbuf chain.
1136                  * If no data is to be copied in, a single empty mbuf
1137                  * is returned.
1138                  */
1139                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1140                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1141                 if (top == NULL) {
1142                         error = EFAULT; /* only possible error */
1143                         goto out;
1144                 }
1145                 space -= resid - uio->uio_resid;
1146                 resid = uio->uio_resid;
1147         }
1148         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1149         /*
1150          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1151          * than with.
1152          */
1153         if (dontroute) {
1154                 SOCK_LOCK(so);
1155                 so->so_options |= SO_DONTROUTE;
1156                 SOCK_UNLOCK(so);
1157         }
1158         /*
1159          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1160          * of date.  We could have recieved a reset packet in an interrupt or
1161          * maybe we slept while doing page faults in uiomove() etc.  We could
1162          * probably recheck again inside the locking protection here, but
1163          * there are probably other places that this also happens.  We must
1164          * rethink this.
1165          */
1166         VNET_SO_ASSERT(so);
1167         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1168             (flags & MSG_OOB) ? PRUS_OOB :
1169         /*
1170          * If the user set MSG_EOF, the protocol understands this flag and
1171          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1172          */
1173             ((flags & MSG_EOF) &&
1174              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1175              (resid <= 0)) ?
1176                 PRUS_EOF :
1177                 /* If there is more to send set PRUS_MORETOCOME */
1178                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1179                 top, addr, control, td);
1180         if (dontroute) {
1181                 SOCK_LOCK(so);
1182                 so->so_options &= ~SO_DONTROUTE;
1183                 SOCK_UNLOCK(so);
1184         }
1185         clen = 0;
1186         control = NULL;
1187         top = NULL;
1188 out:
1189         if (top != NULL)
1190                 m_freem(top);
1191         if (control != NULL)
1192                 m_freem(control);
1193         return (error);
1194 }
1195
1196 /*
1197  * Send on a socket.  If send must go all at once and message is larger than
1198  * send buffering, then hard error.  Lock against other senders.  If must go
1199  * all at once and not enough room now, then inform user that this would
1200  * block and do nothing.  Otherwise, if nonblocking, send as much as
1201  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1202  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1203  * in mbuf chain must be small enough to send all at once.
1204  *
1205  * Returns nonzero on error, timeout or signal; callers must check for short
1206  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1207  * on return.
1208  */
1209 int
1210 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1211     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1212 {
1213         long space;
1214         ssize_t resid;
1215         int clen = 0, error, dontroute;
1216         int atomic = sosendallatonce(so) || top;
1217
1218         if (uio != NULL)
1219                 resid = uio->uio_resid;
1220         else
1221                 resid = top->m_pkthdr.len;
1222         /*
1223          * In theory resid should be unsigned.  However, space must be
1224          * signed, as it might be less than 0 if we over-committed, and we
1225          * must use a signed comparison of space and resid.  On the other
1226          * hand, a negative resid causes us to loop sending 0-length
1227          * segments to the protocol.
1228          *
1229          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1230          * type sockets since that's an error.
1231          */
1232         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1233                 error = EINVAL;
1234                 goto out;
1235         }
1236
1237         dontroute =
1238             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1239             (so->so_proto->pr_flags & PR_ATOMIC);
1240         if (td != NULL)
1241                 td->td_ru.ru_msgsnd++;
1242         if (control != NULL)
1243                 clen = control->m_len;
1244
1245         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1246         if (error)
1247                 goto out;
1248
1249 restart:
1250         do {
1251                 SOCKBUF_LOCK(&so->so_snd);
1252                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1253                         SOCKBUF_UNLOCK(&so->so_snd);
1254                         error = EPIPE;
1255                         goto release;
1256                 }
1257                 if (so->so_error) {
1258                         error = so->so_error;
1259                         so->so_error = 0;
1260                         SOCKBUF_UNLOCK(&so->so_snd);
1261                         goto release;
1262                 }
1263                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1264                         /*
1265                          * `sendto' and `sendmsg' is allowed on a connection-
1266                          * based socket if it supports implied connect.
1267                          * Return ENOTCONN if not connected and no address is
1268                          * supplied.
1269                          */
1270                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1271                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1272                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1273                                     !(resid == 0 && clen != 0)) {
1274                                         SOCKBUF_UNLOCK(&so->so_snd);
1275                                         error = ENOTCONN;
1276                                         goto release;
1277                                 }
1278                         } else if (addr == NULL) {
1279                                 SOCKBUF_UNLOCK(&so->so_snd);
1280                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1281                                         error = ENOTCONN;
1282                                 else
1283                                         error = EDESTADDRREQ;
1284                                 goto release;
1285                         }
1286                 }
1287                 space = sbspace(&so->so_snd);
1288                 if (flags & MSG_OOB)
1289                         space += 1024;
1290                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1291                     clen > so->so_snd.sb_hiwat) {
1292                         SOCKBUF_UNLOCK(&so->so_snd);
1293                         error = EMSGSIZE;
1294                         goto release;
1295                 }
1296                 if (space < resid + clen &&
1297                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1298                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1299                                 SOCKBUF_UNLOCK(&so->so_snd);
1300                                 error = EWOULDBLOCK;
1301                                 goto release;
1302                         }
1303                         error = sbwait(&so->so_snd);
1304                         SOCKBUF_UNLOCK(&so->so_snd);
1305                         if (error)
1306                                 goto release;
1307                         goto restart;
1308                 }
1309                 SOCKBUF_UNLOCK(&so->so_snd);
1310                 space -= clen;
1311                 do {
1312                         if (uio == NULL) {
1313                                 resid = 0;
1314                                 if (flags & MSG_EOR)
1315                                         top->m_flags |= M_EOR;
1316                         } else {
1317                                 /*
1318                                  * Copy the data from userland into a mbuf
1319                                  * chain.  If no data is to be copied in,
1320                                  * a single empty mbuf is returned.
1321                                  */
1322                                 top = m_uiotombuf(uio, M_WAITOK, space,
1323                                     (atomic ? max_hdr : 0),
1324                                     (atomic ? M_PKTHDR : 0) |
1325                                     ((flags & MSG_EOR) ? M_EOR : 0));
1326                                 if (top == NULL) {
1327                                         error = EFAULT; /* only possible error */
1328                                         goto release;
1329                                 }
1330                                 space -= resid - uio->uio_resid;
1331                                 resid = uio->uio_resid;
1332                         }
1333                         if (dontroute) {
1334                                 SOCK_LOCK(so);
1335                                 so->so_options |= SO_DONTROUTE;
1336                                 SOCK_UNLOCK(so);
1337                         }
1338                         /*
1339                          * XXX all the SBS_CANTSENDMORE checks previously
1340                          * done could be out of date.  We could have recieved
1341                          * a reset packet in an interrupt or maybe we slept
1342                          * while doing page faults in uiomove() etc.  We
1343                          * could probably recheck again inside the locking
1344                          * protection here, but there are probably other
1345                          * places that this also happens.  We must rethink
1346                          * this.
1347                          */
1348                         VNET_SO_ASSERT(so);
1349                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1350                             (flags & MSG_OOB) ? PRUS_OOB :
1351                         /*
1352                          * If the user set MSG_EOF, the protocol understands
1353                          * this flag and nothing left to send then use
1354                          * PRU_SEND_EOF instead of PRU_SEND.
1355                          */
1356                             ((flags & MSG_EOF) &&
1357                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1358                              (resid <= 0)) ?
1359                                 PRUS_EOF :
1360                         /* If there is more to send set PRUS_MORETOCOME. */
1361                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1362                             top, addr, control, td);
1363                         if (dontroute) {
1364                                 SOCK_LOCK(so);
1365                                 so->so_options &= ~SO_DONTROUTE;
1366                                 SOCK_UNLOCK(so);
1367                         }
1368                         clen = 0;
1369                         control = NULL;
1370                         top = NULL;
1371                         if (error)
1372                                 goto release;
1373                 } while (resid && space > 0);
1374         } while (resid);
1375
1376 release:
1377         sbunlock(&so->so_snd);
1378 out:
1379         if (top != NULL)
1380                 m_freem(top);
1381         if (control != NULL)
1382                 m_freem(control);
1383         return (error);
1384 }
1385
1386 int
1387 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1388     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1389 {
1390         int error;
1391
1392         CURVNET_SET(so->so_vnet);
1393         error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1394             control, flags, td);
1395         CURVNET_RESTORE();
1396         return (error);
1397 }
1398
1399 /*
1400  * The part of soreceive() that implements reading non-inline out-of-band
1401  * data from a socket.  For more complete comments, see soreceive(), from
1402  * which this code originated.
1403  *
1404  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1405  * unable to return an mbuf chain to the caller.
1406  */
1407 static int
1408 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1409 {
1410         struct protosw *pr = so->so_proto;
1411         struct mbuf *m;
1412         int error;
1413
1414         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1415         VNET_SO_ASSERT(so);
1416
1417         m = m_get(M_WAITOK, MT_DATA);
1418         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1419         if (error)
1420                 goto bad;
1421         do {
1422                 error = uiomove(mtod(m, void *),
1423                     (int) min(uio->uio_resid, m->m_len), uio);
1424                 m = m_free(m);
1425         } while (uio->uio_resid && error == 0 && m);
1426 bad:
1427         if (m != NULL)
1428                 m_freem(m);
1429         return (error);
1430 }
1431
1432 /*
1433  * Following replacement or removal of the first mbuf on the first mbuf chain
1434  * of a socket buffer, push necessary state changes back into the socket
1435  * buffer so that other consumers see the values consistently.  'nextrecord'
1436  * is the callers locally stored value of the original value of
1437  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1438  * NOTE: 'nextrecord' may be NULL.
1439  */
1440 static __inline void
1441 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1442 {
1443
1444         SOCKBUF_LOCK_ASSERT(sb);
1445         /*
1446          * First, update for the new value of nextrecord.  If necessary, make
1447          * it the first record.
1448          */
1449         if (sb->sb_mb != NULL)
1450                 sb->sb_mb->m_nextpkt = nextrecord;
1451         else
1452                 sb->sb_mb = nextrecord;
1453
1454         /*
1455          * Now update any dependent socket buffer fields to reflect the new
1456          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1457          * addition of a second clause that takes care of the case where
1458          * sb_mb has been updated, but remains the last record.
1459          */
1460         if (sb->sb_mb == NULL) {
1461                 sb->sb_mbtail = NULL;
1462                 sb->sb_lastrecord = NULL;
1463         } else if (sb->sb_mb->m_nextpkt == NULL)
1464                 sb->sb_lastrecord = sb->sb_mb;
1465 }
1466
1467 /*
1468  * Implement receive operations on a socket.  We depend on the way that
1469  * records are added to the sockbuf by sbappend.  In particular, each record
1470  * (mbufs linked through m_next) must begin with an address if the protocol
1471  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1472  * data, and then zero or more mbufs of data.  In order to allow parallelism
1473  * between network receive and copying to user space, as well as avoid
1474  * sleeping with a mutex held, we release the socket buffer mutex during the
1475  * user space copy.  Although the sockbuf is locked, new data may still be
1476  * appended, and thus we must maintain consistency of the sockbuf during that
1477  * time.
1478  *
1479  * The caller may receive the data as a single mbuf chain by supplying an
1480  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1481  * the count in uio_resid.
1482  */
1483 int
1484 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1485     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1486 {
1487         struct mbuf *m, **mp;
1488         int flags, error, offset;
1489         ssize_t len;
1490         struct protosw *pr = so->so_proto;
1491         struct mbuf *nextrecord;
1492         int moff, type = 0;
1493         ssize_t orig_resid = uio->uio_resid;
1494
1495         mp = mp0;
1496         if (psa != NULL)
1497                 *psa = NULL;
1498         if (controlp != NULL)
1499                 *controlp = NULL;
1500         if (flagsp != NULL)
1501                 flags = *flagsp &~ MSG_EOR;
1502         else
1503                 flags = 0;
1504         if (flags & MSG_OOB)
1505                 return (soreceive_rcvoob(so, uio, flags));
1506         if (mp != NULL)
1507                 *mp = NULL;
1508         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1509             && uio->uio_resid) {
1510                 VNET_SO_ASSERT(so);
1511                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1512         }
1513
1514         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1515         if (error)
1516                 return (error);
1517
1518 restart:
1519         SOCKBUF_LOCK(&so->so_rcv);
1520         m = so->so_rcv.sb_mb;
1521         /*
1522          * If we have less data than requested, block awaiting more (subject
1523          * to any timeout) if:
1524          *   1. the current count is less than the low water mark, or
1525          *   2. MSG_DONTWAIT is not set
1526          */
1527         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1528             so->so_rcv.sb_cc < uio->uio_resid) &&
1529             so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1530             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1531                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1532                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1533                     m, so->so_rcv.sb_cc));
1534                 if (so->so_error) {
1535                         if (m != NULL)
1536                                 goto dontblock;
1537                         error = so->so_error;
1538                         if ((flags & MSG_PEEK) == 0)
1539                                 so->so_error = 0;
1540                         SOCKBUF_UNLOCK(&so->so_rcv);
1541                         goto release;
1542                 }
1543                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1544                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1545                         if (m == NULL) {
1546                                 SOCKBUF_UNLOCK(&so->so_rcv);
1547                                 goto release;
1548                         } else
1549                                 goto dontblock;
1550                 }
1551                 for (; m != NULL; m = m->m_next)
1552                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1553                                 m = so->so_rcv.sb_mb;
1554                                 goto dontblock;
1555                         }
1556                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1557                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1558                         SOCKBUF_UNLOCK(&so->so_rcv);
1559                         error = ENOTCONN;
1560                         goto release;
1561                 }
1562                 if (uio->uio_resid == 0) {
1563                         SOCKBUF_UNLOCK(&so->so_rcv);
1564                         goto release;
1565                 }
1566                 if ((so->so_state & SS_NBIO) ||
1567                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1568                         SOCKBUF_UNLOCK(&so->so_rcv);
1569                         error = EWOULDBLOCK;
1570                         goto release;
1571                 }
1572                 SBLASTRECORDCHK(&so->so_rcv);
1573                 SBLASTMBUFCHK(&so->so_rcv);
1574                 error = sbwait(&so->so_rcv);
1575                 SOCKBUF_UNLOCK(&so->so_rcv);
1576                 if (error)
1577                         goto release;
1578                 goto restart;
1579         }
1580 dontblock:
1581         /*
1582          * From this point onward, we maintain 'nextrecord' as a cache of the
1583          * pointer to the next record in the socket buffer.  We must keep the
1584          * various socket buffer pointers and local stack versions of the
1585          * pointers in sync, pushing out modifications before dropping the
1586          * socket buffer mutex, and re-reading them when picking it up.
1587          *
1588          * Otherwise, we will race with the network stack appending new data
1589          * or records onto the socket buffer by using inconsistent/stale
1590          * versions of the field, possibly resulting in socket buffer
1591          * corruption.
1592          *
1593          * By holding the high-level sblock(), we prevent simultaneous
1594          * readers from pulling off the front of the socket buffer.
1595          */
1596         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1597         if (uio->uio_td)
1598                 uio->uio_td->td_ru.ru_msgrcv++;
1599         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1600         SBLASTRECORDCHK(&so->so_rcv);
1601         SBLASTMBUFCHK(&so->so_rcv);
1602         nextrecord = m->m_nextpkt;
1603         if (pr->pr_flags & PR_ADDR) {
1604                 KASSERT(m->m_type == MT_SONAME,
1605                     ("m->m_type == %d", m->m_type));
1606                 orig_resid = 0;
1607                 if (psa != NULL)
1608                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1609                             M_NOWAIT);
1610                 if (flags & MSG_PEEK) {
1611                         m = m->m_next;
1612                 } else {
1613                         sbfree(&so->so_rcv, m);
1614                         so->so_rcv.sb_mb = m_free(m);
1615                         m = so->so_rcv.sb_mb;
1616                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1617                 }
1618         }
1619
1620         /*
1621          * Process one or more MT_CONTROL mbufs present before any data mbufs
1622          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1623          * just copy the data; if !MSG_PEEK, we call into the protocol to
1624          * perform externalization (or freeing if controlp == NULL).
1625          */
1626         if (m != NULL && m->m_type == MT_CONTROL) {
1627                 struct mbuf *cm = NULL, *cmn;
1628                 struct mbuf **cme = &cm;
1629
1630                 do {
1631                         if (flags & MSG_PEEK) {
1632                                 if (controlp != NULL) {
1633                                         *controlp = m_copy(m, 0, m->m_len);
1634                                         controlp = &(*controlp)->m_next;
1635                                 }
1636                                 m = m->m_next;
1637                         } else {
1638                                 sbfree(&so->so_rcv, m);
1639                                 so->so_rcv.sb_mb = m->m_next;
1640                                 m->m_next = NULL;
1641                                 *cme = m;
1642                                 cme = &(*cme)->m_next;
1643                                 m = so->so_rcv.sb_mb;
1644                         }
1645                 } while (m != NULL && m->m_type == MT_CONTROL);
1646                 if ((flags & MSG_PEEK) == 0)
1647                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1648                 while (cm != NULL) {
1649                         cmn = cm->m_next;
1650                         cm->m_next = NULL;
1651                         if (pr->pr_domain->dom_externalize != NULL) {
1652                                 SOCKBUF_UNLOCK(&so->so_rcv);
1653                                 VNET_SO_ASSERT(so);
1654                                 error = (*pr->pr_domain->dom_externalize)
1655                                     (cm, controlp, flags);
1656                                 SOCKBUF_LOCK(&so->so_rcv);
1657                         } else if (controlp != NULL)
1658                                 *controlp = cm;
1659                         else
1660                                 m_freem(cm);
1661                         if (controlp != NULL) {
1662                                 orig_resid = 0;
1663                                 while (*controlp != NULL)
1664                                         controlp = &(*controlp)->m_next;
1665                         }
1666                         cm = cmn;
1667                 }
1668                 if (m != NULL)
1669                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1670                 else
1671                         nextrecord = so->so_rcv.sb_mb;
1672                 orig_resid = 0;
1673         }
1674         if (m != NULL) {
1675                 if ((flags & MSG_PEEK) == 0) {
1676                         KASSERT(m->m_nextpkt == nextrecord,
1677                             ("soreceive: post-control, nextrecord !sync"));
1678                         if (nextrecord == NULL) {
1679                                 KASSERT(so->so_rcv.sb_mb == m,
1680                                     ("soreceive: post-control, sb_mb!=m"));
1681                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1682                                     ("soreceive: post-control, lastrecord!=m"));
1683                         }
1684                 }
1685                 type = m->m_type;
1686                 if (type == MT_OOBDATA)
1687                         flags |= MSG_OOB;
1688         } else {
1689                 if ((flags & MSG_PEEK) == 0) {
1690                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1691                             ("soreceive: sb_mb != nextrecord"));
1692                         if (so->so_rcv.sb_mb == NULL) {
1693                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1694                                     ("soreceive: sb_lastercord != NULL"));
1695                         }
1696                 }
1697         }
1698         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1699         SBLASTRECORDCHK(&so->so_rcv);
1700         SBLASTMBUFCHK(&so->so_rcv);
1701
1702         /*
1703          * Now continue to read any data mbufs off of the head of the socket
1704          * buffer until the read request is satisfied.  Note that 'type' is
1705          * used to store the type of any mbuf reads that have happened so far
1706          * such that soreceive() can stop reading if the type changes, which
1707          * causes soreceive() to return only one of regular data and inline
1708          * out-of-band data in a single socket receive operation.
1709          */
1710         moff = 0;
1711         offset = 0;
1712         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1713                 /*
1714                  * If the type of mbuf has changed since the last mbuf
1715                  * examined ('type'), end the receive operation.
1716                  */
1717                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1718                 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1719                         if (type != m->m_type)
1720                                 break;
1721                 } else if (type == MT_OOBDATA)
1722                         break;
1723                 else
1724                     KASSERT(m->m_type == MT_DATA,
1725                         ("m->m_type == %d", m->m_type));
1726                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1727                 len = uio->uio_resid;
1728                 if (so->so_oobmark && len > so->so_oobmark - offset)
1729                         len = so->so_oobmark - offset;
1730                 if (len > m->m_len - moff)
1731                         len = m->m_len - moff;
1732                 /*
1733                  * If mp is set, just pass back the mbufs.  Otherwise copy
1734                  * them out via the uio, then free.  Sockbuf must be
1735                  * consistent here (points to current mbuf, it points to next
1736                  * record) when we drop priority; we must note any additions
1737                  * to the sockbuf when we block interrupts again.
1738                  */
1739                 if (mp == NULL) {
1740                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1741                         SBLASTRECORDCHK(&so->so_rcv);
1742                         SBLASTMBUFCHK(&so->so_rcv);
1743                         SOCKBUF_UNLOCK(&so->so_rcv);
1744                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1745                         SOCKBUF_LOCK(&so->so_rcv);
1746                         if (error) {
1747                                 /*
1748                                  * The MT_SONAME mbuf has already been removed
1749                                  * from the record, so it is necessary to
1750                                  * remove the data mbufs, if any, to preserve
1751                                  * the invariant in the case of PR_ADDR that
1752                                  * requires MT_SONAME mbufs at the head of
1753                                  * each record.
1754                                  */
1755                                 if (m && pr->pr_flags & PR_ATOMIC &&
1756                                     ((flags & MSG_PEEK) == 0))
1757                                         (void)sbdroprecord_locked(&so->so_rcv);
1758                                 SOCKBUF_UNLOCK(&so->so_rcv);
1759                                 goto release;
1760                         }
1761                 } else
1762                         uio->uio_resid -= len;
1763                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1764                 if (len == m->m_len - moff) {
1765                         if (m->m_flags & M_EOR)
1766                                 flags |= MSG_EOR;
1767                         if (flags & MSG_PEEK) {
1768                                 m = m->m_next;
1769                                 moff = 0;
1770                         } else {
1771                                 nextrecord = m->m_nextpkt;
1772                                 sbfree(&so->so_rcv, m);
1773                                 if (mp != NULL) {
1774                                         m->m_nextpkt = NULL;
1775                                         *mp = m;
1776                                         mp = &m->m_next;
1777                                         so->so_rcv.sb_mb = m = m->m_next;
1778                                         *mp = NULL;
1779                                 } else {
1780                                         so->so_rcv.sb_mb = m_free(m);
1781                                         m = so->so_rcv.sb_mb;
1782                                 }
1783                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
1784                                 SBLASTRECORDCHK(&so->so_rcv);
1785                                 SBLASTMBUFCHK(&so->so_rcv);
1786                         }
1787                 } else {
1788                         if (flags & MSG_PEEK)
1789                                 moff += len;
1790                         else {
1791                                 if (mp != NULL) {
1792                                         if (flags & MSG_DONTWAIT) {
1793                                                 *mp = m_copym(m, 0, len,
1794                                                     M_NOWAIT);
1795                                                 if (*mp == NULL) {
1796                                                         /*
1797                                                          * m_copym() couldn't
1798                                                          * allocate an mbuf.
1799                                                          * Adjust uio_resid back
1800                                                          * (it was adjusted
1801                                                          * down by len bytes,
1802                                                          * which we didn't end
1803                                                          * up "copying" over).
1804                                                          */
1805                                                         uio->uio_resid += len;
1806                                                         break;
1807                                                 }
1808                                         } else {
1809                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1810                                                 *mp = m_copym(m, 0, len,
1811                                                     M_WAITOK);
1812                                                 SOCKBUF_LOCK(&so->so_rcv);
1813                                         }
1814                                 }
1815                                 m->m_data += len;
1816                                 m->m_len -= len;
1817                                 so->so_rcv.sb_cc -= len;
1818                         }
1819                 }
1820                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1821                 if (so->so_oobmark) {
1822                         if ((flags & MSG_PEEK) == 0) {
1823                                 so->so_oobmark -= len;
1824                                 if (so->so_oobmark == 0) {
1825                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1826                                         break;
1827                                 }
1828                         } else {
1829                                 offset += len;
1830                                 if (offset == so->so_oobmark)
1831                                         break;
1832                         }
1833                 }
1834                 if (flags & MSG_EOR)
1835                         break;
1836                 /*
1837                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
1838                  * must not quit until "uio->uio_resid == 0" or an error
1839                  * termination.  If a signal/timeout occurs, return with a
1840                  * short count but without error.  Keep sockbuf locked
1841                  * against other readers.
1842                  */
1843                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1844                     !sosendallatonce(so) && nextrecord == NULL) {
1845                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1846                         if (so->so_error ||
1847                             so->so_rcv.sb_state & SBS_CANTRCVMORE)
1848                                 break;
1849                         /*
1850                          * Notify the protocol that some data has been
1851                          * drained before blocking.
1852                          */
1853                         if (pr->pr_flags & PR_WANTRCVD) {
1854                                 SOCKBUF_UNLOCK(&so->so_rcv);
1855                                 VNET_SO_ASSERT(so);
1856                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1857                                 SOCKBUF_LOCK(&so->so_rcv);
1858                         }
1859                         SBLASTRECORDCHK(&so->so_rcv);
1860                         SBLASTMBUFCHK(&so->so_rcv);
1861                         /*
1862                          * We could receive some data while was notifying
1863                          * the protocol. Skip blocking in this case.
1864                          */
1865                         if (so->so_rcv.sb_mb == NULL) {
1866                                 error = sbwait(&so->so_rcv);
1867                                 if (error) {
1868                                         SOCKBUF_UNLOCK(&so->so_rcv);
1869                                         goto release;
1870                                 }
1871                         }
1872                         m = so->so_rcv.sb_mb;
1873                         if (m != NULL)
1874                                 nextrecord = m->m_nextpkt;
1875                 }
1876         }
1877
1878         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1879         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1880                 flags |= MSG_TRUNC;
1881                 if ((flags & MSG_PEEK) == 0)
1882                         (void) sbdroprecord_locked(&so->so_rcv);
1883         }
1884         if ((flags & MSG_PEEK) == 0) {
1885                 if (m == NULL) {
1886                         /*
1887                          * First part is an inline SB_EMPTY_FIXUP().  Second
1888                          * part makes sure sb_lastrecord is up-to-date if
1889                          * there is still data in the socket buffer.
1890                          */
1891                         so->so_rcv.sb_mb = nextrecord;
1892                         if (so->so_rcv.sb_mb == NULL) {
1893                                 so->so_rcv.sb_mbtail = NULL;
1894                                 so->so_rcv.sb_lastrecord = NULL;
1895                         } else if (nextrecord->m_nextpkt == NULL)
1896                                 so->so_rcv.sb_lastrecord = nextrecord;
1897                 }
1898                 SBLASTRECORDCHK(&so->so_rcv);
1899                 SBLASTMBUFCHK(&so->so_rcv);
1900                 /*
1901                  * If soreceive() is being done from the socket callback,
1902                  * then don't need to generate ACK to peer to update window,
1903                  * since ACK will be generated on return to TCP.
1904                  */
1905                 if (!(flags & MSG_SOCALLBCK) &&
1906                     (pr->pr_flags & PR_WANTRCVD)) {
1907                         SOCKBUF_UNLOCK(&so->so_rcv);
1908                         VNET_SO_ASSERT(so);
1909                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1910                         SOCKBUF_LOCK(&so->so_rcv);
1911                 }
1912         }
1913         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1914         if (orig_resid == uio->uio_resid && orig_resid &&
1915             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1916                 SOCKBUF_UNLOCK(&so->so_rcv);
1917                 goto restart;
1918         }
1919         SOCKBUF_UNLOCK(&so->so_rcv);
1920
1921         if (flagsp != NULL)
1922                 *flagsp |= flags;
1923 release:
1924         sbunlock(&so->so_rcv);
1925         return (error);
1926 }
1927
1928 /*
1929  * Optimized version of soreceive() for stream (TCP) sockets.
1930  * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1931  */
1932 int
1933 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1934     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1935 {
1936         int len = 0, error = 0, flags, oresid;
1937         struct sockbuf *sb;
1938         struct mbuf *m, *n = NULL;
1939
1940         /* We only do stream sockets. */
1941         if (so->so_type != SOCK_STREAM)
1942                 return (EINVAL);
1943         if (psa != NULL)
1944                 *psa = NULL;
1945         if (controlp != NULL)
1946                 return (EINVAL);
1947         if (flagsp != NULL)
1948                 flags = *flagsp &~ MSG_EOR;
1949         else
1950                 flags = 0;
1951         if (flags & MSG_OOB)
1952                 return (soreceive_rcvoob(so, uio, flags));
1953         if (mp0 != NULL)
1954                 *mp0 = NULL;
1955
1956         sb = &so->so_rcv;
1957
1958         /* Prevent other readers from entering the socket. */
1959         error = sblock(sb, SBLOCKWAIT(flags));
1960         if (error)
1961                 goto out;
1962         SOCKBUF_LOCK(sb);
1963
1964         /* Easy one, no space to copyout anything. */
1965         if (uio->uio_resid == 0) {
1966                 error = EINVAL;
1967                 goto out;
1968         }
1969         oresid = uio->uio_resid;
1970
1971         /* We will never ever get anything unless we are or were connected. */
1972         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1973                 error = ENOTCONN;
1974                 goto out;
1975         }
1976
1977 restart:
1978         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1979
1980         /* Abort if socket has reported problems. */
1981         if (so->so_error) {
1982                 if (sb->sb_cc > 0)
1983                         goto deliver;
1984                 if (oresid > uio->uio_resid)
1985                         goto out;
1986                 error = so->so_error;
1987                 if (!(flags & MSG_PEEK))
1988                         so->so_error = 0;
1989                 goto out;
1990         }
1991
1992         /* Door is closed.  Deliver what is left, if any. */
1993         if (sb->sb_state & SBS_CANTRCVMORE) {
1994                 if (sb->sb_cc > 0)
1995                         goto deliver;
1996                 else
1997                         goto out;
1998         }
1999
2000         /* Socket buffer is empty and we shall not block. */
2001         if (sb->sb_cc == 0 &&
2002             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2003                 error = EAGAIN;
2004                 goto out;
2005         }
2006
2007         /* Socket buffer got some data that we shall deliver now. */
2008         if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2009             ((sb->sb_flags & SS_NBIO) ||
2010              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2011              sb->sb_cc >= sb->sb_lowat ||
2012              sb->sb_cc >= uio->uio_resid ||
2013              sb->sb_cc >= sb->sb_hiwat) ) {
2014                 goto deliver;
2015         }
2016
2017         /* On MSG_WAITALL we must wait until all data or error arrives. */
2018         if ((flags & MSG_WAITALL) &&
2019             (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
2020                 goto deliver;
2021
2022         /*
2023          * Wait and block until (more) data comes in.
2024          * NB: Drops the sockbuf lock during wait.
2025          */
2026         error = sbwait(sb);
2027         if (error)
2028                 goto out;
2029         goto restart;
2030
2031 deliver:
2032         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2033         KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2034         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2035
2036         /* Statistics. */
2037         if (uio->uio_td)
2038                 uio->uio_td->td_ru.ru_msgrcv++;
2039
2040         /* Fill uio until full or current end of socket buffer is reached. */
2041         len = min(uio->uio_resid, sb->sb_cc);
2042         if (mp0 != NULL) {
2043                 /* Dequeue as many mbufs as possible. */
2044                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2045                         if (*mp0 == NULL)
2046                                 *mp0 = sb->sb_mb;
2047                         else
2048                                 m_cat(*mp0, sb->sb_mb);
2049                         for (m = sb->sb_mb;
2050                              m != NULL && m->m_len <= len;
2051                              m = m->m_next) {
2052                                 len -= m->m_len;
2053                                 uio->uio_resid -= m->m_len;
2054                                 sbfree(sb, m);
2055                                 n = m;
2056                         }
2057                         n->m_next = NULL;
2058                         sb->sb_mb = m;
2059                         sb->sb_lastrecord = sb->sb_mb;
2060                         if (sb->sb_mb == NULL)
2061                                 SB_EMPTY_FIXUP(sb);
2062                 }
2063                 /* Copy the remainder. */
2064                 if (len > 0) {
2065                         KASSERT(sb->sb_mb != NULL,
2066                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2067
2068                         m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2069                         if (m == NULL)
2070                                 len = 0;        /* Don't flush data from sockbuf. */
2071                         else
2072                                 uio->uio_resid -= len;
2073                         if (*mp0 != NULL)
2074                                 m_cat(*mp0, m);
2075                         else
2076                                 *mp0 = m;
2077                         if (*mp0 == NULL) {
2078                                 error = ENOBUFS;
2079                                 goto out;
2080                         }
2081                 }
2082         } else {
2083                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2084                 SOCKBUF_UNLOCK(sb);
2085                 error = m_mbuftouio(uio, sb->sb_mb, len);
2086                 SOCKBUF_LOCK(sb);
2087                 if (error)
2088                         goto out;
2089         }
2090         SBLASTRECORDCHK(sb);
2091         SBLASTMBUFCHK(sb);
2092
2093         /*
2094          * Remove the delivered data from the socket buffer unless we
2095          * were only peeking.
2096          */
2097         if (!(flags & MSG_PEEK)) {
2098                 if (len > 0)
2099                         sbdrop_locked(sb, len);
2100
2101                 /* Notify protocol that we drained some data. */
2102                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2103                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2104                      !(flags & MSG_SOCALLBCK))) {
2105                         SOCKBUF_UNLOCK(sb);
2106                         VNET_SO_ASSERT(so);
2107                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2108                         SOCKBUF_LOCK(sb);
2109                 }
2110         }
2111
2112         /*
2113          * For MSG_WAITALL we may have to loop again and wait for
2114          * more data to come in.
2115          */
2116         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2117                 goto restart;
2118 out:
2119         SOCKBUF_LOCK_ASSERT(sb);
2120         SBLASTRECORDCHK(sb);
2121         SBLASTMBUFCHK(sb);
2122         SOCKBUF_UNLOCK(sb);
2123         sbunlock(sb);
2124         return (error);
2125 }
2126
2127 /*
2128  * Optimized version of soreceive() for simple datagram cases from userspace.
2129  * Unlike in the stream case, we're able to drop a datagram if copyout()
2130  * fails, and because we handle datagrams atomically, we don't need to use a
2131  * sleep lock to prevent I/O interlacing.
2132  */
2133 int
2134 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2135     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2136 {
2137         struct mbuf *m, *m2;
2138         int flags, error;
2139         ssize_t len;
2140         struct protosw *pr = so->so_proto;
2141         struct mbuf *nextrecord;
2142
2143         if (psa != NULL)
2144                 *psa = NULL;
2145         if (controlp != NULL)
2146                 *controlp = NULL;
2147         if (flagsp != NULL)
2148                 flags = *flagsp &~ MSG_EOR;
2149         else
2150                 flags = 0;
2151
2152         /*
2153          * For any complicated cases, fall back to the full
2154          * soreceive_generic().
2155          */
2156         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2157                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2158                     flagsp));
2159
2160         /*
2161          * Enforce restrictions on use.
2162          */
2163         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2164             ("soreceive_dgram: wantrcvd"));
2165         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2166         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2167             ("soreceive_dgram: SBS_RCVATMARK"));
2168         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2169             ("soreceive_dgram: P_CONNREQUIRED"));
2170
2171         /*
2172          * Loop blocking while waiting for a datagram.
2173          */
2174         SOCKBUF_LOCK(&so->so_rcv);
2175         while ((m = so->so_rcv.sb_mb) == NULL) {
2176                 KASSERT(so->so_rcv.sb_cc == 0,
2177                     ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2178                     so->so_rcv.sb_cc));
2179                 if (so->so_error) {
2180                         error = so->so_error;
2181                         so->so_error = 0;
2182                         SOCKBUF_UNLOCK(&so->so_rcv);
2183                         return (error);
2184                 }
2185                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2186                     uio->uio_resid == 0) {
2187                         SOCKBUF_UNLOCK(&so->so_rcv);
2188                         return (0);
2189                 }
2190                 if ((so->so_state & SS_NBIO) ||
2191                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2192                         SOCKBUF_UNLOCK(&so->so_rcv);
2193                         return (EWOULDBLOCK);
2194                 }
2195                 SBLASTRECORDCHK(&so->so_rcv);
2196                 SBLASTMBUFCHK(&so->so_rcv);
2197                 error = sbwait(&so->so_rcv);
2198                 if (error) {
2199                         SOCKBUF_UNLOCK(&so->so_rcv);
2200                         return (error);
2201                 }
2202         }
2203         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2204
2205         if (uio->uio_td)
2206                 uio->uio_td->td_ru.ru_msgrcv++;
2207         SBLASTRECORDCHK(&so->so_rcv);
2208         SBLASTMBUFCHK(&so->so_rcv);
2209         nextrecord = m->m_nextpkt;
2210         if (nextrecord == NULL) {
2211                 KASSERT(so->so_rcv.sb_lastrecord == m,
2212                     ("soreceive_dgram: lastrecord != m"));
2213         }
2214
2215         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2216             ("soreceive_dgram: m_nextpkt != nextrecord"));
2217
2218         /*
2219          * Pull 'm' and its chain off the front of the packet queue.
2220          */
2221         so->so_rcv.sb_mb = NULL;
2222         sockbuf_pushsync(&so->so_rcv, nextrecord);
2223
2224         /*
2225          * Walk 'm's chain and free that many bytes from the socket buffer.
2226          */
2227         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2228                 sbfree(&so->so_rcv, m2);
2229
2230         /*
2231          * Do a few last checks before we let go of the lock.
2232          */
2233         SBLASTRECORDCHK(&so->so_rcv);
2234         SBLASTMBUFCHK(&so->so_rcv);
2235         SOCKBUF_UNLOCK(&so->so_rcv);
2236
2237         if (pr->pr_flags & PR_ADDR) {
2238                 KASSERT(m->m_type == MT_SONAME,
2239                     ("m->m_type == %d", m->m_type));
2240                 if (psa != NULL)
2241                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2242                             M_NOWAIT);
2243                 m = m_free(m);
2244         }
2245         if (m == NULL) {
2246                 /* XXXRW: Can this happen? */
2247                 return (0);
2248         }
2249
2250         /*
2251          * Packet to copyout() is now in 'm' and it is disconnected from the
2252          * queue.
2253          *
2254          * Process one or more MT_CONTROL mbufs present before any data mbufs
2255          * in the first mbuf chain on the socket buffer.  We call into the
2256          * protocol to perform externalization (or freeing if controlp ==
2257          * NULL).
2258          */
2259         if (m->m_type == MT_CONTROL) {
2260                 struct mbuf *cm = NULL, *cmn;
2261                 struct mbuf **cme = &cm;
2262
2263                 do {
2264                         m2 = m->m_next;
2265                         m->m_next = NULL;
2266                         *cme = m;
2267                         cme = &(*cme)->m_next;
2268                         m = m2;
2269                 } while (m != NULL && m->m_type == MT_CONTROL);
2270                 while (cm != NULL) {
2271                         cmn = cm->m_next;
2272                         cm->m_next = NULL;
2273                         if (pr->pr_domain->dom_externalize != NULL) {
2274                                 error = (*pr->pr_domain->dom_externalize)
2275                                     (cm, controlp, flags);
2276                         } else if (controlp != NULL)
2277                                 *controlp = cm;
2278                         else
2279                                 m_freem(cm);
2280                         if (controlp != NULL) {
2281                                 while (*controlp != NULL)
2282                                         controlp = &(*controlp)->m_next;
2283                         }
2284                         cm = cmn;
2285                 }
2286         }
2287         KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2288
2289         while (m != NULL && uio->uio_resid > 0) {
2290                 len = uio->uio_resid;
2291                 if (len > m->m_len)
2292                         len = m->m_len;
2293                 error = uiomove(mtod(m, char *), (int)len, uio);
2294                 if (error) {
2295                         m_freem(m);
2296                         return (error);
2297                 }
2298                 if (len == m->m_len)
2299                         m = m_free(m);
2300                 else {
2301                         m->m_data += len;
2302                         m->m_len -= len;
2303                 }
2304         }
2305         if (m != NULL)
2306                 flags |= MSG_TRUNC;
2307         m_freem(m);
2308         if (flagsp != NULL)
2309                 *flagsp |= flags;
2310         return (0);
2311 }
2312
2313 int
2314 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2315     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2316 {
2317         int error;
2318
2319         CURVNET_SET(so->so_vnet);
2320         error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2321             controlp, flagsp));
2322         CURVNET_RESTORE();
2323         return (error);
2324 }
2325
2326 int
2327 soshutdown(struct socket *so, int how)
2328 {
2329         struct protosw *pr = so->so_proto;
2330         int error;
2331
2332         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2333                 return (EINVAL);
2334
2335         CURVNET_SET(so->so_vnet);
2336         if (pr->pr_usrreqs->pru_flush != NULL)
2337                 (*pr->pr_usrreqs->pru_flush)(so, how);
2338         if (how != SHUT_WR)
2339                 sorflush(so);
2340         if (how != SHUT_RD) {
2341                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2342                 wakeup(&so->so_timeo);
2343                 CURVNET_RESTORE();
2344                 return (error);
2345         }
2346         wakeup(&so->so_timeo);
2347         CURVNET_RESTORE();
2348         return (0);
2349 }
2350
2351 void
2352 sorflush(struct socket *so)
2353 {
2354         struct sockbuf *sb = &so->so_rcv;
2355         struct protosw *pr = so->so_proto;
2356         struct sockbuf asb;
2357
2358         VNET_SO_ASSERT(so);
2359
2360         /*
2361          * In order to avoid calling dom_dispose with the socket buffer mutex
2362          * held, and in order to generally avoid holding the lock for a long
2363          * time, we make a copy of the socket buffer and clear the original
2364          * (except locks, state).  The new socket buffer copy won't have
2365          * initialized locks so we can only call routines that won't use or
2366          * assert those locks.
2367          *
2368          * Dislodge threads currently blocked in receive and wait to acquire
2369          * a lock against other simultaneous readers before clearing the
2370          * socket buffer.  Don't let our acquire be interrupted by a signal
2371          * despite any existing socket disposition on interruptable waiting.
2372          */
2373         socantrcvmore(so);
2374         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2375
2376         /*
2377          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2378          * and mutex data unchanged.
2379          */
2380         SOCKBUF_LOCK(sb);
2381         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2382         bcopy(&sb->sb_startzero, &asb.sb_startzero,
2383             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2384         bzero(&sb->sb_startzero,
2385             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2386         SOCKBUF_UNLOCK(sb);
2387         sbunlock(sb);
2388
2389         /*
2390          * Dispose of special rights and flush the socket buffer.  Don't call
2391          * any unsafe routines (that rely on locks being initialized) on asb.
2392          */
2393         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2394                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2395         sbrelease_internal(&asb, so);
2396 }
2397
2398 /*
2399  * Wrapper for Socket established helper hook.
2400  * Parameters: socket, context of the hook point, hook id.
2401  */
2402 static int inline
2403 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2404 {
2405         struct socket_hhook_data hhook_data = {
2406                 .so = so,
2407                 .hctx = hctx,
2408                 .m = NULL
2409         };
2410
2411         hhook_run_hooks(V_socket_hhh[h_id], &hhook_data, &so->osd);
2412
2413         /* Ugly but needed, since hhooks return void for now */
2414         return (hhook_data.status);
2415 }
2416
2417 /*
2418  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2419  * additional variant to handle the case where the option value needs to be
2420  * some kind of integer, but not a specific size.  In addition to their use
2421  * here, these functions are also called by the protocol-level pr_ctloutput()
2422  * routines.
2423  */
2424 int
2425 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2426 {
2427         size_t  valsize;
2428
2429         /*
2430          * If the user gives us more than we wanted, we ignore it, but if we
2431          * don't get the minimum length the caller wants, we return EINVAL.
2432          * On success, sopt->sopt_valsize is set to however much we actually
2433          * retrieved.
2434          */
2435         if ((valsize = sopt->sopt_valsize) < minlen)
2436                 return EINVAL;
2437         if (valsize > len)
2438                 sopt->sopt_valsize = valsize = len;
2439
2440         if (sopt->sopt_td != NULL)
2441                 return (copyin(sopt->sopt_val, buf, valsize));
2442
2443         bcopy(sopt->sopt_val, buf, valsize);
2444         return (0);
2445 }
2446
2447 /*
2448  * Kernel version of setsockopt(2).
2449  *
2450  * XXX: optlen is size_t, not socklen_t
2451  */
2452 int
2453 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2454     size_t optlen)
2455 {
2456         struct sockopt sopt;
2457
2458         sopt.sopt_level = level;
2459         sopt.sopt_name = optname;
2460         sopt.sopt_dir = SOPT_SET;
2461         sopt.sopt_val = optval;
2462         sopt.sopt_valsize = optlen;
2463         sopt.sopt_td = NULL;
2464         return (sosetopt(so, &sopt));
2465 }
2466
2467 int
2468 sosetopt(struct socket *so, struct sockopt *sopt)
2469 {
2470         int     error, optval;
2471         struct  linger l;
2472         struct  timeval tv;
2473         sbintime_t val;
2474         uint32_t val32;
2475 #ifdef MAC
2476         struct mac extmac;
2477 #endif
2478
2479         CURVNET_SET(so->so_vnet);
2480         error = 0;
2481         if (sopt->sopt_level != SOL_SOCKET) {
2482                 if (so->so_proto->pr_ctloutput != NULL) {
2483                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2484                         CURVNET_RESTORE();
2485                         return (error);
2486                 }
2487                 error = ENOPROTOOPT;
2488         } else {
2489                 switch (sopt->sopt_name) {
2490                 case SO_ACCEPTFILTER:
2491                         error = do_setopt_accept_filter(so, sopt);
2492                         if (error)
2493                                 goto bad;
2494                         break;
2495
2496                 case SO_LINGER:
2497                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2498                         if (error)
2499                                 goto bad;
2500
2501                         SOCK_LOCK(so);
2502                         so->so_linger = l.l_linger;
2503                         if (l.l_onoff)
2504                                 so->so_options |= SO_LINGER;
2505                         else
2506                                 so->so_options &= ~SO_LINGER;
2507                         SOCK_UNLOCK(so);
2508                         break;
2509
2510                 case SO_DEBUG:
2511                 case SO_KEEPALIVE:
2512                 case SO_DONTROUTE:
2513                 case SO_USELOOPBACK:
2514                 case SO_BROADCAST:
2515                 case SO_REUSEADDR:
2516                 case SO_REUSEPORT:
2517                 case SO_OOBINLINE:
2518                 case SO_TIMESTAMP:
2519                 case SO_BINTIME:
2520                 case SO_NOSIGPIPE:
2521                 case SO_NO_DDP:
2522                 case SO_NO_OFFLOAD:
2523                         error = sooptcopyin(sopt, &optval, sizeof optval,
2524                             sizeof optval);
2525                         if (error)
2526                                 goto bad;
2527                         SOCK_LOCK(so);
2528                         if (optval)
2529                                 so->so_options |= sopt->sopt_name;
2530                         else
2531                                 so->so_options &= ~sopt->sopt_name;
2532                         SOCK_UNLOCK(so);
2533                         break;
2534
2535                 case SO_SETFIB:
2536                         error = sooptcopyin(sopt, &optval, sizeof optval,
2537                             sizeof optval);
2538                         if (error)
2539                                 goto bad;
2540
2541                         if (optval < 0 || optval >= rt_numfibs) {
2542                                 error = EINVAL;
2543                                 goto bad;
2544                         }
2545                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2546                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2547                            (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2548                                 so->so_fibnum = optval;
2549                         else
2550                                 so->so_fibnum = 0;
2551                         break;
2552
2553                 case SO_USER_COOKIE:
2554                         error = sooptcopyin(sopt, &val32, sizeof val32,
2555                             sizeof val32);
2556                         if (error)
2557                                 goto bad;
2558                         so->so_user_cookie = val32;
2559                         break;
2560
2561                 case SO_SNDBUF:
2562                 case SO_RCVBUF:
2563                 case SO_SNDLOWAT:
2564                 case SO_RCVLOWAT:
2565                         error = sooptcopyin(sopt, &optval, sizeof optval,
2566                             sizeof optval);
2567                         if (error)
2568                                 goto bad;
2569
2570                         /*
2571                          * Values < 1 make no sense for any of these options,
2572                          * so disallow them.
2573                          */
2574                         if (optval < 1) {
2575                                 error = EINVAL;
2576                                 goto bad;
2577                         }
2578
2579                         switch (sopt->sopt_name) {
2580                         case SO_SNDBUF:
2581                         case SO_RCVBUF:
2582                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2583                                     &so->so_snd : &so->so_rcv, (u_long)optval,
2584                                     so, curthread) == 0) {
2585                                         error = ENOBUFS;
2586                                         goto bad;
2587                                 }
2588                                 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2589                                     &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2590                                 break;
2591
2592                         /*
2593                          * Make sure the low-water is never greater than the
2594                          * high-water.
2595                          */
2596                         case SO_SNDLOWAT:
2597                                 SOCKBUF_LOCK(&so->so_snd);
2598                                 so->so_snd.sb_lowat =
2599                                     (optval > so->so_snd.sb_hiwat) ?
2600                                     so->so_snd.sb_hiwat : optval;
2601                                 SOCKBUF_UNLOCK(&so->so_snd);
2602                                 break;
2603                         case SO_RCVLOWAT:
2604                                 SOCKBUF_LOCK(&so->so_rcv);
2605                                 so->so_rcv.sb_lowat =
2606                                     (optval > so->so_rcv.sb_hiwat) ?
2607                                     so->so_rcv.sb_hiwat : optval;
2608                                 SOCKBUF_UNLOCK(&so->so_rcv);
2609                                 break;
2610                         }
2611                         break;
2612
2613                 case SO_SNDTIMEO:
2614                 case SO_RCVTIMEO:
2615 #ifdef COMPAT_FREEBSD32
2616                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2617                                 struct timeval32 tv32;
2618
2619                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2620                                     sizeof tv32);
2621                                 CP(tv32, tv, tv_sec);
2622                                 CP(tv32, tv, tv_usec);
2623                         } else
2624 #endif
2625                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2626                                     sizeof tv);
2627                         if (error)
2628                                 goto bad;
2629                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2630                             tv.tv_usec >= 1000000) {
2631                                 error = EDOM;
2632                                 goto bad;
2633                         }
2634                         if (tv.tv_sec > INT32_MAX)
2635                                 val = SBT_MAX;
2636                         else
2637                                 val = tvtosbt(tv);
2638                         switch (sopt->sopt_name) {
2639                         case SO_SNDTIMEO:
2640                                 so->so_snd.sb_timeo = val;
2641                                 break;
2642                         case SO_RCVTIMEO:
2643                                 so->so_rcv.sb_timeo = val;
2644                                 break;
2645                         }
2646                         break;
2647
2648                 case SO_LABEL:
2649 #ifdef MAC
2650                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2651                             sizeof extmac);
2652                         if (error)
2653                                 goto bad;
2654                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2655                             so, &extmac);
2656 #else
2657                         error = EOPNOTSUPP;
2658 #endif
2659                         break;
2660
2661                 default:
2662                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
2663                                 error = hhook_run_socket(so, sopt,
2664                                     HHOOK_SOCKET_OPT);
2665                         else
2666                                 error = ENOPROTOOPT;
2667                         break;
2668                 }
2669                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2670                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
2671         }
2672 bad:
2673         CURVNET_RESTORE();
2674         return (error);
2675 }
2676
2677 /*
2678  * Helper routine for getsockopt.
2679  */
2680 int
2681 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2682 {
2683         int     error;
2684         size_t  valsize;
2685
2686         error = 0;
2687
2688         /*
2689          * Documented get behavior is that we always return a value, possibly
2690          * truncated to fit in the user's buffer.  Traditional behavior is
2691          * that we always tell the user precisely how much we copied, rather
2692          * than something useful like the total amount we had available for
2693          * her.  Note that this interface is not idempotent; the entire
2694          * answer must generated ahead of time.
2695          */
2696         valsize = min(len, sopt->sopt_valsize);
2697         sopt->sopt_valsize = valsize;
2698         if (sopt->sopt_val != NULL) {
2699                 if (sopt->sopt_td != NULL)
2700                         error = copyout(buf, sopt->sopt_val, valsize);
2701                 else
2702                         bcopy(buf, sopt->sopt_val, valsize);
2703         }
2704         return (error);
2705 }
2706
2707 int
2708 sogetopt(struct socket *so, struct sockopt *sopt)
2709 {
2710         int     error, optval;
2711         struct  linger l;
2712         struct  timeval tv;
2713 #ifdef MAC
2714         struct mac extmac;
2715 #endif
2716
2717         CURVNET_SET(so->so_vnet);
2718         error = 0;
2719         if (sopt->sopt_level != SOL_SOCKET) {
2720                 if (so->so_proto->pr_ctloutput != NULL)
2721                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2722                 else
2723                         error = ENOPROTOOPT;
2724                 CURVNET_RESTORE();
2725                 return (error);
2726         } else {
2727                 switch (sopt->sopt_name) {
2728                 case SO_ACCEPTFILTER:
2729                         error = do_getopt_accept_filter(so, sopt);
2730                         break;
2731
2732                 case SO_LINGER:
2733                         SOCK_LOCK(so);
2734                         l.l_onoff = so->so_options & SO_LINGER;
2735                         l.l_linger = so->so_linger;
2736                         SOCK_UNLOCK(so);
2737                         error = sooptcopyout(sopt, &l, sizeof l);
2738                         break;
2739
2740                 case SO_USELOOPBACK:
2741                 case SO_DONTROUTE:
2742                 case SO_DEBUG:
2743                 case SO_KEEPALIVE:
2744                 case SO_REUSEADDR:
2745                 case SO_REUSEPORT:
2746                 case SO_BROADCAST:
2747                 case SO_OOBINLINE:
2748                 case SO_ACCEPTCONN:
2749                 case SO_TIMESTAMP:
2750                 case SO_BINTIME:
2751                 case SO_NOSIGPIPE:
2752                         optval = so->so_options & sopt->sopt_name;
2753 integer:
2754                         error = sooptcopyout(sopt, &optval, sizeof optval);
2755                         break;
2756
2757                 case SO_TYPE:
2758                         optval = so->so_type;
2759                         goto integer;
2760
2761                 case SO_PROTOCOL:
2762                         optval = so->so_proto->pr_protocol;
2763                         goto integer;
2764
2765                 case SO_ERROR:
2766                         SOCK_LOCK(so);
2767                         optval = so->so_error;
2768                         so->so_error = 0;
2769                         SOCK_UNLOCK(so);
2770                         goto integer;
2771
2772                 case SO_SNDBUF:
2773                         optval = so->so_snd.sb_hiwat;
2774                         goto integer;
2775
2776                 case SO_RCVBUF:
2777                         optval = so->so_rcv.sb_hiwat;
2778                         goto integer;
2779
2780                 case SO_SNDLOWAT:
2781                         optval = so->so_snd.sb_lowat;
2782                         goto integer;
2783
2784                 case SO_RCVLOWAT:
2785                         optval = so->so_rcv.sb_lowat;
2786                         goto integer;
2787
2788                 case SO_SNDTIMEO:
2789                 case SO_RCVTIMEO:
2790                         tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
2791                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2792 #ifdef COMPAT_FREEBSD32
2793                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2794                                 struct timeval32 tv32;
2795
2796                                 CP(tv, tv32, tv_sec);
2797                                 CP(tv, tv32, tv_usec);
2798                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2799                         } else
2800 #endif
2801                                 error = sooptcopyout(sopt, &tv, sizeof tv);
2802                         break;
2803
2804                 case SO_LABEL:
2805 #ifdef MAC
2806                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2807                             sizeof(extmac));
2808                         if (error)
2809                                 goto bad;
2810                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2811                             so, &extmac);
2812                         if (error)
2813                                 goto bad;
2814                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2815 #else
2816                         error = EOPNOTSUPP;
2817 #endif
2818                         break;
2819
2820                 case SO_PEERLABEL:
2821 #ifdef MAC
2822                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2823                             sizeof(extmac));
2824                         if (error)
2825                                 goto bad;
2826                         error = mac_getsockopt_peerlabel(
2827                             sopt->sopt_td->td_ucred, so, &extmac);
2828                         if (error)
2829                                 goto bad;
2830                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2831 #else
2832                         error = EOPNOTSUPP;
2833 #endif
2834                         break;
2835
2836                 case SO_LISTENQLIMIT:
2837                         optval = so->so_qlimit;
2838                         goto integer;
2839
2840                 case SO_LISTENQLEN:
2841                         optval = so->so_qlen;
2842                         goto integer;
2843
2844                 case SO_LISTENINCQLEN:
2845                         optval = so->so_incqlen;
2846                         goto integer;
2847
2848                 default:
2849                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
2850                                 error = hhook_run_socket(so, sopt,
2851                                     HHOOK_SOCKET_OPT);
2852                         else
2853                                 error = ENOPROTOOPT;
2854                         break;
2855                 }
2856         }
2857 #ifdef MAC
2858 bad:
2859 #endif
2860         CURVNET_RESTORE();
2861         return (error);
2862 }
2863
2864 int
2865 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2866 {
2867         struct mbuf *m, *m_prev;
2868         int sopt_size = sopt->sopt_valsize;
2869
2870         MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2871         if (m == NULL)
2872                 return ENOBUFS;
2873         if (sopt_size > MLEN) {
2874                 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
2875                 if ((m->m_flags & M_EXT) == 0) {
2876                         m_free(m);
2877                         return ENOBUFS;
2878                 }
2879                 m->m_len = min(MCLBYTES, sopt_size);
2880         } else {
2881                 m->m_len = min(MLEN, sopt_size);
2882         }
2883         sopt_size -= m->m_len;
2884         *mp = m;
2885         m_prev = m;
2886
2887         while (sopt_size) {
2888                 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2889                 if (m == NULL) {
2890                         m_freem(*mp);
2891                         return ENOBUFS;
2892                 }
2893                 if (sopt_size > MLEN) {
2894                         MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
2895                             M_NOWAIT);
2896                         if ((m->m_flags & M_EXT) == 0) {
2897                                 m_freem(m);
2898                                 m_freem(*mp);
2899                                 return ENOBUFS;
2900                         }
2901                         m->m_len = min(MCLBYTES, sopt_size);
2902                 } else {
2903                         m->m_len = min(MLEN, sopt_size);
2904                 }
2905                 sopt_size -= m->m_len;
2906                 m_prev->m_next = m;
2907                 m_prev = m;
2908         }
2909         return (0);
2910 }
2911
2912 int
2913 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2914 {
2915         struct mbuf *m0 = m;
2916
2917         if (sopt->sopt_val == NULL)
2918                 return (0);
2919         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2920                 if (sopt->sopt_td != NULL) {
2921                         int error;
2922
2923                         error = copyin(sopt->sopt_val, mtod(m, char *),
2924                             m->m_len);
2925                         if (error != 0) {
2926                                 m_freem(m0);
2927                                 return(error);
2928                         }
2929                 } else
2930                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2931                 sopt->sopt_valsize -= m->m_len;
2932                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2933                 m = m->m_next;
2934         }
2935         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2936                 panic("ip6_sooptmcopyin");
2937         return (0);
2938 }
2939
2940 int
2941 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2942 {
2943         struct mbuf *m0 = m;
2944         size_t valsize = 0;
2945
2946         if (sopt->sopt_val == NULL)
2947                 return (0);
2948         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2949                 if (sopt->sopt_td != NULL) {
2950                         int error;
2951
2952                         error = copyout(mtod(m, char *), sopt->sopt_val,
2953                             m->m_len);
2954                         if (error != 0) {
2955                                 m_freem(m0);
2956                                 return(error);
2957                         }
2958                 } else
2959                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2960                 sopt->sopt_valsize -= m->m_len;
2961                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2962                 valsize += m->m_len;
2963                 m = m->m_next;
2964         }
2965         if (m != NULL) {
2966                 /* enough soopt buffer should be given from user-land */
2967                 m_freem(m0);
2968                 return(EINVAL);
2969         }
2970         sopt->sopt_valsize = valsize;
2971         return (0);
2972 }
2973
2974 /*
2975  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2976  * out-of-band data, which will then notify socket consumers.
2977  */
2978 void
2979 sohasoutofband(struct socket *so)
2980 {
2981
2982         if (so->so_sigio != NULL)
2983                 pgsigio(&so->so_sigio, SIGURG, 0);
2984         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2985 }
2986
2987 int
2988 sopoll(struct socket *so, int events, struct ucred *active_cred,
2989     struct thread *td)
2990 {
2991
2992         /*
2993          * We do not need to set or assert curvnet as long as everyone uses
2994          * sopoll_generic().
2995          */
2996         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2997             td));
2998 }
2999
3000 int
3001 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3002     struct thread *td)
3003 {
3004         int revents = 0;
3005
3006         SOCKBUF_LOCK(&so->so_snd);
3007         SOCKBUF_LOCK(&so->so_rcv);
3008         if (events & (POLLIN | POLLRDNORM))
3009                 if (soreadabledata(so))
3010                         revents |= events & (POLLIN | POLLRDNORM);
3011
3012         if (events & (POLLOUT | POLLWRNORM))
3013                 if (sowriteable(so))
3014                         revents |= events & (POLLOUT | POLLWRNORM);
3015
3016         if (events & (POLLPRI | POLLRDBAND))
3017                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
3018                         revents |= events & (POLLPRI | POLLRDBAND);
3019
3020         if ((events & POLLINIGNEOF) == 0) {
3021                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3022                         revents |= events & (POLLIN | POLLRDNORM);
3023                         if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3024                                 revents |= POLLHUP;
3025                 }
3026         }
3027
3028         if (revents == 0) {
3029                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3030                         selrecord(td, &so->so_rcv.sb_sel);
3031                         so->so_rcv.sb_flags |= SB_SEL;
3032                 }
3033
3034                 if (events & (POLLOUT | POLLWRNORM)) {
3035                         selrecord(td, &so->so_snd.sb_sel);
3036                         so->so_snd.sb_flags |= SB_SEL;
3037                 }
3038         }
3039
3040         SOCKBUF_UNLOCK(&so->so_rcv);
3041         SOCKBUF_UNLOCK(&so->so_snd);
3042         return (revents);
3043 }
3044
3045 int
3046 soo_kqfilter(struct file *fp, struct knote *kn)
3047 {
3048         struct socket *so = kn->kn_fp->f_data;
3049         struct sockbuf *sb;
3050
3051         switch (kn->kn_filter) {
3052         case EVFILT_READ:
3053                 if (so->so_options & SO_ACCEPTCONN)
3054                         kn->kn_fop = &solisten_filtops;
3055                 else
3056                         kn->kn_fop = &soread_filtops;
3057                 sb = &so->so_rcv;
3058                 break;
3059         case EVFILT_WRITE:
3060                 kn->kn_fop = &sowrite_filtops;
3061                 sb = &so->so_snd;
3062                 break;
3063         default:
3064                 return (EINVAL);
3065         }
3066
3067         SOCKBUF_LOCK(sb);
3068         knlist_add(&sb->sb_sel.si_note, kn, 1);
3069         sb->sb_flags |= SB_KNOTE;
3070         SOCKBUF_UNLOCK(sb);
3071         return (0);
3072 }
3073
3074 /*
3075  * Some routines that return EOPNOTSUPP for entry points that are not
3076  * supported by a protocol.  Fill in as needed.
3077  */
3078 int
3079 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3080 {
3081
3082         return EOPNOTSUPP;
3083 }
3084
3085 int
3086 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3087 {
3088
3089         return EOPNOTSUPP;
3090 }
3091
3092 int
3093 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3094 {
3095
3096         return EOPNOTSUPP;
3097 }
3098
3099 int
3100 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3101     struct thread *td)
3102 {
3103
3104         return EOPNOTSUPP;
3105 }
3106
3107 int
3108 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3109 {
3110
3111         return EOPNOTSUPP;
3112 }
3113
3114 int
3115 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3116     struct thread *td)
3117 {
3118
3119         return EOPNOTSUPP;
3120 }
3121
3122 int
3123 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3124 {
3125
3126         return EOPNOTSUPP;
3127 }
3128
3129 int
3130 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3131     struct ifnet *ifp, struct thread *td)
3132 {
3133
3134         return EOPNOTSUPP;
3135 }
3136
3137 int
3138 pru_disconnect_notsupp(struct socket *so)
3139 {
3140
3141         return EOPNOTSUPP;
3142 }
3143
3144 int
3145 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3146 {
3147
3148         return EOPNOTSUPP;
3149 }
3150
3151 int
3152 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3153 {
3154
3155         return EOPNOTSUPP;
3156 }
3157
3158 int
3159 pru_rcvd_notsupp(struct socket *so, int flags)
3160 {
3161
3162         return EOPNOTSUPP;
3163 }
3164
3165 int
3166 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3167 {
3168
3169         return EOPNOTSUPP;
3170 }
3171
3172 int
3173 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3174     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3175 {
3176
3177         return EOPNOTSUPP;
3178 }
3179
3180 /*
3181  * This isn't really a ``null'' operation, but it's the default one and
3182  * doesn't do anything destructive.
3183  */
3184 int
3185 pru_sense_null(struct socket *so, struct stat *sb)
3186 {
3187
3188         sb->st_blksize = so->so_snd.sb_hiwat;
3189         return 0;
3190 }
3191
3192 int
3193 pru_shutdown_notsupp(struct socket *so)
3194 {
3195
3196         return EOPNOTSUPP;
3197 }
3198
3199 int
3200 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3201 {
3202
3203         return EOPNOTSUPP;
3204 }
3205
3206 int
3207 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3208     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3209 {
3210
3211         return EOPNOTSUPP;
3212 }
3213
3214 int
3215 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3216     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3217 {
3218
3219         return EOPNOTSUPP;
3220 }
3221
3222 int
3223 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3224     struct thread *td)
3225 {
3226
3227         return EOPNOTSUPP;
3228 }
3229
3230 static void
3231 filt_sordetach(struct knote *kn)
3232 {
3233         struct socket *so = kn->kn_fp->f_data;
3234
3235         SOCKBUF_LOCK(&so->so_rcv);
3236         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3237         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3238                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3239         SOCKBUF_UNLOCK(&so->so_rcv);
3240 }
3241
3242 /*ARGSUSED*/
3243 static int
3244 filt_soread(struct knote *kn, long hint)
3245 {
3246         struct socket *so;
3247
3248         so = kn->kn_fp->f_data;
3249         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3250
3251         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3252         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3253                 kn->kn_flags |= EV_EOF;
3254                 kn->kn_fflags = so->so_error;
3255                 return (1);
3256         } else if (so->so_error)        /* temporary udp error */
3257                 return (1);
3258
3259         if (kn->kn_sfflags & NOTE_LOWAT) {
3260                 if (kn->kn_data >= kn->kn_sdata)
3261                         return 1;
3262         } else {
3263                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat)
3264                         return 1;
3265         }
3266
3267         if (V_socket_hhh[HHOOK_FILT_SOREAD]->hhh_nhooks > 0)
3268                 /* This hook returning non-zero indicates an event, not error */
3269                 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3270
3271         return (0);
3272 }
3273
3274 static void
3275 filt_sowdetach(struct knote *kn)
3276 {
3277         struct socket *so = kn->kn_fp->f_data;
3278
3279         SOCKBUF_LOCK(&so->so_snd);
3280         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3281         if (knlist_empty(&so->so_snd.sb_sel.si_note))
3282                 so->so_snd.sb_flags &= ~SB_KNOTE;
3283         SOCKBUF_UNLOCK(&so->so_snd);
3284 }
3285
3286 /*ARGSUSED*/
3287 static int
3288 filt_sowrite(struct knote *kn, long hint)
3289 {
3290         struct socket *so;
3291
3292         so = kn->kn_fp->f_data;
3293         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3294         kn->kn_data = sbspace(&so->so_snd);
3295
3296         if (V_socket_hhh[HHOOK_FILT_SOWRITE]->hhh_nhooks > 0)
3297                 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3298
3299         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3300                 kn->kn_flags |= EV_EOF;
3301                 kn->kn_fflags = so->so_error;
3302                 return (1);
3303         } else if (so->so_error)        /* temporary udp error */
3304                 return (1);
3305         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3306             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3307                 return (0);
3308         else if (kn->kn_sfflags & NOTE_LOWAT)
3309                 return (kn->kn_data >= kn->kn_sdata);
3310         else
3311                 return (kn->kn_data >= so->so_snd.sb_lowat);
3312 }
3313
3314 /*ARGSUSED*/
3315 static int
3316 filt_solisten(struct knote *kn, long hint)
3317 {
3318         struct socket *so = kn->kn_fp->f_data;
3319
3320         kn->kn_data = so->so_qlen;
3321         return (!TAILQ_EMPTY(&so->so_comp));
3322 }
3323
3324 int
3325 socheckuid(struct socket *so, uid_t uid)
3326 {
3327
3328         if (so == NULL)
3329                 return (EPERM);
3330         if (so->so_cred->cr_uid != uid)
3331                 return (EPERM);
3332         return (0);
3333 }
3334
3335 /*
3336  * These functions are used by protocols to notify the socket layer (and its
3337  * consumers) of state changes in the sockets driven by protocol-side events.
3338  */
3339
3340 /*
3341  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3342  *
3343  * Normal sequence from the active (originating) side is that
3344  * soisconnecting() is called during processing of connect() call, resulting
3345  * in an eventual call to soisconnected() if/when the connection is
3346  * established.  When the connection is torn down soisdisconnecting() is
3347  * called during processing of disconnect() call, and soisdisconnected() is
3348  * called when the connection to the peer is totally severed.  The semantics
3349  * of these routines are such that connectionless protocols can call
3350  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3351  * calls when setting up a ``connection'' takes no time.
3352  *
3353  * From the passive side, a socket is created with two queues of sockets:
3354  * so_incomp for connections in progress and so_comp for connections already
3355  * made and awaiting user acceptance.  As a protocol is preparing incoming
3356  * connections, it creates a socket structure queued on so_incomp by calling
3357  * sonewconn().  When the connection is established, soisconnected() is
3358  * called, and transfers the socket structure to so_comp, making it available
3359  * to accept().
3360  *
3361  * If a socket is closed with sockets on either so_incomp or so_comp, these
3362  * sockets are dropped.
3363  *
3364  * If higher-level protocols are implemented in the kernel, the wakeups done
3365  * here will sometimes cause software-interrupt process scheduling.
3366  */
3367 void
3368 soisconnecting(struct socket *so)
3369 {
3370
3371         SOCK_LOCK(so);
3372         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3373         so->so_state |= SS_ISCONNECTING;
3374         SOCK_UNLOCK(so);
3375 }
3376
3377 void
3378 soisconnected(struct socket *so)
3379 {
3380         struct socket *head;
3381         int ret;
3382
3383 restart:
3384         ACCEPT_LOCK();
3385         SOCK_LOCK(so);
3386         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3387         so->so_state |= SS_ISCONNECTED;
3388         head = so->so_head;
3389         if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3390                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3391                         SOCK_UNLOCK(so);
3392                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
3393                         head->so_incqlen--;
3394                         so->so_qstate &= ~SQ_INCOMP;
3395                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3396                         head->so_qlen++;
3397                         so->so_qstate |= SQ_COMP;
3398                         ACCEPT_UNLOCK();
3399                         sorwakeup(head);
3400                         wakeup_one(&head->so_timeo);
3401                 } else {
3402                         ACCEPT_UNLOCK();
3403                         soupcall_set(so, SO_RCV,
3404                             head->so_accf->so_accept_filter->accf_callback,
3405                             head->so_accf->so_accept_filter_arg);
3406                         so->so_options &= ~SO_ACCEPTFILTER;
3407                         ret = head->so_accf->so_accept_filter->accf_callback(so,
3408                             head->so_accf->so_accept_filter_arg, M_NOWAIT);
3409                         if (ret == SU_ISCONNECTED)
3410                                 soupcall_clear(so, SO_RCV);
3411                         SOCK_UNLOCK(so);
3412                         if (ret == SU_ISCONNECTED)
3413                                 goto restart;
3414                 }
3415                 return;
3416         }
3417         SOCK_UNLOCK(so);
3418         ACCEPT_UNLOCK();
3419         wakeup(&so->so_timeo);
3420         sorwakeup(so);
3421         sowwakeup(so);
3422 }
3423
3424 void
3425 soisdisconnecting(struct socket *so)
3426 {
3427
3428         /*
3429          * Note: This code assumes that SOCK_LOCK(so) and
3430          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3431          */
3432         SOCKBUF_LOCK(&so->so_rcv);
3433         so->so_state &= ~SS_ISCONNECTING;
3434         so->so_state |= SS_ISDISCONNECTING;
3435         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3436         sorwakeup_locked(so);
3437         SOCKBUF_LOCK(&so->so_snd);
3438         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3439         sowwakeup_locked(so);
3440         wakeup(&so->so_timeo);
3441 }
3442
3443 void
3444 soisdisconnected(struct socket *so)
3445 {
3446
3447         /*
3448          * Note: This code assumes that SOCK_LOCK(so) and
3449          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3450          */
3451         SOCKBUF_LOCK(&so->so_rcv);
3452         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3453         so->so_state |= SS_ISDISCONNECTED;
3454         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3455         sorwakeup_locked(so);
3456         SOCKBUF_LOCK(&so->so_snd);
3457         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3458         sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3459         sowwakeup_locked(so);
3460         wakeup(&so->so_timeo);
3461 }
3462
3463 /*
3464  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3465  */
3466 struct sockaddr *
3467 sodupsockaddr(const struct sockaddr *sa, int mflags)
3468 {
3469         struct sockaddr *sa2;
3470
3471         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3472         if (sa2)
3473                 bcopy(sa, sa2, sa->sa_len);
3474         return sa2;
3475 }
3476
3477 /*
3478  * Register per-socket buffer upcalls.
3479  */
3480 void
3481 soupcall_set(struct socket *so, int which,
3482     int (*func)(struct socket *, void *, int), void *arg)
3483 {
3484         struct sockbuf *sb;
3485
3486         switch (which) {
3487         case SO_RCV:
3488                 sb = &so->so_rcv;
3489                 break;
3490         case SO_SND:
3491                 sb = &so->so_snd;
3492                 break;
3493         default:
3494                 panic("soupcall_set: bad which");
3495         }
3496         SOCKBUF_LOCK_ASSERT(sb);
3497 #if 0
3498         /* XXX: accf_http actually wants to do this on purpose. */
3499         KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3500 #endif
3501         sb->sb_upcall = func;
3502         sb->sb_upcallarg = arg;
3503         sb->sb_flags |= SB_UPCALL;
3504 }
3505
3506 void
3507 soupcall_clear(struct socket *so, int which)
3508 {
3509         struct sockbuf *sb;
3510
3511         switch (which) {
3512         case SO_RCV:
3513                 sb = &so->so_rcv;
3514                 break;
3515         case SO_SND:
3516                 sb = &so->so_snd;
3517                 break;
3518         default:
3519                 panic("soupcall_clear: bad which");
3520         }
3521         SOCKBUF_LOCK_ASSERT(sb);
3522         KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3523         sb->sb_upcall = NULL;
3524         sb->sb_upcallarg = NULL;
3525         sb->sb_flags &= ~SB_UPCALL;
3526 }
3527
3528 /*
3529  * Create an external-format (``xsocket'') structure using the information in
3530  * the kernel-format socket structure pointed to by so.  This is done to
3531  * reduce the spew of irrelevant information over this interface, to isolate
3532  * user code from changes in the kernel structure, and potentially to provide
3533  * information-hiding if we decide that some of this information should be
3534  * hidden from users.
3535  */
3536 void
3537 sotoxsocket(struct socket *so, struct xsocket *xso)
3538 {
3539
3540         xso->xso_len = sizeof *xso;
3541         xso->xso_so = so;
3542         xso->so_type = so->so_type;
3543         xso->so_options = so->so_options;
3544         xso->so_linger = so->so_linger;
3545         xso->so_state = so->so_state;
3546         xso->so_pcb = so->so_pcb;
3547         xso->xso_protocol = so->so_proto->pr_protocol;
3548         xso->xso_family = so->so_proto->pr_domain->dom_family;
3549         xso->so_qlen = so->so_qlen;
3550         xso->so_incqlen = so->so_incqlen;
3551         xso->so_qlimit = so->so_qlimit;
3552         xso->so_timeo = so->so_timeo;
3553         xso->so_error = so->so_error;
3554         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3555         xso->so_oobmark = so->so_oobmark;
3556         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3557         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3558         xso->so_uid = so->so_cred->cr_uid;
3559 }
3560
3561
3562 /*
3563  * Socket accessor functions to provide external consumers with
3564  * a safe interface to socket state
3565  *
3566  */
3567
3568 void
3569 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
3570     void *arg)
3571 {
3572
3573         TAILQ_FOREACH(so, &so->so_comp, so_list)
3574                 func(so, arg);
3575 }
3576
3577 struct sockbuf *
3578 so_sockbuf_rcv(struct socket *so)
3579 {
3580
3581         return (&so->so_rcv);
3582 }
3583
3584 struct sockbuf *
3585 so_sockbuf_snd(struct socket *so)
3586 {
3587
3588         return (&so->so_snd);
3589 }
3590
3591 int
3592 so_state_get(const struct socket *so)
3593 {
3594
3595         return (so->so_state);
3596 }
3597
3598 void
3599 so_state_set(struct socket *so, int val)
3600 {
3601
3602         so->so_state = val;
3603 }
3604
3605 int
3606 so_options_get(const struct socket *so)
3607 {
3608
3609         return (so->so_options);
3610 }
3611
3612 void
3613 so_options_set(struct socket *so, int val)
3614 {
3615
3616         so->so_options = val;
3617 }
3618
3619 int
3620 so_error_get(const struct socket *so)
3621 {
3622
3623         return (so->so_error);
3624 }
3625
3626 void
3627 so_error_set(struct socket *so, int val)
3628 {
3629
3630         so->so_error = val;
3631 }
3632
3633 int
3634 so_linger_get(const struct socket *so)
3635 {
3636
3637         return (so->so_linger);
3638 }
3639
3640 void
3641 so_linger_set(struct socket *so, int val)
3642 {
3643
3644         so->so_linger = val;
3645 }
3646
3647 struct protosw *
3648 so_protosw_get(const struct socket *so)
3649 {
3650
3651         return (so->so_proto);
3652 }
3653
3654 void
3655 so_protosw_set(struct socket *so, struct protosw *val)
3656 {
3657
3658         so->so_proto = val;
3659 }
3660
3661 void
3662 so_sorwakeup(struct socket *so)
3663 {
3664
3665         sorwakeup(so);
3666 }
3667
3668 void
3669 so_sowwakeup(struct socket *so)
3670 {
3671
3672         sowwakeup(so);
3673 }
3674
3675 void
3676 so_sorwakeup_locked(struct socket *so)
3677 {
3678
3679         sorwakeup_locked(so);
3680 }
3681
3682 void
3683 so_sowwakeup_locked(struct socket *so)
3684 {
3685
3686         sowwakeup_locked(so);
3687 }
3688
3689 void
3690 so_lock(struct socket *so)
3691 {
3692
3693         SOCK_LOCK(so);
3694 }
3695
3696 void
3697 so_unlock(struct socket *so)
3698 {
3699
3700         SOCK_UNLOCK(so);
3701 }