sys/kern/uipc_socket.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   5  *      The Regents of the University of California.
   6  * Copyright (c) 2004 The FreeBSD Foundation
   7  * Copyright (c) 2004-2008 Robert N. M. Watson
   8  * All rights reserved.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  *
  96  * NOTE: With regard to VNETs the general rule is that callers do not set
  97  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  98  * sofree(), sorele(), sonewconn() and sorflush(), which are usually called
  99  * from a pre-set VNET context.  sopoll() currently does not need a VNET
 100  * context to be set.
 101  */
 102
 103 #include <sys/cdefs.h>
 104 #include "opt_inet.h"
 105 #include "opt_inet6.h"
 106 #include "opt_kern_tls.h"
 107 #include "opt_sctp.h"
 108
 109 #include <sys/param.h>
 110 #include <sys/systm.h>
 111 #include <sys/capsicum.h>
 112 #include <sys/fcntl.h>
 113 #include <sys/limits.h>
 114 #include <sys/lock.h>
 115 #include <sys/mac.h>
 116 #include <sys/malloc.h>
 117 #include <sys/mbuf.h>
 118 #include <sys/mutex.h>
 119 #include <sys/domain.h>
 120 #include <sys/file.h>                   /* for struct knote */
 121 #include <sys/hhook.h>
 122 #include <sys/kernel.h>
 123 #include <sys/khelp.h>
 124 #include <sys/ktls.h>
 125 #include <sys/event.h>
 126 #include <sys/eventhandler.h>
 127 #include <sys/poll.h>
 128 #include <sys/proc.h>
 129 #include <sys/protosw.h>
 130 #include <sys/sbuf.h>
 131 #include <sys/socket.h>
 132 #include <sys/socketvar.h>
 133 #include <sys/resourcevar.h>
 134 #include <net/route.h>
 135 #include <sys/signalvar.h>
 136 #include <sys/stat.h>
 137 #include <sys/sx.h>
 138 #include <sys/sysctl.h>
 139 #include <sys/taskqueue.h>
 140 #include <sys/uio.h>
 141 #include <sys/un.h>
 142 #include <sys/unpcb.h>
 143 #include <sys/jail.h>
 144 #include <sys/syslog.h>
 145 #include <netinet/in.h>
 146 #include <netinet/in_pcb.h>
 147 #include <netinet/tcp.h>
 148
 149 #include <net/vnet.h>
 150
 151 #include <security/mac/mac_framework.h>
 152
 153 #include <vm/uma.h>
 154
 155 #ifdef COMPAT_FREEBSD32
 156 #include <sys/mount.h>
 157 #include <sys/sysent.h>
 158 #include <compat/freebsd32/freebsd32.h>
 159 #endif
 160
 161 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 162                     int flags);
 163 static void     so_rdknl_lock(void *);
 164 static void     so_rdknl_unlock(void *);
 165 static void     so_rdknl_assert_lock(void *, int);
 166 static void     so_wrknl_lock(void *);
 167 static void     so_wrknl_unlock(void *);
 168 static void     so_wrknl_assert_lock(void *, int);
 169
 170 static void     filt_sordetach(struct knote *kn);
 171 static int      filt_soread(struct knote *kn, long hint);
 172 static void     filt_sowdetach(struct knote *kn);
 173 static int      filt_sowrite(struct knote *kn, long hint);
 174 static int      filt_soempty(struct knote *kn, long hint);
 175 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 176 fo_kqfilter_t   soo_kqfilter;
 177
 178 static struct filterops soread_filtops = {
 179         .f_isfd = 1,
 180         .f_detach = filt_sordetach,
 181         .f_event = filt_soread,
 182 };
 183 static struct filterops sowrite_filtops = {
 184         .f_isfd = 1,
 185         .f_detach = filt_sowdetach,
 186         .f_event = filt_sowrite,
 187 };
 188 static struct filterops soempty_filtops = {
 189         .f_isfd = 1,
 190         .f_detach = filt_sowdetach,
 191         .f_event = filt_soempty,
 192 };
 193
 194 so_gen_t        so_gencnt;      /* generation count for sockets */
 195
 196 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 197 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 198
 199 #define VNET_SO_ASSERT(so)                                              \
 200         VNET_ASSERT(curvnet != NULL,                                    \
 201             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 202
 203 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 204 #define V_socket_hhh            VNET(socket_hhh)
 205
 206 /*
 207  * Limit on the number of connections in the listen queue waiting
 208  * for accept(2).
 209  * NB: The original sysctl somaxconn is still available but hidden
 210  * to prevent confusion about the actual purpose of this number.
 211  */
 212 static u_int somaxconn = SOMAXCONN;
 213
 214 static int
 215 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 216 {
 217         int error;
 218         int val;
 219
 220         val = somaxconn;
 221         error = sysctl_handle_int(oidp, &val, 0, req);
 222         if (error || !req->newptr )
 223                 return (error);
 224
 225         /*
 226          * The purpose of the UINT_MAX / 3 limit, is so that the formula
 227          *   3 * so_qlimit / 2
 228          * below, will not overflow.
 229          */
 230
 231         if (val < 1 || val > UINT_MAX / 3)
 232                 return (EINVAL);
 233
 234         somaxconn = val;
 235         return (0);
 236 }
 237 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
 238     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
 239     sysctl_somaxconn, "I",
 240     "Maximum listen socket pending connection accept queue size");
 241 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 242     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
 243     sizeof(int), sysctl_somaxconn, "I",
 244     "Maximum listen socket pending connection accept queue size (compat)");
 245
 246 static int numopensockets;
 247 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 248     &numopensockets, 0, "Number of open sockets");
 249
 250 /*
 251  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 252  * so_gencnt field.
 253  */
 254 static struct mtx so_global_mtx;
 255 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 256
 257 /*
 258  * General IPC sysctl name space, used by sockets and a variety of other IPC
 259  * types.
 260  */
 261 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 262     "IPC");
 263
 264 /*
 265  * Initialize the socket subsystem and set up the socket
 266  * memory allocator.
 267  */
 268 static uma_zone_t socket_zone;
 269 int     maxsockets;
 270
 271 static void
 272 socket_zone_change(void *tag)
 273 {
 274
 275         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 276 }
 277
 278 static void
 279 socket_hhook_register(int subtype)
 280 {
 281
 282         if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 283             &V_socket_hhh[subtype],
 284             HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 285                 printf("%s: WARNING: unable to register hook\n", __func__);
 286 }
 287
 288 static void
 289 socket_hhook_deregister(int subtype)
 290 {
 291
 292         if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 293                 printf("%s: WARNING: unable to deregister hook\n", __func__);
 294 }
 295
 296 static void
 297 socket_init(void *tag)
 298 {
 299
 300         socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 301             NULL, NULL, UMA_ALIGN_PTR, 0);
 302         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 303         uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 304         EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 305             EVENTHANDLER_PRI_FIRST);
 306 }
 307 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 308
 309 static void
 310 socket_vnet_init(const void *unused __unused)
 311 {
 312         int i;
 313
 314         /* We expect a contiguous range */
 315         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 316                 socket_hhook_register(i);
 317 }
 318 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 319     socket_vnet_init, NULL);
 320
 321 static void
 322 socket_vnet_uninit(const void *unused __unused)
 323 {
 324         int i;
 325
 326         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 327                 socket_hhook_deregister(i);
 328 }
 329 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 330     socket_vnet_uninit, NULL);
 331
 332 /*
 333  * Initialise maxsockets.  This SYSINIT must be run after
 334  * tunable_mbinit().
 335  */
 336 static void
 337 init_maxsockets(void *ignored)
 338 {
 339
 340         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 341         maxsockets = imax(maxsockets, maxfiles);
 342 }
 343 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 344
 345 /*
 346  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 347  * of the change so that they can update their dependent limits as required.
 348  */
 349 static int
 350 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 351 {
 352         int error, newmaxsockets;
 353
 354         newmaxsockets = maxsockets;
 355         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 356         if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
 357                 if (newmaxsockets > maxsockets &&
 358                     newmaxsockets <= maxfiles) {
 359                         maxsockets = newmaxsockets;
 360                         EVENTHANDLER_INVOKE(maxsockets_change);
 361                 } else
 362                         error = EINVAL;
 363         }
 364         return (error);
 365 }
 366 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
 367     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
 368     &maxsockets, 0, sysctl_maxsockets, "IU",
 369     "Maximum number of sockets available");
 370
 371 /*
 372  * Socket operation routines.  These routines are called by the routines in
 373  * sys_socket.c or from a system process, and implement the semantics of
 374  * socket operations by switching out to the protocol specific routines.
 375  */
 376
 377 /*
 378  * Get a socket structure from our zone, and initialize it.  Note that it
 379  * would probably be better to allocate socket and PCB at the same time, but
 380  * I'm not convinced that all the protocols can be easily modified to do
 381  * this.
 382  *
 383  * soalloc() returns a socket with a ref count of 0.
 384  */
 385 static struct socket *
 386 soalloc(struct vnet *vnet)
 387 {
 388         struct socket *so;
 389
 390         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 391         if (so == NULL)
 392                 return (NULL);
 393 #ifdef MAC
 394         if (mac_socket_init(so, M_NOWAIT) != 0) {
 395                 uma_zfree(socket_zone, so);
 396                 return (NULL);
 397         }
 398 #endif
 399         if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 400                 uma_zfree(socket_zone, so);
 401                 return (NULL);
 402         }
 403
 404         /*
 405          * The socket locking protocol allows to lock 2 sockets at a time,
 406          * however, the first one must be a listening socket.  WITNESS lacks
 407          * a feature to change class of an existing lock, so we use DUPOK.
 408          */
 409         mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 410         mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
 411         mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
 412         so->so_rcv.sb_sel = &so->so_rdsel;
 413         so->so_snd.sb_sel = &so->so_wrsel;
 414         sx_init(&so->so_snd_sx, "so_snd_sx");
 415         sx_init(&so->so_rcv_sx, "so_rcv_sx");
 416         TAILQ_INIT(&so->so_snd.sb_aiojobq);
 417         TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 418         TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 419         TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 420 #ifdef VIMAGE
 421         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 422             __func__, __LINE__, so));
 423         so->so_vnet = vnet;
 424 #endif
 425         /* We shouldn't need the so_global_mtx */
 426         if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 427                 /* Do we need more comprehensive error returns? */
 428                 uma_zfree(socket_zone, so);
 429                 return (NULL);
 430         }
 431         mtx_lock(&so_global_mtx);
 432         so->so_gencnt = ++so_gencnt;
 433         ++numopensockets;
 434 #ifdef VIMAGE
 435         vnet->vnet_sockcnt++;
 436 #endif
 437         mtx_unlock(&so_global_mtx);
 438
 439         return (so);
 440 }
 441
 442 /*
 443  * Free the storage associated with a socket at the socket layer, tear down
 444  * locks, labels, etc.  All protocol state is assumed already to have been
 445  * torn down (and possibly never set up) by the caller.
 446  */
 447 void
 448 sodealloc(struct socket *so)
 449 {
 450
 451         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 452         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 453
 454         mtx_lock(&so_global_mtx);
 455         so->so_gencnt = ++so_gencnt;
 456         --numopensockets;       /* Could be below, but faster here. */
 457 #ifdef VIMAGE
 458         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 459             __func__, __LINE__, so));
 460         so->so_vnet->vnet_sockcnt--;
 461 #endif
 462         mtx_unlock(&so_global_mtx);
 463 #ifdef MAC
 464         mac_socket_destroy(so);
 465 #endif
 466         hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 467
 468         khelp_destroy_osd(&so->osd);
 469         if (SOLISTENING(so)) {
 470                 if (so->sol_accept_filter != NULL)
 471                         accept_filt_setopt(so, NULL);
 472         } else {
 473                 if (so->so_rcv.sb_hiwat)
 474                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 475                             &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 476                 if (so->so_snd.sb_hiwat)
 477                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 478                             &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 479                 sx_destroy(&so->so_snd_sx);
 480                 sx_destroy(&so->so_rcv_sx);
 481                 mtx_destroy(&so->so_snd_mtx);
 482                 mtx_destroy(&so->so_rcv_mtx);
 483         }
 484         crfree(so->so_cred);
 485         mtx_destroy(&so->so_lock);
 486         uma_zfree(socket_zone, so);
 487 }
 488
 489 /*
 490  * socreate returns a socket with a ref count of 1 and a file descriptor
 491  * reference.  The socket should be closed with soclose().
 492  */
 493 int
 494 socreate(int dom, struct socket **aso, int type, int proto,
 495     struct ucred *cred, struct thread *td)
 496 {
 497         struct protosw *prp;
 498         struct socket *so;
 499         int error;
 500
 501         /*
 502          * XXX: divert(4) historically abused PF_INET.  Keep this compatibility
 503          * shim until all applications have been updated.
 504          */
 505         if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
 506             proto == IPPROTO_DIVERT)) {
 507                 dom = PF_DIVERT;
 508                 printf("%s uses obsolete way to create divert(4) socket\n",
 509                     td->td_proc->p_comm);
 510         }
 511
 512         prp = pffindproto(dom, type, proto);
 513         if (prp == NULL) {
 514                 /* No support for domain. */
 515                 if (pffinddomain(dom) == NULL)
 516                         return (EAFNOSUPPORT);
 517                 /* No support for socket type. */
 518                 if (proto == 0 && type != 0)
 519                         return (EPROTOTYPE);
 520                 return (EPROTONOSUPPORT);
 521         }
 522
 523         MPASS(prp->pr_attach);
 524
 525         if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0)
 526                 return (ECAPMODE);
 527
 528         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 529                 return (EPROTONOSUPPORT);
 530
 531         so = soalloc(CRED_TO_VNET(cred));
 532         if (so == NULL)
 533                 return (ENOBUFS);
 534
 535         so->so_type = type;
 536         so->so_cred = crhold(cred);
 537         if ((prp->pr_domain->dom_family == PF_INET) ||
 538             (prp->pr_domain->dom_family == PF_INET6) ||
 539             (prp->pr_domain->dom_family == PF_ROUTE))
 540                 so->so_fibnum = td->td_proc->p_fibnum;
 541         else
 542                 so->so_fibnum = 0;
 543         so->so_proto = prp;
 544 #ifdef MAC
 545         mac_socket_create(cred, so);
 546 #endif
 547         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 548             so_rdknl_assert_lock);
 549         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 550             so_wrknl_assert_lock);
 551         if ((prp->pr_flags & PR_SOCKBUF) == 0) {
 552                 so->so_snd.sb_mtx = &so->so_snd_mtx;
 553                 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 554         }
 555         /*
 556          * Auto-sizing of socket buffers is managed by the protocols and
 557          * the appropriate flags must be set in the pru_attach function.
 558          */
 559         CURVNET_SET(so->so_vnet);
 560         error = prp->pr_attach(so, proto, td);
 561         CURVNET_RESTORE();
 562         if (error) {
 563                 sodealloc(so);
 564                 return (error);
 565         }
 566         soref(so);
 567         *aso = so;
 568         return (0);
 569 }
 570
 571 #ifdef REGRESSION
 572 static int regression_sonewconn_earlytest = 1;
 573 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 574     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 575 #endif
 576
 577 static int sooverprio = LOG_DEBUG;
 578 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW,
 579     &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable");
 580
 581 static struct timeval overinterval = { 60, 0 };
 582 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
 583     &overinterval,
 584     "Delay in seconds between warnings for listen socket overflows");
 585
 586 /*
 587  * When an attempt at a new connection is noted on a socket which supports
 588  * accept(2), the protocol has two options:
 589  * 1) Call legacy sonewconn() function, which would call protocol attach
 590  *    method, same as used for socket(2).
 591  * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
 592  *    and then call solisten_enqueue().
 593  *
 594  * Note: the ref count on the socket is 0 on return.
 595  */
 596 struct socket *
 597 solisten_clone(struct socket *head)
 598 {
 599         struct sbuf descrsb;
 600         struct socket *so;
 601         int len, overcount;
 602         u_int qlen;
 603         const char localprefix[] = "local:";
 604         char descrbuf[SUNPATHLEN + sizeof(localprefix)];
 605 #if defined(INET6)
 606         char addrbuf[INET6_ADDRSTRLEN];
 607 #elif defined(INET)
 608         char addrbuf[INET_ADDRSTRLEN];
 609 #endif
 610         bool dolog, over;
 611
 612         SOLISTEN_LOCK(head);
 613         over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 614 #ifdef REGRESSION
 615         if (regression_sonewconn_earlytest && over) {
 616 #else
 617         if (over) {
 618 #endif
 619                 head->sol_overcount++;
 620                 dolog = (sooverprio >= 0) &&
 621                         !!ratecheck(&head->sol_lastover, &overinterval);
 622
 623                 /*
 624                  * If we're going to log, copy the overflow count and queue
 625                  * length from the listen socket before dropping the lock.
 626                  * Also, reset the overflow count.
 627                  */
 628                 if (dolog) {
 629                         overcount = head->sol_overcount;
 630                         head->sol_overcount = 0;
 631                         qlen = head->sol_qlen;
 632                 }
 633                 SOLISTEN_UNLOCK(head);
 634
 635                 if (dolog) {
 636                         /*
 637                          * Try to print something descriptive about the
 638                          * socket for the error message.
 639                          */
 640                         sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
 641                             SBUF_FIXEDLEN);
 642                         switch (head->so_proto->pr_domain->dom_family) {
 643 #if defined(INET) || defined(INET6)
 644 #ifdef INET
 645                         case AF_INET:
 646 #endif
 647 #ifdef INET6
 648                         case AF_INET6:
 649                                 if (head->so_proto->pr_domain->dom_family ==
 650                                     AF_INET6 ||
 651                                     (sotoinpcb(head)->inp_inc.inc_flags &
 652                                     INC_ISIPV6)) {
 653                                         ip6_sprintf(addrbuf,
 654                                             &sotoinpcb(head)->inp_inc.inc6_laddr);
 655                                         sbuf_printf(&descrsb, "[%s]", addrbuf);
 656                                 } else
 657 #endif
 658                                 {
 659 #ifdef INET
 660                                         inet_ntoa_r(
 661                                             sotoinpcb(head)->inp_inc.inc_laddr,
 662                                             addrbuf);
 663                                         sbuf_cat(&descrsb, addrbuf);
 664 #endif
 665                                 }
 666                                 sbuf_printf(&descrsb, ":%hu (proto %u)",
 667                                     ntohs(sotoinpcb(head)->inp_inc.inc_lport),
 668                                     head->so_proto->pr_protocol);
 669                                 break;
 670 #endif /* INET || INET6 */
 671                         case AF_UNIX:
 672                                 sbuf_cat(&descrsb, localprefix);
 673                                 if (sotounpcb(head)->unp_addr != NULL)
 674                                         len =
 675                                             sotounpcb(head)->unp_addr->sun_len -
 676                                             offsetof(struct sockaddr_un,
 677                                             sun_path);
 678                                 else
 679                                         len = 0;
 680                                 if (len > 0)
 681                                         sbuf_bcat(&descrsb,
 682                                             sotounpcb(head)->unp_addr->sun_path,
 683                                             len);
 684                                 else
 685                                         sbuf_cat(&descrsb, "(unknown)");
 686                                 break;
 687                         }
 688
 689                         /*
 690                          * If we can't print something more specific, at least
 691                          * print the domain name.
 692                          */
 693                         if (sbuf_finish(&descrsb) != 0 ||
 694                             sbuf_len(&descrsb) <= 0) {
 695                                 sbuf_clear(&descrsb);
 696                                 sbuf_cat(&descrsb,
 697                                     head->so_proto->pr_domain->dom_name ?:
 698                                     "unknown");
 699                                 sbuf_finish(&descrsb);
 700                         }
 701                         KASSERT(sbuf_len(&descrsb) > 0,
 702                             ("%s: sbuf creation failed", __func__));
 703                         /*
 704                          * Preserve the historic listen queue overflow log
 705                          * message, that starts with "sonewconn:".  It has
 706                          * been known to sysadmins for years and also test
 707                          * sys/kern/sonewconn_overflow checks for it.
 708                          */
 709                         if (head->so_cred == 0) {
 710                                 log(LOG_PRI(sooverprio),
 711                                     "sonewconn: pcb %p (%s): "
 712                                     "Listen queue overflow: %i already in "
 713                                     "queue awaiting acceptance (%d "
 714                                     "occurrences)\n", head->so_pcb,
 715                                     sbuf_data(&descrsb),
 716                                 qlen, overcount);
 717                         } else {
 718                                 log(LOG_PRI(sooverprio),
 719                                     "sonewconn: pcb %p (%s): "
 720                                     "Listen queue overflow: "
 721                                     "%i already in queue awaiting acceptance "
 722                                     "(%d occurrences), euid %d, rgid %d, jail %s\n",
 723                                     head->so_pcb, sbuf_data(&descrsb), qlen,
 724                                     overcount, head->so_cred->cr_uid,
 725                                     head->so_cred->cr_rgid,
 726                                     head->so_cred->cr_prison ?
 727                                         head->so_cred->cr_prison->pr_name :
 728                                         "not_jailed");
 729                         }
 730                         sbuf_delete(&descrsb);
 731
 732                         overcount = 0;
 733                 }
 734
 735                 return (NULL);
 736         }
 737         SOLISTEN_UNLOCK(head);
 738         VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 739             __func__, head));
 740         so = soalloc(head->so_vnet);
 741         if (so == NULL) {
 742                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 743                     "limit reached or out of memory\n",
 744                     __func__, head->so_pcb);
 745                 return (NULL);
 746         }
 747         so->so_listen = head;
 748         so->so_type = head->so_type;
 749         /*
 750          * POSIX is ambiguous on what options an accept(2)ed socket should
 751          * inherit from the listener.  Words "create a new socket" may be
 752          * interpreted as not inheriting anything.  Best programming practice
 753          * for application developers is to not rely on such inheritance.
 754          * FreeBSD had historically inherited all so_options excluding
 755          * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options,
 756          * including those completely irrelevant to a new born socket.  For
 757          * compatibility with older versions we will inherit a list of
 758          * meaningful options.
 759          */
 760         so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE |
 761             SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE);
 762         so->so_linger = head->so_linger;
 763         so->so_state = head->so_state;
 764         so->so_fibnum = head->so_fibnum;
 765         so->so_proto = head->so_proto;
 766         so->so_cred = crhold(head->so_cred);
 767 #ifdef MAC
 768         mac_socket_newconn(head, so);
 769 #endif
 770         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 771             so_rdknl_assert_lock);
 772         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 773             so_wrknl_assert_lock);
 774         VNET_SO_ASSERT(head);
 775         if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 776                 sodealloc(so);
 777                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 778                     __func__, head->so_pcb);
 779                 return (NULL);
 780         }
 781         so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 782         so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 783         so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 784         so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 785         so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
 786         so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
 787         if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 788                 so->so_snd.sb_mtx = &so->so_snd_mtx;
 789                 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 790         }
 791
 792         return (so);
 793 }
 794
 795 /* Connstatus may be 0 or SS_ISCONNECTED. */
 796 struct socket *
 797 sonewconn(struct socket *head, int connstatus)
 798 {
 799         struct socket *so;
 800
 801         if ((so = solisten_clone(head)) == NULL)
 802                 return (NULL);
 803
 804         if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
 805                 sodealloc(so);
 806                 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
 807                     __func__, head->so_pcb);
 808                 return (NULL);
 809         }
 810
 811         (void)solisten_enqueue(so, connstatus);
 812
 813         return (so);
 814 }
 815
 816 /*
 817  * Enqueue socket cloned by solisten_clone() to the listen queue of the
 818  * listener it has been cloned from.
 819  *
 820  * Return 'true' if socket landed on complete queue, otherwise 'false'.
 821  */
 822 bool
 823 solisten_enqueue(struct socket *so, int connstatus)
 824 {
 825         struct socket *head = so->so_listen;
 826
 827         MPASS(refcount_load(&so->so_count) == 0);
 828         refcount_init(&so->so_count, 1);
 829
 830         SOLISTEN_LOCK(head);
 831         if (head->sol_accept_filter != NULL)
 832                 connstatus = 0;
 833         so->so_state |= connstatus;
 834         soref(head); /* A socket on (in)complete queue refs head. */
 835         if (connstatus) {
 836                 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 837                 so->so_qstate = SQ_COMP;
 838                 head->sol_qlen++;
 839                 solisten_wakeup(head);  /* unlocks */
 840                 return (true);
 841         } else {
 842                 /*
 843                  * Keep removing sockets from the head until there's room for
 844                  * us to insert on the tail.  In pre-locking revisions, this
 845                  * was a simple if(), but as we could be racing with other
 846                  * threads and soabort() requires dropping locks, we must
 847                  * loop waiting for the condition to be true.
 848                  */
 849                 while (head->sol_incqlen > head->sol_qlimit) {
 850                         struct socket *sp;
 851
 852                         sp = TAILQ_FIRST(&head->sol_incomp);
 853                         TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 854                         head->sol_incqlen--;
 855                         SOCK_LOCK(sp);
 856                         sp->so_qstate = SQ_NONE;
 857                         sp->so_listen = NULL;
 858                         SOCK_UNLOCK(sp);
 859                         sorele_locked(head);    /* does SOLISTEN_UNLOCK, head stays */
 860                         soabort(sp);
 861                         SOLISTEN_LOCK(head);
 862                 }
 863                 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 864                 so->so_qstate = SQ_INCOMP;
 865                 head->sol_incqlen++;
 866                 SOLISTEN_UNLOCK(head);
 867                 return (false);
 868         }
 869 }
 870
 871 #if defined(SCTP) || defined(SCTP_SUPPORT)
 872 /*
 873  * Socket part of sctp_peeloff().  Detach a new socket from an
 874  * association.  The new socket is returned with a reference.
 875  *
 876  * XXXGL: reduce copy-paste with solisten_clone().
 877  */
 878 struct socket *
 879 sopeeloff(struct socket *head)
 880 {
 881         struct socket *so;
 882
 883         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 884             __func__, __LINE__, head));
 885         so = soalloc(head->so_vnet);
 886         if (so == NULL) {
 887                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 888                     "limit reached or out of memory\n",
 889                     __func__, head->so_pcb);
 890                 return (NULL);
 891         }
 892         so->so_type = head->so_type;
 893         so->so_options = head->so_options;
 894         so->so_linger = head->so_linger;
 895         so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 896         so->so_fibnum = head->so_fibnum;
 897         so->so_proto = head->so_proto;
 898         so->so_cred = crhold(head->so_cred);
 899 #ifdef MAC
 900         mac_socket_newconn(head, so);
 901 #endif
 902         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 903             so_rdknl_assert_lock);
 904         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 905             so_wrknl_assert_lock);
 906         VNET_SO_ASSERT(head);
 907         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 908                 sodealloc(so);
 909                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 910                     __func__, head->so_pcb);
 911                 return (NULL);
 912         }
 913         if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
 914                 sodealloc(so);
 915                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 916                     __func__, head->so_pcb);
 917                 return (NULL);
 918         }
 919         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 920         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 921         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 922         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 923         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 924         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 925
 926         soref(so);
 927
 928         return (so);
 929 }
 930 #endif  /* SCTP */
 931
 932 int
 933 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 934 {
 935         int error;
 936
 937         CURVNET_SET(so->so_vnet);
 938         error = so->so_proto->pr_bind(so, nam, td);
 939         CURVNET_RESTORE();
 940         return (error);
 941 }
 942
 943 int
 944 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 945 {
 946         int error;
 947
 948         CURVNET_SET(so->so_vnet);
 949         error = so->so_proto->pr_bindat(fd, so, nam, td);
 950         CURVNET_RESTORE();
 951         return (error);
 952 }
 953
 954 /*
 955  * solisten() transitions a socket from a non-listening state to a listening
 956  * state, but can also be used to update the listen queue depth on an
 957  * existing listen socket.  The protocol will call back into the sockets
 958  * layer using solisten_proto_check() and solisten_proto() to check and set
 959  * socket-layer listen state.  Call backs are used so that the protocol can
 960  * acquire both protocol and socket layer locks in whatever order is required
 961  * by the protocol.
 962  *
 963  * Protocol implementors are advised to hold the socket lock across the
 964  * socket-layer test and set to avoid races at the socket layer.
 965  */
 966 int
 967 solisten(struct socket *so, int backlog, struct thread *td)
 968 {
 969         int error;
 970
 971         CURVNET_SET(so->so_vnet);
 972         error = so->so_proto->pr_listen(so, backlog, td);
 973         CURVNET_RESTORE();
 974         return (error);
 975 }
 976
 977 /*
 978  * Prepare for a call to solisten_proto().  Acquire all socket buffer locks in
 979  * order to interlock with socket I/O.
 980  */
 981 int
 982 solisten_proto_check(struct socket *so)
 983 {
 984         SOCK_LOCK_ASSERT(so);
 985
 986         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 987             SS_ISDISCONNECTING)) != 0)
 988                 return (EINVAL);
 989
 990         /*
 991          * Sleeping is not permitted here, so simply fail if userspace is
 992          * attempting to transmit or receive on the socket.  This kind of
 993          * transient failure is not ideal, but it should occur only if userspace
 994          * is misusing the socket interfaces.
 995          */
 996         if (!sx_try_xlock(&so->so_snd_sx))
 997                 return (EAGAIN);
 998         if (!sx_try_xlock(&so->so_rcv_sx)) {
 999                 sx_xunlock(&so->so_snd_sx);
1000                 return (EAGAIN);
1001         }
1002         mtx_lock(&so->so_snd_mtx);
1003         mtx_lock(&so->so_rcv_mtx);
1004
1005         /* Interlock with soo_aio_queue() and KTLS. */
1006         if (!SOLISTENING(so)) {
1007                 bool ktls;
1008
1009 #ifdef KERN_TLS
1010                 ktls = so->so_snd.sb_tls_info != NULL ||
1011                     so->so_rcv.sb_tls_info != NULL;
1012 #else
1013                 ktls = false;
1014 #endif
1015                 if (ktls ||
1016                     (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
1017                     (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) {
1018                         solisten_proto_abort(so);
1019                         return (EINVAL);
1020                 }
1021         }
1022
1023         return (0);
1024 }
1025
1026 /*
1027  * Undo the setup done by solisten_proto_check().
1028  */
1029 void
1030 solisten_proto_abort(struct socket *so)
1031 {
1032         mtx_unlock(&so->so_snd_mtx);
1033         mtx_unlock(&so->so_rcv_mtx);
1034         sx_xunlock(&so->so_snd_sx);
1035         sx_xunlock(&so->so_rcv_sx);
1036 }
1037
1038 void
1039 solisten_proto(struct socket *so, int backlog)
1040 {
1041         int sbrcv_lowat, sbsnd_lowat;
1042         u_int sbrcv_hiwat, sbsnd_hiwat;
1043         short sbrcv_flags, sbsnd_flags;
1044         sbintime_t sbrcv_timeo, sbsnd_timeo;
1045
1046         SOCK_LOCK_ASSERT(so);
1047         KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
1048             SS_ISDISCONNECTING)) == 0,
1049             ("%s: bad socket state %p", __func__, so));
1050
1051         if (SOLISTENING(so))
1052                 goto listening;
1053
1054         /*
1055          * Change this socket to listening state.
1056          */
1057         sbrcv_lowat = so->so_rcv.sb_lowat;
1058         sbsnd_lowat = so->so_snd.sb_lowat;
1059         sbrcv_hiwat = so->so_rcv.sb_hiwat;
1060         sbsnd_hiwat = so->so_snd.sb_hiwat;
1061         sbrcv_flags = so->so_rcv.sb_flags;
1062         sbsnd_flags = so->so_snd.sb_flags;
1063         sbrcv_timeo = so->so_rcv.sb_timeo;
1064         sbsnd_timeo = so->so_snd.sb_timeo;
1065
1066         sbdestroy(so, SO_SND);
1067         sbdestroy(so, SO_RCV);
1068
1069 #ifdef INVARIANTS
1070         bzero(&so->so_rcv,
1071             sizeof(struct socket) - offsetof(struct socket, so_rcv));
1072 #endif
1073
1074         so->sol_sbrcv_lowat = sbrcv_lowat;
1075         so->sol_sbsnd_lowat = sbsnd_lowat;
1076         so->sol_sbrcv_hiwat = sbrcv_hiwat;
1077         so->sol_sbsnd_hiwat = sbsnd_hiwat;
1078         so->sol_sbrcv_flags = sbrcv_flags;
1079         so->sol_sbsnd_flags = sbsnd_flags;
1080         so->sol_sbrcv_timeo = sbrcv_timeo;
1081         so->sol_sbsnd_timeo = sbsnd_timeo;
1082
1083         so->sol_qlen = so->sol_incqlen = 0;
1084         TAILQ_INIT(&so->sol_incomp);
1085         TAILQ_INIT(&so->sol_comp);
1086
1087         so->sol_accept_filter = NULL;
1088         so->sol_accept_filter_arg = NULL;
1089         so->sol_accept_filter_str = NULL;
1090
1091         so->sol_upcall = NULL;
1092         so->sol_upcallarg = NULL;
1093
1094         so->so_options |= SO_ACCEPTCONN;
1095
1096 listening:
1097         if (backlog < 0 || backlog > somaxconn)
1098                 backlog = somaxconn;
1099         so->sol_qlimit = backlog;
1100
1101         mtx_unlock(&so->so_snd_mtx);
1102         mtx_unlock(&so->so_rcv_mtx);
1103         sx_xunlock(&so->so_snd_sx);
1104         sx_xunlock(&so->so_rcv_sx);
1105 }
1106
1107 /*
1108  * Wakeup listeners/subsystems once we have a complete connection.
1109  * Enters with lock, returns unlocked.
1110  */
1111 void
1112 solisten_wakeup(struct socket *sol)
1113 {
1114
1115         if (sol->sol_upcall != NULL)
1116                 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
1117         else {
1118                 selwakeuppri(&sol->so_rdsel, PSOCK);
1119                 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
1120         }
1121         SOLISTEN_UNLOCK(sol);
1122         wakeup_one(&sol->sol_comp);
1123         if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
1124                 pgsigio(&sol->so_sigio, SIGIO, 0);
1125 }
1126
1127 /*
1128  * Return single connection off a listening socket queue.  Main consumer of
1129  * the function is kern_accept4().  Some modules, that do their own accept
1130  * management also use the function.  The socket reference held by the
1131  * listen queue is handed to the caller.
1132  *
1133  * Listening socket must be locked on entry and is returned unlocked on
1134  * return.
1135  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1136  */
1137 int
1138 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1139 {
1140         struct socket *so;
1141         int error;
1142
1143         SOLISTEN_LOCK_ASSERT(head);
1144
1145         while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1146             head->so_error == 0) {
1147                 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1148                     "accept", 0);
1149                 if (error != 0) {
1150                         SOLISTEN_UNLOCK(head);
1151                         return (error);
1152                 }
1153         }
1154         if (head->so_error) {
1155                 error = head->so_error;
1156                 head->so_error = 0;
1157         } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1158                 error = EWOULDBLOCK;
1159         else
1160                 error = 0;
1161         if (error) {
1162                 SOLISTEN_UNLOCK(head);
1163                 return (error);
1164         }
1165         so = TAILQ_FIRST(&head->sol_comp);
1166         SOCK_LOCK(so);
1167         KASSERT(so->so_qstate == SQ_COMP,
1168             ("%s: so %p not SQ_COMP", __func__, so));
1169         head->sol_qlen--;
1170         so->so_qstate = SQ_NONE;
1171         so->so_listen = NULL;
1172         TAILQ_REMOVE(&head->sol_comp, so, so_list);
1173         if (flags & ACCEPT4_INHERIT)
1174                 so->so_state |= (head->so_state & SS_NBIO);
1175         else
1176                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1177         SOCK_UNLOCK(so);
1178         sorele_locked(head);
1179
1180         *ret = so;
1181         return (0);
1182 }
1183
1184 /*
1185  * Free socket upon release of the very last reference.
1186  */
1187 static void
1188 sofree(struct socket *so)
1189 {
1190         struct protosw *pr = so->so_proto;
1191
1192         SOCK_LOCK_ASSERT(so);
1193         KASSERT(refcount_load(&so->so_count) == 0,
1194             ("%s: so %p has references", __func__, so));
1195         KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
1196             ("%s: so %p is on listen queue", __func__, so));
1197
1198         SOCK_UNLOCK(so);
1199
1200         if (so->so_dtor != NULL)
1201                 so->so_dtor(so);
1202
1203         VNET_SO_ASSERT(so);
1204         if (pr->pr_detach != NULL)
1205                 pr->pr_detach(so);
1206
1207         /*
1208          * From this point on, we assume that no other references to this
1209          * socket exist anywhere else in the stack.  Therefore, no locks need
1210          * to be acquired or held.
1211          */
1212         if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
1213                 sbdestroy(so, SO_SND);
1214                 sbdestroy(so, SO_RCV);
1215         }
1216         seldrain(&so->so_rdsel);
1217         seldrain(&so->so_wrsel);
1218         knlist_destroy(&so->so_rdsel.si_note);
1219         knlist_destroy(&so->so_wrsel.si_note);
1220         sodealloc(so);
1221 }
1222
1223 /*
1224  * Release a reference on a socket while holding the socket lock.
1225  * Unlocks the socket lock before returning.
1226  */
1227 void
1228 sorele_locked(struct socket *so)
1229 {
1230         SOCK_LOCK_ASSERT(so);
1231         if (refcount_release(&so->so_count))
1232                 sofree(so);
1233         else
1234                 SOCK_UNLOCK(so);
1235 }
1236
1237 /*
1238  * Close a socket on last file table reference removal.  Initiate disconnect
1239  * if connected.  Free socket when disconnect complete.
1240  *
1241  * This function will sorele() the socket.  Note that soclose() may be called
1242  * prior to the ref count reaching zero.  The actual socket structure will
1243  * not be freed until the ref count reaches zero.
1244  */
1245 int
1246 soclose(struct socket *so)
1247 {
1248         struct accept_queue lqueue;
1249         int error = 0;
1250         bool listening, last __diagused;
1251
1252         CURVNET_SET(so->so_vnet);
1253         funsetown(&so->so_sigio);
1254         if (so->so_state & SS_ISCONNECTED) {
1255                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1256                         error = sodisconnect(so);
1257                         if (error) {
1258                                 if (error == ENOTCONN)
1259                                         error = 0;
1260                                 goto drop;
1261                         }
1262                 }
1263
1264                 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1265                         if ((so->so_state & SS_ISDISCONNECTING) &&
1266                             (so->so_state & SS_NBIO))
1267                                 goto drop;
1268                         while (so->so_state & SS_ISCONNECTED) {
1269                                 error = tsleep(&so->so_timeo,
1270                                     PSOCK | PCATCH, "soclos",
1271                                     so->so_linger * hz);
1272                                 if (error)
1273                                         break;
1274                         }
1275                 }
1276         }
1277
1278 drop:
1279         if (so->so_proto->pr_close != NULL)
1280                 so->so_proto->pr_close(so);
1281
1282         SOCK_LOCK(so);
1283         if ((listening = SOLISTENING(so))) {
1284                 struct socket *sp;
1285
1286                 TAILQ_INIT(&lqueue);
1287                 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1288                 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1289
1290                 so->sol_qlen = so->sol_incqlen = 0;
1291
1292                 TAILQ_FOREACH(sp, &lqueue, so_list) {
1293                         SOCK_LOCK(sp);
1294                         sp->so_qstate = SQ_NONE;
1295                         sp->so_listen = NULL;
1296                         SOCK_UNLOCK(sp);
1297                         last = refcount_release(&so->so_count);
1298                         KASSERT(!last, ("%s: released last reference for %p",
1299                             __func__, so));
1300                 }
1301         }
1302         sorele_locked(so);
1303         if (listening) {
1304                 struct socket *sp, *tsp;
1305
1306                 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
1307                         soabort(sp);
1308         }
1309         CURVNET_RESTORE();
1310         return (error);
1311 }
1312
1313 /*
1314  * soabort() is used to abruptly tear down a connection, such as when a
1315  * resource limit is reached (listen queue depth exceeded), or if a listen
1316  * socket is closed while there are sockets waiting to be accepted.
1317  *
1318  * This interface is tricky, because it is called on an unreferenced socket,
1319  * and must be called only by a thread that has actually removed the socket
1320  * from the listen queue it was on.  Likely this thread holds the last
1321  * reference on the socket and soabort() will proceed with sofree().  But
1322  * it might be not the last, as the sockets on the listen queues are seen
1323  * from the protocol side.
1324  *
1325  * This interface will call into the protocol code, so must not be called
1326  * with any socket locks held.  Protocols do call it while holding their own
1327  * recursible protocol mutexes, but this is something that should be subject
1328  * to review in the future.
1329  *
1330  * Usually socket should have a single reference left, but this is not a
1331  * requirement.  In the past, when we have had named references for file
1332  * descriptor and protocol, we asserted that none of them are being held.
1333  */
1334 void
1335 soabort(struct socket *so)
1336 {
1337
1338         VNET_SO_ASSERT(so);
1339
1340         if (so->so_proto->pr_abort != NULL)
1341                 so->so_proto->pr_abort(so);
1342         SOCK_LOCK(so);
1343         sorele_locked(so);
1344 }
1345
1346 int
1347 soaccept(struct socket *so, struct sockaddr *sa)
1348 {
1349 #ifdef INVARIANTS
1350         u_char len = sa->sa_len;
1351 #endif
1352         int error;
1353
1354         CURVNET_SET(so->so_vnet);
1355         error = so->so_proto->pr_accept(so, sa);
1356         KASSERT(sa->sa_len <= len,
1357             ("%s: protocol %p sockaddr overflow", __func__, so->so_proto));
1358         CURVNET_RESTORE();
1359         return (error);
1360 }
1361
1362 int
1363 sopeeraddr(struct socket *so, struct sockaddr *sa)
1364 {
1365 #ifdef INVARIANTS
1366         u_char len = sa->sa_len;
1367 #endif
1368         int error;
1369
1370         CURVNET_SET(so->so_vnet);
1371         error = so->so_proto->pr_peeraddr(so, sa);
1372         KASSERT(sa->sa_len <= len,
1373             ("%s: protocol %p sockaddr overflow", __func__, so->so_proto));
1374         CURVNET_RESTORE();
1375
1376         return (error);
1377 }
1378
1379 int
1380 sosockaddr(struct socket *so, struct sockaddr *sa)
1381 {
1382 #ifdef INVARIANTS
1383         u_char len = sa->sa_len;
1384 #endif
1385         int error;
1386
1387         CURVNET_SET(so->so_vnet);
1388         error = so->so_proto->pr_sockaddr(so, sa);
1389         KASSERT(sa->sa_len <= len,
1390             ("%s: protocol %p sockaddr overflow", __func__, so->so_proto));
1391         CURVNET_RESTORE();
1392
1393         return (error);
1394 }
1395
1396 int
1397 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1398 {
1399
1400         return (soconnectat(AT_FDCWD, so, nam, td));
1401 }
1402
1403 int
1404 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1405 {
1406         int error;
1407
1408         CURVNET_SET(so->so_vnet);
1409
1410         /*
1411          * If protocol is connection-based, can only connect once.
1412          * Otherwise, if connected, try to disconnect first.  This allows
1413          * user to disconnect by connecting to, e.g., a null address.
1414          *
1415          * Note, this check is racy and may need to be re-evaluated at the
1416          * protocol layer.
1417          */
1418         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1419             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1420             (error = sodisconnect(so)))) {
1421                 error = EISCONN;
1422         } else {
1423                 /*
1424                  * Prevent accumulated error from previous connection from
1425                  * biting us.
1426                  */
1427                 so->so_error = 0;
1428                 if (fd == AT_FDCWD) {
1429                         error = so->so_proto->pr_connect(so, nam, td);
1430                 } else {
1431                         error = so->so_proto->pr_connectat(fd, so, nam, td);
1432                 }
1433         }
1434         CURVNET_RESTORE();
1435
1436         return (error);
1437 }
1438
1439 int
1440 soconnect2(struct socket *so1, struct socket *so2)
1441 {
1442         int error;
1443
1444         CURVNET_SET(so1->so_vnet);
1445         error = so1->so_proto->pr_connect2(so1, so2);
1446         CURVNET_RESTORE();
1447         return (error);
1448 }
1449
1450 int
1451 sodisconnect(struct socket *so)
1452 {
1453         int error;
1454
1455         if ((so->so_state & SS_ISCONNECTED) == 0)
1456                 return (ENOTCONN);
1457         if (so->so_state & SS_ISDISCONNECTING)
1458                 return (EALREADY);
1459         VNET_SO_ASSERT(so);
1460         error = so->so_proto->pr_disconnect(so);
1461         return (error);
1462 }
1463
1464 int
1465 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1466     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1467 {
1468         long space;
1469         ssize_t resid;
1470         int clen = 0, error, dontroute;
1471
1472         KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1473         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1474             ("sosend_dgram: !PR_ATOMIC"));
1475
1476         if (uio != NULL)
1477                 resid = uio->uio_resid;
1478         else
1479                 resid = top->m_pkthdr.len;
1480         /*
1481          * In theory resid should be unsigned.  However, space must be
1482          * signed, as it might be less than 0 if we over-committed, and we
1483          * must use a signed comparison of space and resid.  On the other
1484          * hand, a negative resid causes us to loop sending 0-length
1485          * segments to the protocol.
1486          */
1487         if (resid < 0) {
1488                 error = EINVAL;
1489                 goto out;
1490         }
1491
1492         dontroute =
1493             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1494         if (td != NULL)
1495                 td->td_ru.ru_msgsnd++;
1496         if (control != NULL)
1497                 clen = control->m_len;
1498
1499         SOCKBUF_LOCK(&so->so_snd);
1500         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1501                 SOCKBUF_UNLOCK(&so->so_snd);
1502                 error = EPIPE;
1503                 goto out;
1504         }
1505         if (so->so_error) {
1506                 error = so->so_error;
1507                 so->so_error = 0;
1508                 SOCKBUF_UNLOCK(&so->so_snd);
1509                 goto out;
1510         }
1511         if ((so->so_state & SS_ISCONNECTED) == 0) {
1512                 /*
1513                  * `sendto' and `sendmsg' is allowed on a connection-based
1514                  * socket if it supports implied connect.  Return ENOTCONN if
1515                  * not connected and no address is supplied.
1516                  */
1517                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1518                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1519                         if (!(resid == 0 && clen != 0)) {
1520                                 SOCKBUF_UNLOCK(&so->so_snd);
1521                                 error = ENOTCONN;
1522                                 goto out;
1523                         }
1524                 } else if (addr == NULL) {
1525                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1526                                 error = ENOTCONN;
1527                         else
1528                                 error = EDESTADDRREQ;
1529                         SOCKBUF_UNLOCK(&so->so_snd);
1530                         goto out;
1531                 }
1532         }
1533
1534         /*
1535          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1536          * problem and need fixing.
1537          */
1538         space = sbspace(&so->so_snd);
1539         if (flags & MSG_OOB)
1540                 space += 1024;
1541         space -= clen;
1542         SOCKBUF_UNLOCK(&so->so_snd);
1543         if (resid > space) {
1544                 error = EMSGSIZE;
1545                 goto out;
1546         }
1547         if (uio == NULL) {
1548                 resid = 0;
1549                 if (flags & MSG_EOR)
1550                         top->m_flags |= M_EOR;
1551         } else {
1552                 /*
1553                  * Copy the data from userland into a mbuf chain.
1554                  * If no data is to be copied in, a single empty mbuf
1555                  * is returned.
1556                  */
1557                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1558                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1559                 if (top == NULL) {
1560                         error = EFAULT; /* only possible error */
1561                         goto out;
1562                 }
1563                 space -= resid - uio->uio_resid;
1564                 resid = uio->uio_resid;
1565         }
1566         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1567         /*
1568          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1569          * than with.
1570          */
1571         if (dontroute) {
1572                 SOCK_LOCK(so);
1573                 so->so_options |= SO_DONTROUTE;
1574                 SOCK_UNLOCK(so);
1575         }
1576         /*
1577          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1578          * of date.  We could have received a reset packet in an interrupt or
1579          * maybe we slept while doing page faults in uiomove() etc.  We could
1580          * probably recheck again inside the locking protection here, but
1581          * there are probably other places that this also happens.  We must
1582          * rethink this.
1583          */
1584         VNET_SO_ASSERT(so);
1585         error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1586         /*
1587          * If the user set MSG_EOF, the protocol understands this flag and
1588          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1589          */
1590             ((flags & MSG_EOF) &&
1591              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1592              (resid <= 0)) ?
1593                 PRUS_EOF :
1594                 /* If there is more to send set PRUS_MORETOCOME */
1595                 (flags & MSG_MORETOCOME) ||
1596                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1597                 top, addr, control, td);
1598         if (dontroute) {
1599                 SOCK_LOCK(so);
1600                 so->so_options &= ~SO_DONTROUTE;
1601                 SOCK_UNLOCK(so);
1602         }
1603         clen = 0;
1604         control = NULL;
1605         top = NULL;
1606 out:
1607         if (top != NULL)
1608                 m_freem(top);
1609         if (control != NULL)
1610                 m_freem(control);
1611         return (error);
1612 }
1613
1614 /*
1615  * Send on a socket.  If send must go all at once and message is larger than
1616  * send buffering, then hard error.  Lock against other senders.  If must go
1617  * all at once and not enough room now, then inform user that this would
1618  * block and do nothing.  Otherwise, if nonblocking, send as much as
1619  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1620  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1621  * in mbuf chain must be small enough to send all at once.
1622  *
1623  * Returns nonzero on error, timeout or signal; callers must check for short
1624  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1625  * on return.
1626  */
1627 int
1628 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1629     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1630 {
1631         long space;
1632         ssize_t resid;
1633         int clen = 0, error, dontroute;
1634         int atomic = sosendallatonce(so) || top;
1635         int pr_send_flag;
1636 #ifdef KERN_TLS
1637         struct ktls_session *tls;
1638         int tls_enq_cnt, tls_send_flag;
1639         uint8_t tls_rtype;
1640
1641         tls = NULL;
1642         tls_rtype = TLS_RLTYPE_APP;
1643 #endif
1644         if (uio != NULL)
1645                 resid = uio->uio_resid;
1646         else if ((top->m_flags & M_PKTHDR) != 0)
1647                 resid = top->m_pkthdr.len;
1648         else
1649                 resid = m_length(top, NULL);
1650         /*
1651          * In theory resid should be unsigned.  However, space must be
1652          * signed, as it might be less than 0 if we over-committed, and we
1653          * must use a signed comparison of space and resid.  On the other
1654          * hand, a negative resid causes us to loop sending 0-length
1655          * segments to the protocol.
1656          *
1657          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1658          * type sockets since that's an error.
1659          */
1660         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1661                 error = EINVAL;
1662                 goto out;
1663         }
1664
1665         dontroute =
1666             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1667             (so->so_proto->pr_flags & PR_ATOMIC);
1668         if (td != NULL)
1669                 td->td_ru.ru_msgsnd++;
1670         if (control != NULL)
1671                 clen = control->m_len;
1672
1673         error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1674         if (error)
1675                 goto out;
1676
1677 #ifdef KERN_TLS
1678         tls_send_flag = 0;
1679         tls = ktls_hold(so->so_snd.sb_tls_info);
1680         if (tls != NULL) {
1681                 if (tls->mode == TCP_TLS_MODE_SW)
1682                         tls_send_flag = PRUS_NOTREADY;
1683
1684                 if (control != NULL) {
1685                         struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1686
1687                         if (clen >= sizeof(*cm) &&
1688                             cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1689                                 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1690                                 clen = 0;
1691                                 m_freem(control);
1692                                 control = NULL;
1693                                 atomic = 1;
1694                         }
1695                 }
1696
1697                 if (resid == 0 && !ktls_permit_empty_frames(tls)) {
1698                         error = EINVAL;
1699                         goto release;
1700                 }
1701         }
1702 #endif
1703
1704 restart:
1705         do {
1706                 SOCKBUF_LOCK(&so->so_snd);
1707                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1708                         SOCKBUF_UNLOCK(&so->so_snd);
1709                         error = EPIPE;
1710                         goto release;
1711                 }
1712                 if (so->so_error) {
1713                         error = so->so_error;
1714                         so->so_error = 0;
1715                         SOCKBUF_UNLOCK(&so->so_snd);
1716                         goto release;
1717                 }
1718                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1719                         /*
1720                          * `sendto' and `sendmsg' is allowed on a connection-
1721                          * based socket if it supports implied connect.
1722                          * Return ENOTCONN if not connected and no address is
1723                          * supplied.
1724                          */
1725                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1726                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1727                                 if (!(resid == 0 && clen != 0)) {
1728                                         SOCKBUF_UNLOCK(&so->so_snd);
1729                                         error = ENOTCONN;
1730                                         goto release;
1731                                 }
1732                         } else if (addr == NULL) {
1733                                 SOCKBUF_UNLOCK(&so->so_snd);
1734                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1735                                         error = ENOTCONN;
1736                                 else
1737                                         error = EDESTADDRREQ;
1738                                 goto release;
1739                         }
1740                 }
1741                 space = sbspace(&so->so_snd);
1742                 if (flags & MSG_OOB)
1743                         space += 1024;
1744                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1745                     clen > so->so_snd.sb_hiwat) {
1746                         SOCKBUF_UNLOCK(&so->so_snd);
1747                         error = EMSGSIZE;
1748                         goto release;
1749                 }
1750                 if (space < resid + clen &&
1751                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1752                         if ((so->so_state & SS_NBIO) ||
1753                             (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1754                                 SOCKBUF_UNLOCK(&so->so_snd);
1755                                 error = EWOULDBLOCK;
1756                                 goto release;
1757                         }
1758                         error = sbwait(so, SO_SND);
1759                         SOCKBUF_UNLOCK(&so->so_snd);
1760                         if (error)
1761                                 goto release;
1762                         goto restart;
1763                 }
1764                 SOCKBUF_UNLOCK(&so->so_snd);
1765                 space -= clen;
1766                 do {
1767                         if (uio == NULL) {
1768                                 resid = 0;
1769                                 if (flags & MSG_EOR)
1770                                         top->m_flags |= M_EOR;
1771 #ifdef KERN_TLS
1772                                 if (tls != NULL) {
1773                                         ktls_frame(top, tls, &tls_enq_cnt,
1774                                             tls_rtype);
1775                                         tls_rtype = TLS_RLTYPE_APP;
1776                                 }
1777 #endif
1778                         } else {
1779                                 /*
1780                                  * Copy the data from userland into a mbuf
1781                                  * chain.  If resid is 0, which can happen
1782                                  * only if we have control to send, then
1783                                  * a single empty mbuf is returned.  This
1784                                  * is a workaround to prevent protocol send
1785                                  * methods to panic.
1786                                  */
1787 #ifdef KERN_TLS
1788                                 if (tls != NULL) {
1789                                         top = m_uiotombuf(uio, M_WAITOK, space,
1790                                             tls->params.max_frame_len,
1791                                             M_EXTPG |
1792                                             ((flags & MSG_EOR) ? M_EOR : 0));
1793                                         if (top != NULL) {
1794                                                 ktls_frame(top, tls,
1795                                                     &tls_enq_cnt, tls_rtype);
1796                                         }
1797                                         tls_rtype = TLS_RLTYPE_APP;
1798                                 } else
1799 #endif
1800                                         top = m_uiotombuf(uio, M_WAITOK, space,
1801                                             (atomic ? max_hdr : 0),
1802                                             (atomic ? M_PKTHDR : 0) |
1803                                             ((flags & MSG_EOR) ? M_EOR : 0));
1804                                 if (top == NULL) {
1805                                         error = EFAULT; /* only possible error */
1806                                         goto release;
1807                                 }
1808                                 space -= resid - uio->uio_resid;
1809                                 resid = uio->uio_resid;
1810                         }
1811                         if (dontroute) {
1812                                 SOCK_LOCK(so);
1813                                 so->so_options |= SO_DONTROUTE;
1814                                 SOCK_UNLOCK(so);
1815                         }
1816                         /*
1817                          * XXX all the SBS_CANTSENDMORE checks previously
1818                          * done could be out of date.  We could have received
1819                          * a reset packet in an interrupt or maybe we slept
1820                          * while doing page faults in uiomove() etc.  We
1821                          * could probably recheck again inside the locking
1822                          * protection here, but there are probably other
1823                          * places that this also happens.  We must rethink
1824                          * this.
1825                          */
1826                         VNET_SO_ASSERT(so);
1827
1828                         pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
1829                         /*
1830                          * If the user set MSG_EOF, the protocol understands
1831                          * this flag and nothing left to send then use
1832                          * PRU_SEND_EOF instead of PRU_SEND.
1833                          */
1834                             ((flags & MSG_EOF) &&
1835                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1836                              (resid <= 0)) ?
1837                                 PRUS_EOF :
1838                         /* If there is more to send set PRUS_MORETOCOME. */
1839                             (flags & MSG_MORETOCOME) ||
1840                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1841
1842 #ifdef KERN_TLS
1843                         pr_send_flag |= tls_send_flag;
1844 #endif
1845
1846                         error = so->so_proto->pr_send(so, pr_send_flag, top,
1847                             addr, control, td);
1848
1849                         if (dontroute) {
1850                                 SOCK_LOCK(so);
1851                                 so->so_options &= ~SO_DONTROUTE;
1852                                 SOCK_UNLOCK(so);
1853                         }
1854
1855 #ifdef KERN_TLS
1856                         if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1857                                 if (error != 0) {
1858                                         m_freem(top);
1859                                         top = NULL;
1860                                 } else {
1861                                         soref(so);
1862                                         ktls_enqueue(top, so, tls_enq_cnt);
1863                                 }
1864                         }
1865 #endif
1866                         clen = 0;
1867                         control = NULL;
1868                         top = NULL;
1869                         if (error)
1870                                 goto release;
1871                 } while (resid && space > 0);
1872         } while (resid);
1873
1874 release:
1875         SOCK_IO_SEND_UNLOCK(so);
1876 out:
1877 #ifdef KERN_TLS
1878         if (tls != NULL)
1879                 ktls_free(tls);
1880 #endif
1881         if (top != NULL)
1882                 m_freem(top);
1883         if (control != NULL)
1884                 m_freem(control);
1885         return (error);
1886 }
1887
1888 /*
1889  * Send to a socket from a kernel thread.
1890  *
1891  * XXXGL: in almost all cases uio is NULL and the mbuf is supplied.
1892  * Exception is nfs/bootp_subr.c.  It is arguable that the VNET context needs
1893  * to be set at all.  This function should just boil down to a static inline
1894  * calling the protocol method.
1895  */
1896 int
1897 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1898     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1899 {
1900         int error;
1901
1902         CURVNET_SET(so->so_vnet);
1903         error = so->so_proto->pr_sosend(so, addr, uio,
1904             top, control, flags, td);
1905         CURVNET_RESTORE();
1906         return (error);
1907 }
1908
1909 /*
1910  * send(2), write(2) or aio_write(2) on a socket.
1911  */
1912 int
1913 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1914     struct mbuf *control, int flags, struct proc *userproc)
1915 {
1916         struct thread *td;
1917         ssize_t len;
1918         int error;
1919
1920         td = uio->uio_td;
1921         len = uio->uio_resid;
1922         CURVNET_SET(so->so_vnet);
1923         error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags,
1924             td);
1925         CURVNET_RESTORE();
1926         if (error != 0) {
1927                 /*
1928                  * Clear transient errors for stream protocols if they made
1929                  * some progress.  Make exclusion for aio(4) that would
1930                  * schedule a new write in case of EWOULDBLOCK and clear
1931                  * error itself.  See soaio_process_job().
1932                  */
1933                 if (uio->uio_resid != len &&
1934                     (so->so_proto->pr_flags & PR_ATOMIC) == 0 &&
1935                     userproc == NULL &&
1936                     (error == ERESTART || error == EINTR ||
1937                     error == EWOULDBLOCK))
1938                         error = 0;
1939                 /* Generation of SIGPIPE can be controlled per socket. */
1940                 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 &&
1941                     (flags & MSG_NOSIGNAL) == 0) {
1942                         if (userproc != NULL) {
1943                                 /* aio(4) job */
1944                                 PROC_LOCK(userproc);
1945                                 kern_psignal(userproc, SIGPIPE);
1946                                 PROC_UNLOCK(userproc);
1947                         } else {
1948                                 PROC_LOCK(td->td_proc);
1949                                 tdsignal(td, SIGPIPE);
1950                                 PROC_UNLOCK(td->td_proc);
1951                         }
1952                 }
1953         }
1954         return (error);
1955 }
1956
1957 /*
1958  * The part of soreceive() that implements reading non-inline out-of-band
1959  * data from a socket.  For more complete comments, see soreceive(), from
1960  * which this code originated.
1961  *
1962  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1963  * unable to return an mbuf chain to the caller.
1964  */
1965 static int
1966 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1967 {
1968         struct protosw *pr = so->so_proto;
1969         struct mbuf *m;
1970         int error;
1971
1972         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1973         VNET_SO_ASSERT(so);
1974
1975         m = m_get(M_WAITOK, MT_DATA);
1976         error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
1977         if (error)
1978                 goto bad;
1979         do {
1980                 error = uiomove(mtod(m, void *),
1981                     (int) min(uio->uio_resid, m->m_len), uio);
1982                 m = m_free(m);
1983         } while (uio->uio_resid && error == 0 && m);
1984 bad:
1985         if (m != NULL)
1986                 m_freem(m);
1987         return (error);
1988 }
1989
1990 /*
1991  * Following replacement or removal of the first mbuf on the first mbuf chain
1992  * of a socket buffer, push necessary state changes back into the socket
1993  * buffer so that other consumers see the values consistently.  'nextrecord'
1994  * is the callers locally stored value of the original value of
1995  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1996  * NOTE: 'nextrecord' may be NULL.
1997  */
1998 static __inline void
1999 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
2000 {
2001
2002         SOCKBUF_LOCK_ASSERT(sb);
2003         /*
2004          * First, update for the new value of nextrecord.  If necessary, make
2005          * it the first record.
2006          */
2007         if (sb->sb_mb != NULL)
2008                 sb->sb_mb->m_nextpkt = nextrecord;
2009         else
2010                 sb->sb_mb = nextrecord;
2011
2012         /*
2013          * Now update any dependent socket buffer fields to reflect the new
2014          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
2015          * addition of a second clause that takes care of the case where
2016          * sb_mb has been updated, but remains the last record.
2017          */
2018         if (sb->sb_mb == NULL) {
2019                 sb->sb_mbtail = NULL;
2020                 sb->sb_lastrecord = NULL;
2021         } else if (sb->sb_mb->m_nextpkt == NULL)
2022                 sb->sb_lastrecord = sb->sb_mb;
2023 }
2024
2025 /*
2026  * Implement receive operations on a socket.  We depend on the way that
2027  * records are added to the sockbuf by sbappend.  In particular, each record
2028  * (mbufs linked through m_next) must begin with an address if the protocol
2029  * so specifies, followed by an optional mbuf or mbufs containing ancillary
2030  * data, and then zero or more mbufs of data.  In order to allow parallelism
2031  * between network receive and copying to user space, as well as avoid
2032  * sleeping with a mutex held, we release the socket buffer mutex during the
2033  * user space copy.  Although the sockbuf is locked, new data may still be
2034  * appended, and thus we must maintain consistency of the sockbuf during that
2035  * time.
2036  *
2037  * The caller may receive the data as a single mbuf chain by supplying an
2038  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
2039  * the count in uio_resid.
2040  */
2041 int
2042 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
2043     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2044 {
2045         struct mbuf *m, **mp;
2046         int flags, error, offset;
2047         ssize_t len;
2048         struct protosw *pr = so->so_proto;
2049         struct mbuf *nextrecord;
2050         int moff, type = 0;
2051         ssize_t orig_resid = uio->uio_resid;
2052         bool report_real_len = false;
2053
2054         mp = mp0;
2055         if (psa != NULL)
2056                 *psa = NULL;
2057         if (controlp != NULL)
2058                 *controlp = NULL;
2059         if (flagsp != NULL) {
2060                 report_real_len = *flagsp & MSG_TRUNC;
2061                 *flagsp &= ~MSG_TRUNC;
2062                 flags = *flagsp &~ MSG_EOR;
2063         } else
2064                 flags = 0;
2065         if (flags & MSG_OOB)
2066                 return (soreceive_rcvoob(so, uio, flags));
2067         if (mp != NULL)
2068                 *mp = NULL;
2069
2070         error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2071         if (error)
2072                 return (error);
2073
2074 restart:
2075         SOCKBUF_LOCK(&so->so_rcv);
2076         m = so->so_rcv.sb_mb;
2077         /*
2078          * If we have less data than requested, block awaiting more (subject
2079          * to any timeout) if:
2080          *   1. the current count is less than the low water mark, or
2081          *   2. MSG_DONTWAIT is not set
2082          */
2083         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2084             sbavail(&so->so_rcv) < uio->uio_resid) &&
2085             sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
2086             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2087                 KASSERT(m != NULL || !sbavail(&so->so_rcv),
2088                     ("receive: m == %p sbavail == %u",
2089                     m, sbavail(&so->so_rcv)));
2090                 if (so->so_error || so->so_rerror) {
2091                         if (m != NULL)
2092                                 goto dontblock;
2093                         if (so->so_error)
2094                                 error = so->so_error;
2095                         else
2096                                 error = so->so_rerror;
2097                         if ((flags & MSG_PEEK) == 0) {
2098                                 if (so->so_error)
2099                                         so->so_error = 0;
2100                                 else
2101                                         so->so_rerror = 0;
2102                         }
2103                         SOCKBUF_UNLOCK(&so->so_rcv);
2104                         goto release;
2105                 }
2106                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2107                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2108                         if (m != NULL)
2109                                 goto dontblock;
2110 #ifdef KERN_TLS
2111                         else if (so->so_rcv.sb_tlsdcc == 0 &&
2112                             so->so_rcv.sb_tlscc == 0) {
2113 #else
2114                         else {
2115 #endif
2116                                 SOCKBUF_UNLOCK(&so->so_rcv);
2117                                 goto release;
2118                         }
2119                 }
2120                 for (; m != NULL; m = m->m_next)
2121                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
2122                                 m = so->so_rcv.sb_mb;
2123                                 goto dontblock;
2124                         }
2125                 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
2126                     SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
2127                     (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2128                         SOCKBUF_UNLOCK(&so->so_rcv);
2129                         error = ENOTCONN;
2130                         goto release;
2131                 }
2132                 if (uio->uio_resid == 0 && !report_real_len) {
2133                         SOCKBUF_UNLOCK(&so->so_rcv);
2134                         goto release;
2135                 }
2136                 if ((so->so_state & SS_NBIO) ||
2137                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2138                         SOCKBUF_UNLOCK(&so->so_rcv);
2139                         error = EWOULDBLOCK;
2140                         goto release;
2141                 }
2142                 SBLASTRECORDCHK(&so->so_rcv);
2143                 SBLASTMBUFCHK(&so->so_rcv);
2144                 error = sbwait(so, SO_RCV);
2145                 SOCKBUF_UNLOCK(&so->so_rcv);
2146                 if (error)
2147                         goto release;
2148                 goto restart;
2149         }
2150 dontblock:
2151         /*
2152          * From this point onward, we maintain 'nextrecord' as a cache of the
2153          * pointer to the next record in the socket buffer.  We must keep the
2154          * various socket buffer pointers and local stack versions of the
2155          * pointers in sync, pushing out modifications before dropping the
2156          * socket buffer mutex, and re-reading them when picking it up.
2157          *
2158          * Otherwise, we will race with the network stack appending new data
2159          * or records onto the socket buffer by using inconsistent/stale
2160          * versions of the field, possibly resulting in socket buffer
2161          * corruption.
2162          *
2163          * By holding the high-level sblock(), we prevent simultaneous
2164          * readers from pulling off the front of the socket buffer.
2165          */
2166         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2167         if (uio->uio_td)
2168                 uio->uio_td->td_ru.ru_msgrcv++;
2169         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2170         SBLASTRECORDCHK(&so->so_rcv);
2171         SBLASTMBUFCHK(&so->so_rcv);
2172         nextrecord = m->m_nextpkt;
2173         if (pr->pr_flags & PR_ADDR) {
2174                 KASSERT(m->m_type == MT_SONAME,
2175                     ("m->m_type == %d", m->m_type));
2176                 orig_resid = 0;
2177                 if (psa != NULL)
2178                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2179                             M_NOWAIT);
2180                 if (flags & MSG_PEEK) {
2181                         m = m->m_next;
2182                 } else {
2183                         sbfree(&so->so_rcv, m);
2184                         so->so_rcv.sb_mb = m_free(m);
2185                         m = so->so_rcv.sb_mb;
2186                         sockbuf_pushsync(&so->so_rcv, nextrecord);
2187                 }
2188         }
2189
2190         /*
2191          * Process one or more MT_CONTROL mbufs present before any data mbufs
2192          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2193          * just copy the data; if !MSG_PEEK, we call into the protocol to
2194          * perform externalization (or freeing if controlp == NULL).
2195          */
2196         if (m != NULL && m->m_type == MT_CONTROL) {
2197                 struct mbuf *cm = NULL, *cmn;
2198                 struct mbuf **cme = &cm;
2199 #ifdef KERN_TLS
2200                 struct cmsghdr *cmsg;
2201                 struct tls_get_record tgr;
2202
2203                 /*
2204                  * For MSG_TLSAPPDATA, check for an alert record.
2205                  * If found, return ENXIO without removing
2206                  * it from the receive queue.  This allows a subsequent
2207                  * call without MSG_TLSAPPDATA to receive it.
2208                  * Note that, for TLS, there should only be a single
2209                  * control mbuf with the TLS_GET_RECORD message in it.
2210                  */
2211                 if (flags & MSG_TLSAPPDATA) {
2212                         cmsg = mtod(m, struct cmsghdr *);
2213                         if (cmsg->cmsg_type == TLS_GET_RECORD &&
2214                             cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2215                                 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2216                                 if (__predict_false(tgr.tls_type ==
2217                                     TLS_RLTYPE_ALERT)) {
2218                                         SOCKBUF_UNLOCK(&so->so_rcv);
2219                                         error = ENXIO;
2220                                         goto release;
2221                                 }
2222                         }
2223                 }
2224 #endif
2225
2226                 do {
2227                         if (flags & MSG_PEEK) {
2228                                 if (controlp != NULL) {
2229                                         *controlp = m_copym(m, 0, m->m_len,
2230                                             M_NOWAIT);
2231                                         controlp = &(*controlp)->m_next;
2232                                 }
2233                                 m = m->m_next;
2234                         } else {
2235                                 sbfree(&so->so_rcv, m);
2236                                 so->so_rcv.sb_mb = m->m_next;
2237                                 m->m_next = NULL;
2238                                 *cme = m;
2239                                 cme = &(*cme)->m_next;
2240                                 m = so->so_rcv.sb_mb;
2241                         }
2242                 } while (m != NULL && m->m_type == MT_CONTROL);
2243                 if ((flags & MSG_PEEK) == 0)
2244                         sockbuf_pushsync(&so->so_rcv, nextrecord);
2245                 while (cm != NULL) {
2246                         cmn = cm->m_next;
2247                         cm->m_next = NULL;
2248                         if (pr->pr_domain->dom_externalize != NULL) {
2249                                 SOCKBUF_UNLOCK(&so->so_rcv);
2250                                 VNET_SO_ASSERT(so);
2251                                 error = (*pr->pr_domain->dom_externalize)
2252                                     (cm, controlp, flags);
2253                                 SOCKBUF_LOCK(&so->so_rcv);
2254                         } else if (controlp != NULL)
2255                                 *controlp = cm;
2256                         else
2257                                 m_freem(cm);
2258                         if (controlp != NULL) {
2259                                 while (*controlp != NULL)
2260                                         controlp = &(*controlp)->m_next;
2261                         }
2262                         cm = cmn;
2263                 }
2264                 if (m != NULL)
2265                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2266                 else
2267                         nextrecord = so->so_rcv.sb_mb;
2268                 orig_resid = 0;
2269         }
2270         if (m != NULL) {
2271                 if ((flags & MSG_PEEK) == 0) {
2272                         KASSERT(m->m_nextpkt == nextrecord,
2273                             ("soreceive: post-control, nextrecord !sync"));
2274                         if (nextrecord == NULL) {
2275                                 KASSERT(so->so_rcv.sb_mb == m,
2276                                     ("soreceive: post-control, sb_mb!=m"));
2277                                 KASSERT(so->so_rcv.sb_lastrecord == m,
2278                                     ("soreceive: post-control, lastrecord!=m"));
2279                         }
2280                 }
2281                 type = m->m_type;
2282                 if (type == MT_OOBDATA)
2283                         flags |= MSG_OOB;
2284         } else {
2285                 if ((flags & MSG_PEEK) == 0) {
2286                         KASSERT(so->so_rcv.sb_mb == nextrecord,
2287                             ("soreceive: sb_mb != nextrecord"));
2288                         if (so->so_rcv.sb_mb == NULL) {
2289                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2290                                     ("soreceive: sb_lastercord != NULL"));
2291                         }
2292                 }
2293         }
2294         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2295         SBLASTRECORDCHK(&so->so_rcv);
2296         SBLASTMBUFCHK(&so->so_rcv);
2297
2298         /*
2299          * Now continue to read any data mbufs off of the head of the socket
2300          * buffer until the read request is satisfied.  Note that 'type' is
2301          * used to store the type of any mbuf reads that have happened so far
2302          * such that soreceive() can stop reading if the type changes, which
2303          * causes soreceive() to return only one of regular data and inline
2304          * out-of-band data in a single socket receive operation.
2305          */
2306         moff = 0;
2307         offset = 0;
2308         while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2309             && error == 0) {
2310                 /*
2311                  * If the type of mbuf has changed since the last mbuf
2312                  * examined ('type'), end the receive operation.
2313                  */
2314                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2315                 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2316                         if (type != m->m_type)
2317                                 break;
2318                 } else if (type == MT_OOBDATA)
2319                         break;
2320                 else
2321                     KASSERT(m->m_type == MT_DATA,
2322                         ("m->m_type == %d", m->m_type));
2323                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2324                 len = uio->uio_resid;
2325                 if (so->so_oobmark && len > so->so_oobmark - offset)
2326                         len = so->so_oobmark - offset;
2327                 if (len > m->m_len - moff)
2328                         len = m->m_len - moff;
2329                 /*
2330                  * If mp is set, just pass back the mbufs.  Otherwise copy
2331                  * them out via the uio, then free.  Sockbuf must be
2332                  * consistent here (points to current mbuf, it points to next
2333                  * record) when we drop priority; we must note any additions
2334                  * to the sockbuf when we block interrupts again.
2335                  */
2336                 if (mp == NULL) {
2337                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2338                         SBLASTRECORDCHK(&so->so_rcv);
2339                         SBLASTMBUFCHK(&so->so_rcv);
2340                         SOCKBUF_UNLOCK(&so->so_rcv);
2341                         if ((m->m_flags & M_EXTPG) != 0)
2342                                 error = m_unmapped_uiomove(m, moff, uio,
2343                                     (int)len);
2344                         else
2345                                 error = uiomove(mtod(m, char *) + moff,
2346                                     (int)len, uio);
2347                         SOCKBUF_LOCK(&so->so_rcv);
2348                         if (error) {
2349                                 /*
2350                                  * The MT_SONAME mbuf has already been removed
2351                                  * from the record, so it is necessary to
2352                                  * remove the data mbufs, if any, to preserve
2353                                  * the invariant in the case of PR_ADDR that
2354                                  * requires MT_SONAME mbufs at the head of
2355                                  * each record.
2356                                  */
2357                                 if (pr->pr_flags & PR_ATOMIC &&
2358                                     ((flags & MSG_PEEK) == 0))
2359                                         (void)sbdroprecord_locked(&so->so_rcv);
2360                                 SOCKBUF_UNLOCK(&so->so_rcv);
2361                                 goto release;
2362                         }
2363                 } else
2364                         uio->uio_resid -= len;
2365                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2366                 if (len == m->m_len - moff) {
2367                         if (m->m_flags & M_EOR)
2368                                 flags |= MSG_EOR;
2369                         if (flags & MSG_PEEK) {
2370                                 m = m->m_next;
2371                                 moff = 0;
2372                         } else {
2373                                 nextrecord = m->m_nextpkt;
2374                                 sbfree(&so->so_rcv, m);
2375                                 if (mp != NULL) {
2376                                         m->m_nextpkt = NULL;
2377                                         *mp = m;
2378                                         mp = &m->m_next;
2379                                         so->so_rcv.sb_mb = m = m->m_next;
2380                                         *mp = NULL;
2381                                 } else {
2382                                         so->so_rcv.sb_mb = m_free(m);
2383                                         m = so->so_rcv.sb_mb;
2384                                 }
2385                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
2386                                 SBLASTRECORDCHK(&so->so_rcv);
2387                                 SBLASTMBUFCHK(&so->so_rcv);
2388                         }
2389                 } else {
2390                         if (flags & MSG_PEEK)
2391                                 moff += len;
2392                         else {
2393                                 if (mp != NULL) {
2394                                         if (flags & MSG_DONTWAIT) {
2395                                                 *mp = m_copym(m, 0, len,
2396                                                     M_NOWAIT);
2397                                                 if (*mp == NULL) {
2398                                                         /*
2399                                                          * m_copym() couldn't
2400                                                          * allocate an mbuf.
2401                                                          * Adjust uio_resid back
2402                                                          * (it was adjusted
2403                                                          * down by len bytes,
2404                                                          * which we didn't end
2405                                                          * up "copying" over).
2406                                                          */
2407                                                         uio->uio_resid += len;
2408                                                         break;
2409                                                 }
2410                                         } else {
2411                                                 SOCKBUF_UNLOCK(&so->so_rcv);
2412                                                 *mp = m_copym(m, 0, len,
2413                                                     M_WAITOK);
2414                                                 SOCKBUF_LOCK(&so->so_rcv);
2415                                         }
2416                                 }
2417                                 sbcut_locked(&so->so_rcv, len);
2418                         }
2419                 }
2420                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2421                 if (so->so_oobmark) {
2422                         if ((flags & MSG_PEEK) == 0) {
2423                                 so->so_oobmark -= len;
2424                                 if (so->so_oobmark == 0) {
2425                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
2426                                         break;
2427                                 }
2428                         } else {
2429                                 offset += len;
2430                                 if (offset == so->so_oobmark)
2431                                         break;
2432                         }
2433                 }
2434                 if (flags & MSG_EOR)
2435                         break;
2436                 /*
2437                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
2438                  * must not quit until "uio->uio_resid == 0" or an error
2439                  * termination.  If a signal/timeout occurs, return with a
2440                  * short count but without error.  Keep sockbuf locked
2441                  * against other readers.
2442                  */
2443                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2444                     !sosendallatonce(so) && nextrecord == NULL) {
2445                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2446                         if (so->so_error || so->so_rerror ||
2447                             so->so_rcv.sb_state & SBS_CANTRCVMORE)
2448                                 break;
2449                         /*
2450                          * Notify the protocol that some data has been
2451                          * drained before blocking.
2452                          */
2453                         if (pr->pr_flags & PR_WANTRCVD) {
2454                                 SOCKBUF_UNLOCK(&so->so_rcv);
2455                                 VNET_SO_ASSERT(so);
2456                                 pr->pr_rcvd(so, flags);
2457                                 SOCKBUF_LOCK(&so->so_rcv);
2458                                 if (__predict_false(so->so_rcv.sb_mb == NULL &&
2459                                     (so->so_error || so->so_rerror ||
2460                                     so->so_rcv.sb_state & SBS_CANTRCVMORE)))
2461                                         break;
2462                         }
2463                         SBLASTRECORDCHK(&so->so_rcv);
2464                         SBLASTMBUFCHK(&so->so_rcv);
2465                         /*
2466                          * We could receive some data while was notifying
2467                          * the protocol. Skip blocking in this case.
2468                          */
2469                         if (so->so_rcv.sb_mb == NULL) {
2470                                 error = sbwait(so, SO_RCV);
2471                                 if (error) {
2472                                         SOCKBUF_UNLOCK(&so->so_rcv);
2473                                         goto release;
2474                                 }
2475                         }
2476                         m = so->so_rcv.sb_mb;
2477                         if (m != NULL)
2478                                 nextrecord = m->m_nextpkt;
2479                 }
2480         }
2481
2482         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2483         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2484                 if (report_real_len)
2485                         uio->uio_resid -= m_length(m, NULL) - moff;
2486                 flags |= MSG_TRUNC;
2487                 if ((flags & MSG_PEEK) == 0)
2488                         (void) sbdroprecord_locked(&so->so_rcv);
2489         }
2490         if ((flags & MSG_PEEK) == 0) {
2491                 if (m == NULL) {
2492                         /*
2493                          * First part is an inline SB_EMPTY_FIXUP().  Second
2494                          * part makes sure sb_lastrecord is up-to-date if
2495                          * there is still data in the socket buffer.
2496                          */
2497                         so->so_rcv.sb_mb = nextrecord;
2498                         if (so->so_rcv.sb_mb == NULL) {
2499                                 so->so_rcv.sb_mbtail = NULL;
2500                                 so->so_rcv.sb_lastrecord = NULL;
2501                         } else if (nextrecord->m_nextpkt == NULL)
2502                                 so->so_rcv.sb_lastrecord = nextrecord;
2503                 }
2504                 SBLASTRECORDCHK(&so->so_rcv);
2505                 SBLASTMBUFCHK(&so->so_rcv);
2506                 /*
2507                  * If soreceive() is being done from the socket callback,
2508                  * then don't need to generate ACK to peer to update window,
2509                  * since ACK will be generated on return to TCP.
2510                  */
2511                 if (!(flags & MSG_SOCALLBCK) &&
2512                     (pr->pr_flags & PR_WANTRCVD)) {
2513                         SOCKBUF_UNLOCK(&so->so_rcv);
2514                         VNET_SO_ASSERT(so);
2515                         pr->pr_rcvd(so, flags);
2516                         SOCKBUF_LOCK(&so->so_rcv);
2517                 }
2518         }
2519         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2520         if (orig_resid == uio->uio_resid && orig_resid &&
2521             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2522                 SOCKBUF_UNLOCK(&so->so_rcv);
2523                 goto restart;
2524         }
2525         SOCKBUF_UNLOCK(&so->so_rcv);
2526
2527         if (flagsp != NULL)
2528                 *flagsp |= flags;
2529 release:
2530         SOCK_IO_RECV_UNLOCK(so);
2531         return (error);
2532 }
2533
2534 /*
2535  * Optimized version of soreceive() for stream (TCP) sockets.
2536  */
2537 int
2538 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2539     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2540 {
2541         int len = 0, error = 0, flags, oresid;
2542         struct sockbuf *sb;
2543         struct mbuf *m, *n = NULL;
2544
2545         /* We only do stream sockets. */
2546         if (so->so_type != SOCK_STREAM)
2547                 return (EINVAL);
2548         if (psa != NULL)
2549                 *psa = NULL;
2550         if (flagsp != NULL)
2551                 flags = *flagsp &~ MSG_EOR;
2552         else
2553                 flags = 0;
2554         if (controlp != NULL)
2555                 *controlp = NULL;
2556         if (flags & MSG_OOB)
2557                 return (soreceive_rcvoob(so, uio, flags));
2558         if (mp0 != NULL)
2559                 *mp0 = NULL;
2560
2561         sb = &so->so_rcv;
2562
2563 #ifdef KERN_TLS
2564         /*
2565          * KTLS store TLS records as records with a control message to
2566          * describe the framing.
2567          *
2568          * We check once here before acquiring locks to optimize the
2569          * common case.
2570          */
2571         if (sb->sb_tls_info != NULL)
2572                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2573                     flagsp));
2574 #endif
2575
2576         /* Prevent other readers from entering the socket. */
2577         error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2578         if (error)
2579                 return (error);
2580         SOCKBUF_LOCK(sb);
2581
2582 #ifdef KERN_TLS
2583         if (sb->sb_tls_info != NULL) {
2584                 SOCKBUF_UNLOCK(sb);
2585                 SOCK_IO_RECV_UNLOCK(so);
2586                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2587                     flagsp));
2588         }
2589 #endif
2590
2591         /* Easy one, no space to copyout anything. */
2592         if (uio->uio_resid == 0) {
2593                 error = EINVAL;
2594                 goto out;
2595         }
2596         oresid = uio->uio_resid;
2597
2598         /* We will never ever get anything unless we are or were connected. */
2599         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2600                 error = ENOTCONN;
2601                 goto out;
2602         }
2603
2604 restart:
2605         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2606
2607         /* Abort if socket has reported problems. */
2608         if (so->so_error) {
2609                 if (sbavail(sb) > 0)
2610                         goto deliver;
2611                 if (oresid > uio->uio_resid)
2612                         goto out;
2613                 error = so->so_error;
2614                 if (!(flags & MSG_PEEK))
2615                         so->so_error = 0;
2616                 goto out;
2617         }
2618
2619         /* Door is closed.  Deliver what is left, if any. */
2620         if (sb->sb_state & SBS_CANTRCVMORE) {
2621                 if (sbavail(sb) > 0)
2622                         goto deliver;
2623                 else
2624                         goto out;
2625         }
2626
2627         /* Socket buffer is empty and we shall not block. */
2628         if (sbavail(sb) == 0 &&
2629             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2630                 error = EAGAIN;
2631                 goto out;
2632         }
2633
2634         /* Socket buffer got some data that we shall deliver now. */
2635         if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2636             ((so->so_state & SS_NBIO) ||
2637              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2638              sbavail(sb) >= sb->sb_lowat ||
2639              sbavail(sb) >= uio->uio_resid ||
2640              sbavail(sb) >= sb->sb_hiwat) ) {
2641                 goto deliver;
2642         }
2643
2644         /* On MSG_WAITALL we must wait until all data or error arrives. */
2645         if ((flags & MSG_WAITALL) &&
2646             (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2647                 goto deliver;
2648
2649         /*
2650          * Wait and block until (more) data comes in.
2651          * NB: Drops the sockbuf lock during wait.
2652          */
2653         error = sbwait(so, SO_RCV);
2654         if (error)
2655                 goto out;
2656         goto restart;
2657
2658 deliver:
2659         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2660         KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2661         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2662
2663         /* Statistics. */
2664         if (uio->uio_td)
2665                 uio->uio_td->td_ru.ru_msgrcv++;
2666
2667         /* Fill uio until full or current end of socket buffer is reached. */
2668         len = min(uio->uio_resid, sbavail(sb));
2669         if (mp0 != NULL) {
2670                 /* Dequeue as many mbufs as possible. */
2671                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2672                         if (*mp0 == NULL)
2673                                 *mp0 = sb->sb_mb;
2674                         else
2675                                 m_cat(*mp0, sb->sb_mb);
2676                         for (m = sb->sb_mb;
2677                              m != NULL && m->m_len <= len;
2678                              m = m->m_next) {
2679                                 KASSERT(!(m->m_flags & M_NOTAVAIL),
2680                                     ("%s: m %p not available", __func__, m));
2681                                 len -= m->m_len;
2682                                 uio->uio_resid -= m->m_len;
2683                                 sbfree(sb, m);
2684                                 n = m;
2685                         }
2686                         n->m_next = NULL;
2687                         sb->sb_mb = m;
2688                         sb->sb_lastrecord = sb->sb_mb;
2689                         if (sb->sb_mb == NULL)
2690                                 SB_EMPTY_FIXUP(sb);
2691                 }
2692                 /* Copy the remainder. */
2693                 if (len > 0) {
2694                         KASSERT(sb->sb_mb != NULL,
2695                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2696
2697                         m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2698                         if (m == NULL)
2699                                 len = 0;        /* Don't flush data from sockbuf. */
2700                         else
2701                                 uio->uio_resid -= len;
2702                         if (*mp0 != NULL)
2703                                 m_cat(*mp0, m);
2704                         else
2705                                 *mp0 = m;
2706                         if (*mp0 == NULL) {
2707                                 error = ENOBUFS;
2708                                 goto out;
2709                         }
2710                 }
2711         } else {
2712                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2713                 SOCKBUF_UNLOCK(sb);
2714                 error = m_mbuftouio(uio, sb->sb_mb, len);
2715                 SOCKBUF_LOCK(sb);
2716                 if (error)
2717                         goto out;
2718         }
2719         SBLASTRECORDCHK(sb);
2720         SBLASTMBUFCHK(sb);
2721
2722         /*
2723          * Remove the delivered data from the socket buffer unless we
2724          * were only peeking.
2725          */
2726         if (!(flags & MSG_PEEK)) {
2727                 if (len > 0)
2728                         sbdrop_locked(sb, len);
2729
2730                 /* Notify protocol that we drained some data. */
2731                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2732                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2733                      !(flags & MSG_SOCALLBCK))) {
2734                         SOCKBUF_UNLOCK(sb);
2735                         VNET_SO_ASSERT(so);
2736                         so->so_proto->pr_rcvd(so, flags);
2737                         SOCKBUF_LOCK(sb);
2738                 }
2739         }
2740
2741         /*
2742          * For MSG_WAITALL we may have to loop again and wait for
2743          * more data to come in.
2744          */
2745         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2746                 goto restart;
2747 out:
2748         SBLASTRECORDCHK(sb);
2749         SBLASTMBUFCHK(sb);
2750         SOCKBUF_UNLOCK(sb);
2751         SOCK_IO_RECV_UNLOCK(so);
2752         return (error);
2753 }
2754
2755 /*
2756  * Optimized version of soreceive() for simple datagram cases from userspace.
2757  * Unlike in the stream case, we're able to drop a datagram if copyout()
2758  * fails, and because we handle datagrams atomically, we don't need to use a
2759  * sleep lock to prevent I/O interlacing.
2760  */
2761 int
2762 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2763     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2764 {
2765         struct mbuf *m, *m2;
2766         int flags, error;
2767         ssize_t len;
2768         struct protosw *pr = so->so_proto;
2769         struct mbuf *nextrecord;
2770
2771         if (psa != NULL)
2772                 *psa = NULL;
2773         if (controlp != NULL)
2774                 *controlp = NULL;
2775         if (flagsp != NULL)
2776                 flags = *flagsp &~ MSG_EOR;
2777         else
2778                 flags = 0;
2779
2780         /*
2781          * For any complicated cases, fall back to the full
2782          * soreceive_generic().
2783          */
2784         if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
2785                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2786                     flagsp));
2787
2788         /*
2789          * Enforce restrictions on use.
2790          */
2791         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2792             ("soreceive_dgram: wantrcvd"));
2793         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2794         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2795             ("soreceive_dgram: SBS_RCVATMARK"));
2796         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2797             ("soreceive_dgram: P_CONNREQUIRED"));
2798
2799         /*
2800          * Loop blocking while waiting for a datagram.
2801          */
2802         SOCKBUF_LOCK(&so->so_rcv);
2803         while ((m = so->so_rcv.sb_mb) == NULL) {
2804                 KASSERT(sbavail(&so->so_rcv) == 0,
2805                     ("soreceive_dgram: sb_mb NULL but sbavail %u",
2806                     sbavail(&so->so_rcv)));
2807                 if (so->so_error) {
2808                         error = so->so_error;
2809                         so->so_error = 0;
2810                         SOCKBUF_UNLOCK(&so->so_rcv);
2811                         return (error);
2812                 }
2813                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2814                     uio->uio_resid == 0) {
2815                         SOCKBUF_UNLOCK(&so->so_rcv);
2816                         return (0);
2817                 }
2818                 if ((so->so_state & SS_NBIO) ||
2819                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2820                         SOCKBUF_UNLOCK(&so->so_rcv);
2821                         return (EWOULDBLOCK);
2822                 }
2823                 SBLASTRECORDCHK(&so->so_rcv);
2824                 SBLASTMBUFCHK(&so->so_rcv);
2825                 error = sbwait(so, SO_RCV);
2826                 if (error) {
2827                         SOCKBUF_UNLOCK(&so->so_rcv);
2828                         return (error);
2829                 }
2830         }
2831         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2832
2833         if (uio->uio_td)
2834                 uio->uio_td->td_ru.ru_msgrcv++;
2835         SBLASTRECORDCHK(&so->so_rcv);
2836         SBLASTMBUFCHK(&so->so_rcv);
2837         nextrecord = m->m_nextpkt;
2838         if (nextrecord == NULL) {
2839                 KASSERT(so->so_rcv.sb_lastrecord == m,
2840                     ("soreceive_dgram: lastrecord != m"));
2841         }
2842
2843         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2844             ("soreceive_dgram: m_nextpkt != nextrecord"));
2845
2846         /*
2847          * Pull 'm' and its chain off the front of the packet queue.
2848          */
2849         so->so_rcv.sb_mb = NULL;
2850         sockbuf_pushsync(&so->so_rcv, nextrecord);
2851
2852         /*
2853          * Walk 'm's chain and free that many bytes from the socket buffer.
2854          */
2855         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2856                 sbfree(&so->so_rcv, m2);
2857
2858         /*
2859          * Do a few last checks before we let go of the lock.
2860          */
2861         SBLASTRECORDCHK(&so->so_rcv);
2862         SBLASTMBUFCHK(&so->so_rcv);
2863         SOCKBUF_UNLOCK(&so->so_rcv);
2864
2865         if (pr->pr_flags & PR_ADDR) {
2866                 KASSERT(m->m_type == MT_SONAME,
2867                     ("m->m_type == %d", m->m_type));
2868                 if (psa != NULL)
2869                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2870                             M_NOWAIT);
2871                 m = m_free(m);
2872         }
2873         if (m == NULL) {
2874                 /* XXXRW: Can this happen? */
2875                 return (0);
2876         }
2877
2878         /*
2879          * Packet to copyout() is now in 'm' and it is disconnected from the
2880          * queue.
2881          *
2882          * Process one or more MT_CONTROL mbufs present before any data mbufs
2883          * in the first mbuf chain on the socket buffer.  We call into the
2884          * protocol to perform externalization (or freeing if controlp ==
2885          * NULL). In some cases there can be only MT_CONTROL mbufs without
2886          * MT_DATA mbufs.
2887          */
2888         if (m->m_type == MT_CONTROL) {
2889                 struct mbuf *cm = NULL, *cmn;
2890                 struct mbuf **cme = &cm;
2891
2892                 do {
2893                         m2 = m->m_next;
2894                         m->m_next = NULL;
2895                         *cme = m;
2896                         cme = &(*cme)->m_next;
2897                         m = m2;
2898                 } while (m != NULL && m->m_type == MT_CONTROL);
2899                 while (cm != NULL) {
2900                         cmn = cm->m_next;
2901                         cm->m_next = NULL;
2902                         if (pr->pr_domain->dom_externalize != NULL) {
2903                                 error = (*pr->pr_domain->dom_externalize)
2904                                     (cm, controlp, flags);
2905                         } else if (controlp != NULL)
2906                                 *controlp = cm;
2907                         else
2908                                 m_freem(cm);
2909                         if (controlp != NULL) {
2910                                 while (*controlp != NULL)
2911                                         controlp = &(*controlp)->m_next;
2912                         }
2913                         cm = cmn;
2914                 }
2915         }
2916         KASSERT(m == NULL || m->m_type == MT_DATA,
2917             ("soreceive_dgram: !data"));
2918         while (m != NULL && uio->uio_resid > 0) {
2919                 len = uio->uio_resid;
2920                 if (len > m->m_len)
2921                         len = m->m_len;
2922                 error = uiomove(mtod(m, char *), (int)len, uio);
2923                 if (error) {
2924                         m_freem(m);
2925                         return (error);
2926                 }
2927                 if (len == m->m_len)
2928                         m = m_free(m);
2929                 else {
2930                         m->m_data += len;
2931                         m->m_len -= len;
2932                 }
2933         }
2934         if (m != NULL) {
2935                 flags |= MSG_TRUNC;
2936                 m_freem(m);
2937         }
2938         if (flagsp != NULL)
2939                 *flagsp |= flags;
2940         return (0);
2941 }
2942
2943 int
2944 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2945     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2946 {
2947         int error;
2948
2949         CURVNET_SET(so->so_vnet);
2950         error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
2951         CURVNET_RESTORE();
2952         return (error);
2953 }
2954
2955 int
2956 soshutdown(struct socket *so, enum shutdown_how how)
2957 {
2958         int error;
2959
2960         CURVNET_SET(so->so_vnet);
2961         error = so->so_proto->pr_shutdown(so, how);
2962         CURVNET_RESTORE();
2963
2964         return (error);
2965 }
2966
2967 /*
2968  * Used by several pr_shutdown implementations that use generic socket buffers.
2969  */
2970 void
2971 sorflush(struct socket *so)
2972 {
2973         int error;
2974
2975         VNET_SO_ASSERT(so);
2976
2977         /*
2978          * Dislodge threads currently blocked in receive and wait to acquire
2979          * a lock against other simultaneous readers before clearing the
2980          * socket buffer.  Don't let our acquire be interrupted by a signal
2981          * despite any existing socket disposition on interruptable waiting.
2982          *
2983          * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive
2984          * methods that read the top of the socket buffer without acquisition
2985          * of the socket buffer mutex, assuming that top of the buffer
2986          * exclusively belongs to the read(2) syscall.  This is handy when
2987          * performing MSG_PEEK.
2988          */
2989         socantrcvmore(so);
2990
2991         error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
2992         if (error != 0) {
2993                 KASSERT(SOLISTENING(so),
2994                     ("%s: soiolock(%p) failed", __func__, so));
2995                 return;
2996         }
2997
2998         sbrelease(so, SO_RCV);
2999         SOCK_IO_RECV_UNLOCK(so);
3000
3001 }
3002
3003 /*
3004  * Wrapper for Socket established helper hook.
3005  * Parameters: socket, context of the hook point, hook id.
3006  */
3007 static int inline
3008 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
3009 {
3010         struct socket_hhook_data hhook_data = {
3011                 .so = so,
3012                 .hctx = hctx,
3013                 .m = NULL,
3014                 .status = 0
3015         };
3016
3017         CURVNET_SET(so->so_vnet);
3018         HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
3019         CURVNET_RESTORE();
3020
3021         /* Ugly but needed, since hhooks return void for now */
3022         return (hhook_data.status);
3023 }
3024
3025 /*
3026  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
3027  * additional variant to handle the case where the option value needs to be
3028  * some kind of integer, but not a specific size.  In addition to their use
3029  * here, these functions are also called by the protocol-level pr_ctloutput()
3030  * routines.
3031  */
3032 int
3033 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3034 {
3035         size_t  valsize;
3036
3037         /*
3038          * If the user gives us more than we wanted, we ignore it, but if we
3039          * don't get the minimum length the caller wants, we return EINVAL.
3040          * On success, sopt->sopt_valsize is set to however much we actually
3041          * retrieved.
3042          */
3043         if ((valsize = sopt->sopt_valsize) < minlen)
3044                 return EINVAL;
3045         if (valsize > len)
3046                 sopt->sopt_valsize = valsize = len;
3047
3048         if (sopt->sopt_td != NULL)
3049                 return (copyin(sopt->sopt_val, buf, valsize));
3050
3051         bcopy(sopt->sopt_val, buf, valsize);
3052         return (0);
3053 }
3054
3055 /*
3056  * Kernel version of setsockopt(2).
3057  *
3058  * XXX: optlen is size_t, not socklen_t
3059  */
3060 int
3061 so_setsockopt(struct socket *so, int level, int optname, void *optval,
3062     size_t optlen)
3063 {
3064         struct sockopt sopt;
3065
3066         sopt.sopt_level = level;
3067         sopt.sopt_name = optname;
3068         sopt.sopt_dir = SOPT_SET;
3069         sopt.sopt_val = optval;
3070         sopt.sopt_valsize = optlen;
3071         sopt.sopt_td = NULL;
3072         return (sosetopt(so, &sopt));
3073 }
3074
3075 int
3076 sosetopt(struct socket *so, struct sockopt *sopt)
3077 {
3078         int     error, optval;
3079         struct  linger l;
3080         struct  timeval tv;
3081         sbintime_t val, *valp;
3082         uint32_t val32;
3083 #ifdef MAC
3084         struct mac extmac;
3085 #endif
3086
3087         CURVNET_SET(so->so_vnet);
3088         error = 0;
3089         if (sopt->sopt_level != SOL_SOCKET) {
3090                 if (so->so_proto->pr_ctloutput != NULL)
3091                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3092                 else
3093                         error = ENOPROTOOPT;
3094         } else {
3095                 switch (sopt->sopt_name) {
3096                 case SO_ACCEPTFILTER:
3097                         error = accept_filt_setopt(so, sopt);
3098                         if (error)
3099                                 goto bad;
3100                         break;
3101
3102                 case SO_LINGER:
3103                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3104                         if (error)
3105                                 goto bad;
3106                         if (l.l_linger < 0 ||
3107                             l.l_linger > USHRT_MAX ||
3108                             l.l_linger > (INT_MAX / hz)) {
3109                                 error = EDOM;
3110                                 goto bad;
3111                         }
3112                         SOCK_LOCK(so);
3113                         so->so_linger = l.l_linger;
3114                         if (l.l_onoff)
3115                                 so->so_options |= SO_LINGER;
3116                         else
3117                                 so->so_options &= ~SO_LINGER;
3118                         SOCK_UNLOCK(so);
3119                         break;
3120
3121                 case SO_DEBUG:
3122                 case SO_KEEPALIVE:
3123                 case SO_DONTROUTE:
3124                 case SO_USELOOPBACK:
3125                 case SO_BROADCAST:
3126                 case SO_REUSEADDR:
3127                 case SO_REUSEPORT:
3128                 case SO_REUSEPORT_LB:
3129                 case SO_OOBINLINE:
3130                 case SO_TIMESTAMP:
3131                 case SO_BINTIME:
3132                 case SO_NOSIGPIPE:
3133                 case SO_NO_DDP:
3134                 case SO_NO_OFFLOAD:
3135                 case SO_RERROR:
3136                         error = sooptcopyin(sopt, &optval, sizeof optval,
3137                             sizeof optval);
3138                         if (error)
3139                                 goto bad;
3140                         SOCK_LOCK(so);
3141                         if (optval)
3142                                 so->so_options |= sopt->sopt_name;
3143                         else
3144                                 so->so_options &= ~sopt->sopt_name;
3145                         SOCK_UNLOCK(so);
3146                         break;
3147
3148                 case SO_SETFIB:
3149                         error = sooptcopyin(sopt, &optval, sizeof optval,
3150                             sizeof optval);
3151                         if (error)
3152                                 goto bad;
3153
3154                         if (optval < 0 || optval >= rt_numfibs) {
3155                                 error = EINVAL;
3156                                 goto bad;
3157                         }
3158                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3159                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3160                            (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3161                                 so->so_fibnum = optval;
3162                         else
3163                                 so->so_fibnum = 0;
3164                         break;
3165
3166                 case SO_USER_COOKIE:
3167                         error = sooptcopyin(sopt, &val32, sizeof val32,
3168                             sizeof val32);
3169                         if (error)
3170                                 goto bad;
3171                         so->so_user_cookie = val32;
3172                         break;
3173
3174                 case SO_SNDBUF:
3175                 case SO_RCVBUF:
3176                 case SO_SNDLOWAT:
3177                 case SO_RCVLOWAT:
3178                         error = so->so_proto->pr_setsbopt(so, sopt);
3179                         if (error)
3180                                 goto bad;
3181                         break;
3182
3183                 case SO_SNDTIMEO:
3184                 case SO_RCVTIMEO:
3185 #ifdef COMPAT_FREEBSD32
3186                         if (SV_CURPROC_FLAG(SV_ILP32)) {
3187                                 struct timeval32 tv32;
3188
3189                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
3190                                     sizeof tv32);
3191                                 CP(tv32, tv, tv_sec);
3192                                 CP(tv32, tv, tv_usec);
3193                         } else
3194 #endif
3195                                 error = sooptcopyin(sopt, &tv, sizeof tv,
3196                                     sizeof tv);
3197                         if (error)
3198                                 goto bad;
3199                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3200                             tv.tv_usec >= 1000000) {
3201                                 error = EDOM;
3202                                 goto bad;
3203                         }
3204                         if (tv.tv_sec > INT32_MAX)
3205                                 val = SBT_MAX;
3206                         else
3207                                 val = tvtosbt(tv);
3208                         SOCK_LOCK(so);
3209                         valp = sopt->sopt_name == SO_SNDTIMEO ?
3210                             (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
3211                             &so->so_snd.sb_timeo) :
3212                             (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
3213                             &so->so_rcv.sb_timeo);
3214                         *valp = val;
3215                         SOCK_UNLOCK(so);
3216                         break;
3217
3218                 case SO_LABEL:
3219 #ifdef MAC
3220                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
3221                             sizeof extmac);
3222                         if (error)
3223                                 goto bad;
3224                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3225                             so, &extmac);
3226 #else
3227                         error = EOPNOTSUPP;
3228 #endif
3229                         break;
3230
3231                 case SO_TS_CLOCK:
3232                         error = sooptcopyin(sopt, &optval, sizeof optval,
3233                             sizeof optval);
3234                         if (error)
3235                                 goto bad;
3236                         if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3237                                 error = EINVAL;
3238                                 goto bad;
3239                         }
3240                         so->so_ts_clock = optval;
3241                         break;
3242
3243                 case SO_MAX_PACING_RATE:
3244                         error = sooptcopyin(sopt, &val32, sizeof(val32),
3245                             sizeof(val32));
3246                         if (error)
3247                                 goto bad;
3248                         so->so_max_pacing_rate = val32;
3249                         break;
3250
3251                 default:
3252                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3253                                 error = hhook_run_socket(so, sopt,
3254                                     HHOOK_SOCKET_OPT);
3255                         else
3256                                 error = ENOPROTOOPT;
3257                         break;
3258                 }
3259                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3260                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
3261         }
3262 bad:
3263         CURVNET_RESTORE();
3264         return (error);
3265 }
3266
3267 /*
3268  * Helper routine for getsockopt.
3269  */
3270 int
3271 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3272 {
3273         int     error;
3274         size_t  valsize;
3275
3276         error = 0;
3277
3278         /*
3279          * Documented get behavior is that we always return a value, possibly
3280          * truncated to fit in the user's buffer.  Traditional behavior is
3281          * that we always tell the user precisely how much we copied, rather
3282          * than something useful like the total amount we had available for
3283          * her.  Note that this interface is not idempotent; the entire
3284          * answer must be generated ahead of time.
3285          */
3286         valsize = min(len, sopt->sopt_valsize);
3287         sopt->sopt_valsize = valsize;
3288         if (sopt->sopt_val != NULL) {
3289                 if (sopt->sopt_td != NULL)
3290                         error = copyout(buf, sopt->sopt_val, valsize);
3291                 else
3292                         bcopy(buf, sopt->sopt_val, valsize);
3293         }
3294         return (error);
3295 }
3296
3297 int
3298 sogetopt(struct socket *so, struct sockopt *sopt)
3299 {
3300         int     error, optval;
3301         struct  linger l;
3302         struct  timeval tv;
3303 #ifdef MAC
3304         struct mac extmac;
3305 #endif
3306
3307         CURVNET_SET(so->so_vnet);
3308         error = 0;
3309         if (sopt->sopt_level != SOL_SOCKET) {
3310                 if (so->so_proto->pr_ctloutput != NULL)
3311                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3312                 else
3313                         error = ENOPROTOOPT;
3314                 CURVNET_RESTORE();
3315                 return (error);
3316         } else {
3317                 switch (sopt->sopt_name) {
3318                 case SO_ACCEPTFILTER:
3319                         error = accept_filt_getopt(so, sopt);
3320                         break;
3321
3322                 case SO_LINGER:
3323                         SOCK_LOCK(so);
3324                         l.l_onoff = so->so_options & SO_LINGER;
3325                         l.l_linger = so->so_linger;
3326                         SOCK_UNLOCK(so);
3327                         error = sooptcopyout(sopt, &l, sizeof l);
3328                         break;
3329
3330                 case SO_USELOOPBACK:
3331                 case SO_DONTROUTE:
3332                 case SO_DEBUG:
3333                 case SO_KEEPALIVE:
3334                 case SO_REUSEADDR:
3335                 case SO_REUSEPORT:
3336                 case SO_REUSEPORT_LB:
3337                 case SO_BROADCAST:
3338                 case SO_OOBINLINE:
3339                 case SO_ACCEPTCONN:
3340                 case SO_TIMESTAMP:
3341                 case SO_BINTIME:
3342                 case SO_NOSIGPIPE:
3343                 case SO_NO_DDP:
3344                 case SO_NO_OFFLOAD:
3345                 case SO_RERROR:
3346                         optval = so->so_options & sopt->sopt_name;
3347 integer:
3348                         error = sooptcopyout(sopt, &optval, sizeof optval);
3349                         break;
3350
3351                 case SO_DOMAIN:
3352                         optval = so->so_proto->pr_domain->dom_family;
3353                         goto integer;
3354
3355                 case SO_TYPE:
3356                         optval = so->so_type;
3357                         goto integer;
3358
3359                 case SO_PROTOCOL:
3360                         optval = so->so_proto->pr_protocol;
3361                         goto integer;
3362
3363                 case SO_ERROR:
3364                         SOCK_LOCK(so);
3365                         if (so->so_error) {
3366                                 optval = so->so_error;
3367                                 so->so_error = 0;
3368                         } else {
3369                                 optval = so->so_rerror;
3370                                 so->so_rerror = 0;
3371                         }
3372                         SOCK_UNLOCK(so);
3373                         goto integer;
3374
3375                 case SO_SNDBUF:
3376                         optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3377                             so->so_snd.sb_hiwat;
3378                         goto integer;
3379
3380                 case SO_RCVBUF:
3381                         optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3382                             so->so_rcv.sb_hiwat;
3383                         goto integer;
3384
3385                 case SO_SNDLOWAT:
3386                         optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3387                             so->so_snd.sb_lowat;
3388                         goto integer;
3389
3390                 case SO_RCVLOWAT:
3391                         optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3392                             so->so_rcv.sb_lowat;
3393                         goto integer;
3394
3395                 case SO_SNDTIMEO:
3396                 case SO_RCVTIMEO:
3397                         SOCK_LOCK(so);
3398                         tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3399                             (SOLISTENING(so) ? so->sol_sbsnd_timeo :
3400                             so->so_snd.sb_timeo) :
3401                             (SOLISTENING(so) ? so->sol_sbrcv_timeo :
3402                             so->so_rcv.sb_timeo));
3403                         SOCK_UNLOCK(so);
3404 #ifdef COMPAT_FREEBSD32
3405                         if (SV_CURPROC_FLAG(SV_ILP32)) {
3406                                 struct timeval32 tv32;
3407
3408                                 CP(tv, tv32, tv_sec);
3409                                 CP(tv, tv32, tv_usec);
3410                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
3411                         } else
3412 #endif
3413                                 error = sooptcopyout(sopt, &tv, sizeof tv);
3414                         break;
3415
3416                 case SO_LABEL:
3417 #ifdef MAC
3418                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3419                             sizeof(extmac));
3420                         if (error)
3421                                 goto bad;
3422                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3423                             so, &extmac);
3424                         if (error)
3425                                 goto bad;
3426                         /* Don't copy out extmac, it is unchanged. */
3427 #else
3428                         error = EOPNOTSUPP;
3429 #endif
3430                         break;
3431
3432                 case SO_PEERLABEL:
3433 #ifdef MAC
3434                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3435                             sizeof(extmac));
3436                         if (error)
3437                                 goto bad;
3438                         error = mac_getsockopt_peerlabel(
3439                             sopt->sopt_td->td_ucred, so, &extmac);
3440                         if (error)
3441                                 goto bad;
3442                         /* Don't copy out extmac, it is unchanged. */
3443 #else
3444                         error = EOPNOTSUPP;
3445 #endif
3446                         break;
3447
3448                 case SO_LISTENQLIMIT:
3449                         optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3450                         goto integer;
3451
3452                 case SO_LISTENQLEN:
3453                         optval = SOLISTENING(so) ? so->sol_qlen : 0;
3454                         goto integer;
3455
3456                 case SO_LISTENINCQLEN:
3457                         optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3458                         goto integer;
3459
3460                 case SO_TS_CLOCK:
3461                         optval = so->so_ts_clock;
3462                         goto integer;
3463
3464                 case SO_MAX_PACING_RATE:
3465                         optval = so->so_max_pacing_rate;
3466                         goto integer;
3467
3468                 default:
3469                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3470                                 error = hhook_run_socket(so, sopt,
3471                                     HHOOK_SOCKET_OPT);
3472                         else
3473                                 error = ENOPROTOOPT;
3474                         break;
3475                 }
3476         }
3477 #ifdef MAC
3478 bad:
3479 #endif
3480         CURVNET_RESTORE();
3481         return (error);
3482 }
3483
3484 int
3485 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3486 {
3487         struct mbuf *m, *m_prev;
3488         int sopt_size = sopt->sopt_valsize;
3489
3490         MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3491         if (m == NULL)
3492                 return ENOBUFS;
3493         if (sopt_size > MLEN) {
3494                 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3495                 if ((m->m_flags & M_EXT) == 0) {
3496                         m_free(m);
3497                         return ENOBUFS;
3498                 }
3499                 m->m_len = min(MCLBYTES, sopt_size);
3500         } else {
3501                 m->m_len = min(MLEN, sopt_size);
3502         }
3503         sopt_size -= m->m_len;
3504         *mp = m;
3505         m_prev = m;
3506
3507         while (sopt_size) {
3508                 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3509                 if (m == NULL) {
3510                         m_freem(*mp);
3511                         return ENOBUFS;
3512                 }
3513                 if (sopt_size > MLEN) {
3514                         MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3515                             M_NOWAIT);
3516                         if ((m->m_flags & M_EXT) == 0) {
3517                                 m_freem(m);
3518                                 m_freem(*mp);
3519                                 return ENOBUFS;
3520                         }
3521                         m->m_len = min(MCLBYTES, sopt_size);
3522                 } else {
3523                         m->m_len = min(MLEN, sopt_size);
3524                 }
3525                 sopt_size -= m->m_len;
3526                 m_prev->m_next = m;
3527                 m_prev = m;
3528         }
3529         return (0);
3530 }
3531
3532 int
3533 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3534 {
3535         struct mbuf *m0 = m;
3536
3537         if (sopt->sopt_val == NULL)
3538                 return (0);
3539         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3540                 if (sopt->sopt_td != NULL) {
3541                         int error;
3542
3543                         error = copyin(sopt->sopt_val, mtod(m, char *),
3544                             m->m_len);
3545                         if (error != 0) {
3546                                 m_freem(m0);
3547                                 return(error);
3548                         }
3549                 } else
3550                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3551                 sopt->sopt_valsize -= m->m_len;
3552                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3553                 m = m->m_next;
3554         }
3555         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3556                 panic("ip6_sooptmcopyin");
3557         return (0);
3558 }
3559
3560 int
3561 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3562 {
3563         struct mbuf *m0 = m;
3564         size_t valsize = 0;
3565
3566         if (sopt->sopt_val == NULL)
3567                 return (0);
3568         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3569                 if (sopt->sopt_td != NULL) {
3570                         int error;
3571
3572                         error = copyout(mtod(m, char *), sopt->sopt_val,
3573                             m->m_len);
3574                         if (error != 0) {
3575                                 m_freem(m0);
3576                                 return(error);
3577                         }
3578                 } else
3579                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3580                 sopt->sopt_valsize -= m->m_len;
3581                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3582                 valsize += m->m_len;
3583                 m = m->m_next;
3584         }
3585         if (m != NULL) {
3586                 /* enough soopt buffer should be given from user-land */
3587                 m_freem(m0);
3588                 return(EINVAL);
3589         }
3590         sopt->sopt_valsize = valsize;
3591         return (0);
3592 }
3593
3594 /*
3595  * sohasoutofband(): protocol notifies socket layer of the arrival of new
3596  * out-of-band data, which will then notify socket consumers.
3597  */
3598 void
3599 sohasoutofband(struct socket *so)
3600 {
3601
3602         if (so->so_sigio != NULL)
3603                 pgsigio(&so->so_sigio, SIGURG, 0);
3604         selwakeuppri(&so->so_rdsel, PSOCK);
3605 }
3606
3607 int
3608 sopoll(struct socket *so, int events, struct ucred *active_cred,
3609     struct thread *td)
3610 {
3611
3612         /*
3613          * We do not need to set or assert curvnet as long as everyone uses
3614          * sopoll_generic().
3615          */
3616         return (so->so_proto->pr_sopoll(so, events, active_cred, td));
3617 }
3618
3619 int
3620 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3621     struct thread *td)
3622 {
3623         int revents;
3624
3625         SOCK_LOCK(so);
3626         if (SOLISTENING(so)) {
3627                 if (!(events & (POLLIN | POLLRDNORM)))
3628                         revents = 0;
3629                 else if (!TAILQ_EMPTY(&so->sol_comp))
3630                         revents = events & (POLLIN | POLLRDNORM);
3631                 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3632                         revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3633                 else {
3634                         selrecord(td, &so->so_rdsel);
3635                         revents = 0;
3636                 }
3637         } else {
3638                 revents = 0;
3639                 SOCK_SENDBUF_LOCK(so);
3640                 SOCK_RECVBUF_LOCK(so);
3641                 if (events & (POLLIN | POLLRDNORM))
3642                         if (soreadabledata(so))
3643                                 revents |= events & (POLLIN | POLLRDNORM);
3644                 if (events & (POLLOUT | POLLWRNORM))
3645                         if (sowriteable(so))
3646                                 revents |= events & (POLLOUT | POLLWRNORM);
3647                 if (events & (POLLPRI | POLLRDBAND))
3648                         if (so->so_oobmark ||
3649                             (so->so_rcv.sb_state & SBS_RCVATMARK))
3650                                 revents |= events & (POLLPRI | POLLRDBAND);
3651                 if ((events & POLLINIGNEOF) == 0) {
3652                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3653                                 revents |= events & (POLLIN | POLLRDNORM);
3654                                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3655                                         revents |= POLLHUP;
3656                         }
3657                 }
3658                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3659                         revents |= events & POLLRDHUP;
3660                 if (revents == 0) {
3661                         if (events &
3662                             (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
3663                                 selrecord(td, &so->so_rdsel);
3664                                 so->so_rcv.sb_flags |= SB_SEL;
3665                         }
3666                         if (events & (POLLOUT | POLLWRNORM)) {
3667                                 selrecord(td, &so->so_wrsel);
3668                                 so->so_snd.sb_flags |= SB_SEL;
3669                         }
3670                 }
3671                 SOCK_RECVBUF_UNLOCK(so);
3672                 SOCK_SENDBUF_UNLOCK(so);
3673         }
3674         SOCK_UNLOCK(so);
3675         return (revents);
3676 }
3677
3678 int
3679 soo_kqfilter(struct file *fp, struct knote *kn)
3680 {
3681         struct socket *so = kn->kn_fp->f_data;
3682         struct sockbuf *sb;
3683         sb_which which;
3684         struct knlist *knl;
3685
3686         switch (kn->kn_filter) {
3687         case EVFILT_READ:
3688                 kn->kn_fop = &soread_filtops;
3689                 knl = &so->so_rdsel.si_note;
3690                 sb = &so->so_rcv;
3691                 which = SO_RCV;
3692                 break;
3693         case EVFILT_WRITE:
3694                 kn->kn_fop = &sowrite_filtops;
3695                 knl = &so->so_wrsel.si_note;
3696                 sb = &so->so_snd;
3697                 which = SO_SND;
3698                 break;
3699         case EVFILT_EMPTY:
3700                 kn->kn_fop = &soempty_filtops;
3701                 knl = &so->so_wrsel.si_note;
3702                 sb = &so->so_snd;
3703                 which = SO_SND;
3704                 break;
3705         default:
3706                 return (EINVAL);
3707         }
3708
3709         SOCK_LOCK(so);
3710         if (SOLISTENING(so)) {
3711                 knlist_add(knl, kn, 1);
3712         } else {
3713                 SOCK_BUF_LOCK(so, which);
3714                 knlist_add(knl, kn, 1);
3715                 sb->sb_flags |= SB_KNOTE;
3716                 SOCK_BUF_UNLOCK(so, which);
3717         }
3718         SOCK_UNLOCK(so);
3719         return (0);
3720 }
3721
3722 static void
3723 filt_sordetach(struct knote *kn)
3724 {
3725         struct socket *so = kn->kn_fp->f_data;
3726
3727         so_rdknl_lock(so);
3728         knlist_remove(&so->so_rdsel.si_note, kn, 1);
3729         if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3730                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3731         so_rdknl_unlock(so);
3732 }
3733
3734 /*ARGSUSED*/
3735 static int
3736 filt_soread(struct knote *kn, long hint)
3737 {
3738         struct socket *so;
3739
3740         so = kn->kn_fp->f_data;
3741
3742         if (SOLISTENING(so)) {
3743                 SOCK_LOCK_ASSERT(so);
3744                 kn->kn_data = so->sol_qlen;
3745                 if (so->so_error) {
3746                         kn->kn_flags |= EV_EOF;
3747                         kn->kn_fflags = so->so_error;
3748                         return (1);
3749                 }
3750                 return (!TAILQ_EMPTY(&so->sol_comp));
3751         }
3752
3753         SOCK_RECVBUF_LOCK_ASSERT(so);
3754
3755         kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3756         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3757                 kn->kn_flags |= EV_EOF;
3758                 kn->kn_fflags = so->so_error;
3759                 return (1);
3760         } else if (so->so_error || so->so_rerror)
3761                 return (1);
3762
3763         if (kn->kn_sfflags & NOTE_LOWAT) {
3764                 if (kn->kn_data >= kn->kn_sdata)
3765                         return (1);
3766         } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3767                 return (1);
3768
3769         /* This hook returning non-zero indicates an event, not error */
3770         return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3771 }
3772
3773 static void
3774 filt_sowdetach(struct knote *kn)
3775 {
3776         struct socket *so = kn->kn_fp->f_data;
3777
3778         so_wrknl_lock(so);
3779         knlist_remove(&so->so_wrsel.si_note, kn, 1);
3780         if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3781                 so->so_snd.sb_flags &= ~SB_KNOTE;
3782         so_wrknl_unlock(so);
3783 }
3784
3785 /*ARGSUSED*/
3786 static int
3787 filt_sowrite(struct knote *kn, long hint)
3788 {
3789         struct socket *so;
3790
3791         so = kn->kn_fp->f_data;
3792
3793         if (SOLISTENING(so))
3794                 return (0);
3795
3796         SOCK_SENDBUF_LOCK_ASSERT(so);
3797         kn->kn_data = sbspace(&so->so_snd);
3798
3799         hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3800
3801         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3802                 kn->kn_flags |= EV_EOF;
3803                 kn->kn_fflags = so->so_error;
3804                 return (1);
3805         } else if (so->so_error)        /* temporary udp error */
3806                 return (1);
3807         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3808             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3809                 return (0);
3810         else if (kn->kn_sfflags & NOTE_LOWAT)
3811                 return (kn->kn_data >= kn->kn_sdata);
3812         else
3813                 return (kn->kn_data >= so->so_snd.sb_lowat);
3814 }
3815
3816 static int
3817 filt_soempty(struct knote *kn, long hint)
3818 {
3819         struct socket *so;
3820
3821         so = kn->kn_fp->f_data;
3822
3823         if (SOLISTENING(so))
3824                 return (1);
3825
3826         SOCK_SENDBUF_LOCK_ASSERT(so);
3827         kn->kn_data = sbused(&so->so_snd);
3828
3829         if (kn->kn_data == 0)
3830                 return (1);
3831         else
3832                 return (0);
3833 }
3834
3835 int
3836 socheckuid(struct socket *so, uid_t uid)
3837 {
3838
3839         if (so == NULL)
3840                 return (EPERM);
3841         if (so->so_cred->cr_uid != uid)
3842                 return (EPERM);
3843         return (0);
3844 }
3845
3846 /*
3847  * These functions are used by protocols to notify the socket layer (and its
3848  * consumers) of state changes in the sockets driven by protocol-side events.
3849  */
3850
3851 /*
3852  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3853  *
3854  * Normal sequence from the active (originating) side is that
3855  * soisconnecting() is called during processing of connect() call, resulting
3856  * in an eventual call to soisconnected() if/when the connection is
3857  * established.  When the connection is torn down soisdisconnecting() is
3858  * called during processing of disconnect() call, and soisdisconnected() is
3859  * called when the connection to the peer is totally severed.  The semantics
3860  * of these routines are such that connectionless protocols can call
3861  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3862  * calls when setting up a ``connection'' takes no time.
3863  *
3864  * From the passive side, a socket is created with two queues of sockets:
3865  * so_incomp for connections in progress and so_comp for connections already
3866  * made and awaiting user acceptance.  As a protocol is preparing incoming
3867  * connections, it creates a socket structure queued on so_incomp by calling
3868  * sonewconn().  When the connection is established, soisconnected() is
3869  * called, and transfers the socket structure to so_comp, making it available
3870  * to accept().
3871  *
3872  * If a socket is closed with sockets on either so_incomp or so_comp, these
3873  * sockets are dropped.
3874  *
3875  * If higher-level protocols are implemented in the kernel, the wakeups done
3876  * here will sometimes cause software-interrupt process scheduling.
3877  */
3878 void
3879 soisconnecting(struct socket *so)
3880 {
3881
3882         SOCK_LOCK(so);
3883         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3884         so->so_state |= SS_ISCONNECTING;
3885         SOCK_UNLOCK(so);
3886 }
3887
3888 void
3889 soisconnected(struct socket *so)
3890 {
3891         bool last __diagused;
3892
3893         SOCK_LOCK(so);
3894         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
3895         so->so_state |= SS_ISCONNECTED;
3896
3897         if (so->so_qstate == SQ_INCOMP) {
3898                 struct socket *head = so->so_listen;
3899                 int ret;
3900
3901                 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
3902                 /*
3903                  * Promoting a socket from incomplete queue to complete, we
3904                  * need to go through reverse order of locking.  We first do
3905                  * trylock, and if that doesn't succeed, we go the hard way
3906                  * leaving a reference and rechecking consistency after proper
3907                  * locking.
3908                  */
3909                 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
3910                         soref(head);
3911                         SOCK_UNLOCK(so);
3912                         SOLISTEN_LOCK(head);
3913                         SOCK_LOCK(so);
3914                         if (__predict_false(head != so->so_listen)) {
3915                                 /*
3916                                  * The socket went off the listen queue,
3917                                  * should be lost race to close(2) of sol.
3918                                  * The socket is about to soabort().
3919                                  */
3920                                 SOCK_UNLOCK(so);
3921                                 sorele_locked(head);
3922                                 return;
3923                         }
3924                         last = refcount_release(&head->so_count);
3925                         KASSERT(!last, ("%s: released last reference for %p",
3926                             __func__, head));
3927                 }
3928 again:
3929                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3930                         TAILQ_REMOVE(&head->sol_incomp, so, so_list);
3931                         head->sol_incqlen--;
3932                         TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
3933                         head->sol_qlen++;
3934                         so->so_qstate = SQ_COMP;
3935                         SOCK_UNLOCK(so);
3936                         solisten_wakeup(head);  /* unlocks */
3937                 } else {
3938                         SOCK_RECVBUF_LOCK(so);
3939                         soupcall_set(so, SO_RCV,
3940                             head->sol_accept_filter->accf_callback,
3941                             head->sol_accept_filter_arg);
3942                         so->so_options &= ~SO_ACCEPTFILTER;
3943                         ret = head->sol_accept_filter->accf_callback(so,
3944                             head->sol_accept_filter_arg, M_NOWAIT);
3945                         if (ret == SU_ISCONNECTED) {
3946                                 soupcall_clear(so, SO_RCV);
3947                                 SOCK_RECVBUF_UNLOCK(so);
3948                                 goto again;
3949                         }
3950                         SOCK_RECVBUF_UNLOCK(so);
3951                         SOCK_UNLOCK(so);
3952                         SOLISTEN_UNLOCK(head);
3953                 }
3954                 return;
3955         }
3956         SOCK_UNLOCK(so);
3957         wakeup(&so->so_timeo);
3958         sorwakeup(so);
3959         sowwakeup(so);
3960 }
3961
3962 void
3963 soisdisconnecting(struct socket *so)
3964 {
3965
3966         SOCK_LOCK(so);
3967         so->so_state &= ~SS_ISCONNECTING;
3968         so->so_state |= SS_ISDISCONNECTING;
3969
3970         if (!SOLISTENING(so)) {
3971                 SOCK_RECVBUF_LOCK(so);
3972                 socantrcvmore_locked(so);
3973                 SOCK_SENDBUF_LOCK(so);
3974                 socantsendmore_locked(so);
3975         }
3976         SOCK_UNLOCK(so);
3977         wakeup(&so->so_timeo);
3978 }
3979
3980 void
3981 soisdisconnected(struct socket *so)
3982 {
3983
3984         SOCK_LOCK(so);
3985
3986         /*
3987          * There is at least one reader of so_state that does not
3988          * acquire socket lock, namely soreceive_generic().  Ensure
3989          * that it never sees all flags that track connection status
3990          * cleared, by ordering the update with a barrier semantic of
3991          * our release thread fence.
3992          */
3993         so->so_state |= SS_ISDISCONNECTED;
3994         atomic_thread_fence_rel();
3995         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3996
3997         if (!SOLISTENING(so)) {
3998                 SOCK_UNLOCK(so);
3999                 SOCK_RECVBUF_LOCK(so);
4000                 socantrcvmore_locked(so);
4001                 SOCK_SENDBUF_LOCK(so);
4002                 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4003                 socantsendmore_locked(so);
4004         } else
4005                 SOCK_UNLOCK(so);
4006         wakeup(&so->so_timeo);
4007 }
4008
4009 int
4010 soiolock(struct socket *so, struct sx *sx, int flags)
4011 {
4012         int error;
4013
4014         KASSERT((flags & SBL_VALID) == flags,
4015             ("soiolock: invalid flags %#x", flags));
4016
4017         if ((flags & SBL_WAIT) != 0) {
4018                 if ((flags & SBL_NOINTR) != 0) {
4019                         sx_xlock(sx);
4020                 } else {
4021                         error = sx_xlock_sig(sx);
4022                         if (error != 0)
4023                                 return (error);
4024                 }
4025         } else if (!sx_try_xlock(sx)) {
4026                 return (EWOULDBLOCK);
4027         }
4028
4029         if (__predict_false(SOLISTENING(so))) {
4030                 sx_xunlock(sx);
4031                 return (ENOTCONN);
4032         }
4033         return (0);
4034 }
4035
4036 void
4037 soiounlock(struct sx *sx)
4038 {
4039         sx_xunlock(sx);
4040 }
4041
4042 /*
4043  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4044  */
4045 struct sockaddr *
4046 sodupsockaddr(const struct sockaddr *sa, int mflags)
4047 {
4048         struct sockaddr *sa2;
4049
4050         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4051         if (sa2)
4052                 bcopy(sa, sa2, sa->sa_len);
4053         return sa2;
4054 }
4055
4056 /*
4057  * Register per-socket destructor.
4058  */
4059 void
4060 sodtor_set(struct socket *so, so_dtor_t *func)
4061 {
4062
4063         SOCK_LOCK_ASSERT(so);
4064         so->so_dtor = func;
4065 }
4066
4067 /*
4068  * Register per-socket buffer upcalls.
4069  */
4070 void
4071 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
4072 {
4073         struct sockbuf *sb;
4074
4075         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4076
4077         switch (which) {
4078         case SO_RCV:
4079                 sb = &so->so_rcv;
4080                 break;
4081         case SO_SND:
4082                 sb = &so->so_snd;
4083                 break;
4084         }
4085         SOCK_BUF_LOCK_ASSERT(so, which);
4086         sb->sb_upcall = func;
4087         sb->sb_upcallarg = arg;
4088         sb->sb_flags |= SB_UPCALL;
4089 }
4090
4091 void
4092 soupcall_clear(struct socket *so, sb_which which)
4093 {
4094         struct sockbuf *sb;
4095
4096         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4097
4098         switch (which) {
4099         case SO_RCV:
4100                 sb = &so->so_rcv;
4101                 break;
4102         case SO_SND:
4103                 sb = &so->so_snd;
4104                 break;
4105         }
4106         SOCK_BUF_LOCK_ASSERT(so, which);
4107         KASSERT(sb->sb_upcall != NULL,
4108             ("%s: so %p no upcall to clear", __func__, so));
4109         sb->sb_upcall = NULL;
4110         sb->sb_upcallarg = NULL;
4111         sb->sb_flags &= ~SB_UPCALL;
4112 }
4113
4114 void
4115 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4116 {
4117
4118         SOLISTEN_LOCK_ASSERT(so);
4119         so->sol_upcall = func;
4120         so->sol_upcallarg = arg;
4121 }
4122
4123 static void
4124 so_rdknl_lock(void *arg)
4125 {
4126         struct socket *so = arg;
4127
4128 retry:
4129         if (SOLISTENING(so)) {
4130                 SOLISTEN_LOCK(so);
4131         } else {
4132                 SOCK_RECVBUF_LOCK(so);
4133                 if (__predict_false(SOLISTENING(so))) {
4134                         SOCK_RECVBUF_UNLOCK(so);
4135                         goto retry;
4136                 }
4137         }
4138 }
4139
4140 static void
4141 so_rdknl_unlock(void *arg)
4142 {
4143         struct socket *so = arg;
4144
4145         if (SOLISTENING(so))
4146                 SOLISTEN_UNLOCK(so);
4147         else
4148                 SOCK_RECVBUF_UNLOCK(so);
4149 }
4150
4151 static void
4152 so_rdknl_assert_lock(void *arg, int what)
4153 {
4154         struct socket *so = arg;
4155
4156         if (what == LA_LOCKED) {
4157                 if (SOLISTENING(so))
4158                         SOLISTEN_LOCK_ASSERT(so);
4159                 else
4160                         SOCK_RECVBUF_LOCK_ASSERT(so);
4161         } else {
4162                 if (SOLISTENING(so))
4163                         SOLISTEN_UNLOCK_ASSERT(so);
4164                 else
4165                         SOCK_RECVBUF_UNLOCK_ASSERT(so);
4166         }
4167 }
4168
4169 static void
4170 so_wrknl_lock(void *arg)
4171 {
4172         struct socket *so = arg;
4173
4174 retry:
4175         if (SOLISTENING(so)) {
4176                 SOLISTEN_LOCK(so);
4177         } else {
4178                 SOCK_SENDBUF_LOCK(so);
4179                 if (__predict_false(SOLISTENING(so))) {
4180                         SOCK_SENDBUF_UNLOCK(so);
4181                         goto retry;
4182                 }
4183         }
4184 }
4185
4186 static void
4187 so_wrknl_unlock(void *arg)
4188 {
4189         struct socket *so = arg;
4190
4191         if (SOLISTENING(so))
4192                 SOLISTEN_UNLOCK(so);
4193         else
4194                 SOCK_SENDBUF_UNLOCK(so);
4195 }
4196
4197 static void
4198 so_wrknl_assert_lock(void *arg, int what)
4199 {
4200         struct socket *so = arg;
4201
4202         if (what == LA_LOCKED) {
4203                 if (SOLISTENING(so))
4204                         SOLISTEN_LOCK_ASSERT(so);
4205                 else
4206                         SOCK_SENDBUF_LOCK_ASSERT(so);
4207         } else {
4208                 if (SOLISTENING(so))
4209                         SOLISTEN_UNLOCK_ASSERT(so);
4210                 else
4211                         SOCK_SENDBUF_UNLOCK_ASSERT(so);
4212         }
4213 }
4214
4215 /*
4216  * Create an external-format (``xsocket'') structure using the information in
4217  * the kernel-format socket structure pointed to by so.  This is done to
4218  * reduce the spew of irrelevant information over this interface, to isolate
4219  * user code from changes in the kernel structure, and potentially to provide
4220  * information-hiding if we decide that some of this information should be
4221  * hidden from users.
4222  */
4223 void
4224 sotoxsocket(struct socket *so, struct xsocket *xso)
4225 {
4226
4227         bzero(xso, sizeof(*xso));
4228         xso->xso_len = sizeof *xso;
4229         xso->xso_so = (uintptr_t)so;
4230         xso->so_type = so->so_type;
4231         xso->so_options = so->so_options;
4232         xso->so_linger = so->so_linger;
4233         xso->so_state = so->so_state;
4234         xso->so_pcb = (uintptr_t)so->so_pcb;
4235         xso->xso_protocol = so->so_proto->pr_protocol;
4236         xso->xso_family = so->so_proto->pr_domain->dom_family;
4237         xso->so_timeo = so->so_timeo;
4238         xso->so_error = so->so_error;
4239         xso->so_uid = so->so_cred->cr_uid;
4240         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4241         if (SOLISTENING(so)) {
4242                 xso->so_qlen = so->sol_qlen;
4243                 xso->so_incqlen = so->sol_incqlen;
4244                 xso->so_qlimit = so->sol_qlimit;
4245                 xso->so_oobmark = 0;
4246         } else {
4247                 xso->so_state |= so->so_qstate;
4248                 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4249                 xso->so_oobmark = so->so_oobmark;
4250                 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4251                 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4252         }
4253 }
4254
4255 struct sockbuf *
4256 so_sockbuf_rcv(struct socket *so)
4257 {
4258
4259         return (&so->so_rcv);
4260 }
4261
4262 struct sockbuf *
4263 so_sockbuf_snd(struct socket *so)
4264 {
4265
4266         return (&so->so_snd);
4267 }
4268
4269 int
4270 so_state_get(const struct socket *so)
4271 {
4272
4273         return (so->so_state);
4274 }
4275
4276 void
4277 so_state_set(struct socket *so, int val)
4278 {
4279
4280         so->so_state = val;
4281 }
4282
4283 int
4284 so_options_get(const struct socket *so)
4285 {
4286
4287         return (so->so_options);
4288 }
4289
4290 void
4291 so_options_set(struct socket *so, int val)
4292 {
4293
4294         so->so_options = val;
4295 }
4296
4297 int
4298 so_error_get(const struct socket *so)
4299 {
4300
4301         return (so->so_error);
4302 }
4303
4304 void
4305 so_error_set(struct socket *so, int val)
4306 {
4307
4308         so->so_error = val;
4309 }
4310
4311 int
4312 so_linger_get(const struct socket *so)
4313 {
4314
4315         return (so->so_linger);
4316 }
4317
4318 void
4319 so_linger_set(struct socket *so, int val)
4320 {
4321
4322         KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4323             ("%s: val %d out of range", __func__, val));
4324
4325         so->so_linger = val;
4326 }
4327
4328 struct protosw *
4329 so_protosw_get(const struct socket *so)
4330 {
4331
4332         return (so->so_proto);
4333 }
4334
4335 void
4336 so_protosw_set(struct socket *so, struct protosw *val)
4337 {
4338
4339         so->so_proto = val;
4340 }
4341
4342 void
4343 so_sorwakeup(struct socket *so)
4344 {
4345
4346         sorwakeup(so);
4347 }
4348
4349 void
4350 so_sowwakeup(struct socket *so)
4351 {
4352
4353         sowwakeup(so);
4354 }
4355
4356 void
4357 so_sorwakeup_locked(struct socket *so)
4358 {
4359
4360         sorwakeup_locked(so);
4361 }
4362
4363 void
4364 so_sowwakeup_locked(struct socket *so)
4365 {
4366
4367         sowwakeup_locked(so);
4368 }
4369
4370 void
4371 so_lock(struct socket *so)
4372 {
4373
4374         SOCK_LOCK(so);
4375 }
4376
4377 void
4378 so_unlock(struct socket *so)
4379 {
4380
4381         SOCK_UNLOCK(so);
4382 }