sys/kern/uipc_socket.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   5  *      The Regents of the University of California.
   6  * Copyright (c) 2004 The FreeBSD Foundation
   7  * Copyright (c) 2004-2008 Robert N. M. Watson
   8  * All rights reserved.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  35  */
  36
  37 /*
  38  * Comments on the socket life cycle:
  39  *
  40  * soalloc() sets of socket layer state for a socket, called only by
  41  * socreate() and sonewconn().  Socket layer private.
  42  *
  43  * sodealloc() tears down socket layer state for a socket, called only by
  44  * sofree() and sonewconn().  Socket layer private.
  45  *
  46  * pru_attach() associates protocol layer state with an allocated socket;
  47  * called only once, may fail, aborting socket allocation.  This is called
  48  * from socreate() and sonewconn().  Socket layer private.
  49  *
  50  * pru_detach() disassociates protocol layer state from an attached socket,
  51  * and will be called exactly once for sockets in which pru_attach() has
  52  * been successfully called.  If pru_attach() returned an error,
  53  * pru_detach() will not be called.  Socket layer private.
  54  *
  55  * pru_abort() and pru_close() notify the protocol layer that the last
  56  * consumer of a socket is starting to tear down the socket, and that the
  57  * protocol should terminate the connection.  Historically, pru_abort() also
  58  * detached protocol state from the socket state, but this is no longer the
  59  * case.
  60  *
  61  * socreate() creates a socket and attaches protocol state.  This is a public
  62  * interface that may be used by socket layer consumers to create new
  63  * sockets.
  64  *
  65  * sonewconn() creates a socket and attaches protocol state.  This is a
  66  * public interface  that may be used by protocols to create new sockets when
  67  * a new connection is received and will be available for accept() on a
  68  * listen socket.
  69  *
  70  * soclose() destroys a socket after possibly waiting for it to disconnect.
  71  * This is a public interface that socket consumers should use to close and
  72  * release a socket when done with it.
  73  *
  74  * soabort() destroys a socket without waiting for it to disconnect (used
  75  * only for incoming connections that are already partially or fully
  76  * connected).  This is used internally by the socket layer when clearing
  77  * listen socket queues (due to overflow or close on the listen socket), but
  78  * is also a public interface protocols may use to abort connections in
  79  * their incomplete listen queues should they no longer be required.  Sockets
  80  * placed in completed connection listen queues should not be aborted for
  81  * reasons described in the comment above the soclose() implementation.  This
  82  * is not a general purpose close routine, and except in the specific
  83  * circumstances described here, should not be used.
  84  *
  85  * sofree() will free a socket and its protocol state if all references on
  86  * the socket have been released, and is the public interface to attempt to
  87  * free a socket when a reference is removed.  This is a socket layer private
  88  * interface.
  89  *
  90  * NOTE: In addition to socreate() and soclose(), which provide a single
  91  * socket reference to the consumer to be managed as required, there are two
  92  * calls to explicitly manage socket references, soref(), and sorele().
  93  * Currently, these are generally required only when transitioning a socket
  94  * from a listen queue to a file descriptor, in order to prevent garbage
  95  * collection of the socket at an untimely moment.  For a number of reasons,
  96  * these interfaces are not preferred, and should be avoided.
  97  *
  98  * NOTE: With regard to VNETs the general rule is that callers do not set
  99  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
 100  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
 101  * and sorflush(), which are usually called from a pre-set VNET context.
 102  * sopoll() currently does not need a VNET context to be set.
 103  */
 104
 105 #include <sys/cdefs.h>
 106 __FBSDID("$FreeBSD$");
 107
 108 #include "opt_inet.h"
 109 #include "opt_inet6.h"
 110 #include "opt_kern_tls.h"
 111 #include "opt_sctp.h"
 112
 113 #include <sys/param.h>
 114 #include <sys/systm.h>
 115 #include <sys/capsicum.h>
 116 #include <sys/fcntl.h>
 117 #include <sys/limits.h>
 118 #include <sys/lock.h>
 119 #include <sys/mac.h>
 120 #include <sys/malloc.h>
 121 #include <sys/mbuf.h>
 122 #include <sys/mutex.h>
 123 #include <sys/domain.h>
 124 #include <sys/file.h>                   /* for struct knote */
 125 #include <sys/hhook.h>
 126 #include <sys/kernel.h>
 127 #include <sys/khelp.h>
 128 #include <sys/ktls.h>
 129 #include <sys/event.h>
 130 #include <sys/eventhandler.h>
 131 #include <sys/poll.h>
 132 #include <sys/proc.h>
 133 #include <sys/protosw.h>
 134 #include <sys/sbuf.h>
 135 #include <sys/socket.h>
 136 #include <sys/socketvar.h>
 137 #include <sys/resourcevar.h>
 138 #include <net/route.h>
 139 #include <sys/signalvar.h>
 140 #include <sys/stat.h>
 141 #include <sys/sx.h>
 142 #include <sys/sysctl.h>
 143 #include <sys/taskqueue.h>
 144 #include <sys/uio.h>
 145 #include <sys/un.h>
 146 #include <sys/unpcb.h>
 147 #include <sys/jail.h>
 148 #include <sys/syslog.h>
 149 #include <netinet/in.h>
 150 #include <netinet/in_pcb.h>
 151 #include <netinet/tcp.h>
 152
 153 #include <net/vnet.h>
 154
 155 #include <security/mac/mac_framework.h>
 156
 157 #include <vm/uma.h>
 158
 159 #ifdef COMPAT_FREEBSD32
 160 #include <sys/mount.h>
 161 #include <sys/sysent.h>
 162 #include <compat/freebsd32/freebsd32.h>
 163 #endif
 164
 165 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 166                     int flags);
 167 static void     so_rdknl_lock(void *);
 168 static void     so_rdknl_unlock(void *);
 169 static void     so_rdknl_assert_lock(void *, int);
 170 static void     so_wrknl_lock(void *);
 171 static void     so_wrknl_unlock(void *);
 172 static void     so_wrknl_assert_lock(void *, int);
 173
 174 static void     filt_sordetach(struct knote *kn);
 175 static int      filt_soread(struct knote *kn, long hint);
 176 static void     filt_sowdetach(struct knote *kn);
 177 static int      filt_sowrite(struct knote *kn, long hint);
 178 static int      filt_soempty(struct knote *kn, long hint);
 179 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 180 fo_kqfilter_t   soo_kqfilter;
 181
 182 static struct filterops soread_filtops = {
 183         .f_isfd = 1,
 184         .f_detach = filt_sordetach,
 185         .f_event = filt_soread,
 186 };
 187 static struct filterops sowrite_filtops = {
 188         .f_isfd = 1,
 189         .f_detach = filt_sowdetach,
 190         .f_event = filt_sowrite,
 191 };
 192 static struct filterops soempty_filtops = {
 193         .f_isfd = 1,
 194         .f_detach = filt_sowdetach,
 195         .f_event = filt_soempty,
 196 };
 197
 198 so_gen_t        so_gencnt;      /* generation count for sockets */
 199
 200 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 201 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 202
 203 #define VNET_SO_ASSERT(so)                                              \
 204         VNET_ASSERT(curvnet != NULL,                                    \
 205             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 206
 207 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 208 #define V_socket_hhh            VNET(socket_hhh)
 209
 210 /*
 211  * Limit on the number of connections in the listen queue waiting
 212  * for accept(2).
 213  * NB: The original sysctl somaxconn is still available but hidden
 214  * to prevent confusion about the actual purpose of this number.
 215  */
 216 static u_int somaxconn = SOMAXCONN;
 217
 218 static int
 219 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 220 {
 221         int error;
 222         int val;
 223
 224         val = somaxconn;
 225         error = sysctl_handle_int(oidp, &val, 0, req);
 226         if (error || !req->newptr )
 227                 return (error);
 228
 229         /*
 230          * The purpose of the UINT_MAX / 3 limit, is so that the formula
 231          *   3 * so_qlimit / 2
 232          * below, will not overflow.
 233          */
 234
 235         if (val < 1 || val > UINT_MAX / 3)
 236                 return (EINVAL);
 237
 238         somaxconn = val;
 239         return (0);
 240 }
 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
 242     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
 243     sysctl_somaxconn, "I",
 244     "Maximum listen socket pending connection accept queue size");
 245 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 246     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
 247     sizeof(int), sysctl_somaxconn, "I",
 248     "Maximum listen socket pending connection accept queue size (compat)");
 249
 250 static int numopensockets;
 251 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 252     &numopensockets, 0, "Number of open sockets");
 253
 254 /*
 255  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 256  * so_gencnt field.
 257  */
 258 static struct mtx so_global_mtx;
 259 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 260
 261 /*
 262  * General IPC sysctl name space, used by sockets and a variety of other IPC
 263  * types.
 264  */
 265 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 266     "IPC");
 267
 268 /*
 269  * Initialize the socket subsystem and set up the socket
 270  * memory allocator.
 271  */
 272 static uma_zone_t socket_zone;
 273 int     maxsockets;
 274
 275 static void
 276 socket_zone_change(void *tag)
 277 {
 278
 279         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 280 }
 281
 282 static void
 283 socket_hhook_register(int subtype)
 284 {
 285
 286         if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 287             &V_socket_hhh[subtype],
 288             HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 289                 printf("%s: WARNING: unable to register hook\n", __func__);
 290 }
 291
 292 static void
 293 socket_hhook_deregister(int subtype)
 294 {
 295
 296         if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 297                 printf("%s: WARNING: unable to deregister hook\n", __func__);
 298 }
 299
 300 static void
 301 socket_init(void *tag)
 302 {
 303
 304         socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 305             NULL, NULL, UMA_ALIGN_PTR, 0);
 306         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 307         uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 308         EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 309             EVENTHANDLER_PRI_FIRST);
 310 }
 311 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 312
 313 static void
 314 socket_vnet_init(const void *unused __unused)
 315 {
 316         int i;
 317
 318         /* We expect a contiguous range */
 319         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 320                 socket_hhook_register(i);
 321 }
 322 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 323     socket_vnet_init, NULL);
 324
 325 static void
 326 socket_vnet_uninit(const void *unused __unused)
 327 {
 328         int i;
 329
 330         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 331                 socket_hhook_deregister(i);
 332 }
 333 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 334     socket_vnet_uninit, NULL);
 335
 336 /*
 337  * Initialise maxsockets.  This SYSINIT must be run after
 338  * tunable_mbinit().
 339  */
 340 static void
 341 init_maxsockets(void *ignored)
 342 {
 343
 344         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 345         maxsockets = imax(maxsockets, maxfiles);
 346 }
 347 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 348
 349 /*
 350  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 351  * of the change so that they can update their dependent limits as required.
 352  */
 353 static int
 354 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 355 {
 356         int error, newmaxsockets;
 357
 358         newmaxsockets = maxsockets;
 359         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 360         if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
 361                 if (newmaxsockets > maxsockets &&
 362                     newmaxsockets <= maxfiles) {
 363                         maxsockets = newmaxsockets;
 364                         EVENTHANDLER_INVOKE(maxsockets_change);
 365                 } else
 366                         error = EINVAL;
 367         }
 368         return (error);
 369 }
 370 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
 371     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0,
 372     sysctl_maxsockets, "IU",
 373     "Maximum number of sockets available");
 374
 375 /*
 376  * Socket operation routines.  These routines are called by the routines in
 377  * sys_socket.c or from a system process, and implement the semantics of
 378  * socket operations by switching out to the protocol specific routines.
 379  */
 380
 381 /*
 382  * Get a socket structure from our zone, and initialize it.  Note that it
 383  * would probably be better to allocate socket and PCB at the same time, but
 384  * I'm not convinced that all the protocols can be easily modified to do
 385  * this.
 386  *
 387  * soalloc() returns a socket with a ref count of 0.
 388  */
 389 static struct socket *
 390 soalloc(struct vnet *vnet)
 391 {
 392         struct socket *so;
 393
 394         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 395         if (so == NULL)
 396                 return (NULL);
 397 #ifdef MAC
 398         if (mac_socket_init(so, M_NOWAIT) != 0) {
 399                 uma_zfree(socket_zone, so);
 400                 return (NULL);
 401         }
 402 #endif
 403         if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 404                 uma_zfree(socket_zone, so);
 405                 return (NULL);
 406         }
 407
 408         /*
 409          * The socket locking protocol allows to lock 2 sockets at a time,
 410          * however, the first one must be a listening socket.  WITNESS lacks
 411          * a feature to change class of an existing lock, so we use DUPOK.
 412          */
 413         mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 414         mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
 415         mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
 416         so->so_rcv.sb_sel = &so->so_rdsel;
 417         so->so_snd.sb_sel = &so->so_wrsel;
 418         sx_init(&so->so_snd_sx, "so_snd_sx");
 419         sx_init(&so->so_rcv_sx, "so_rcv_sx");
 420         TAILQ_INIT(&so->so_snd.sb_aiojobq);
 421         TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 422         TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 423         TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 424 #ifdef VIMAGE
 425         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 426             __func__, __LINE__, so));
 427         so->so_vnet = vnet;
 428 #endif
 429         /* We shouldn't need the so_global_mtx */
 430         if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 431                 /* Do we need more comprehensive error returns? */
 432                 uma_zfree(socket_zone, so);
 433                 return (NULL);
 434         }
 435         mtx_lock(&so_global_mtx);
 436         so->so_gencnt = ++so_gencnt;
 437         ++numopensockets;
 438 #ifdef VIMAGE
 439         vnet->vnet_sockcnt++;
 440 #endif
 441         mtx_unlock(&so_global_mtx);
 442
 443         return (so);
 444 }
 445
 446 /*
 447  * Free the storage associated with a socket at the socket layer, tear down
 448  * locks, labels, etc.  All protocol state is assumed already to have been
 449  * torn down (and possibly never set up) by the caller.
 450  */
 451 void
 452 sodealloc(struct socket *so)
 453 {
 454
 455         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 456         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 457
 458         mtx_lock(&so_global_mtx);
 459         so->so_gencnt = ++so_gencnt;
 460         --numopensockets;       /* Could be below, but faster here. */
 461 #ifdef VIMAGE
 462         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 463             __func__, __LINE__, so));
 464         so->so_vnet->vnet_sockcnt--;
 465 #endif
 466         mtx_unlock(&so_global_mtx);
 467 #ifdef MAC
 468         mac_socket_destroy(so);
 469 #endif
 470         hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 471
 472         khelp_destroy_osd(&so->osd);
 473         if (SOLISTENING(so)) {
 474                 if (so->sol_accept_filter != NULL)
 475                         accept_filt_setopt(so, NULL);
 476         } else {
 477                 if (so->so_rcv.sb_hiwat)
 478                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 479                             &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 480                 if (so->so_snd.sb_hiwat)
 481                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 482                             &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 483                 sx_destroy(&so->so_snd_sx);
 484                 sx_destroy(&so->so_rcv_sx);
 485                 mtx_destroy(&so->so_snd_mtx);
 486                 mtx_destroy(&so->so_rcv_mtx);
 487         }
 488         crfree(so->so_cred);
 489         mtx_destroy(&so->so_lock);
 490         uma_zfree(socket_zone, so);
 491 }
 492
 493 /*
 494  * socreate returns a socket with a ref count of 1 and a file descriptor
 495  * reference.  The socket should be closed with soclose().
 496  */
 497 int
 498 socreate(int dom, struct socket **aso, int type, int proto,
 499     struct ucred *cred, struct thread *td)
 500 {
 501         struct protosw *prp;
 502         struct socket *so;
 503         int error;
 504
 505         /*
 506          * XXX: divert(4) historically abused PF_INET.  Keep this compatibility
 507          * shim until all applications have been updated.
 508          */
 509         if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
 510             proto == IPPROTO_DIVERT)) {
 511                 dom = PF_DIVERT;
 512                 printf("%s uses obsolete way to create divert(4) socket\n",
 513                     td->td_proc->p_comm);
 514         }
 515
 516         prp = pffindproto(dom, type, proto);
 517         if (prp == NULL) {
 518                 /* No support for domain. */
 519                 if (pffinddomain(dom) == NULL)
 520                         return (EAFNOSUPPORT);
 521                 /* No support for socket type. */
 522                 if (proto == 0 && type != 0)
 523                         return (EPROTOTYPE);
 524                 return (EPROTONOSUPPORT);
 525         }
 526
 527         MPASS(prp->pr_attach);
 528
 529         if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0)
 530                 return (ECAPMODE);
 531
 532         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 533                 return (EPROTONOSUPPORT);
 534
 535         so = soalloc(CRED_TO_VNET(cred));
 536         if (so == NULL)
 537                 return (ENOBUFS);
 538
 539         so->so_type = type;
 540         so->so_cred = crhold(cred);
 541         if ((prp->pr_domain->dom_family == PF_INET) ||
 542             (prp->pr_domain->dom_family == PF_INET6) ||
 543             (prp->pr_domain->dom_family == PF_ROUTE))
 544                 so->so_fibnum = td->td_proc->p_fibnum;
 545         else
 546                 so->so_fibnum = 0;
 547         so->so_proto = prp;
 548 #ifdef MAC
 549         mac_socket_create(cred, so);
 550 #endif
 551         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 552             so_rdknl_assert_lock);
 553         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 554             so_wrknl_assert_lock);
 555         if ((prp->pr_flags & PR_SOCKBUF) == 0) {
 556                 so->so_snd.sb_mtx = &so->so_snd_mtx;
 557                 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 558         }
 559         /*
 560          * Auto-sizing of socket buffers is managed by the protocols and
 561          * the appropriate flags must be set in the pru_attach function.
 562          */
 563         CURVNET_SET(so->so_vnet);
 564         error = prp->pr_attach(so, proto, td);
 565         CURVNET_RESTORE();
 566         if (error) {
 567                 sodealloc(so);
 568                 return (error);
 569         }
 570         soref(so);
 571         *aso = so;
 572         return (0);
 573 }
 574
 575 #ifdef REGRESSION
 576 static int regression_sonewconn_earlytest = 1;
 577 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 578     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 579 #endif
 580
 581 static int sooverprio = LOG_DEBUG;
 582 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW,
 583     &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable");
 584
 585 static struct timeval overinterval = { 60, 0 };
 586 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
 587     &overinterval,
 588     "Delay in seconds between warnings for listen socket overflows");
 589
 590 /*
 591  * When an attempt at a new connection is noted on a socket which supports
 592  * accept(2), the protocol has two options:
 593  * 1) Call legacy sonewconn() function, which would call protocol attach
 594  *    method, same as used for socket(2).
 595  * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
 596  *    and then call solisten_enqueue().
 597  *
 598  * Note: the ref count on the socket is 0 on return.
 599  */
 600 struct socket *
 601 solisten_clone(struct socket *head)
 602 {
 603         struct sbuf descrsb;
 604         struct socket *so;
 605         int len, overcount;
 606         u_int qlen;
 607         const char localprefix[] = "local:";
 608         char descrbuf[SUNPATHLEN + sizeof(localprefix)];
 609 #if defined(INET6)
 610         char addrbuf[INET6_ADDRSTRLEN];
 611 #elif defined(INET)
 612         char addrbuf[INET_ADDRSTRLEN];
 613 #endif
 614         bool dolog, over;
 615
 616         SOLISTEN_LOCK(head);
 617         over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 618 #ifdef REGRESSION
 619         if (regression_sonewconn_earlytest && over) {
 620 #else
 621         if (over) {
 622 #endif
 623                 head->sol_overcount++;
 624                 dolog = (sooverprio >= 0) &&
 625                         !!ratecheck(&head->sol_lastover, &overinterval);
 626
 627                 /*
 628                  * If we're going to log, copy the overflow count and queue
 629                  * length from the listen socket before dropping the lock.
 630                  * Also, reset the overflow count.
 631                  */
 632                 if (dolog) {
 633                         overcount = head->sol_overcount;
 634                         head->sol_overcount = 0;
 635                         qlen = head->sol_qlen;
 636                 }
 637                 SOLISTEN_UNLOCK(head);
 638
 639                 if (dolog) {
 640                         /*
 641                          * Try to print something descriptive about the
 642                          * socket for the error message.
 643                          */
 644                         sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
 645                             SBUF_FIXEDLEN);
 646                         switch (head->so_proto->pr_domain->dom_family) {
 647 #if defined(INET) || defined(INET6)
 648 #ifdef INET
 649                         case AF_INET:
 650 #endif
 651 #ifdef INET6
 652                         case AF_INET6:
 653                                 if (head->so_proto->pr_domain->dom_family ==
 654                                     AF_INET6 ||
 655                                     (sotoinpcb(head)->inp_inc.inc_flags &
 656                                     INC_ISIPV6)) {
 657                                         ip6_sprintf(addrbuf,
 658                                             &sotoinpcb(head)->inp_inc.inc6_laddr);
 659                                         sbuf_printf(&descrsb, "[%s]", addrbuf);
 660                                 } else
 661 #endif
 662                                 {
 663 #ifdef INET
 664                                         inet_ntoa_r(
 665                                             sotoinpcb(head)->inp_inc.inc_laddr,
 666                                             addrbuf);
 667                                         sbuf_cat(&descrsb, addrbuf);
 668 #endif
 669                                 }
 670                                 sbuf_printf(&descrsb, ":%hu (proto %u)",
 671                                     ntohs(sotoinpcb(head)->inp_inc.inc_lport),
 672                                     head->so_proto->pr_protocol);
 673                                 break;
 674 #endif /* INET || INET6 */
 675                         case AF_UNIX:
 676                                 sbuf_cat(&descrsb, localprefix);
 677                                 if (sotounpcb(head)->unp_addr != NULL)
 678                                         len =
 679                                             sotounpcb(head)->unp_addr->sun_len -
 680                                             offsetof(struct sockaddr_un,
 681                                             sun_path);
 682                                 else
 683                                         len = 0;
 684                                 if (len > 0)
 685                                         sbuf_bcat(&descrsb,
 686                                             sotounpcb(head)->unp_addr->sun_path,
 687                                             len);
 688                                 else
 689                                         sbuf_cat(&descrsb, "(unknown)");
 690                                 break;
 691                         }
 692
 693                         /*
 694                          * If we can't print something more specific, at least
 695                          * print the domain name.
 696                          */
 697                         if (sbuf_finish(&descrsb) != 0 ||
 698                             sbuf_len(&descrsb) <= 0) {
 699                                 sbuf_clear(&descrsb);
 700                                 sbuf_cat(&descrsb,
 701                                     head->so_proto->pr_domain->dom_name ?:
 702                                     "unknown");
 703                                 sbuf_finish(&descrsb);
 704                         }
 705                         KASSERT(sbuf_len(&descrsb) > 0,
 706                             ("%s: sbuf creation failed", __func__));
 707                         /*
 708                          * Preserve the historic listen queue overflow log
 709                          * message, that starts with "sonewconn:".  It has
 710                          * been known to sysadmins for years and also test
 711                          * sys/kern/sonewconn_overflow checks for it.
 712                          */
 713                         if (head->so_cred == 0) {
 714                                 log(LOG_PRI(sooverprio),
 715                                     "sonewconn: pcb %p (%s): "
 716                                     "Listen queue overflow: %i already in "
 717                                     "queue awaiting acceptance (%d "
 718                                     "occurrences)\n", head->so_pcb,
 719                                     sbuf_data(&descrsb),
 720                                 qlen, overcount);
 721                         } else {
 722                                 log(LOG_PRI(sooverprio),
 723                                     "sonewconn: pcb %p (%s): "
 724                                     "Listen queue overflow: "
 725                                     "%i already in queue awaiting acceptance "
 726                                     "(%d occurrences), euid %d, rgid %d, jail %s\n",
 727                                     head->so_pcb, sbuf_data(&descrsb), qlen,
 728                                     overcount, head->so_cred->cr_uid,
 729                                     head->so_cred->cr_rgid,
 730                                     head->so_cred->cr_prison ?
 731                                         head->so_cred->cr_prison->pr_name :
 732                                         "not_jailed");
 733                         }
 734                         sbuf_delete(&descrsb);
 735
 736                         overcount = 0;
 737                 }
 738
 739                 return (NULL);
 740         }
 741         SOLISTEN_UNLOCK(head);
 742         VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 743             __func__, head));
 744         so = soalloc(head->so_vnet);
 745         if (so == NULL) {
 746                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 747                     "limit reached or out of memory\n",
 748                     __func__, head->so_pcb);
 749                 return (NULL);
 750         }
 751         so->so_listen = head;
 752         so->so_type = head->so_type;
 753         so->so_options = head->so_options & ~SO_ACCEPTCONN;
 754         so->so_linger = head->so_linger;
 755         so->so_state = head->so_state;
 756         so->so_fibnum = head->so_fibnum;
 757         so->so_proto = head->so_proto;
 758         so->so_cred = crhold(head->so_cred);
 759 #ifdef MAC
 760         mac_socket_newconn(head, so);
 761 #endif
 762         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 763             so_rdknl_assert_lock);
 764         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 765             so_wrknl_assert_lock);
 766         VNET_SO_ASSERT(head);
 767         if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 768                 sodealloc(so);
 769                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 770                     __func__, head->so_pcb);
 771                 return (NULL);
 772         }
 773         so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 774         so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 775         so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 776         so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 777         so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
 778         so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
 779         if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 780                 so->so_snd.sb_mtx = &so->so_snd_mtx;
 781                 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 782         }
 783
 784         return (so);
 785 }
 786
 787 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */
 788 struct socket *
 789 sonewconn(struct socket *head, int connstatus)
 790 {
 791         struct socket *so;
 792
 793         if ((so = solisten_clone(head)) == NULL)
 794                 return (NULL);
 795
 796         if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
 797                 sodealloc(so);
 798                 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
 799                     __func__, head->so_pcb);
 800                 return (NULL);
 801         }
 802
 803         (void)solisten_enqueue(so, connstatus);
 804
 805         return (so);
 806 }
 807
 808 /*
 809  * Enqueue socket cloned by solisten_clone() to the listen queue of the
 810  * listener it has been cloned from.
 811  *
 812  * Return 'true' if socket landed on complete queue, otherwise 'false'.
 813  */
 814 bool
 815 solisten_enqueue(struct socket *so, int connstatus)
 816 {
 817         struct socket *head = so->so_listen;
 818
 819         MPASS(refcount_load(&so->so_count) == 0);
 820         refcount_init(&so->so_count, 1);
 821
 822         SOLISTEN_LOCK(head);
 823         if (head->sol_accept_filter != NULL)
 824                 connstatus = 0;
 825         so->so_state |= connstatus;
 826         soref(head); /* A socket on (in)complete queue refs head. */
 827         if (connstatus) {
 828                 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 829                 so->so_qstate = SQ_COMP;
 830                 head->sol_qlen++;
 831                 solisten_wakeup(head);  /* unlocks */
 832                 return (true);
 833         } else {
 834                 /*
 835                  * Keep removing sockets from the head until there's room for
 836                  * us to insert on the tail.  In pre-locking revisions, this
 837                  * was a simple if(), but as we could be racing with other
 838                  * threads and soabort() requires dropping locks, we must
 839                  * loop waiting for the condition to be true.
 840                  */
 841                 while (head->sol_incqlen > head->sol_qlimit) {
 842                         struct socket *sp;
 843
 844                         sp = TAILQ_FIRST(&head->sol_incomp);
 845                         TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 846                         head->sol_incqlen--;
 847                         SOCK_LOCK(sp);
 848                         sp->so_qstate = SQ_NONE;
 849                         sp->so_listen = NULL;
 850                         SOCK_UNLOCK(sp);
 851                         sorele_locked(head);    /* does SOLISTEN_UNLOCK, head stays */
 852                         soabort(sp);
 853                         SOLISTEN_LOCK(head);
 854                 }
 855                 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 856                 so->so_qstate = SQ_INCOMP;
 857                 head->sol_incqlen++;
 858                 SOLISTEN_UNLOCK(head);
 859                 return (false);
 860         }
 861 }
 862
 863 #if defined(SCTP) || defined(SCTP_SUPPORT)
 864 /*
 865  * Socket part of sctp_peeloff().  Detach a new socket from an
 866  * association.  The new socket is returned with a reference.
 867  *
 868  * XXXGL: reduce copy-paste with solisten_clone().
 869  */
 870 struct socket *
 871 sopeeloff(struct socket *head)
 872 {
 873         struct socket *so;
 874
 875         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 876             __func__, __LINE__, head));
 877         so = soalloc(head->so_vnet);
 878         if (so == NULL) {
 879                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 880                     "limit reached or out of memory\n",
 881                     __func__, head->so_pcb);
 882                 return (NULL);
 883         }
 884         so->so_type = head->so_type;
 885         so->so_options = head->so_options;
 886         so->so_linger = head->so_linger;
 887         so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 888         so->so_fibnum = head->so_fibnum;
 889         so->so_proto = head->so_proto;
 890         so->so_cred = crhold(head->so_cred);
 891 #ifdef MAC
 892         mac_socket_newconn(head, so);
 893 #endif
 894         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 895             so_rdknl_assert_lock);
 896         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 897             so_wrknl_assert_lock);
 898         VNET_SO_ASSERT(head);
 899         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 900                 sodealloc(so);
 901                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 902                     __func__, head->so_pcb);
 903                 return (NULL);
 904         }
 905         if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
 906                 sodealloc(so);
 907                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 908                     __func__, head->so_pcb);
 909                 return (NULL);
 910         }
 911         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 912         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 913         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 914         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 915         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 916         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 917
 918         soref(so);
 919
 920         return (so);
 921 }
 922 #endif  /* SCTP */
 923
 924 int
 925 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 926 {
 927         int error;
 928
 929         CURVNET_SET(so->so_vnet);
 930         error = so->so_proto->pr_bind(so, nam, td);
 931         CURVNET_RESTORE();
 932         return (error);
 933 }
 934
 935 int
 936 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 937 {
 938         int error;
 939
 940         CURVNET_SET(so->so_vnet);
 941         error = so->so_proto->pr_bindat(fd, so, nam, td);
 942         CURVNET_RESTORE();
 943         return (error);
 944 }
 945
 946 /*
 947  * solisten() transitions a socket from a non-listening state to a listening
 948  * state, but can also be used to update the listen queue depth on an
 949  * existing listen socket.  The protocol will call back into the sockets
 950  * layer using solisten_proto_check() and solisten_proto() to check and set
 951  * socket-layer listen state.  Call backs are used so that the protocol can
 952  * acquire both protocol and socket layer locks in whatever order is required
 953  * by the protocol.
 954  *
 955  * Protocol implementors are advised to hold the socket lock across the
 956  * socket-layer test and set to avoid races at the socket layer.
 957  */
 958 int
 959 solisten(struct socket *so, int backlog, struct thread *td)
 960 {
 961         int error;
 962
 963         CURVNET_SET(so->so_vnet);
 964         error = so->so_proto->pr_listen(so, backlog, td);
 965         CURVNET_RESTORE();
 966         return (error);
 967 }
 968
 969 /*
 970  * Prepare for a call to solisten_proto().  Acquire all socket buffer locks in
 971  * order to interlock with socket I/O.
 972  */
 973 int
 974 solisten_proto_check(struct socket *so)
 975 {
 976         SOCK_LOCK_ASSERT(so);
 977
 978         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 979             SS_ISDISCONNECTING)) != 0)
 980                 return (EINVAL);
 981
 982         /*
 983          * Sleeping is not permitted here, so simply fail if userspace is
 984          * attempting to transmit or receive on the socket.  This kind of
 985          * transient failure is not ideal, but it should occur only if userspace
 986          * is misusing the socket interfaces.
 987          */
 988         if (!sx_try_xlock(&so->so_snd_sx))
 989                 return (EAGAIN);
 990         if (!sx_try_xlock(&so->so_rcv_sx)) {
 991                 sx_xunlock(&so->so_snd_sx);
 992                 return (EAGAIN);
 993         }
 994         mtx_lock(&so->so_snd_mtx);
 995         mtx_lock(&so->so_rcv_mtx);
 996
 997         /* Interlock with soo_aio_queue() and KTLS. */
 998         if (!SOLISTENING(so)) {
 999                 bool ktls;
1000
1001 #ifdef KERN_TLS
1002                 ktls = so->so_snd.sb_tls_info != NULL ||
1003                     so->so_rcv.sb_tls_info != NULL;
1004 #else
1005                 ktls = false;
1006 #endif
1007                 if (ktls ||
1008                     (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
1009                     (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) {
1010                         solisten_proto_abort(so);
1011                         return (EINVAL);
1012                 }
1013         }
1014
1015         return (0);
1016 }
1017
1018 /*
1019  * Undo the setup done by solisten_proto_check().
1020  */
1021 void
1022 solisten_proto_abort(struct socket *so)
1023 {
1024         mtx_unlock(&so->so_snd_mtx);
1025         mtx_unlock(&so->so_rcv_mtx);
1026         sx_xunlock(&so->so_snd_sx);
1027         sx_xunlock(&so->so_rcv_sx);
1028 }
1029
1030 void
1031 solisten_proto(struct socket *so, int backlog)
1032 {
1033         int sbrcv_lowat, sbsnd_lowat;
1034         u_int sbrcv_hiwat, sbsnd_hiwat;
1035         short sbrcv_flags, sbsnd_flags;
1036         sbintime_t sbrcv_timeo, sbsnd_timeo;
1037
1038         SOCK_LOCK_ASSERT(so);
1039         KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
1040             SS_ISDISCONNECTING)) == 0,
1041             ("%s: bad socket state %p", __func__, so));
1042
1043         if (SOLISTENING(so))
1044                 goto listening;
1045
1046         /*
1047          * Change this socket to listening state.
1048          */
1049         sbrcv_lowat = so->so_rcv.sb_lowat;
1050         sbsnd_lowat = so->so_snd.sb_lowat;
1051         sbrcv_hiwat = so->so_rcv.sb_hiwat;
1052         sbsnd_hiwat = so->so_snd.sb_hiwat;
1053         sbrcv_flags = so->so_rcv.sb_flags;
1054         sbsnd_flags = so->so_snd.sb_flags;
1055         sbrcv_timeo = so->so_rcv.sb_timeo;
1056         sbsnd_timeo = so->so_snd.sb_timeo;
1057
1058         sbdestroy(so, SO_SND);
1059         sbdestroy(so, SO_RCV);
1060
1061 #ifdef INVARIANTS
1062         bzero(&so->so_rcv,
1063             sizeof(struct socket) - offsetof(struct socket, so_rcv));
1064 #endif
1065
1066         so->sol_sbrcv_lowat = sbrcv_lowat;
1067         so->sol_sbsnd_lowat = sbsnd_lowat;
1068         so->sol_sbrcv_hiwat = sbrcv_hiwat;
1069         so->sol_sbsnd_hiwat = sbsnd_hiwat;
1070         so->sol_sbrcv_flags = sbrcv_flags;
1071         so->sol_sbsnd_flags = sbsnd_flags;
1072         so->sol_sbrcv_timeo = sbrcv_timeo;
1073         so->sol_sbsnd_timeo = sbsnd_timeo;
1074
1075         so->sol_qlen = so->sol_incqlen = 0;
1076         TAILQ_INIT(&so->sol_incomp);
1077         TAILQ_INIT(&so->sol_comp);
1078
1079         so->sol_accept_filter = NULL;
1080         so->sol_accept_filter_arg = NULL;
1081         so->sol_accept_filter_str = NULL;
1082
1083         so->sol_upcall = NULL;
1084         so->sol_upcallarg = NULL;
1085
1086         so->so_options |= SO_ACCEPTCONN;
1087
1088 listening:
1089         if (backlog < 0 || backlog > somaxconn)
1090                 backlog = somaxconn;
1091         so->sol_qlimit = backlog;
1092
1093         mtx_unlock(&so->so_snd_mtx);
1094         mtx_unlock(&so->so_rcv_mtx);
1095         sx_xunlock(&so->so_snd_sx);
1096         sx_xunlock(&so->so_rcv_sx);
1097 }
1098
1099 /*
1100  * Wakeup listeners/subsystems once we have a complete connection.
1101  * Enters with lock, returns unlocked.
1102  */
1103 void
1104 solisten_wakeup(struct socket *sol)
1105 {
1106
1107         if (sol->sol_upcall != NULL)
1108                 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
1109         else {
1110                 selwakeuppri(&sol->so_rdsel, PSOCK);
1111                 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
1112         }
1113         SOLISTEN_UNLOCK(sol);
1114         wakeup_one(&sol->sol_comp);
1115         if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
1116                 pgsigio(&sol->so_sigio, SIGIO, 0);
1117 }
1118
1119 /*
1120  * Return single connection off a listening socket queue.  Main consumer of
1121  * the function is kern_accept4().  Some modules, that do their own accept
1122  * management also use the function.  The socket reference held by the
1123  * listen queue is handed to the caller.
1124  *
1125  * Listening socket must be locked on entry and is returned unlocked on
1126  * return.
1127  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1128  */
1129 int
1130 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1131 {
1132         struct socket *so;
1133         int error;
1134
1135         SOLISTEN_LOCK_ASSERT(head);
1136
1137         while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1138             head->so_error == 0) {
1139                 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1140                     "accept", 0);
1141                 if (error != 0) {
1142                         SOLISTEN_UNLOCK(head);
1143                         return (error);
1144                 }
1145         }
1146         if (head->so_error) {
1147                 error = head->so_error;
1148                 head->so_error = 0;
1149         } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1150                 error = EWOULDBLOCK;
1151         else
1152                 error = 0;
1153         if (error) {
1154                 SOLISTEN_UNLOCK(head);
1155                 return (error);
1156         }
1157         so = TAILQ_FIRST(&head->sol_comp);
1158         SOCK_LOCK(so);
1159         KASSERT(so->so_qstate == SQ_COMP,
1160             ("%s: so %p not SQ_COMP", __func__, so));
1161         head->sol_qlen--;
1162         so->so_qstate = SQ_NONE;
1163         so->so_listen = NULL;
1164         TAILQ_REMOVE(&head->sol_comp, so, so_list);
1165         if (flags & ACCEPT4_INHERIT)
1166                 so->so_state |= (head->so_state & SS_NBIO);
1167         else
1168                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1169         SOCK_UNLOCK(so);
1170         sorele_locked(head);
1171
1172         *ret = so;
1173         return (0);
1174 }
1175
1176 /*
1177  * Free socket upon release of the very last reference.
1178  */
1179 static void
1180 sofree(struct socket *so)
1181 {
1182         struct protosw *pr = so->so_proto;
1183
1184         SOCK_LOCK_ASSERT(so);
1185         KASSERT(refcount_load(&so->so_count) == 0,
1186             ("%s: so %p has references", __func__, so));
1187         KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
1188             ("%s: so %p is on listen queue", __func__, so));
1189
1190         SOCK_UNLOCK(so);
1191
1192         if (so->so_dtor != NULL)
1193                 so->so_dtor(so);
1194
1195         VNET_SO_ASSERT(so);
1196         if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) {
1197                 MPASS(pr->pr_domain->dom_dispose != NULL);
1198                 (*pr->pr_domain->dom_dispose)(so);
1199         }
1200         if (pr->pr_detach != NULL)
1201                 pr->pr_detach(so);
1202
1203         /*
1204          * From this point on, we assume that no other references to this
1205          * socket exist anywhere else in the stack.  Therefore, no locks need
1206          * to be acquired or held.
1207          */
1208         if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
1209                 sbdestroy(so, SO_SND);
1210                 sbdestroy(so, SO_RCV);
1211         }
1212         seldrain(&so->so_rdsel);
1213         seldrain(&so->so_wrsel);
1214         knlist_destroy(&so->so_rdsel.si_note);
1215         knlist_destroy(&so->so_wrsel.si_note);
1216         sodealloc(so);
1217 }
1218
1219 /*
1220  * Release a reference on a socket while holding the socket lock.
1221  * Unlocks the socket lock before returning.
1222  */
1223 void
1224 sorele_locked(struct socket *so)
1225 {
1226         SOCK_LOCK_ASSERT(so);
1227         if (refcount_release(&so->so_count))
1228                 sofree(so);
1229         else
1230                 SOCK_UNLOCK(so);
1231 }
1232
1233 /*
1234  * Close a socket on last file table reference removal.  Initiate disconnect
1235  * if connected.  Free socket when disconnect complete.
1236  *
1237  * This function will sorele() the socket.  Note that soclose() may be called
1238  * prior to the ref count reaching zero.  The actual socket structure will
1239  * not be freed until the ref count reaches zero.
1240  */
1241 int
1242 soclose(struct socket *so)
1243 {
1244         struct accept_queue lqueue;
1245         int error = 0;
1246         bool listening, last __diagused;
1247
1248         CURVNET_SET(so->so_vnet);
1249         funsetown(&so->so_sigio);
1250         if (so->so_state & SS_ISCONNECTED) {
1251                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1252                         error = sodisconnect(so);
1253                         if (error) {
1254                                 if (error == ENOTCONN)
1255                                         error = 0;
1256                                 goto drop;
1257                         }
1258                 }
1259
1260                 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1261                         if ((so->so_state & SS_ISDISCONNECTING) &&
1262                             (so->so_state & SS_NBIO))
1263                                 goto drop;
1264                         while (so->so_state & SS_ISCONNECTED) {
1265                                 error = tsleep(&so->so_timeo,
1266                                     PSOCK | PCATCH, "soclos",
1267                                     so->so_linger * hz);
1268                                 if (error)
1269                                         break;
1270                         }
1271                 }
1272         }
1273
1274 drop:
1275         if (so->so_proto->pr_close != NULL)
1276                 so->so_proto->pr_close(so);
1277
1278         SOCK_LOCK(so);
1279         if ((listening = SOLISTENING(so))) {
1280                 struct socket *sp;
1281
1282                 TAILQ_INIT(&lqueue);
1283                 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1284                 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1285
1286                 so->sol_qlen = so->sol_incqlen = 0;
1287
1288                 TAILQ_FOREACH(sp, &lqueue, so_list) {
1289                         SOCK_LOCK(sp);
1290                         sp->so_qstate = SQ_NONE;
1291                         sp->so_listen = NULL;
1292                         SOCK_UNLOCK(sp);
1293                         last = refcount_release(&so->so_count);
1294                         KASSERT(!last, ("%s: released last reference for %p",
1295                             __func__, so));
1296                 }
1297         }
1298         sorele_locked(so);
1299         if (listening) {
1300                 struct socket *sp, *tsp;
1301
1302                 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
1303                         soabort(sp);
1304         }
1305         CURVNET_RESTORE();
1306         return (error);
1307 }
1308
1309 /*
1310  * soabort() is used to abruptly tear down a connection, such as when a
1311  * resource limit is reached (listen queue depth exceeded), or if a listen
1312  * socket is closed while there are sockets waiting to be accepted.
1313  *
1314  * This interface is tricky, because it is called on an unreferenced socket,
1315  * and must be called only by a thread that has actually removed the socket
1316  * from the listen queue it was on.  Likely this thread holds the last
1317  * reference on the socket and soabort() will proceed with sofree().  But
1318  * it might be not the last, as the sockets on the listen queues are seen
1319  * from the protocol side.
1320  *
1321  * This interface will call into the protocol code, so must not be called
1322  * with any socket locks held.  Protocols do call it while holding their own
1323  * recursible protocol mutexes, but this is something that should be subject
1324  * to review in the future.
1325  *
1326  * Usually socket should have a single reference left, but this is not a
1327  * requirement.  In the past, when we have had named references for file
1328  * descriptor and protocol, we asserted that none of them are being held.
1329  */
1330 void
1331 soabort(struct socket *so)
1332 {
1333
1334         VNET_SO_ASSERT(so);
1335
1336         if (so->so_proto->pr_abort != NULL)
1337                 so->so_proto->pr_abort(so);
1338         SOCK_LOCK(so);
1339         sorele_locked(so);
1340 }
1341
1342 int
1343 soaccept(struct socket *so, struct sockaddr **nam)
1344 {
1345         int error;
1346
1347         CURVNET_SET(so->so_vnet);
1348         error = so->so_proto->pr_accept(so, nam);
1349         CURVNET_RESTORE();
1350         return (error);
1351 }
1352
1353 int
1354 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1355 {
1356
1357         return (soconnectat(AT_FDCWD, so, nam, td));
1358 }
1359
1360 int
1361 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1362 {
1363         int error;
1364
1365         CURVNET_SET(so->so_vnet);
1366
1367         /*
1368          * If protocol is connection-based, can only connect once.
1369          * Otherwise, if connected, try to disconnect first.  This allows
1370          * user to disconnect by connecting to, e.g., a null address.
1371          *
1372          * Note, this check is racy and may need to be re-evaluated at the
1373          * protocol layer.
1374          */
1375         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1376             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1377             (error = sodisconnect(so)))) {
1378                 error = EISCONN;
1379         } else {
1380                 /*
1381                  * Prevent accumulated error from previous connection from
1382                  * biting us.
1383                  */
1384                 so->so_error = 0;
1385                 if (fd == AT_FDCWD) {
1386                         error = so->so_proto->pr_connect(so, nam, td);
1387                 } else {
1388                         error = so->so_proto->pr_connectat(fd, so, nam, td);
1389                 }
1390         }
1391         CURVNET_RESTORE();
1392
1393         return (error);
1394 }
1395
1396 int
1397 soconnect2(struct socket *so1, struct socket *so2)
1398 {
1399         int error;
1400
1401         CURVNET_SET(so1->so_vnet);
1402         error = so1->so_proto->pr_connect2(so1, so2);
1403         CURVNET_RESTORE();
1404         return (error);
1405 }
1406
1407 int
1408 sodisconnect(struct socket *so)
1409 {
1410         int error;
1411
1412         if ((so->so_state & SS_ISCONNECTED) == 0)
1413                 return (ENOTCONN);
1414         if (so->so_state & SS_ISDISCONNECTING)
1415                 return (EALREADY);
1416         VNET_SO_ASSERT(so);
1417         error = so->so_proto->pr_disconnect(so);
1418         return (error);
1419 }
1420
1421 int
1422 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1423     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1424 {
1425         long space;
1426         ssize_t resid;
1427         int clen = 0, error, dontroute;
1428
1429         KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1430         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1431             ("sosend_dgram: !PR_ATOMIC"));
1432
1433         if (uio != NULL)
1434                 resid = uio->uio_resid;
1435         else
1436                 resid = top->m_pkthdr.len;
1437         /*
1438          * In theory resid should be unsigned.  However, space must be
1439          * signed, as it might be less than 0 if we over-committed, and we
1440          * must use a signed comparison of space and resid.  On the other
1441          * hand, a negative resid causes us to loop sending 0-length
1442          * segments to the protocol.
1443          */
1444         if (resid < 0) {
1445                 error = EINVAL;
1446                 goto out;
1447         }
1448
1449         dontroute =
1450             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1451         if (td != NULL)
1452                 td->td_ru.ru_msgsnd++;
1453         if (control != NULL)
1454                 clen = control->m_len;
1455
1456         SOCKBUF_LOCK(&so->so_snd);
1457         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1458                 SOCKBUF_UNLOCK(&so->so_snd);
1459                 error = EPIPE;
1460                 goto out;
1461         }
1462         if (so->so_error) {
1463                 error = so->so_error;
1464                 so->so_error = 0;
1465                 SOCKBUF_UNLOCK(&so->so_snd);
1466                 goto out;
1467         }
1468         if ((so->so_state & SS_ISCONNECTED) == 0) {
1469                 /*
1470                  * `sendto' and `sendmsg' is allowed on a connection-based
1471                  * socket if it supports implied connect.  Return ENOTCONN if
1472                  * not connected and no address is supplied.
1473                  */
1474                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1475                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1476                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1477                             !(resid == 0 && clen != 0)) {
1478                                 SOCKBUF_UNLOCK(&so->so_snd);
1479                                 error = ENOTCONN;
1480                                 goto out;
1481                         }
1482                 } else if (addr == NULL) {
1483                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1484                                 error = ENOTCONN;
1485                         else
1486                                 error = EDESTADDRREQ;
1487                         SOCKBUF_UNLOCK(&so->so_snd);
1488                         goto out;
1489                 }
1490         }
1491
1492         /*
1493          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1494          * problem and need fixing.
1495          */
1496         space = sbspace(&so->so_snd);
1497         if (flags & MSG_OOB)
1498                 space += 1024;
1499         space -= clen;
1500         SOCKBUF_UNLOCK(&so->so_snd);
1501         if (resid > space) {
1502                 error = EMSGSIZE;
1503                 goto out;
1504         }
1505         if (uio == NULL) {
1506                 resid = 0;
1507                 if (flags & MSG_EOR)
1508                         top->m_flags |= M_EOR;
1509         } else {
1510                 /*
1511                  * Copy the data from userland into a mbuf chain.
1512                  * If no data is to be copied in, a single empty mbuf
1513                  * is returned.
1514                  */
1515                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1516                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1517                 if (top == NULL) {
1518                         error = EFAULT; /* only possible error */
1519                         goto out;
1520                 }
1521                 space -= resid - uio->uio_resid;
1522                 resid = uio->uio_resid;
1523         }
1524         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1525         /*
1526          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1527          * than with.
1528          */
1529         if (dontroute) {
1530                 SOCK_LOCK(so);
1531                 so->so_options |= SO_DONTROUTE;
1532                 SOCK_UNLOCK(so);
1533         }
1534         /*
1535          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1536          * of date.  We could have received a reset packet in an interrupt or
1537          * maybe we slept while doing page faults in uiomove() etc.  We could
1538          * probably recheck again inside the locking protection here, but
1539          * there are probably other places that this also happens.  We must
1540          * rethink this.
1541          */
1542         VNET_SO_ASSERT(so);
1543         error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1544         /*
1545          * If the user set MSG_EOF, the protocol understands this flag and
1546          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1547          */
1548             ((flags & MSG_EOF) &&
1549              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1550              (resid <= 0)) ?
1551                 PRUS_EOF :
1552                 /* If there is more to send set PRUS_MORETOCOME */
1553                 (flags & MSG_MORETOCOME) ||
1554                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1555                 top, addr, control, td);
1556         if (dontroute) {
1557                 SOCK_LOCK(so);
1558                 so->so_options &= ~SO_DONTROUTE;
1559                 SOCK_UNLOCK(so);
1560         }
1561         clen = 0;
1562         control = NULL;
1563         top = NULL;
1564 out:
1565         if (top != NULL)
1566                 m_freem(top);
1567         if (control != NULL)
1568                 m_freem(control);
1569         return (error);
1570 }
1571
1572 /*
1573  * Send on a socket.  If send must go all at once and message is larger than
1574  * send buffering, then hard error.  Lock against other senders.  If must go
1575  * all at once and not enough room now, then inform user that this would
1576  * block and do nothing.  Otherwise, if nonblocking, send as much as
1577  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1578  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1579  * in mbuf chain must be small enough to send all at once.
1580  *
1581  * Returns nonzero on error, timeout or signal; callers must check for short
1582  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1583  * on return.
1584  */
1585 int
1586 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1587     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1588 {
1589         long space;
1590         ssize_t resid;
1591         int clen = 0, error, dontroute;
1592         int atomic = sosendallatonce(so) || top;
1593         int pr_send_flag;
1594 #ifdef KERN_TLS
1595         struct ktls_session *tls;
1596         int tls_enq_cnt, tls_send_flag;
1597         uint8_t tls_rtype;
1598
1599         tls = NULL;
1600         tls_rtype = TLS_RLTYPE_APP;
1601 #endif
1602         if (uio != NULL)
1603                 resid = uio->uio_resid;
1604         else if ((top->m_flags & M_PKTHDR) != 0)
1605                 resid = top->m_pkthdr.len;
1606         else
1607                 resid = m_length(top, NULL);
1608         /*
1609          * In theory resid should be unsigned.  However, space must be
1610          * signed, as it might be less than 0 if we over-committed, and we
1611          * must use a signed comparison of space and resid.  On the other
1612          * hand, a negative resid causes us to loop sending 0-length
1613          * segments to the protocol.
1614          *
1615          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1616          * type sockets since that's an error.
1617          */
1618         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1619                 error = EINVAL;
1620                 goto out;
1621         }
1622
1623         dontroute =
1624             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1625             (so->so_proto->pr_flags & PR_ATOMIC);
1626         if (td != NULL)
1627                 td->td_ru.ru_msgsnd++;
1628         if (control != NULL)
1629                 clen = control->m_len;
1630
1631         error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1632         if (error)
1633                 goto out;
1634
1635 #ifdef KERN_TLS
1636         tls_send_flag = 0;
1637         tls = ktls_hold(so->so_snd.sb_tls_info);
1638         if (tls != NULL) {
1639                 if (tls->mode == TCP_TLS_MODE_SW)
1640                         tls_send_flag = PRUS_NOTREADY;
1641
1642                 if (control != NULL) {
1643                         struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1644
1645                         if (clen >= sizeof(*cm) &&
1646                             cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1647                                 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1648                                 clen = 0;
1649                                 m_freem(control);
1650                                 control = NULL;
1651                                 atomic = 1;
1652                         }
1653                 }
1654
1655                 if (resid == 0 && !ktls_permit_empty_frames(tls)) {
1656                         error = EINVAL;
1657                         goto release;
1658                 }
1659         }
1660 #endif
1661
1662 restart:
1663         do {
1664                 SOCKBUF_LOCK(&so->so_snd);
1665                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1666                         SOCKBUF_UNLOCK(&so->so_snd);
1667                         error = EPIPE;
1668                         goto release;
1669                 }
1670                 if (so->so_error) {
1671                         error = so->so_error;
1672                         so->so_error = 0;
1673                         SOCKBUF_UNLOCK(&so->so_snd);
1674                         goto release;
1675                 }
1676                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1677                         /*
1678                          * `sendto' and `sendmsg' is allowed on a connection-
1679                          * based socket if it supports implied connect.
1680                          * Return ENOTCONN if not connected and no address is
1681                          * supplied.
1682                          */
1683                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1684                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1685                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1686                                     !(resid == 0 && clen != 0)) {
1687                                         SOCKBUF_UNLOCK(&so->so_snd);
1688                                         error = ENOTCONN;
1689                                         goto release;
1690                                 }
1691                         } else if (addr == NULL) {
1692                                 SOCKBUF_UNLOCK(&so->so_snd);
1693                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1694                                         error = ENOTCONN;
1695                                 else
1696                                         error = EDESTADDRREQ;
1697                                 goto release;
1698                         }
1699                 }
1700                 space = sbspace(&so->so_snd);
1701                 if (flags & MSG_OOB)
1702                         space += 1024;
1703                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1704                     clen > so->so_snd.sb_hiwat) {
1705                         SOCKBUF_UNLOCK(&so->so_snd);
1706                         error = EMSGSIZE;
1707                         goto release;
1708                 }
1709                 if (space < resid + clen &&
1710                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1711                         if ((so->so_state & SS_NBIO) ||
1712                             (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1713                                 SOCKBUF_UNLOCK(&so->so_snd);
1714                                 error = EWOULDBLOCK;
1715                                 goto release;
1716                         }
1717                         error = sbwait(so, SO_SND);
1718                         SOCKBUF_UNLOCK(&so->so_snd);
1719                         if (error)
1720                                 goto release;
1721                         goto restart;
1722                 }
1723                 SOCKBUF_UNLOCK(&so->so_snd);
1724                 space -= clen;
1725                 do {
1726                         if (uio == NULL) {
1727                                 resid = 0;
1728                                 if (flags & MSG_EOR)
1729                                         top->m_flags |= M_EOR;
1730 #ifdef KERN_TLS
1731                                 if (tls != NULL) {
1732                                         ktls_frame(top, tls, &tls_enq_cnt,
1733                                             tls_rtype);
1734                                         tls_rtype = TLS_RLTYPE_APP;
1735                                 }
1736 #endif
1737                         } else {
1738                                 /*
1739                                  * Copy the data from userland into a mbuf
1740                                  * chain.  If resid is 0, which can happen
1741                                  * only if we have control to send, then
1742                                  * a single empty mbuf is returned.  This
1743                                  * is a workaround to prevent protocol send
1744                                  * methods to panic.
1745                                  */
1746 #ifdef KERN_TLS
1747                                 if (tls != NULL) {
1748                                         top = m_uiotombuf(uio, M_WAITOK, space,
1749                                             tls->params.max_frame_len,
1750                                             M_EXTPG |
1751                                             ((flags & MSG_EOR) ? M_EOR : 0));
1752                                         if (top != NULL) {
1753                                                 ktls_frame(top, tls,
1754                                                     &tls_enq_cnt, tls_rtype);
1755                                         }
1756                                         tls_rtype = TLS_RLTYPE_APP;
1757                                 } else
1758 #endif
1759                                         top = m_uiotombuf(uio, M_WAITOK, space,
1760                                             (atomic ? max_hdr : 0),
1761                                             (atomic ? M_PKTHDR : 0) |
1762                                             ((flags & MSG_EOR) ? M_EOR : 0));
1763                                 if (top == NULL) {
1764                                         error = EFAULT; /* only possible error */
1765                                         goto release;
1766                                 }
1767                                 space -= resid - uio->uio_resid;
1768                                 resid = uio->uio_resid;
1769                         }
1770                         if (dontroute) {
1771                                 SOCK_LOCK(so);
1772                                 so->so_options |= SO_DONTROUTE;
1773                                 SOCK_UNLOCK(so);
1774                         }
1775                         /*
1776                          * XXX all the SBS_CANTSENDMORE checks previously
1777                          * done could be out of date.  We could have received
1778                          * a reset packet in an interrupt or maybe we slept
1779                          * while doing page faults in uiomove() etc.  We
1780                          * could probably recheck again inside the locking
1781                          * protection here, but there are probably other
1782                          * places that this also happens.  We must rethink
1783                          * this.
1784                          */
1785                         VNET_SO_ASSERT(so);
1786
1787                         pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
1788                         /*
1789                          * If the user set MSG_EOF, the protocol understands
1790                          * this flag and nothing left to send then use
1791                          * PRU_SEND_EOF instead of PRU_SEND.
1792                          */
1793                             ((flags & MSG_EOF) &&
1794                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1795                              (resid <= 0)) ?
1796                                 PRUS_EOF :
1797                         /* If there is more to send set PRUS_MORETOCOME. */
1798                             (flags & MSG_MORETOCOME) ||
1799                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1800
1801 #ifdef KERN_TLS
1802                         pr_send_flag |= tls_send_flag;
1803 #endif
1804
1805                         error = so->so_proto->pr_send(so, pr_send_flag, top,
1806                             addr, control, td);
1807
1808                         if (dontroute) {
1809                                 SOCK_LOCK(so);
1810                                 so->so_options &= ~SO_DONTROUTE;
1811                                 SOCK_UNLOCK(so);
1812                         }
1813
1814 #ifdef KERN_TLS
1815                         if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1816                                 if (error != 0) {
1817                                         m_freem(top);
1818                                         top = NULL;
1819                                 } else {
1820                                         soref(so);
1821                                         ktls_enqueue(top, so, tls_enq_cnt);
1822                                 }
1823                         }
1824 #endif
1825                         clen = 0;
1826                         control = NULL;
1827                         top = NULL;
1828                         if (error)
1829                                 goto release;
1830                 } while (resid && space > 0);
1831         } while (resid);
1832
1833 release:
1834         SOCK_IO_SEND_UNLOCK(so);
1835 out:
1836 #ifdef KERN_TLS
1837         if (tls != NULL)
1838                 ktls_free(tls);
1839 #endif
1840         if (top != NULL)
1841                 m_freem(top);
1842         if (control != NULL)
1843                 m_freem(control);
1844         return (error);
1845 }
1846
1847 /*
1848  * Send to a socket from a kernel thread.
1849  *
1850  * XXXGL: in almost all cases uio is NULL and the mbuf is supplied.
1851  * Exception is nfs/bootp_subr.c.  It is arguable that the VNET context needs
1852  * to be set at all.  This function should just boil down to a static inline
1853  * calling the protocol method.
1854  */
1855 int
1856 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1857     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1858 {
1859         int error;
1860
1861         CURVNET_SET(so->so_vnet);
1862         error = so->so_proto->pr_sosend(so, addr, uio,
1863             top, control, flags, td);
1864         CURVNET_RESTORE();
1865         return (error);
1866 }
1867
1868 /*
1869  * send(2), write(2) or aio_write(2) on a socket.
1870  */
1871 int
1872 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1873     struct mbuf *control, int flags, struct proc *userproc)
1874 {
1875         struct thread *td;
1876         ssize_t len;
1877         int error;
1878
1879         td = uio->uio_td;
1880         len = uio->uio_resid;
1881         CURVNET_SET(so->so_vnet);
1882         error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags,
1883             td);
1884         CURVNET_RESTORE();
1885         if (error != 0) {
1886                 /*
1887                  * Clear transient errors for stream protocols if they made
1888                  * some progress.  Make exclusion for aio(4) that would
1889                  * schedule a new write in case of EWOULDBLOCK and clear
1890                  * error itself.  See soaio_process_job().
1891                  */
1892                 if (uio->uio_resid != len &&
1893                     (so->so_proto->pr_flags & PR_ATOMIC) == 0 &&
1894                     userproc == NULL &&
1895                     (error == ERESTART || error == EINTR ||
1896                     error == EWOULDBLOCK))
1897                         error = 0;
1898                 /* Generation of SIGPIPE can be controlled per socket. */
1899                 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 &&
1900                     (flags & MSG_NOSIGNAL) == 0) {
1901                         if (userproc != NULL) {
1902                                 /* aio(4) job */
1903                                 PROC_LOCK(userproc);
1904                                 kern_psignal(userproc, SIGPIPE);
1905                                 PROC_UNLOCK(userproc);
1906                         } else {
1907                                 PROC_LOCK(td->td_proc);
1908                                 tdsignal(td, SIGPIPE);
1909                                 PROC_UNLOCK(td->td_proc);
1910                         }
1911                 }
1912         }
1913         return (error);
1914 }
1915
1916 /*
1917  * The part of soreceive() that implements reading non-inline out-of-band
1918  * data from a socket.  For more complete comments, see soreceive(), from
1919  * which this code originated.
1920  *
1921  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1922  * unable to return an mbuf chain to the caller.
1923  */
1924 static int
1925 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1926 {
1927         struct protosw *pr = so->so_proto;
1928         struct mbuf *m;
1929         int error;
1930
1931         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1932         VNET_SO_ASSERT(so);
1933
1934         m = m_get(M_WAITOK, MT_DATA);
1935         error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
1936         if (error)
1937                 goto bad;
1938         do {
1939                 error = uiomove(mtod(m, void *),
1940                     (int) min(uio->uio_resid, m->m_len), uio);
1941                 m = m_free(m);
1942         } while (uio->uio_resid && error == 0 && m);
1943 bad:
1944         if (m != NULL)
1945                 m_freem(m);
1946         return (error);
1947 }
1948
1949 /*
1950  * Following replacement or removal of the first mbuf on the first mbuf chain
1951  * of a socket buffer, push necessary state changes back into the socket
1952  * buffer so that other consumers see the values consistently.  'nextrecord'
1953  * is the callers locally stored value of the original value of
1954  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1955  * NOTE: 'nextrecord' may be NULL.
1956  */
1957 static __inline void
1958 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1959 {
1960
1961         SOCKBUF_LOCK_ASSERT(sb);
1962         /*
1963          * First, update for the new value of nextrecord.  If necessary, make
1964          * it the first record.
1965          */
1966         if (sb->sb_mb != NULL)
1967                 sb->sb_mb->m_nextpkt = nextrecord;
1968         else
1969                 sb->sb_mb = nextrecord;
1970
1971         /*
1972          * Now update any dependent socket buffer fields to reflect the new
1973          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1974          * addition of a second clause that takes care of the case where
1975          * sb_mb has been updated, but remains the last record.
1976          */
1977         if (sb->sb_mb == NULL) {
1978                 sb->sb_mbtail = NULL;
1979                 sb->sb_lastrecord = NULL;
1980         } else if (sb->sb_mb->m_nextpkt == NULL)
1981                 sb->sb_lastrecord = sb->sb_mb;
1982 }
1983
1984 /*
1985  * Implement receive operations on a socket.  We depend on the way that
1986  * records are added to the sockbuf by sbappend.  In particular, each record
1987  * (mbufs linked through m_next) must begin with an address if the protocol
1988  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1989  * data, and then zero or more mbufs of data.  In order to allow parallelism
1990  * between network receive and copying to user space, as well as avoid
1991  * sleeping with a mutex held, we release the socket buffer mutex during the
1992  * user space copy.  Although the sockbuf is locked, new data may still be
1993  * appended, and thus we must maintain consistency of the sockbuf during that
1994  * time.
1995  *
1996  * The caller may receive the data as a single mbuf chain by supplying an
1997  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1998  * the count in uio_resid.
1999  */
2000 int
2001 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
2002     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2003 {
2004         struct mbuf *m, **mp;
2005         int flags, error, offset;
2006         ssize_t len;
2007         struct protosw *pr = so->so_proto;
2008         struct mbuf *nextrecord;
2009         int moff, type = 0;
2010         ssize_t orig_resid = uio->uio_resid;
2011         bool report_real_len = false;
2012
2013         mp = mp0;
2014         if (psa != NULL)
2015                 *psa = NULL;
2016         if (controlp != NULL)
2017                 *controlp = NULL;
2018         if (flagsp != NULL) {
2019                 report_real_len = *flagsp & MSG_TRUNC;
2020                 *flagsp &= ~MSG_TRUNC;
2021                 flags = *flagsp &~ MSG_EOR;
2022         } else
2023                 flags = 0;
2024         if (flags & MSG_OOB)
2025                 return (soreceive_rcvoob(so, uio, flags));
2026         if (mp != NULL)
2027                 *mp = NULL;
2028         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
2029             && uio->uio_resid) {
2030                 VNET_SO_ASSERT(so);
2031                 pr->pr_rcvd(so, 0);
2032         }
2033
2034         error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2035         if (error)
2036                 return (error);
2037
2038 restart:
2039         SOCKBUF_LOCK(&so->so_rcv);
2040         m = so->so_rcv.sb_mb;
2041         /*
2042          * If we have less data than requested, block awaiting more (subject
2043          * to any timeout) if:
2044          *   1. the current count is less than the low water mark, or
2045          *   2. MSG_DONTWAIT is not set
2046          */
2047         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2048             sbavail(&so->so_rcv) < uio->uio_resid) &&
2049             sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
2050             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2051                 KASSERT(m != NULL || !sbavail(&so->so_rcv),
2052                     ("receive: m == %p sbavail == %u",
2053                     m, sbavail(&so->so_rcv)));
2054                 if (so->so_error || so->so_rerror) {
2055                         if (m != NULL)
2056                                 goto dontblock;
2057                         if (so->so_error)
2058                                 error = so->so_error;
2059                         else
2060                                 error = so->so_rerror;
2061                         if ((flags & MSG_PEEK) == 0) {
2062                                 if (so->so_error)
2063                                         so->so_error = 0;
2064                                 else
2065                                         so->so_rerror = 0;
2066                         }
2067                         SOCKBUF_UNLOCK(&so->so_rcv);
2068                         goto release;
2069                 }
2070                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2071                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2072                         if (m != NULL)
2073                                 goto dontblock;
2074 #ifdef KERN_TLS
2075                         else if (so->so_rcv.sb_tlsdcc == 0 &&
2076                             so->so_rcv.sb_tlscc == 0) {
2077 #else
2078                         else {
2079 #endif
2080                                 SOCKBUF_UNLOCK(&so->so_rcv);
2081                                 goto release;
2082                         }
2083                 }
2084                 for (; m != NULL; m = m->m_next)
2085                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
2086                                 m = so->so_rcv.sb_mb;
2087                                 goto dontblock;
2088                         }
2089                 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
2090                     SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
2091                     (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2092                         SOCKBUF_UNLOCK(&so->so_rcv);
2093                         error = ENOTCONN;
2094                         goto release;
2095                 }
2096                 if (uio->uio_resid == 0 && !report_real_len) {
2097                         SOCKBUF_UNLOCK(&so->so_rcv);
2098                         goto release;
2099                 }
2100                 if ((so->so_state & SS_NBIO) ||
2101                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2102                         SOCKBUF_UNLOCK(&so->so_rcv);
2103                         error = EWOULDBLOCK;
2104                         goto release;
2105                 }
2106                 SBLASTRECORDCHK(&so->so_rcv);
2107                 SBLASTMBUFCHK(&so->so_rcv);
2108                 error = sbwait(so, SO_RCV);
2109                 SOCKBUF_UNLOCK(&so->so_rcv);
2110                 if (error)
2111                         goto release;
2112                 goto restart;
2113         }
2114 dontblock:
2115         /*
2116          * From this point onward, we maintain 'nextrecord' as a cache of the
2117          * pointer to the next record in the socket buffer.  We must keep the
2118          * various socket buffer pointers and local stack versions of the
2119          * pointers in sync, pushing out modifications before dropping the
2120          * socket buffer mutex, and re-reading them when picking it up.
2121          *
2122          * Otherwise, we will race with the network stack appending new data
2123          * or records onto the socket buffer by using inconsistent/stale
2124          * versions of the field, possibly resulting in socket buffer
2125          * corruption.
2126          *
2127          * By holding the high-level sblock(), we prevent simultaneous
2128          * readers from pulling off the front of the socket buffer.
2129          */
2130         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2131         if (uio->uio_td)
2132                 uio->uio_td->td_ru.ru_msgrcv++;
2133         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2134         SBLASTRECORDCHK(&so->so_rcv);
2135         SBLASTMBUFCHK(&so->so_rcv);
2136         nextrecord = m->m_nextpkt;
2137         if (pr->pr_flags & PR_ADDR) {
2138                 KASSERT(m->m_type == MT_SONAME,
2139                     ("m->m_type == %d", m->m_type));
2140                 orig_resid = 0;
2141                 if (psa != NULL)
2142                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2143                             M_NOWAIT);
2144                 if (flags & MSG_PEEK) {
2145                         m = m->m_next;
2146                 } else {
2147                         sbfree(&so->so_rcv, m);
2148                         so->so_rcv.sb_mb = m_free(m);
2149                         m = so->so_rcv.sb_mb;
2150                         sockbuf_pushsync(&so->so_rcv, nextrecord);
2151                 }
2152         }
2153
2154         /*
2155          * Process one or more MT_CONTROL mbufs present before any data mbufs
2156          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2157          * just copy the data; if !MSG_PEEK, we call into the protocol to
2158          * perform externalization (or freeing if controlp == NULL).
2159          */
2160         if (m != NULL && m->m_type == MT_CONTROL) {
2161                 struct mbuf *cm = NULL, *cmn;
2162                 struct mbuf **cme = &cm;
2163 #ifdef KERN_TLS
2164                 struct cmsghdr *cmsg;
2165                 struct tls_get_record tgr;
2166
2167                 /*
2168                  * For MSG_TLSAPPDATA, check for an alert record.
2169                  * If found, return ENXIO without removing
2170                  * it from the receive queue.  This allows a subsequent
2171                  * call without MSG_TLSAPPDATA to receive it.
2172                  * Note that, for TLS, there should only be a single
2173                  * control mbuf with the TLS_GET_RECORD message in it.
2174                  */
2175                 if (flags & MSG_TLSAPPDATA) {
2176                         cmsg = mtod(m, struct cmsghdr *);
2177                         if (cmsg->cmsg_type == TLS_GET_RECORD &&
2178                             cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2179                                 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2180                                 if (__predict_false(tgr.tls_type ==
2181                                     TLS_RLTYPE_ALERT)) {
2182                                         SOCKBUF_UNLOCK(&so->so_rcv);
2183                                         error = ENXIO;
2184                                         goto release;
2185                                 }
2186                         }
2187                 }
2188 #endif
2189
2190                 do {
2191                         if (flags & MSG_PEEK) {
2192                                 if (controlp != NULL) {
2193                                         *controlp = m_copym(m, 0, m->m_len,
2194                                             M_NOWAIT);
2195                                         controlp = &(*controlp)->m_next;
2196                                 }
2197                                 m = m->m_next;
2198                         } else {
2199                                 sbfree(&so->so_rcv, m);
2200                                 so->so_rcv.sb_mb = m->m_next;
2201                                 m->m_next = NULL;
2202                                 *cme = m;
2203                                 cme = &(*cme)->m_next;
2204                                 m = so->so_rcv.sb_mb;
2205                         }
2206                 } while (m != NULL && m->m_type == MT_CONTROL);
2207                 if ((flags & MSG_PEEK) == 0)
2208                         sockbuf_pushsync(&so->so_rcv, nextrecord);
2209                 while (cm != NULL) {
2210                         cmn = cm->m_next;
2211                         cm->m_next = NULL;
2212                         if (pr->pr_domain->dom_externalize != NULL) {
2213                                 SOCKBUF_UNLOCK(&so->so_rcv);
2214                                 VNET_SO_ASSERT(so);
2215                                 error = (*pr->pr_domain->dom_externalize)
2216                                     (cm, controlp, flags);
2217                                 SOCKBUF_LOCK(&so->so_rcv);
2218                         } else if (controlp != NULL)
2219                                 *controlp = cm;
2220                         else
2221                                 m_freem(cm);
2222                         if (controlp != NULL) {
2223                                 while (*controlp != NULL)
2224                                         controlp = &(*controlp)->m_next;
2225                         }
2226                         cm = cmn;
2227                 }
2228                 if (m != NULL)
2229                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2230                 else
2231                         nextrecord = so->so_rcv.sb_mb;
2232                 orig_resid = 0;
2233         }
2234         if (m != NULL) {
2235                 if ((flags & MSG_PEEK) == 0) {
2236                         KASSERT(m->m_nextpkt == nextrecord,
2237                             ("soreceive: post-control, nextrecord !sync"));
2238                         if (nextrecord == NULL) {
2239                                 KASSERT(so->so_rcv.sb_mb == m,
2240                                     ("soreceive: post-control, sb_mb!=m"));
2241                                 KASSERT(so->so_rcv.sb_lastrecord == m,
2242                                     ("soreceive: post-control, lastrecord!=m"));
2243                         }
2244                 }
2245                 type = m->m_type;
2246                 if (type == MT_OOBDATA)
2247                         flags |= MSG_OOB;
2248         } else {
2249                 if ((flags & MSG_PEEK) == 0) {
2250                         KASSERT(so->so_rcv.sb_mb == nextrecord,
2251                             ("soreceive: sb_mb != nextrecord"));
2252                         if (so->so_rcv.sb_mb == NULL) {
2253                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2254                                     ("soreceive: sb_lastercord != NULL"));
2255                         }
2256                 }
2257         }
2258         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2259         SBLASTRECORDCHK(&so->so_rcv);
2260         SBLASTMBUFCHK(&so->so_rcv);
2261
2262         /*
2263          * Now continue to read any data mbufs off of the head of the socket
2264          * buffer until the read request is satisfied.  Note that 'type' is
2265          * used to store the type of any mbuf reads that have happened so far
2266          * such that soreceive() can stop reading if the type changes, which
2267          * causes soreceive() to return only one of regular data and inline
2268          * out-of-band data in a single socket receive operation.
2269          */
2270         moff = 0;
2271         offset = 0;
2272         while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2273             && error == 0) {
2274                 /*
2275                  * If the type of mbuf has changed since the last mbuf
2276                  * examined ('type'), end the receive operation.
2277                  */
2278                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2279                 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2280                         if (type != m->m_type)
2281                                 break;
2282                 } else if (type == MT_OOBDATA)
2283                         break;
2284                 else
2285                     KASSERT(m->m_type == MT_DATA,
2286                         ("m->m_type == %d", m->m_type));
2287                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2288                 len = uio->uio_resid;
2289                 if (so->so_oobmark && len > so->so_oobmark - offset)
2290                         len = so->so_oobmark - offset;
2291                 if (len > m->m_len - moff)
2292                         len = m->m_len - moff;
2293                 /*
2294                  * If mp is set, just pass back the mbufs.  Otherwise copy
2295                  * them out via the uio, then free.  Sockbuf must be
2296                  * consistent here (points to current mbuf, it points to next
2297                  * record) when we drop priority; we must note any additions
2298                  * to the sockbuf when we block interrupts again.
2299                  */
2300                 if (mp == NULL) {
2301                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2302                         SBLASTRECORDCHK(&so->so_rcv);
2303                         SBLASTMBUFCHK(&so->so_rcv);
2304                         SOCKBUF_UNLOCK(&so->so_rcv);
2305                         if ((m->m_flags & M_EXTPG) != 0)
2306                                 error = m_unmapped_uiomove(m, moff, uio,
2307                                     (int)len);
2308                         else
2309                                 error = uiomove(mtod(m, char *) + moff,
2310                                     (int)len, uio);
2311                         SOCKBUF_LOCK(&so->so_rcv);
2312                         if (error) {
2313                                 /*
2314                                  * The MT_SONAME mbuf has already been removed
2315                                  * from the record, so it is necessary to
2316                                  * remove the data mbufs, if any, to preserve
2317                                  * the invariant in the case of PR_ADDR that
2318                                  * requires MT_SONAME mbufs at the head of
2319                                  * each record.
2320                                  */
2321                                 if (pr->pr_flags & PR_ATOMIC &&
2322                                     ((flags & MSG_PEEK) == 0))
2323                                         (void)sbdroprecord_locked(&so->so_rcv);
2324                                 SOCKBUF_UNLOCK(&so->so_rcv);
2325                                 goto release;
2326                         }
2327                 } else
2328                         uio->uio_resid -= len;
2329                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2330                 if (len == m->m_len - moff) {
2331                         if (m->m_flags & M_EOR)
2332                                 flags |= MSG_EOR;
2333                         if (flags & MSG_PEEK) {
2334                                 m = m->m_next;
2335                                 moff = 0;
2336                         } else {
2337                                 nextrecord = m->m_nextpkt;
2338                                 sbfree(&so->so_rcv, m);
2339                                 if (mp != NULL) {
2340                                         m->m_nextpkt = NULL;
2341                                         *mp = m;
2342                                         mp = &m->m_next;
2343                                         so->so_rcv.sb_mb = m = m->m_next;
2344                                         *mp = NULL;
2345                                 } else {
2346                                         so->so_rcv.sb_mb = m_free(m);
2347                                         m = so->so_rcv.sb_mb;
2348                                 }
2349                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
2350                                 SBLASTRECORDCHK(&so->so_rcv);
2351                                 SBLASTMBUFCHK(&so->so_rcv);
2352                         }
2353                 } else {
2354                         if (flags & MSG_PEEK)
2355                                 moff += len;
2356                         else {
2357                                 if (mp != NULL) {
2358                                         if (flags & MSG_DONTWAIT) {
2359                                                 *mp = m_copym(m, 0, len,
2360                                                     M_NOWAIT);
2361                                                 if (*mp == NULL) {
2362                                                         /*
2363                                                          * m_copym() couldn't
2364                                                          * allocate an mbuf.
2365                                                          * Adjust uio_resid back
2366                                                          * (it was adjusted
2367                                                          * down by len bytes,
2368                                                          * which we didn't end
2369                                                          * up "copying" over).
2370                                                          */
2371                                                         uio->uio_resid += len;
2372                                                         break;
2373                                                 }
2374                                         } else {
2375                                                 SOCKBUF_UNLOCK(&so->so_rcv);
2376                                                 *mp = m_copym(m, 0, len,
2377                                                     M_WAITOK);
2378                                                 SOCKBUF_LOCK(&so->so_rcv);
2379                                         }
2380                                 }
2381                                 sbcut_locked(&so->so_rcv, len);
2382                         }
2383                 }
2384                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2385                 if (so->so_oobmark) {
2386                         if ((flags & MSG_PEEK) == 0) {
2387                                 so->so_oobmark -= len;
2388                                 if (so->so_oobmark == 0) {
2389                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
2390                                         break;
2391                                 }
2392                         } else {
2393                                 offset += len;
2394                                 if (offset == so->so_oobmark)
2395                                         break;
2396                         }
2397                 }
2398                 if (flags & MSG_EOR)
2399                         break;
2400                 /*
2401                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
2402                  * must not quit until "uio->uio_resid == 0" or an error
2403                  * termination.  If a signal/timeout occurs, return with a
2404                  * short count but without error.  Keep sockbuf locked
2405                  * against other readers.
2406                  */
2407                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2408                     !sosendallatonce(so) && nextrecord == NULL) {
2409                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2410                         if (so->so_error || so->so_rerror ||
2411                             so->so_rcv.sb_state & SBS_CANTRCVMORE)
2412                                 break;
2413                         /*
2414                          * Notify the protocol that some data has been
2415                          * drained before blocking.
2416                          */
2417                         if (pr->pr_flags & PR_WANTRCVD) {
2418                                 SOCKBUF_UNLOCK(&so->so_rcv);
2419                                 VNET_SO_ASSERT(so);
2420                                 pr->pr_rcvd(so, flags);
2421                                 SOCKBUF_LOCK(&so->so_rcv);
2422                         }
2423                         SBLASTRECORDCHK(&so->so_rcv);
2424                         SBLASTMBUFCHK(&so->so_rcv);
2425                         /*
2426                          * We could receive some data while was notifying
2427                          * the protocol. Skip blocking in this case.
2428                          */
2429                         if (so->so_rcv.sb_mb == NULL) {
2430                                 error = sbwait(so, SO_RCV);
2431                                 if (error) {
2432                                         SOCKBUF_UNLOCK(&so->so_rcv);
2433                                         goto release;
2434                                 }
2435                         }
2436                         m = so->so_rcv.sb_mb;
2437                         if (m != NULL)
2438                                 nextrecord = m->m_nextpkt;
2439                 }
2440         }
2441
2442         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2443         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2444                 if (report_real_len)
2445                         uio->uio_resid -= m_length(m, NULL) - moff;
2446                 flags |= MSG_TRUNC;
2447                 if ((flags & MSG_PEEK) == 0)
2448                         (void) sbdroprecord_locked(&so->so_rcv);
2449         }
2450         if ((flags & MSG_PEEK) == 0) {
2451                 if (m == NULL) {
2452                         /*
2453                          * First part is an inline SB_EMPTY_FIXUP().  Second
2454                          * part makes sure sb_lastrecord is up-to-date if
2455                          * there is still data in the socket buffer.
2456                          */
2457                         so->so_rcv.sb_mb = nextrecord;
2458                         if (so->so_rcv.sb_mb == NULL) {
2459                                 so->so_rcv.sb_mbtail = NULL;
2460                                 so->so_rcv.sb_lastrecord = NULL;
2461                         } else if (nextrecord->m_nextpkt == NULL)
2462                                 so->so_rcv.sb_lastrecord = nextrecord;
2463                 }
2464                 SBLASTRECORDCHK(&so->so_rcv);
2465                 SBLASTMBUFCHK(&so->so_rcv);
2466                 /*
2467                  * If soreceive() is being done from the socket callback,
2468                  * then don't need to generate ACK to peer to update window,
2469                  * since ACK will be generated on return to TCP.
2470                  */
2471                 if (!(flags & MSG_SOCALLBCK) &&
2472                     (pr->pr_flags & PR_WANTRCVD)) {
2473                         SOCKBUF_UNLOCK(&so->so_rcv);
2474                         VNET_SO_ASSERT(so);
2475                         pr->pr_rcvd(so, flags);
2476                         SOCKBUF_LOCK(&so->so_rcv);
2477                 }
2478         }
2479         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2480         if (orig_resid == uio->uio_resid && orig_resid &&
2481             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2482                 SOCKBUF_UNLOCK(&so->so_rcv);
2483                 goto restart;
2484         }
2485         SOCKBUF_UNLOCK(&so->so_rcv);
2486
2487         if (flagsp != NULL)
2488                 *flagsp |= flags;
2489 release:
2490         SOCK_IO_RECV_UNLOCK(so);
2491         return (error);
2492 }
2493
2494 /*
2495  * Optimized version of soreceive() for stream (TCP) sockets.
2496  */
2497 int
2498 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2499     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2500 {
2501         int len = 0, error = 0, flags, oresid;
2502         struct sockbuf *sb;
2503         struct mbuf *m, *n = NULL;
2504
2505         /* We only do stream sockets. */
2506         if (so->so_type != SOCK_STREAM)
2507                 return (EINVAL);
2508         if (psa != NULL)
2509                 *psa = NULL;
2510         if (flagsp != NULL)
2511                 flags = *flagsp &~ MSG_EOR;
2512         else
2513                 flags = 0;
2514         if (controlp != NULL)
2515                 *controlp = NULL;
2516         if (flags & MSG_OOB)
2517                 return (soreceive_rcvoob(so, uio, flags));
2518         if (mp0 != NULL)
2519                 *mp0 = NULL;
2520
2521         sb = &so->so_rcv;
2522
2523 #ifdef KERN_TLS
2524         /*
2525          * KTLS store TLS records as records with a control message to
2526          * describe the framing.
2527          *
2528          * We check once here before acquiring locks to optimize the
2529          * common case.
2530          */
2531         if (sb->sb_tls_info != NULL)
2532                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2533                     flagsp));
2534 #endif
2535
2536         /* Prevent other readers from entering the socket. */
2537         error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2538         if (error)
2539                 return (error);
2540         SOCKBUF_LOCK(sb);
2541
2542 #ifdef KERN_TLS
2543         if (sb->sb_tls_info != NULL) {
2544                 SOCKBUF_UNLOCK(sb);
2545                 SOCK_IO_RECV_UNLOCK(so);
2546                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2547                     flagsp));
2548         }
2549 #endif
2550
2551         /* Easy one, no space to copyout anything. */
2552         if (uio->uio_resid == 0) {
2553                 error = EINVAL;
2554                 goto out;
2555         }
2556         oresid = uio->uio_resid;
2557
2558         /* We will never ever get anything unless we are or were connected. */
2559         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2560                 error = ENOTCONN;
2561                 goto out;
2562         }
2563
2564 restart:
2565         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2566
2567         /* Abort if socket has reported problems. */
2568         if (so->so_error) {
2569                 if (sbavail(sb) > 0)
2570                         goto deliver;
2571                 if (oresid > uio->uio_resid)
2572                         goto out;
2573                 error = so->so_error;
2574                 if (!(flags & MSG_PEEK))
2575                         so->so_error = 0;
2576                 goto out;
2577         }
2578
2579         /* Door is closed.  Deliver what is left, if any. */
2580         if (sb->sb_state & SBS_CANTRCVMORE) {
2581                 if (sbavail(sb) > 0)
2582                         goto deliver;
2583                 else
2584                         goto out;
2585         }
2586
2587         /* Socket buffer is empty and we shall not block. */
2588         if (sbavail(sb) == 0 &&
2589             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2590                 error = EAGAIN;
2591                 goto out;
2592         }
2593
2594         /* Socket buffer got some data that we shall deliver now. */
2595         if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2596             ((so->so_state & SS_NBIO) ||
2597              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2598              sbavail(sb) >= sb->sb_lowat ||
2599              sbavail(sb) >= uio->uio_resid ||
2600              sbavail(sb) >= sb->sb_hiwat) ) {
2601                 goto deliver;
2602         }
2603
2604         /* On MSG_WAITALL we must wait until all data or error arrives. */
2605         if ((flags & MSG_WAITALL) &&
2606             (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2607                 goto deliver;
2608
2609         /*
2610          * Wait and block until (more) data comes in.
2611          * NB: Drops the sockbuf lock during wait.
2612          */
2613         error = sbwait(so, SO_RCV);
2614         if (error)
2615                 goto out;
2616         goto restart;
2617
2618 deliver:
2619         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2620         KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2621         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2622
2623         /* Statistics. */
2624         if (uio->uio_td)
2625                 uio->uio_td->td_ru.ru_msgrcv++;
2626
2627         /* Fill uio until full or current end of socket buffer is reached. */
2628         len = min(uio->uio_resid, sbavail(sb));
2629         if (mp0 != NULL) {
2630                 /* Dequeue as many mbufs as possible. */
2631                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2632                         if (*mp0 == NULL)
2633                                 *mp0 = sb->sb_mb;
2634                         else
2635                                 m_cat(*mp0, sb->sb_mb);
2636                         for (m = sb->sb_mb;
2637                              m != NULL && m->m_len <= len;
2638                              m = m->m_next) {
2639                                 KASSERT(!(m->m_flags & M_NOTAVAIL),
2640                                     ("%s: m %p not available", __func__, m));
2641                                 len -= m->m_len;
2642                                 uio->uio_resid -= m->m_len;
2643                                 sbfree(sb, m);
2644                                 n = m;
2645                         }
2646                         n->m_next = NULL;
2647                         sb->sb_mb = m;
2648                         sb->sb_lastrecord = sb->sb_mb;
2649                         if (sb->sb_mb == NULL)
2650                                 SB_EMPTY_FIXUP(sb);
2651                 }
2652                 /* Copy the remainder. */
2653                 if (len > 0) {
2654                         KASSERT(sb->sb_mb != NULL,
2655                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2656
2657                         m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2658                         if (m == NULL)
2659                                 len = 0;        /* Don't flush data from sockbuf. */
2660                         else
2661                                 uio->uio_resid -= len;
2662                         if (*mp0 != NULL)
2663                                 m_cat(*mp0, m);
2664                         else
2665                                 *mp0 = m;
2666                         if (*mp0 == NULL) {
2667                                 error = ENOBUFS;
2668                                 goto out;
2669                         }
2670                 }
2671         } else {
2672                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2673                 SOCKBUF_UNLOCK(sb);
2674                 error = m_mbuftouio(uio, sb->sb_mb, len);
2675                 SOCKBUF_LOCK(sb);
2676                 if (error)
2677                         goto out;
2678         }
2679         SBLASTRECORDCHK(sb);
2680         SBLASTMBUFCHK(sb);
2681
2682         /*
2683          * Remove the delivered data from the socket buffer unless we
2684          * were only peeking.
2685          */
2686         if (!(flags & MSG_PEEK)) {
2687                 if (len > 0)
2688                         sbdrop_locked(sb, len);
2689
2690                 /* Notify protocol that we drained some data. */
2691                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2692                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2693                      !(flags & MSG_SOCALLBCK))) {
2694                         SOCKBUF_UNLOCK(sb);
2695                         VNET_SO_ASSERT(so);
2696                         so->so_proto->pr_rcvd(so, flags);
2697                         SOCKBUF_LOCK(sb);
2698                 }
2699         }
2700
2701         /*
2702          * For MSG_WAITALL we may have to loop again and wait for
2703          * more data to come in.
2704          */
2705         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2706                 goto restart;
2707 out:
2708         SBLASTRECORDCHK(sb);
2709         SBLASTMBUFCHK(sb);
2710         SOCKBUF_UNLOCK(sb);
2711         SOCK_IO_RECV_UNLOCK(so);
2712         return (error);
2713 }
2714
2715 /*
2716  * Optimized version of soreceive() for simple datagram cases from userspace.
2717  * Unlike in the stream case, we're able to drop a datagram if copyout()
2718  * fails, and because we handle datagrams atomically, we don't need to use a
2719  * sleep lock to prevent I/O interlacing.
2720  */
2721 int
2722 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2723     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2724 {
2725         struct mbuf *m, *m2;
2726         int flags, error;
2727         ssize_t len;
2728         struct protosw *pr = so->so_proto;
2729         struct mbuf *nextrecord;
2730
2731         if (psa != NULL)
2732                 *psa = NULL;
2733         if (controlp != NULL)
2734                 *controlp = NULL;
2735         if (flagsp != NULL)
2736                 flags = *flagsp &~ MSG_EOR;
2737         else
2738                 flags = 0;
2739
2740         /*
2741          * For any complicated cases, fall back to the full
2742          * soreceive_generic().
2743          */
2744         if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
2745                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2746                     flagsp));
2747
2748         /*
2749          * Enforce restrictions on use.
2750          */
2751         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2752             ("soreceive_dgram: wantrcvd"));
2753         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2754         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2755             ("soreceive_dgram: SBS_RCVATMARK"));
2756         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2757             ("soreceive_dgram: P_CONNREQUIRED"));
2758
2759         /*
2760          * Loop blocking while waiting for a datagram.
2761          */
2762         SOCKBUF_LOCK(&so->so_rcv);
2763         while ((m = so->so_rcv.sb_mb) == NULL) {
2764                 KASSERT(sbavail(&so->so_rcv) == 0,
2765                     ("soreceive_dgram: sb_mb NULL but sbavail %u",
2766                     sbavail(&so->so_rcv)));
2767                 if (so->so_error) {
2768                         error = so->so_error;
2769                         so->so_error = 0;
2770                         SOCKBUF_UNLOCK(&so->so_rcv);
2771                         return (error);
2772                 }
2773                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2774                     uio->uio_resid == 0) {
2775                         SOCKBUF_UNLOCK(&so->so_rcv);
2776                         return (0);
2777                 }
2778                 if ((so->so_state & SS_NBIO) ||
2779                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2780                         SOCKBUF_UNLOCK(&so->so_rcv);
2781                         return (EWOULDBLOCK);
2782                 }
2783                 SBLASTRECORDCHK(&so->so_rcv);
2784                 SBLASTMBUFCHK(&so->so_rcv);
2785                 error = sbwait(so, SO_RCV);
2786                 if (error) {
2787                         SOCKBUF_UNLOCK(&so->so_rcv);
2788                         return (error);
2789                 }
2790         }
2791         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2792
2793         if (uio->uio_td)
2794                 uio->uio_td->td_ru.ru_msgrcv++;
2795         SBLASTRECORDCHK(&so->so_rcv);
2796         SBLASTMBUFCHK(&so->so_rcv);
2797         nextrecord = m->m_nextpkt;
2798         if (nextrecord == NULL) {
2799                 KASSERT(so->so_rcv.sb_lastrecord == m,
2800                     ("soreceive_dgram: lastrecord != m"));
2801         }
2802
2803         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2804             ("soreceive_dgram: m_nextpkt != nextrecord"));
2805
2806         /*
2807          * Pull 'm' and its chain off the front of the packet queue.
2808          */
2809         so->so_rcv.sb_mb = NULL;
2810         sockbuf_pushsync(&so->so_rcv, nextrecord);
2811
2812         /*
2813          * Walk 'm's chain and free that many bytes from the socket buffer.
2814          */
2815         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2816                 sbfree(&so->so_rcv, m2);
2817
2818         /*
2819          * Do a few last checks before we let go of the lock.
2820          */
2821         SBLASTRECORDCHK(&so->so_rcv);
2822         SBLASTMBUFCHK(&so->so_rcv);
2823         SOCKBUF_UNLOCK(&so->so_rcv);
2824
2825         if (pr->pr_flags & PR_ADDR) {
2826                 KASSERT(m->m_type == MT_SONAME,
2827                     ("m->m_type == %d", m->m_type));
2828                 if (psa != NULL)
2829                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2830                             M_NOWAIT);
2831                 m = m_free(m);
2832         }
2833         if (m == NULL) {
2834                 /* XXXRW: Can this happen? */
2835                 return (0);
2836         }
2837
2838         /*
2839          * Packet to copyout() is now in 'm' and it is disconnected from the
2840          * queue.
2841          *
2842          * Process one or more MT_CONTROL mbufs present before any data mbufs
2843          * in the first mbuf chain on the socket buffer.  We call into the
2844          * protocol to perform externalization (or freeing if controlp ==
2845          * NULL). In some cases there can be only MT_CONTROL mbufs without
2846          * MT_DATA mbufs.
2847          */
2848         if (m->m_type == MT_CONTROL) {
2849                 struct mbuf *cm = NULL, *cmn;
2850                 struct mbuf **cme = &cm;
2851
2852                 do {
2853                         m2 = m->m_next;
2854                         m->m_next = NULL;
2855                         *cme = m;
2856                         cme = &(*cme)->m_next;
2857                         m = m2;
2858                 } while (m != NULL && m->m_type == MT_CONTROL);
2859                 while (cm != NULL) {
2860                         cmn = cm->m_next;
2861                         cm->m_next = NULL;
2862                         if (pr->pr_domain->dom_externalize != NULL) {
2863                                 error = (*pr->pr_domain->dom_externalize)
2864                                     (cm, controlp, flags);
2865                         } else if (controlp != NULL)
2866                                 *controlp = cm;
2867                         else
2868                                 m_freem(cm);
2869                         if (controlp != NULL) {
2870                                 while (*controlp != NULL)
2871                                         controlp = &(*controlp)->m_next;
2872                         }
2873                         cm = cmn;
2874                 }
2875         }
2876         KASSERT(m == NULL || m->m_type == MT_DATA,
2877             ("soreceive_dgram: !data"));
2878         while (m != NULL && uio->uio_resid > 0) {
2879                 len = uio->uio_resid;
2880                 if (len > m->m_len)
2881                         len = m->m_len;
2882                 error = uiomove(mtod(m, char *), (int)len, uio);
2883                 if (error) {
2884                         m_freem(m);
2885                         return (error);
2886                 }
2887                 if (len == m->m_len)
2888                         m = m_free(m);
2889                 else {
2890                         m->m_data += len;
2891                         m->m_len -= len;
2892                 }
2893         }
2894         if (m != NULL) {
2895                 flags |= MSG_TRUNC;
2896                 m_freem(m);
2897         }
2898         if (flagsp != NULL)
2899                 *flagsp |= flags;
2900         return (0);
2901 }
2902
2903 int
2904 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2905     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2906 {
2907         int error;
2908
2909         CURVNET_SET(so->so_vnet);
2910         error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
2911         CURVNET_RESTORE();
2912         return (error);
2913 }
2914
2915 int
2916 soshutdown(struct socket *so, int how)
2917 {
2918         struct protosw *pr;
2919         int error, soerror_enotconn;
2920
2921         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2922                 return (EINVAL);
2923
2924         soerror_enotconn = 0;
2925         SOCK_LOCK(so);
2926         if ((so->so_state &
2927             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2928                 /*
2929                  * POSIX mandates us to return ENOTCONN when shutdown(2) is
2930                  * invoked on a datagram sockets, however historically we would
2931                  * actually tear socket down. This is known to be leveraged by
2932                  * some applications to unblock process waiting in recvXXX(2)
2933                  * by other process that it shares that socket with. Try to meet
2934                  * both backward-compatibility and POSIX requirements by forcing
2935                  * ENOTCONN but still asking protocol to perform pru_shutdown().
2936                  */
2937                 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
2938                         SOCK_UNLOCK(so);
2939                         return (ENOTCONN);
2940                 }
2941                 soerror_enotconn = 1;
2942         }
2943
2944         if (SOLISTENING(so)) {
2945                 if (how != SHUT_WR) {
2946                         so->so_error = ECONNABORTED;
2947                         solisten_wakeup(so);    /* unlocks so */
2948                 } else {
2949                         SOCK_UNLOCK(so);
2950                 }
2951                 goto done;
2952         }
2953         SOCK_UNLOCK(so);
2954
2955         CURVNET_SET(so->so_vnet);
2956         pr = so->so_proto;
2957         if (pr->pr_flush != NULL)
2958                 pr->pr_flush(so, how);
2959         if (how != SHUT_WR)
2960                 sorflush(so);
2961         if (how != SHUT_RD) {
2962                 error = pr->pr_shutdown(so);
2963                 wakeup(&so->so_timeo);
2964                 CURVNET_RESTORE();
2965                 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2966         }
2967         wakeup(&so->so_timeo);
2968         CURVNET_RESTORE();
2969
2970 done:
2971         return (soerror_enotconn ? ENOTCONN : 0);
2972 }
2973
2974 void
2975 sorflush(struct socket *so)
2976 {
2977         struct protosw *pr;
2978         int error;
2979
2980         VNET_SO_ASSERT(so);
2981
2982         /*
2983          * Dislodge threads currently blocked in receive and wait to acquire
2984          * a lock against other simultaneous readers before clearing the
2985          * socket buffer.  Don't let our acquire be interrupted by a signal
2986          * despite any existing socket disposition on interruptable waiting.
2987          */
2988         socantrcvmore(so);
2989
2990         error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
2991         if (error != 0) {
2992                 KASSERT(SOLISTENING(so),
2993                     ("%s: soiolock(%p) failed", __func__, so));
2994                 return;
2995         }
2996
2997         pr = so->so_proto;
2998         if (pr->pr_flags & PR_RIGHTS) {
2999                 MPASS(pr->pr_domain->dom_dispose != NULL);
3000                 (*pr->pr_domain->dom_dispose)(so);
3001         } else {
3002                 sbrelease(so, SO_RCV);
3003                 SOCK_IO_RECV_UNLOCK(so);
3004         }
3005
3006 }
3007
3008 /*
3009  * Wrapper for Socket established helper hook.
3010  * Parameters: socket, context of the hook point, hook id.
3011  */
3012 static int inline
3013 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
3014 {
3015         struct socket_hhook_data hhook_data = {
3016                 .so = so,
3017                 .hctx = hctx,
3018                 .m = NULL,
3019                 .status = 0
3020         };
3021
3022         CURVNET_SET(so->so_vnet);
3023         HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
3024         CURVNET_RESTORE();
3025
3026         /* Ugly but needed, since hhooks return void for now */
3027         return (hhook_data.status);
3028 }
3029
3030 /*
3031  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
3032  * additional variant to handle the case where the option value needs to be
3033  * some kind of integer, but not a specific size.  In addition to their use
3034  * here, these functions are also called by the protocol-level pr_ctloutput()
3035  * routines.
3036  */
3037 int
3038 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3039 {
3040         size_t  valsize;
3041
3042         /*
3043          * If the user gives us more than we wanted, we ignore it, but if we
3044          * don't get the minimum length the caller wants, we return EINVAL.
3045          * On success, sopt->sopt_valsize is set to however much we actually
3046          * retrieved.
3047          */
3048         if ((valsize = sopt->sopt_valsize) < minlen)
3049                 return EINVAL;
3050         if (valsize > len)
3051                 sopt->sopt_valsize = valsize = len;
3052
3053         if (sopt->sopt_td != NULL)
3054                 return (copyin(sopt->sopt_val, buf, valsize));
3055
3056         bcopy(sopt->sopt_val, buf, valsize);
3057         return (0);
3058 }
3059
3060 /*
3061  * Kernel version of setsockopt(2).
3062  *
3063  * XXX: optlen is size_t, not socklen_t
3064  */
3065 int
3066 so_setsockopt(struct socket *so, int level, int optname, void *optval,
3067     size_t optlen)
3068 {
3069         struct sockopt sopt;
3070
3071         sopt.sopt_level = level;
3072         sopt.sopt_name = optname;
3073         sopt.sopt_dir = SOPT_SET;
3074         sopt.sopt_val = optval;
3075         sopt.sopt_valsize = optlen;
3076         sopt.sopt_td = NULL;
3077         return (sosetopt(so, &sopt));
3078 }
3079
3080 int
3081 sosetopt(struct socket *so, struct sockopt *sopt)
3082 {
3083         int     error, optval;
3084         struct  linger l;
3085         struct  timeval tv;
3086         sbintime_t val, *valp;
3087         uint32_t val32;
3088 #ifdef MAC
3089         struct mac extmac;
3090 #endif
3091
3092         CURVNET_SET(so->so_vnet);
3093         error = 0;
3094         if (sopt->sopt_level != SOL_SOCKET) {
3095                 if (so->so_proto->pr_ctloutput != NULL)
3096                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3097                 else
3098                         error = ENOPROTOOPT;
3099         } else {
3100                 switch (sopt->sopt_name) {
3101                 case SO_ACCEPTFILTER:
3102                         error = accept_filt_setopt(so, sopt);
3103                         if (error)
3104                                 goto bad;
3105                         break;
3106
3107                 case SO_LINGER:
3108                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3109                         if (error)
3110                                 goto bad;
3111                         if (l.l_linger < 0 ||
3112                             l.l_linger > USHRT_MAX ||
3113                             l.l_linger > (INT_MAX / hz)) {
3114                                 error = EDOM;
3115                                 goto bad;
3116                         }
3117                         SOCK_LOCK(so);
3118                         so->so_linger = l.l_linger;
3119                         if (l.l_onoff)
3120                                 so->so_options |= SO_LINGER;
3121                         else
3122                                 so->so_options &= ~SO_LINGER;
3123                         SOCK_UNLOCK(so);
3124                         break;
3125
3126                 case SO_DEBUG:
3127                 case SO_KEEPALIVE:
3128                 case SO_DONTROUTE:
3129                 case SO_USELOOPBACK:
3130                 case SO_BROADCAST:
3131                 case SO_REUSEADDR:
3132                 case SO_REUSEPORT:
3133                 case SO_REUSEPORT_LB:
3134                 case SO_OOBINLINE:
3135                 case SO_TIMESTAMP:
3136                 case SO_BINTIME:
3137                 case SO_NOSIGPIPE:
3138                 case SO_NO_DDP:
3139                 case SO_NO_OFFLOAD:
3140                 case SO_RERROR:
3141                         error = sooptcopyin(sopt, &optval, sizeof optval,
3142                             sizeof optval);
3143                         if (error)
3144                                 goto bad;
3145                         SOCK_LOCK(so);
3146                         if (optval)
3147                                 so->so_options |= sopt->sopt_name;
3148                         else
3149                                 so->so_options &= ~sopt->sopt_name;
3150                         SOCK_UNLOCK(so);
3151                         break;
3152
3153                 case SO_SETFIB:
3154                         error = sooptcopyin(sopt, &optval, sizeof optval,
3155                             sizeof optval);
3156                         if (error)
3157                                 goto bad;
3158
3159                         if (optval < 0 || optval >= rt_numfibs) {
3160                                 error = EINVAL;
3161                                 goto bad;
3162                         }
3163                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3164                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3165                            (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3166                                 so->so_fibnum = optval;
3167                         else
3168                                 so->so_fibnum = 0;
3169                         break;
3170
3171                 case SO_USER_COOKIE:
3172                         error = sooptcopyin(sopt, &val32, sizeof val32,
3173                             sizeof val32);
3174                         if (error)
3175                                 goto bad;
3176                         so->so_user_cookie = val32;
3177                         break;
3178
3179                 case SO_SNDBUF:
3180                 case SO_RCVBUF:
3181                 case SO_SNDLOWAT:
3182                 case SO_RCVLOWAT:
3183                         error = so->so_proto->pr_setsbopt(so, sopt);
3184                         if (error)
3185                                 goto bad;
3186                         break;
3187
3188                 case SO_SNDTIMEO:
3189                 case SO_RCVTIMEO:
3190 #ifdef COMPAT_FREEBSD32
3191                         if (SV_CURPROC_FLAG(SV_ILP32)) {
3192                                 struct timeval32 tv32;
3193
3194                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
3195                                     sizeof tv32);
3196                                 CP(tv32, tv, tv_sec);
3197                                 CP(tv32, tv, tv_usec);
3198                         } else
3199 #endif
3200                                 error = sooptcopyin(sopt, &tv, sizeof tv,
3201                                     sizeof tv);
3202                         if (error)
3203                                 goto bad;
3204                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3205                             tv.tv_usec >= 1000000) {
3206                                 error = EDOM;
3207                                 goto bad;
3208                         }
3209                         if (tv.tv_sec > INT32_MAX)
3210                                 val = SBT_MAX;
3211                         else
3212                                 val = tvtosbt(tv);
3213                         SOCK_LOCK(so);
3214                         valp = sopt->sopt_name == SO_SNDTIMEO ?
3215                             (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
3216                             &so->so_snd.sb_timeo) :
3217                             (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
3218                             &so->so_rcv.sb_timeo);
3219                         *valp = val;
3220                         SOCK_UNLOCK(so);
3221                         break;
3222
3223                 case SO_LABEL:
3224 #ifdef MAC
3225                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
3226                             sizeof extmac);
3227                         if (error)
3228                                 goto bad;
3229                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3230                             so, &extmac);
3231 #else
3232                         error = EOPNOTSUPP;
3233 #endif
3234                         break;
3235
3236                 case SO_TS_CLOCK:
3237                         error = sooptcopyin(sopt, &optval, sizeof optval,
3238                             sizeof optval);
3239                         if (error)
3240                                 goto bad;
3241                         if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3242                                 error = EINVAL;
3243                                 goto bad;
3244                         }
3245                         so->so_ts_clock = optval;
3246                         break;
3247
3248                 case SO_MAX_PACING_RATE:
3249                         error = sooptcopyin(sopt, &val32, sizeof(val32),
3250                             sizeof(val32));
3251                         if (error)
3252                                 goto bad;
3253                         so->so_max_pacing_rate = val32;
3254                         break;
3255
3256                 default:
3257                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3258                                 error = hhook_run_socket(so, sopt,
3259                                     HHOOK_SOCKET_OPT);
3260                         else
3261                                 error = ENOPROTOOPT;
3262                         break;
3263                 }
3264                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3265                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
3266         }
3267 bad:
3268         CURVNET_RESTORE();
3269         return (error);
3270 }
3271
3272 /*
3273  * Helper routine for getsockopt.
3274  */
3275 int
3276 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3277 {
3278         int     error;
3279         size_t  valsize;
3280
3281         error = 0;
3282
3283         /*
3284          * Documented get behavior is that we always return a value, possibly
3285          * truncated to fit in the user's buffer.  Traditional behavior is
3286          * that we always tell the user precisely how much we copied, rather
3287          * than something useful like the total amount we had available for
3288          * her.  Note that this interface is not idempotent; the entire
3289          * answer must be generated ahead of time.
3290          */
3291         valsize = min(len, sopt->sopt_valsize);
3292         sopt->sopt_valsize = valsize;
3293         if (sopt->sopt_val != NULL) {
3294                 if (sopt->sopt_td != NULL)
3295                         error = copyout(buf, sopt->sopt_val, valsize);
3296                 else
3297                         bcopy(buf, sopt->sopt_val, valsize);
3298         }
3299         return (error);
3300 }
3301
3302 int
3303 sogetopt(struct socket *so, struct sockopt *sopt)
3304 {
3305         int     error, optval;
3306         struct  linger l;
3307         struct  timeval tv;
3308 #ifdef MAC
3309         struct mac extmac;
3310 #endif
3311
3312         CURVNET_SET(so->so_vnet);
3313         error = 0;
3314         if (sopt->sopt_level != SOL_SOCKET) {
3315                 if (so->so_proto->pr_ctloutput != NULL)
3316                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3317                 else
3318                         error = ENOPROTOOPT;
3319                 CURVNET_RESTORE();
3320                 return (error);
3321         } else {
3322                 switch (sopt->sopt_name) {
3323                 case SO_ACCEPTFILTER:
3324                         error = accept_filt_getopt(so, sopt);
3325                         break;
3326
3327                 case SO_LINGER:
3328                         SOCK_LOCK(so);
3329                         l.l_onoff = so->so_options & SO_LINGER;
3330                         l.l_linger = so->so_linger;
3331                         SOCK_UNLOCK(so);
3332                         error = sooptcopyout(sopt, &l, sizeof l);
3333                         break;
3334
3335                 case SO_USELOOPBACK:
3336                 case SO_DONTROUTE:
3337                 case SO_DEBUG:
3338                 case SO_KEEPALIVE:
3339                 case SO_REUSEADDR:
3340                 case SO_REUSEPORT:
3341                 case SO_REUSEPORT_LB:
3342                 case SO_BROADCAST:
3343                 case SO_OOBINLINE:
3344                 case SO_ACCEPTCONN:
3345                 case SO_TIMESTAMP:
3346                 case SO_BINTIME:
3347                 case SO_NOSIGPIPE:
3348                 case SO_NO_DDP:
3349                 case SO_NO_OFFLOAD:
3350                 case SO_RERROR:
3351                         optval = so->so_options & sopt->sopt_name;
3352 integer:
3353                         error = sooptcopyout(sopt, &optval, sizeof optval);
3354                         break;
3355
3356                 case SO_DOMAIN:
3357                         optval = so->so_proto->pr_domain->dom_family;
3358                         goto integer;
3359
3360                 case SO_TYPE:
3361                         optval = so->so_type;
3362                         goto integer;
3363
3364                 case SO_PROTOCOL:
3365                         optval = so->so_proto->pr_protocol;
3366                         goto integer;
3367
3368                 case SO_ERROR:
3369                         SOCK_LOCK(so);
3370                         if (so->so_error) {
3371                                 optval = so->so_error;
3372                                 so->so_error = 0;
3373                         } else {
3374                                 optval = so->so_rerror;
3375                                 so->so_rerror = 0;
3376                         }
3377                         SOCK_UNLOCK(so);
3378                         goto integer;
3379
3380                 case SO_SNDBUF:
3381                         optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3382                             so->so_snd.sb_hiwat;
3383                         goto integer;
3384
3385                 case SO_RCVBUF:
3386                         optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3387                             so->so_rcv.sb_hiwat;
3388                         goto integer;
3389
3390                 case SO_SNDLOWAT:
3391                         optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3392                             so->so_snd.sb_lowat;
3393                         goto integer;
3394
3395                 case SO_RCVLOWAT:
3396                         optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3397                             so->so_rcv.sb_lowat;
3398                         goto integer;
3399
3400                 case SO_SNDTIMEO:
3401                 case SO_RCVTIMEO:
3402                         SOCK_LOCK(so);
3403                         tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3404                             (SOLISTENING(so) ? so->sol_sbsnd_timeo :
3405                             so->so_snd.sb_timeo) :
3406                             (SOLISTENING(so) ? so->sol_sbrcv_timeo :
3407                             so->so_rcv.sb_timeo));
3408                         SOCK_UNLOCK(so);
3409 #ifdef COMPAT_FREEBSD32
3410                         if (SV_CURPROC_FLAG(SV_ILP32)) {
3411                                 struct timeval32 tv32;
3412
3413                                 CP(tv, tv32, tv_sec);
3414                                 CP(tv, tv32, tv_usec);
3415                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
3416                         } else
3417 #endif
3418                                 error = sooptcopyout(sopt, &tv, sizeof tv);
3419                         break;
3420
3421                 case SO_LABEL:
3422 #ifdef MAC
3423                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3424                             sizeof(extmac));
3425                         if (error)
3426                                 goto bad;
3427                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3428                             so, &extmac);
3429                         if (error)
3430                                 goto bad;
3431                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
3432 #else
3433                         error = EOPNOTSUPP;
3434 #endif
3435                         break;
3436
3437                 case SO_PEERLABEL:
3438 #ifdef MAC
3439                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3440                             sizeof(extmac));
3441                         if (error)
3442                                 goto bad;
3443                         error = mac_getsockopt_peerlabel(
3444                             sopt->sopt_td->td_ucred, so, &extmac);
3445                         if (error)
3446                                 goto bad;
3447                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
3448 #else
3449                         error = EOPNOTSUPP;
3450 #endif
3451                         break;
3452
3453                 case SO_LISTENQLIMIT:
3454                         optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3455                         goto integer;
3456
3457                 case SO_LISTENQLEN:
3458                         optval = SOLISTENING(so) ? so->sol_qlen : 0;
3459                         goto integer;
3460
3461                 case SO_LISTENINCQLEN:
3462                         optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3463                         goto integer;
3464
3465                 case SO_TS_CLOCK:
3466                         optval = so->so_ts_clock;
3467                         goto integer;
3468
3469                 case SO_MAX_PACING_RATE:
3470                         optval = so->so_max_pacing_rate;
3471                         goto integer;
3472
3473                 default:
3474                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3475                                 error = hhook_run_socket(so, sopt,
3476                                     HHOOK_SOCKET_OPT);
3477                         else
3478                                 error = ENOPROTOOPT;
3479                         break;
3480                 }
3481         }
3482 #ifdef MAC
3483 bad:
3484 #endif
3485         CURVNET_RESTORE();
3486         return (error);
3487 }
3488
3489 int
3490 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3491 {
3492         struct mbuf *m, *m_prev;
3493         int sopt_size = sopt->sopt_valsize;
3494
3495         MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3496         if (m == NULL)
3497                 return ENOBUFS;
3498         if (sopt_size > MLEN) {
3499                 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3500                 if ((m->m_flags & M_EXT) == 0) {
3501                         m_free(m);
3502                         return ENOBUFS;
3503                 }
3504                 m->m_len = min(MCLBYTES, sopt_size);
3505         } else {
3506                 m->m_len = min(MLEN, sopt_size);
3507         }
3508         sopt_size -= m->m_len;
3509         *mp = m;
3510         m_prev = m;
3511
3512         while (sopt_size) {
3513                 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3514                 if (m == NULL) {
3515                         m_freem(*mp);
3516                         return ENOBUFS;
3517                 }
3518                 if (sopt_size > MLEN) {
3519                         MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3520                             M_NOWAIT);
3521                         if ((m->m_flags & M_EXT) == 0) {
3522                                 m_freem(m);
3523                                 m_freem(*mp);
3524                                 return ENOBUFS;
3525                         }
3526                         m->m_len = min(MCLBYTES, sopt_size);
3527                 } else {
3528                         m->m_len = min(MLEN, sopt_size);
3529                 }
3530                 sopt_size -= m->m_len;
3531                 m_prev->m_next = m;
3532                 m_prev = m;
3533         }
3534         return (0);
3535 }
3536
3537 int
3538 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3539 {
3540         struct mbuf *m0 = m;
3541
3542         if (sopt->sopt_val == NULL)
3543                 return (0);
3544         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3545                 if (sopt->sopt_td != NULL) {
3546                         int error;
3547
3548                         error = copyin(sopt->sopt_val, mtod(m, char *),
3549                             m->m_len);
3550                         if (error != 0) {
3551                                 m_freem(m0);
3552                                 return(error);
3553                         }
3554                 } else
3555                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3556                 sopt->sopt_valsize -= m->m_len;
3557                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3558                 m = m->m_next;
3559         }
3560         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3561                 panic("ip6_sooptmcopyin");
3562         return (0);
3563 }
3564
3565 int
3566 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3567 {
3568         struct mbuf *m0 = m;
3569         size_t valsize = 0;
3570
3571         if (sopt->sopt_val == NULL)
3572                 return (0);
3573         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3574                 if (sopt->sopt_td != NULL) {
3575                         int error;
3576
3577                         error = copyout(mtod(m, char *), sopt->sopt_val,
3578                             m->m_len);
3579                         if (error != 0) {
3580                                 m_freem(m0);
3581                                 return(error);
3582                         }
3583                 } else
3584                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3585                 sopt->sopt_valsize -= m->m_len;
3586                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3587                 valsize += m->m_len;
3588                 m = m->m_next;
3589         }
3590         if (m != NULL) {
3591                 /* enough soopt buffer should be given from user-land */
3592                 m_freem(m0);
3593                 return(EINVAL);
3594         }
3595         sopt->sopt_valsize = valsize;
3596         return (0);
3597 }
3598
3599 /*
3600  * sohasoutofband(): protocol notifies socket layer of the arrival of new
3601  * out-of-band data, which will then notify socket consumers.
3602  */
3603 void
3604 sohasoutofband(struct socket *so)
3605 {
3606
3607         if (so->so_sigio != NULL)
3608                 pgsigio(&so->so_sigio, SIGURG, 0);
3609         selwakeuppri(&so->so_rdsel, PSOCK);
3610 }
3611
3612 int
3613 sopoll(struct socket *so, int events, struct ucred *active_cred,
3614     struct thread *td)
3615 {
3616
3617         /*
3618          * We do not need to set or assert curvnet as long as everyone uses
3619          * sopoll_generic().
3620          */
3621         return (so->so_proto->pr_sopoll(so, events, active_cred, td));
3622 }
3623
3624 int
3625 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3626     struct thread *td)
3627 {
3628         int revents;
3629
3630         SOCK_LOCK(so);
3631         if (SOLISTENING(so)) {
3632                 if (!(events & (POLLIN | POLLRDNORM)))
3633                         revents = 0;
3634                 else if (!TAILQ_EMPTY(&so->sol_comp))
3635                         revents = events & (POLLIN | POLLRDNORM);
3636                 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3637                         revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3638                 else {
3639                         selrecord(td, &so->so_rdsel);
3640                         revents = 0;
3641                 }
3642         } else {
3643                 revents = 0;
3644                 SOCK_SENDBUF_LOCK(so);
3645                 SOCK_RECVBUF_LOCK(so);
3646                 if (events & (POLLIN | POLLRDNORM))
3647                         if (soreadabledata(so))
3648                                 revents |= events & (POLLIN | POLLRDNORM);
3649                 if (events & (POLLOUT | POLLWRNORM))
3650                         if (sowriteable(so))
3651                                 revents |= events & (POLLOUT | POLLWRNORM);
3652                 if (events & (POLLPRI | POLLRDBAND))
3653                         if (so->so_oobmark ||
3654                             (so->so_rcv.sb_state & SBS_RCVATMARK))
3655                                 revents |= events & (POLLPRI | POLLRDBAND);
3656                 if ((events & POLLINIGNEOF) == 0) {
3657                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3658                                 revents |= events & (POLLIN | POLLRDNORM);
3659                                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3660                                         revents |= POLLHUP;
3661                         }
3662                 }
3663                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3664                         revents |= events & POLLRDHUP;
3665                 if (revents == 0) {
3666                         if (events &
3667                             (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
3668                                 selrecord(td, &so->so_rdsel);
3669                                 so->so_rcv.sb_flags |= SB_SEL;
3670                         }
3671                         if (events & (POLLOUT | POLLWRNORM)) {
3672                                 selrecord(td, &so->so_wrsel);
3673                                 so->so_snd.sb_flags |= SB_SEL;
3674                         }
3675                 }
3676                 SOCK_RECVBUF_UNLOCK(so);
3677                 SOCK_SENDBUF_UNLOCK(so);
3678         }
3679         SOCK_UNLOCK(so);
3680         return (revents);
3681 }
3682
3683 int
3684 soo_kqfilter(struct file *fp, struct knote *kn)
3685 {
3686         struct socket *so = kn->kn_fp->f_data;
3687         struct sockbuf *sb;
3688         sb_which which;
3689         struct knlist *knl;
3690
3691         switch (kn->kn_filter) {
3692         case EVFILT_READ:
3693                 kn->kn_fop = &soread_filtops;
3694                 knl = &so->so_rdsel.si_note;
3695                 sb = &so->so_rcv;
3696                 which = SO_RCV;
3697                 break;
3698         case EVFILT_WRITE:
3699                 kn->kn_fop = &sowrite_filtops;
3700                 knl = &so->so_wrsel.si_note;
3701                 sb = &so->so_snd;
3702                 which = SO_SND;
3703                 break;
3704         case EVFILT_EMPTY:
3705                 kn->kn_fop = &soempty_filtops;
3706                 knl = &so->so_wrsel.si_note;
3707                 sb = &so->so_snd;
3708                 which = SO_SND;
3709                 break;
3710         default:
3711                 return (EINVAL);
3712         }
3713
3714         SOCK_LOCK(so);
3715         if (SOLISTENING(so)) {
3716                 knlist_add(knl, kn, 1);
3717         } else {
3718                 SOCK_BUF_LOCK(so, which);
3719                 knlist_add(knl, kn, 1);
3720                 sb->sb_flags |= SB_KNOTE;
3721                 SOCK_BUF_UNLOCK(so, which);
3722         }
3723         SOCK_UNLOCK(so);
3724         return (0);
3725 }
3726
3727 static void
3728 filt_sordetach(struct knote *kn)
3729 {
3730         struct socket *so = kn->kn_fp->f_data;
3731
3732         so_rdknl_lock(so);
3733         knlist_remove(&so->so_rdsel.si_note, kn, 1);
3734         if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3735                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3736         so_rdknl_unlock(so);
3737 }
3738
3739 /*ARGSUSED*/
3740 static int
3741 filt_soread(struct knote *kn, long hint)
3742 {
3743         struct socket *so;
3744
3745         so = kn->kn_fp->f_data;
3746
3747         if (SOLISTENING(so)) {
3748                 SOCK_LOCK_ASSERT(so);
3749                 kn->kn_data = so->sol_qlen;
3750                 if (so->so_error) {
3751                         kn->kn_flags |= EV_EOF;
3752                         kn->kn_fflags = so->so_error;
3753                         return (1);
3754                 }
3755                 return (!TAILQ_EMPTY(&so->sol_comp));
3756         }
3757
3758         SOCK_RECVBUF_LOCK_ASSERT(so);
3759
3760         kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3761         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3762                 kn->kn_flags |= EV_EOF;
3763                 kn->kn_fflags = so->so_error;
3764                 return (1);
3765         } else if (so->so_error || so->so_rerror)
3766                 return (1);
3767
3768         if (kn->kn_sfflags & NOTE_LOWAT) {
3769                 if (kn->kn_data >= kn->kn_sdata)
3770                         return (1);
3771         } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3772                 return (1);
3773
3774         /* This hook returning non-zero indicates an event, not error */
3775         return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3776 }
3777
3778 static void
3779 filt_sowdetach(struct knote *kn)
3780 {
3781         struct socket *so = kn->kn_fp->f_data;
3782
3783         so_wrknl_lock(so);
3784         knlist_remove(&so->so_wrsel.si_note, kn, 1);
3785         if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3786                 so->so_snd.sb_flags &= ~SB_KNOTE;
3787         so_wrknl_unlock(so);
3788 }
3789
3790 /*ARGSUSED*/
3791 static int
3792 filt_sowrite(struct knote *kn, long hint)
3793 {
3794         struct socket *so;
3795
3796         so = kn->kn_fp->f_data;
3797
3798         if (SOLISTENING(so))
3799                 return (0);
3800
3801         SOCK_SENDBUF_LOCK_ASSERT(so);
3802         kn->kn_data = sbspace(&so->so_snd);
3803
3804         hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3805
3806         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3807                 kn->kn_flags |= EV_EOF;
3808                 kn->kn_fflags = so->so_error;
3809                 return (1);
3810         } else if (so->so_error)        /* temporary udp error */
3811                 return (1);
3812         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3813             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3814                 return (0);
3815         else if (kn->kn_sfflags & NOTE_LOWAT)
3816                 return (kn->kn_data >= kn->kn_sdata);
3817         else
3818                 return (kn->kn_data >= so->so_snd.sb_lowat);
3819 }
3820
3821 static int
3822 filt_soempty(struct knote *kn, long hint)
3823 {
3824         struct socket *so;
3825
3826         so = kn->kn_fp->f_data;
3827
3828         if (SOLISTENING(so))
3829                 return (1);
3830
3831         SOCK_SENDBUF_LOCK_ASSERT(so);
3832         kn->kn_data = sbused(&so->so_snd);
3833
3834         if (kn->kn_data == 0)
3835                 return (1);
3836         else
3837                 return (0);
3838 }
3839
3840 int
3841 socheckuid(struct socket *so, uid_t uid)
3842 {
3843
3844         if (so == NULL)
3845                 return (EPERM);
3846         if (so->so_cred->cr_uid != uid)
3847                 return (EPERM);
3848         return (0);
3849 }
3850
3851 /*
3852  * These functions are used by protocols to notify the socket layer (and its
3853  * consumers) of state changes in the sockets driven by protocol-side events.
3854  */
3855
3856 /*
3857  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3858  *
3859  * Normal sequence from the active (originating) side is that
3860  * soisconnecting() is called during processing of connect() call, resulting
3861  * in an eventual call to soisconnected() if/when the connection is
3862  * established.  When the connection is torn down soisdisconnecting() is
3863  * called during processing of disconnect() call, and soisdisconnected() is
3864  * called when the connection to the peer is totally severed.  The semantics
3865  * of these routines are such that connectionless protocols can call
3866  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3867  * calls when setting up a ``connection'' takes no time.
3868  *
3869  * From the passive side, a socket is created with two queues of sockets:
3870  * so_incomp for connections in progress and so_comp for connections already
3871  * made and awaiting user acceptance.  As a protocol is preparing incoming
3872  * connections, it creates a socket structure queued on so_incomp by calling
3873  * sonewconn().  When the connection is established, soisconnected() is
3874  * called, and transfers the socket structure to so_comp, making it available
3875  * to accept().
3876  *
3877  * If a socket is closed with sockets on either so_incomp or so_comp, these
3878  * sockets are dropped.
3879  *
3880  * If higher-level protocols are implemented in the kernel, the wakeups done
3881  * here will sometimes cause software-interrupt process scheduling.
3882  */
3883 void
3884 soisconnecting(struct socket *so)
3885 {
3886
3887         SOCK_LOCK(so);
3888         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3889         so->so_state |= SS_ISCONNECTING;
3890         SOCK_UNLOCK(so);
3891 }
3892
3893 void
3894 soisconnected(struct socket *so)
3895 {
3896         bool last __diagused;
3897
3898         SOCK_LOCK(so);
3899         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3900         so->so_state |= SS_ISCONNECTED;
3901
3902         if (so->so_qstate == SQ_INCOMP) {
3903                 struct socket *head = so->so_listen;
3904                 int ret;
3905
3906                 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
3907                 /*
3908                  * Promoting a socket from incomplete queue to complete, we
3909                  * need to go through reverse order of locking.  We first do
3910                  * trylock, and if that doesn't succeed, we go the hard way
3911                  * leaving a reference and rechecking consistency after proper
3912                  * locking.
3913                  */
3914                 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
3915                         soref(head);
3916                         SOCK_UNLOCK(so);
3917                         SOLISTEN_LOCK(head);
3918                         SOCK_LOCK(so);
3919                         if (__predict_false(head != so->so_listen)) {
3920                                 /*
3921                                  * The socket went off the listen queue,
3922                                  * should be lost race to close(2) of sol.
3923                                  * The socket is about to soabort().
3924                                  */
3925                                 SOCK_UNLOCK(so);
3926                                 sorele_locked(head);
3927                                 return;
3928                         }
3929                         last = refcount_release(&head->so_count);
3930                         KASSERT(!last, ("%s: released last reference for %p",
3931                             __func__, head));
3932                 }
3933 again:
3934                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3935                         TAILQ_REMOVE(&head->sol_incomp, so, so_list);
3936                         head->sol_incqlen--;
3937                         TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
3938                         head->sol_qlen++;
3939                         so->so_qstate = SQ_COMP;
3940                         SOCK_UNLOCK(so);
3941                         solisten_wakeup(head);  /* unlocks */
3942                 } else {
3943                         SOCK_RECVBUF_LOCK(so);
3944                         soupcall_set(so, SO_RCV,
3945                             head->sol_accept_filter->accf_callback,
3946                             head->sol_accept_filter_arg);
3947                         so->so_options &= ~SO_ACCEPTFILTER;
3948                         ret = head->sol_accept_filter->accf_callback(so,
3949                             head->sol_accept_filter_arg, M_NOWAIT);
3950                         if (ret == SU_ISCONNECTED) {
3951                                 soupcall_clear(so, SO_RCV);
3952                                 SOCK_RECVBUF_UNLOCK(so);
3953                                 goto again;
3954                         }
3955                         SOCK_RECVBUF_UNLOCK(so);
3956                         SOCK_UNLOCK(so);
3957                         SOLISTEN_UNLOCK(head);
3958                 }
3959                 return;
3960         }
3961         SOCK_UNLOCK(so);
3962         wakeup(&so->so_timeo);
3963         sorwakeup(so);
3964         sowwakeup(so);
3965 }
3966
3967 void
3968 soisdisconnecting(struct socket *so)
3969 {
3970
3971         SOCK_LOCK(so);
3972         so->so_state &= ~SS_ISCONNECTING;
3973         so->so_state |= SS_ISDISCONNECTING;
3974
3975         if (!SOLISTENING(so)) {
3976                 SOCK_RECVBUF_LOCK(so);
3977                 socantrcvmore_locked(so);
3978                 SOCK_SENDBUF_LOCK(so);
3979                 socantsendmore_locked(so);
3980         }
3981         SOCK_UNLOCK(so);
3982         wakeup(&so->so_timeo);
3983 }
3984
3985 void
3986 soisdisconnected(struct socket *so)
3987 {
3988
3989         SOCK_LOCK(so);
3990
3991         /*
3992          * There is at least one reader of so_state that does not
3993          * acquire socket lock, namely soreceive_generic().  Ensure
3994          * that it never sees all flags that track connection status
3995          * cleared, by ordering the update with a barrier semantic of
3996          * our release thread fence.
3997          */
3998         so->so_state |= SS_ISDISCONNECTED;
3999         atomic_thread_fence_rel();
4000         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
4001
4002         if (!SOLISTENING(so)) {
4003                 SOCK_UNLOCK(so);
4004                 SOCK_RECVBUF_LOCK(so);
4005                 socantrcvmore_locked(so);
4006                 SOCK_SENDBUF_LOCK(so);
4007                 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4008                 socantsendmore_locked(so);
4009         } else
4010                 SOCK_UNLOCK(so);
4011         wakeup(&so->so_timeo);
4012 }
4013
4014 int
4015 soiolock(struct socket *so, struct sx *sx, int flags)
4016 {
4017         int error;
4018
4019         KASSERT((flags & SBL_VALID) == flags,
4020             ("soiolock: invalid flags %#x", flags));
4021
4022         if ((flags & SBL_WAIT) != 0) {
4023                 if ((flags & SBL_NOINTR) != 0) {
4024                         sx_xlock(sx);
4025                 } else {
4026                         error = sx_xlock_sig(sx);
4027                         if (error != 0)
4028                                 return (error);
4029                 }
4030         } else if (!sx_try_xlock(sx)) {
4031                 return (EWOULDBLOCK);
4032         }
4033
4034         if (__predict_false(SOLISTENING(so))) {
4035                 sx_xunlock(sx);
4036                 return (ENOTCONN);
4037         }
4038         return (0);
4039 }
4040
4041 void
4042 soiounlock(struct sx *sx)
4043 {
4044         sx_xunlock(sx);
4045 }
4046
4047 /*
4048  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4049  */
4050 struct sockaddr *
4051 sodupsockaddr(const struct sockaddr *sa, int mflags)
4052 {
4053         struct sockaddr *sa2;
4054
4055         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4056         if (sa2)
4057                 bcopy(sa, sa2, sa->sa_len);
4058         return sa2;
4059 }
4060
4061 /*
4062  * Register per-socket destructor.
4063  */
4064 void
4065 sodtor_set(struct socket *so, so_dtor_t *func)
4066 {
4067
4068         SOCK_LOCK_ASSERT(so);
4069         so->so_dtor = func;
4070 }
4071
4072 /*
4073  * Register per-socket buffer upcalls.
4074  */
4075 void
4076 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
4077 {
4078         struct sockbuf *sb;
4079
4080         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4081
4082         switch (which) {
4083         case SO_RCV:
4084                 sb = &so->so_rcv;
4085                 break;
4086         case SO_SND:
4087                 sb = &so->so_snd;
4088                 break;
4089         }
4090         SOCK_BUF_LOCK_ASSERT(so, which);
4091         sb->sb_upcall = func;
4092         sb->sb_upcallarg = arg;
4093         sb->sb_flags |= SB_UPCALL;
4094 }
4095
4096 void
4097 soupcall_clear(struct socket *so, sb_which which)
4098 {
4099         struct sockbuf *sb;
4100
4101         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4102
4103         switch (which) {
4104         case SO_RCV:
4105                 sb = &so->so_rcv;
4106                 break;
4107         case SO_SND:
4108                 sb = &so->so_snd;
4109                 break;
4110         }
4111         SOCK_BUF_LOCK_ASSERT(so, which);
4112         KASSERT(sb->sb_upcall != NULL,
4113             ("%s: so %p no upcall to clear", __func__, so));
4114         sb->sb_upcall = NULL;
4115         sb->sb_upcallarg = NULL;
4116         sb->sb_flags &= ~SB_UPCALL;
4117 }
4118
4119 void
4120 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4121 {
4122
4123         SOLISTEN_LOCK_ASSERT(so);
4124         so->sol_upcall = func;
4125         so->sol_upcallarg = arg;
4126 }
4127
4128 static void
4129 so_rdknl_lock(void *arg)
4130 {
4131         struct socket *so = arg;
4132
4133 retry:
4134         if (SOLISTENING(so)) {
4135                 SOLISTEN_LOCK(so);
4136         } else {
4137                 SOCK_RECVBUF_LOCK(so);
4138                 if (__predict_false(SOLISTENING(so))) {
4139                         SOCK_RECVBUF_UNLOCK(so);
4140                         goto retry;
4141                 }
4142         }
4143 }
4144
4145 static void
4146 so_rdknl_unlock(void *arg)
4147 {
4148         struct socket *so = arg;
4149
4150         if (SOLISTENING(so))
4151                 SOLISTEN_UNLOCK(so);
4152         else
4153                 SOCK_RECVBUF_UNLOCK(so);
4154 }
4155
4156 static void
4157 so_rdknl_assert_lock(void *arg, int what)
4158 {
4159         struct socket *so = arg;
4160
4161         if (what == LA_LOCKED) {
4162                 if (SOLISTENING(so))
4163                         SOLISTEN_LOCK_ASSERT(so);
4164                 else
4165                         SOCK_RECVBUF_LOCK_ASSERT(so);
4166         } else {
4167                 if (SOLISTENING(so))
4168                         SOLISTEN_UNLOCK_ASSERT(so);
4169                 else
4170                         SOCK_RECVBUF_UNLOCK_ASSERT(so);
4171         }
4172 }
4173
4174 static void
4175 so_wrknl_lock(void *arg)
4176 {
4177         struct socket *so = arg;
4178
4179 retry:
4180         if (SOLISTENING(so)) {
4181                 SOLISTEN_LOCK(so);
4182         } else {
4183                 SOCK_SENDBUF_LOCK(so);
4184                 if (__predict_false(SOLISTENING(so))) {
4185                         SOCK_SENDBUF_UNLOCK(so);
4186                         goto retry;
4187                 }
4188         }
4189 }
4190
4191 static void
4192 so_wrknl_unlock(void *arg)
4193 {
4194         struct socket *so = arg;
4195
4196         if (SOLISTENING(so))
4197                 SOLISTEN_UNLOCK(so);
4198         else
4199                 SOCK_SENDBUF_UNLOCK(so);
4200 }
4201
4202 static void
4203 so_wrknl_assert_lock(void *arg, int what)
4204 {
4205         struct socket *so = arg;
4206
4207         if (what == LA_LOCKED) {
4208                 if (SOLISTENING(so))
4209                         SOLISTEN_LOCK_ASSERT(so);
4210                 else
4211                         SOCK_SENDBUF_LOCK_ASSERT(so);
4212         } else {
4213                 if (SOLISTENING(so))
4214                         SOLISTEN_UNLOCK_ASSERT(so);
4215                 else
4216                         SOCK_SENDBUF_UNLOCK_ASSERT(so);
4217         }
4218 }
4219
4220 /*
4221  * Create an external-format (``xsocket'') structure using the information in
4222  * the kernel-format socket structure pointed to by so.  This is done to
4223  * reduce the spew of irrelevant information over this interface, to isolate
4224  * user code from changes in the kernel structure, and potentially to provide
4225  * information-hiding if we decide that some of this information should be
4226  * hidden from users.
4227  */
4228 void
4229 sotoxsocket(struct socket *so, struct xsocket *xso)
4230 {
4231
4232         bzero(xso, sizeof(*xso));
4233         xso->xso_len = sizeof *xso;
4234         xso->xso_so = (uintptr_t)so;
4235         xso->so_type = so->so_type;
4236         xso->so_options = so->so_options;
4237         xso->so_linger = so->so_linger;
4238         xso->so_state = so->so_state;
4239         xso->so_pcb = (uintptr_t)so->so_pcb;
4240         xso->xso_protocol = so->so_proto->pr_protocol;
4241         xso->xso_family = so->so_proto->pr_domain->dom_family;
4242         xso->so_timeo = so->so_timeo;
4243         xso->so_error = so->so_error;
4244         xso->so_uid = so->so_cred->cr_uid;
4245         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4246         if (SOLISTENING(so)) {
4247                 xso->so_qlen = so->sol_qlen;
4248                 xso->so_incqlen = so->sol_incqlen;
4249                 xso->so_qlimit = so->sol_qlimit;
4250                 xso->so_oobmark = 0;
4251         } else {
4252                 xso->so_state |= so->so_qstate;
4253                 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4254                 xso->so_oobmark = so->so_oobmark;
4255                 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4256                 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4257         }
4258 }
4259
4260 struct sockbuf *
4261 so_sockbuf_rcv(struct socket *so)
4262 {
4263
4264         return (&so->so_rcv);
4265 }
4266
4267 struct sockbuf *
4268 so_sockbuf_snd(struct socket *so)
4269 {
4270
4271         return (&so->so_snd);
4272 }
4273
4274 int
4275 so_state_get(const struct socket *so)
4276 {
4277
4278         return (so->so_state);
4279 }
4280
4281 void
4282 so_state_set(struct socket *so, int val)
4283 {
4284
4285         so->so_state = val;
4286 }
4287
4288 int
4289 so_options_get(const struct socket *so)
4290 {
4291
4292         return (so->so_options);
4293 }
4294
4295 void
4296 so_options_set(struct socket *so, int val)
4297 {
4298
4299         so->so_options = val;
4300 }
4301
4302 int
4303 so_error_get(const struct socket *so)
4304 {
4305
4306         return (so->so_error);
4307 }
4308
4309 void
4310 so_error_set(struct socket *so, int val)
4311 {
4312
4313         so->so_error = val;
4314 }
4315
4316 int
4317 so_linger_get(const struct socket *so)
4318 {
4319
4320         return (so->so_linger);
4321 }
4322
4323 void
4324 so_linger_set(struct socket *so, int val)
4325 {
4326
4327         KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4328             ("%s: val %d out of range", __func__, val));
4329
4330         so->so_linger = val;
4331 }
4332
4333 struct protosw *
4334 so_protosw_get(const struct socket *so)
4335 {
4336
4337         return (so->so_proto);
4338 }
4339
4340 void
4341 so_protosw_set(struct socket *so, struct protosw *val)
4342 {
4343
4344         so->so_proto = val;
4345 }
4346
4347 void
4348 so_sorwakeup(struct socket *so)
4349 {
4350
4351         sorwakeup(so);
4352 }
4353
4354 void
4355 so_sowwakeup(struct socket *so)
4356 {
4357
4358         sowwakeup(so);
4359 }
4360
4361 void
4362 so_sorwakeup_locked(struct socket *so)
4363 {
4364
4365         sorwakeup_locked(so);
4366 }
4367
4368 void
4369 so_sowwakeup_locked(struct socket *so)
4370 {
4371
4372         sowwakeup_locked(so);
4373 }
4374
4375 void
4376 so_lock(struct socket *so)
4377 {
4378
4379         SOCK_LOCK(so);
4380 }
4381
4382 void
4383 so_unlock(struct socket *so)
4384 {
4385
4386         SOCK_UNLOCK(so);
4387 }