sys/kern/uipc_socket.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   5  *      The Regents of the University of California.
   6  * Copyright (c) 2004 The FreeBSD Foundation
   7  * Copyright (c) 2004-2008 Robert N. M. Watson
   8  * All rights reserved.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  35  */
  36
  37 /*
  38  * Comments on the socket life cycle:
  39  *
  40  * soalloc() sets of socket layer state for a socket, called only by
  41  * socreate() and sonewconn().  Socket layer private.
  42  *
  43  * sodealloc() tears down socket layer state for a socket, called only by
  44  * sofree() and sonewconn().  Socket layer private.
  45  *
  46  * pru_attach() associates protocol layer state with an allocated socket;
  47  * called only once, may fail, aborting socket allocation.  This is called
  48  * from socreate() and sonewconn().  Socket layer private.
  49  *
  50  * pru_detach() disassociates protocol layer state from an attached socket,
  51  * and will be called exactly once for sockets in which pru_attach() has
  52  * been successfully called.  If pru_attach() returned an error,
  53  * pru_detach() will not be called.  Socket layer private.
  54  *
  55  * pru_abort() and pru_close() notify the protocol layer that the last
  56  * consumer of a socket is starting to tear down the socket, and that the
  57  * protocol should terminate the connection.  Historically, pru_abort() also
  58  * detached protocol state from the socket state, but this is no longer the
  59  * case.
  60  *
  61  * socreate() creates a socket and attaches protocol state.  This is a public
  62  * interface that may be used by socket layer consumers to create new
  63  * sockets.
  64  *
  65  * sonewconn() creates a socket and attaches protocol state.  This is a
  66  * public interface  that may be used by protocols to create new sockets when
  67  * a new connection is received and will be available for accept() on a
  68  * listen socket.
  69  *
  70  * soclose() destroys a socket after possibly waiting for it to disconnect.
  71  * This is a public interface that socket consumers should use to close and
  72  * release a socket when done with it.
  73  *
  74  * soabort() destroys a socket without waiting for it to disconnect (used
  75  * only for incoming connections that are already partially or fully
  76  * connected).  This is used internally by the socket layer when clearing
  77  * listen socket queues (due to overflow or close on the listen socket), but
  78  * is also a public interface protocols may use to abort connections in
  79  * their incomplete listen queues should they no longer be required.  Sockets
  80  * placed in completed connection listen queues should not be aborted for
  81  * reasons described in the comment above the soclose() implementation.  This
  82  * is not a general purpose close routine, and except in the specific
  83  * circumstances described here, should not be used.
  84  *
  85  * sofree() will free a socket and its protocol state if all references on
  86  * the socket have been released, and is the public interface to attempt to
  87  * free a socket when a reference is removed.  This is a socket layer private
  88  * interface.
  89  *
  90  * NOTE: In addition to socreate() and soclose(), which provide a single
  91  * socket reference to the consumer to be managed as required, there are two
  92  * calls to explicitly manage socket references, soref(), and sorele().
  93  * Currently, these are generally required only when transitioning a socket
  94  * from a listen queue to a file descriptor, in order to prevent garbage
  95  * collection of the socket at an untimely moment.  For a number of reasons,
  96  * these interfaces are not preferred, and should be avoided.
  97  *
  98  * NOTE: With regard to VNETs the general rule is that callers do not set
  99  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
 100  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
 101  * and sorflush(), which are usually called from a pre-set VNET context.
 102  * sopoll() currently does not need a VNET context to be set.
 103  */
 104
 105 #include <sys/cdefs.h>
 106 __FBSDID("$FreeBSD$");
 107
 108 #include "opt_inet.h"
 109 #include "opt_inet6.h"
 110 #include "opt_kern_tls.h"
 111 #include "opt_sctp.h"
 112
 113 #include <sys/param.h>
 114 #include <sys/systm.h>
 115 #include <sys/fcntl.h>
 116 #include <sys/limits.h>
 117 #include <sys/lock.h>
 118 #include <sys/mac.h>
 119 #include <sys/malloc.h>
 120 #include <sys/mbuf.h>
 121 #include <sys/mutex.h>
 122 #include <sys/domain.h>
 123 #include <sys/file.h>                   /* for struct knote */
 124 #include <sys/hhook.h>
 125 #include <sys/kernel.h>
 126 #include <sys/khelp.h>
 127 #include <sys/ktls.h>
 128 #include <sys/event.h>
 129 #include <sys/eventhandler.h>
 130 #include <sys/poll.h>
 131 #include <sys/proc.h>
 132 #include <sys/protosw.h>
 133 #include <sys/socket.h>
 134 #include <sys/socketvar.h>
 135 #include <sys/resourcevar.h>
 136 #include <net/route.h>
 137 #include <sys/signalvar.h>
 138 #include <sys/stat.h>
 139 #include <sys/sx.h>
 140 #include <sys/sysctl.h>
 141 #include <sys/taskqueue.h>
 142 #include <sys/uio.h>
 143 #include <sys/jail.h>
 144 #include <sys/syslog.h>
 145 #include <netinet/in.h>
 146 #include <netinet/tcp.h>
 147
 148 #include <net/vnet.h>
 149
 150 #include <security/mac/mac_framework.h>
 151
 152 #include <vm/uma.h>
 153
 154 #ifdef COMPAT_FREEBSD32
 155 #include <sys/mount.h>
 156 #include <sys/sysent.h>
 157 #include <compat/freebsd32/freebsd32.h>
 158 #endif
 159
 160 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 161                     int flags);
 162 static void     so_rdknl_lock(void *);
 163 static void     so_rdknl_unlock(void *);
 164 static void     so_rdknl_assert_locked(void *);
 165 static void     so_rdknl_assert_unlocked(void *);
 166 static void     so_wrknl_lock(void *);
 167 static void     so_wrknl_unlock(void *);
 168 static void     so_wrknl_assert_locked(void *);
 169 static void     so_wrknl_assert_unlocked(void *);
 170
 171 static void     filt_sordetach(struct knote *kn);
 172 static int      filt_soread(struct knote *kn, long hint);
 173 static void     filt_sowdetach(struct knote *kn);
 174 static int      filt_sowrite(struct knote *kn, long hint);
 175 static int      filt_soempty(struct knote *kn, long hint);
 176 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 177 fo_kqfilter_t   soo_kqfilter;
 178
 179 static struct filterops soread_filtops = {
 180         .f_isfd = 1,
 181         .f_detach = filt_sordetach,
 182         .f_event = filt_soread,
 183 };
 184 static struct filterops sowrite_filtops = {
 185         .f_isfd = 1,
 186         .f_detach = filt_sowdetach,
 187         .f_event = filt_sowrite,
 188 };
 189 static struct filterops soempty_filtops = {
 190         .f_isfd = 1,
 191         .f_detach = filt_sowdetach,
 192         .f_event = filt_soempty,
 193 };
 194
 195 so_gen_t        so_gencnt;      /* generation count for sockets */
 196
 197 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 198 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 199
 200 #define VNET_SO_ASSERT(so)                                              \
 201         VNET_ASSERT(curvnet != NULL,                                    \
 202             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 203
 204 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 205 #define V_socket_hhh            VNET(socket_hhh)
 206
 207 /*
 208  * Limit on the number of connections in the listen queue waiting
 209  * for accept(2).
 210  * NB: The original sysctl somaxconn is still available but hidden
 211  * to prevent confusion about the actual purpose of this number.
 212  */
 213 static u_int somaxconn = SOMAXCONN;
 214
 215 static int
 216 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 217 {
 218         int error;
 219         int val;
 220
 221         val = somaxconn;
 222         error = sysctl_handle_int(oidp, &val, 0, req);
 223         if (error || !req->newptr )
 224                 return (error);
 225
 226         /*
 227          * The purpose of the UINT_MAX / 3 limit, is so that the formula
 228          *   3 * so_qlimit / 2
 229          * below, will not overflow.
 230          */
 231
 232         if (val < 1 || val > UINT_MAX / 3)
 233                 return (EINVAL);
 234
 235         somaxconn = val;
 236         return (0);
 237 }
 238 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
 239     0, sizeof(int), sysctl_somaxconn, "I",
 240     "Maximum listen socket pending connection accept queue size");
 241 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 242     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
 243     0, sizeof(int), sysctl_somaxconn, "I",
 244     "Maximum listen socket pending connection accept queue size (compat)");
 245
 246 static int numopensockets;
 247 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 248     &numopensockets, 0, "Number of open sockets");
 249
 250 /*
 251  * accept_mtx locks down per-socket fields relating to accept queues.  See
 252  * socketvar.h for an annotation of the protected fields of struct socket.
 253  */
 254 struct mtx accept_mtx;
 255 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 256
 257 /*
 258  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 259  * so_gencnt field.
 260  */
 261 static struct mtx so_global_mtx;
 262 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 263
 264 /*
 265  * General IPC sysctl name space, used by sockets and a variety of other IPC
 266  * types.
 267  */
 268 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 269
 270 /*
 271  * Initialize the socket subsystem and set up the socket
 272  * memory allocator.
 273  */
 274 static uma_zone_t socket_zone;
 275 int     maxsockets;
 276
 277 static void
 278 socket_zone_change(void *tag)
 279 {
 280
 281         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 282 }
 283
 284 static void
 285 socket_hhook_register(int subtype)
 286 {
 287
 288         if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 289             &V_socket_hhh[subtype],
 290             HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 291                 printf("%s: WARNING: unable to register hook\n", __func__);
 292 }
 293
 294 static void
 295 socket_hhook_deregister(int subtype)
 296 {
 297
 298         if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 299                 printf("%s: WARNING: unable to deregister hook\n", __func__);
 300 }
 301
 302 static void
 303 socket_init(void *tag)
 304 {
 305
 306         socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 307             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 308         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 309         uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 310         EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 311             EVENTHANDLER_PRI_FIRST);
 312 }
 313 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 314
 315 static void
 316 socket_vnet_init(const void *unused __unused)
 317 {
 318         int i;
 319
 320         /* We expect a contiguous range */
 321         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 322                 socket_hhook_register(i);
 323 }
 324 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 325     socket_vnet_init, NULL);
 326
 327 static void
 328 socket_vnet_uninit(const void *unused __unused)
 329 {
 330         int i;
 331
 332         for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 333                 socket_hhook_deregister(i);
 334 }
 335 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
 336     socket_vnet_uninit, NULL);
 337
 338 /*
 339  * Initialise maxsockets.  This SYSINIT must be run after
 340  * tunable_mbinit().
 341  */
 342 static void
 343 init_maxsockets(void *ignored)
 344 {
 345
 346         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 347         maxsockets = imax(maxsockets, maxfiles);
 348 }
 349 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 350
 351 /*
 352  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 353  * of the change so that they can update their dependent limits as required.
 354  */
 355 static int
 356 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 357 {
 358         int error, newmaxsockets;
 359
 360         newmaxsockets = maxsockets;
 361         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 362         if (error == 0 && req->newptr) {
 363                 if (newmaxsockets > maxsockets &&
 364                     newmaxsockets <= maxfiles) {
 365                         maxsockets = newmaxsockets;
 366                         EVENTHANDLER_INVOKE(maxsockets_change);
 367                 } else
 368                         error = EINVAL;
 369         }
 370         return (error);
 371 }
 372 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 373     &maxsockets, 0, sysctl_maxsockets, "IU",
 374     "Maximum number of sockets available");
 375
 376 /*
 377  * Socket operation routines.  These routines are called by the routines in
 378  * sys_socket.c or from a system process, and implement the semantics of
 379  * socket operations by switching out to the protocol specific routines.
 380  */
 381
 382 /*
 383  * Get a socket structure from our zone, and initialize it.  Note that it
 384  * would probably be better to allocate socket and PCB at the same time, but
 385  * I'm not convinced that all the protocols can be easily modified to do
 386  * this.
 387  *
 388  * soalloc() returns a socket with a ref count of 0.
 389  */
 390 static struct socket *
 391 soalloc(struct vnet *vnet)
 392 {
 393         struct socket *so;
 394
 395         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 396         if (so == NULL)
 397                 return (NULL);
 398 #ifdef MAC
 399         if (mac_socket_init(so, M_NOWAIT) != 0) {
 400                 uma_zfree(socket_zone, so);
 401                 return (NULL);
 402         }
 403 #endif
 404         if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 405                 uma_zfree(socket_zone, so);
 406                 return (NULL);
 407         }
 408
 409         /*
 410          * The socket locking protocol allows to lock 2 sockets at a time,
 411          * however, the first one must be a listening socket.  WITNESS lacks
 412          * a feature to change class of an existing lock, so we use DUPOK.
 413          */
 414         mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 415         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 416         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 417         so->so_rcv.sb_sel = &so->so_rdsel;
 418         so->so_snd.sb_sel = &so->so_wrsel;
 419         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 420         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 421         TAILQ_INIT(&so->so_snd.sb_aiojobq);
 422         TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 423         TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 424         TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 425 #ifdef VIMAGE
 426         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 427             __func__, __LINE__, so));
 428         so->so_vnet = vnet;
 429 #endif
 430         /* We shouldn't need the so_global_mtx */
 431         if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 432                 /* Do we need more comprehensive error returns? */
 433                 uma_zfree(socket_zone, so);
 434                 return (NULL);
 435         }
 436         mtx_lock(&so_global_mtx);
 437         so->so_gencnt = ++so_gencnt;
 438         ++numopensockets;
 439 #ifdef VIMAGE
 440         vnet->vnet_sockcnt++;
 441 #endif
 442         mtx_unlock(&so_global_mtx);
 443
 444         return (so);
 445 }
 446
 447 /*
 448  * Free the storage associated with a socket at the socket layer, tear down
 449  * locks, labels, etc.  All protocol state is assumed already to have been
 450  * torn down (and possibly never set up) by the caller.
 451  */
 452 static void
 453 sodealloc(struct socket *so)
 454 {
 455
 456         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 457         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 458
 459         mtx_lock(&so_global_mtx);
 460         so->so_gencnt = ++so_gencnt;
 461         --numopensockets;       /* Could be below, but faster here. */
 462 #ifdef VIMAGE
 463         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 464             __func__, __LINE__, so));
 465         so->so_vnet->vnet_sockcnt--;
 466 #endif
 467         mtx_unlock(&so_global_mtx);
 468 #ifdef MAC
 469         mac_socket_destroy(so);
 470 #endif
 471         hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 472
 473         crfree(so->so_cred);
 474         khelp_destroy_osd(&so->osd);
 475         if (SOLISTENING(so)) {
 476                 if (so->sol_accept_filter != NULL)
 477                         accept_filt_setopt(so, NULL);
 478         } else {
 479                 if (so->so_rcv.sb_hiwat)
 480                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 481                             &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 482                 if (so->so_snd.sb_hiwat)
 483                         (void)chgsbsize(so->so_cred->cr_uidinfo,
 484                             &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 485                 sx_destroy(&so->so_snd.sb_sx);
 486                 sx_destroy(&so->so_rcv.sb_sx);
 487                 SOCKBUF_LOCK_DESTROY(&so->so_snd);
 488                 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 489         }
 490         mtx_destroy(&so->so_lock);
 491         uma_zfree(socket_zone, so);
 492 }
 493
 494 /*
 495  * socreate returns a socket with a ref count of 1.  The socket should be
 496  * closed with soclose().
 497  */
 498 int
 499 socreate(int dom, struct socket **aso, int type, int proto,
 500     struct ucred *cred, struct thread *td)
 501 {
 502         struct protosw *prp;
 503         struct socket *so;
 504         int error;
 505
 506         if (proto)
 507                 prp = pffindproto(dom, proto, type);
 508         else
 509                 prp = pffindtype(dom, type);
 510
 511         if (prp == NULL) {
 512                 /* No support for domain. */
 513                 if (pffinddomain(dom) == NULL)
 514                         return (EAFNOSUPPORT);
 515                 /* No support for socket type. */
 516                 if (proto == 0 && type != 0)
 517                         return (EPROTOTYPE);
 518                 return (EPROTONOSUPPORT);
 519         }
 520         if (prp->pr_usrreqs->pru_attach == NULL ||
 521             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 522                 return (EPROTONOSUPPORT);
 523
 524         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 525                 return (EPROTONOSUPPORT);
 526
 527         if (prp->pr_type != type)
 528                 return (EPROTOTYPE);
 529         so = soalloc(CRED_TO_VNET(cred));
 530         if (so == NULL)
 531                 return (ENOBUFS);
 532
 533         so->so_type = type;
 534         so->so_cred = crhold(cred);
 535         if ((prp->pr_domain->dom_family == PF_INET) ||
 536             (prp->pr_domain->dom_family == PF_INET6) ||
 537             (prp->pr_domain->dom_family == PF_ROUTE))
 538                 so->so_fibnum = td->td_proc->p_fibnum;
 539         else
 540                 so->so_fibnum = 0;
 541         so->so_proto = prp;
 542 #ifdef MAC
 543         mac_socket_create(cred, so);
 544 #endif
 545         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 546             so_rdknl_assert_locked, so_rdknl_assert_unlocked);
 547         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 548             so_wrknl_assert_locked, so_wrknl_assert_unlocked);
 549         /*
 550          * Auto-sizing of socket buffers is managed by the protocols and
 551          * the appropriate flags must be set in the pru_attach function.
 552          */
 553         CURVNET_SET(so->so_vnet);
 554         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 555         CURVNET_RESTORE();
 556         if (error) {
 557                 sodealloc(so);
 558                 return (error);
 559         }
 560         soref(so);
 561         *aso = so;
 562         return (0);
 563 }
 564
 565 #ifdef REGRESSION
 566 static int regression_sonewconn_earlytest = 1;
 567 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 568     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 569 #endif
 570
 571 /*
 572  * When an attempt at a new connection is noted on a socket which accepts
 573  * connections, sonewconn is called.  If the connection is possible (subject
 574  * to space constraints, etc.) then we allocate a new structure, properly
 575  * linked into the data structure of the original socket, and return this.
 576  * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
 577  *
 578  * Note: the ref count on the socket is 0 on return.
 579  */
 580 struct socket *
 581 sonewconn(struct socket *head, int connstatus)
 582 {
 583         static struct timeval lastover;
 584         static struct timeval overinterval = { 60, 0 };
 585         static int overcount;
 586
 587         struct socket *so;
 588         u_int over;
 589
 590         SOLISTEN_LOCK(head);
 591         over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 592         SOLISTEN_UNLOCK(head);
 593 #ifdef REGRESSION
 594         if (regression_sonewconn_earlytest && over) {
 595 #else
 596         if (over) {
 597 #endif
 598                 overcount++;
 599
 600                 if (ratecheck(&lastover, &overinterval)) {
 601                         log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
 602                             "%i already in queue awaiting acceptance "
 603                             "(%d occurrences)\n",
 604                             __func__, head->so_pcb, head->sol_qlen, overcount);
 605
 606                         overcount = 0;
 607                 }
 608
 609                 return (NULL);
 610         }
 611         VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 612             __func__, head));
 613         so = soalloc(head->so_vnet);
 614         if (so == NULL) {
 615                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 616                     "limit reached or out of memory\n",
 617                     __func__, head->so_pcb);
 618                 return (NULL);
 619         }
 620         so->so_listen = head;
 621         so->so_type = head->so_type;
 622         so->so_linger = head->so_linger;
 623         so->so_state = head->so_state | SS_NOFDREF;
 624         so->so_fibnum = head->so_fibnum;
 625         so->so_proto = head->so_proto;
 626         so->so_cred = crhold(head->so_cred);
 627 #ifdef MAC
 628         mac_socket_newconn(head, so);
 629 #endif
 630         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 631             so_rdknl_assert_locked, so_rdknl_assert_unlocked);
 632         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 633             so_wrknl_assert_locked, so_wrknl_assert_unlocked);
 634         VNET_SO_ASSERT(head);
 635         if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 636                 sodealloc(so);
 637                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 638                     __func__, head->so_pcb);
 639                 return (NULL);
 640         }
 641         if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 642                 sodealloc(so);
 643                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 644                     __func__, head->so_pcb);
 645                 return (NULL);
 646         }
 647         so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 648         so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 649         so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 650         so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 651         so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
 652         so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
 653
 654         SOLISTEN_LOCK(head);
 655         if (head->sol_accept_filter != NULL)
 656                 connstatus = 0;
 657         so->so_state |= connstatus;
 658         so->so_options = head->so_options & ~SO_ACCEPTCONN;
 659         soref(head); /* A socket on (in)complete queue refs head. */
 660         if (connstatus) {
 661                 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 662                 so->so_qstate = SQ_COMP;
 663                 head->sol_qlen++;
 664                 solisten_wakeup(head);  /* unlocks */
 665         } else {
 666                 /*
 667                  * Keep removing sockets from the head until there's room for
 668                  * us to insert on the tail.  In pre-locking revisions, this
 669                  * was a simple if(), but as we could be racing with other
 670                  * threads and soabort() requires dropping locks, we must
 671                  * loop waiting for the condition to be true.
 672                  */
 673                 while (head->sol_incqlen > head->sol_qlimit) {
 674                         struct socket *sp;
 675
 676                         sp = TAILQ_FIRST(&head->sol_incomp);
 677                         TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 678                         head->sol_incqlen--;
 679                         SOCK_LOCK(sp);
 680                         sp->so_qstate = SQ_NONE;
 681                         sp->so_listen = NULL;
 682                         SOCK_UNLOCK(sp);
 683                         sorele(head);   /* does SOLISTEN_UNLOCK, head stays */
 684                         soabort(sp);
 685                         SOLISTEN_LOCK(head);
 686                 }
 687                 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 688                 so->so_qstate = SQ_INCOMP;
 689                 head->sol_incqlen++;
 690                 SOLISTEN_UNLOCK(head);
 691         }
 692         return (so);
 693 }
 694
 695 #ifdef SCTP
 696 /*
 697  * Socket part of sctp_peeloff().  Detach a new socket from an
 698  * association.  The new socket is returned with a reference.
 699  */
 700 struct socket *
 701 sopeeloff(struct socket *head)
 702 {
 703         struct socket *so;
 704
 705         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 706             __func__, __LINE__, head));
 707         so = soalloc(head->so_vnet);
 708         if (so == NULL) {
 709                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 710                     "limit reached or out of memory\n",
 711                     __func__, head->so_pcb);
 712                 return (NULL);
 713         }
 714         so->so_type = head->so_type;
 715         so->so_options = head->so_options;
 716         so->so_linger = head->so_linger;
 717         so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 718         so->so_fibnum = head->so_fibnum;
 719         so->so_proto = head->so_proto;
 720         so->so_cred = crhold(head->so_cred);
 721 #ifdef MAC
 722         mac_socket_newconn(head, so);
 723 #endif
 724         knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 725             so_rdknl_assert_locked, so_rdknl_assert_unlocked);
 726         knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 727             so_wrknl_assert_locked, so_wrknl_assert_unlocked);
 728         VNET_SO_ASSERT(head);
 729         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 730                 sodealloc(so);
 731                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 732                     __func__, head->so_pcb);
 733                 return (NULL);
 734         }
 735         if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 736                 sodealloc(so);
 737                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 738                     __func__, head->so_pcb);
 739                 return (NULL);
 740         }
 741         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 742         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 743         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 744         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 745         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 746         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 747
 748         soref(so);
 749
 750         return (so);
 751 }
 752 #endif  /* SCTP */
 753
 754 int
 755 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 756 {
 757         int error;
 758
 759         CURVNET_SET(so->so_vnet);
 760         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 761         CURVNET_RESTORE();
 762         return (error);
 763 }
 764
 765 int
 766 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 767 {
 768         int error;
 769
 770         CURVNET_SET(so->so_vnet);
 771         error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
 772         CURVNET_RESTORE();
 773         return (error);
 774 }
 775
 776 /*
 777  * solisten() transitions a socket from a non-listening state to a listening
 778  * state, but can also be used to update the listen queue depth on an
 779  * existing listen socket.  The protocol will call back into the sockets
 780  * layer using solisten_proto_check() and solisten_proto() to check and set
 781  * socket-layer listen state.  Call backs are used so that the protocol can
 782  * acquire both protocol and socket layer locks in whatever order is required
 783  * by the protocol.
 784  *
 785  * Protocol implementors are advised to hold the socket lock across the
 786  * socket-layer test and set to avoid races at the socket layer.
 787  */
 788 int
 789 solisten(struct socket *so, int backlog, struct thread *td)
 790 {
 791         int error;
 792
 793         CURVNET_SET(so->so_vnet);
 794         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 795         CURVNET_RESTORE();
 796         return (error);
 797 }
 798
 799 int
 800 solisten_proto_check(struct socket *so)
 801 {
 802
 803         SOCK_LOCK_ASSERT(so);
 804
 805         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 806             SS_ISDISCONNECTING))
 807                 return (EINVAL);
 808         return (0);
 809 }
 810
 811 void
 812 solisten_proto(struct socket *so, int backlog)
 813 {
 814         int sbrcv_lowat, sbsnd_lowat;
 815         u_int sbrcv_hiwat, sbsnd_hiwat;
 816         short sbrcv_flags, sbsnd_flags;
 817         sbintime_t sbrcv_timeo, sbsnd_timeo;
 818
 819         SOCK_LOCK_ASSERT(so);
 820
 821         if (SOLISTENING(so))
 822                 goto listening;
 823
 824         /*
 825          * Change this socket to listening state.
 826          */
 827         sbrcv_lowat = so->so_rcv.sb_lowat;
 828         sbsnd_lowat = so->so_snd.sb_lowat;
 829         sbrcv_hiwat = so->so_rcv.sb_hiwat;
 830         sbsnd_hiwat = so->so_snd.sb_hiwat;
 831         sbrcv_flags = so->so_rcv.sb_flags;
 832         sbsnd_flags = so->so_snd.sb_flags;
 833         sbrcv_timeo = so->so_rcv.sb_timeo;
 834         sbsnd_timeo = so->so_snd.sb_timeo;
 835
 836         sbdestroy(&so->so_snd, so);
 837         sbdestroy(&so->so_rcv, so);
 838         sx_destroy(&so->so_snd.sb_sx);
 839         sx_destroy(&so->so_rcv.sb_sx);
 840         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 841         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 842
 843 #ifdef INVARIANTS
 844         bzero(&so->so_rcv,
 845             sizeof(struct socket) - offsetof(struct socket, so_rcv));
 846 #endif
 847
 848         so->sol_sbrcv_lowat = sbrcv_lowat;
 849         so->sol_sbsnd_lowat = sbsnd_lowat;
 850         so->sol_sbrcv_hiwat = sbrcv_hiwat;
 851         so->sol_sbsnd_hiwat = sbsnd_hiwat;
 852         so->sol_sbrcv_flags = sbrcv_flags;
 853         so->sol_sbsnd_flags = sbsnd_flags;
 854         so->sol_sbrcv_timeo = sbrcv_timeo;
 855         so->sol_sbsnd_timeo = sbsnd_timeo;
 856
 857         so->sol_qlen = so->sol_incqlen = 0;
 858         TAILQ_INIT(&so->sol_incomp);
 859         TAILQ_INIT(&so->sol_comp);
 860
 861         so->sol_accept_filter = NULL;
 862         so->sol_accept_filter_arg = NULL;
 863         so->sol_accept_filter_str = NULL;
 864
 865         so->sol_upcall = NULL;
 866         so->sol_upcallarg = NULL;
 867
 868         so->so_options |= SO_ACCEPTCONN;
 869
 870 listening:
 871         if (backlog < 0 || backlog > somaxconn)
 872                 backlog = somaxconn;
 873         so->sol_qlimit = backlog;
 874 }
 875
 876 /*
 877  * Wakeup listeners/subsystems once we have a complete connection.
 878  * Enters with lock, returns unlocked.
 879  */
 880 void
 881 solisten_wakeup(struct socket *sol)
 882 {
 883
 884         if (sol->sol_upcall != NULL)
 885                 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
 886         else {
 887                 selwakeuppri(&sol->so_rdsel, PSOCK);
 888                 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
 889         }
 890         SOLISTEN_UNLOCK(sol);
 891         wakeup_one(&sol->sol_comp);
 892         if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
 893                 pgsigio(&sol->so_sigio, SIGIO, 0);
 894 }
 895
 896 /*
 897  * Return single connection off a listening socket queue.  Main consumer of
 898  * the function is kern_accept4().  Some modules, that do their own accept
 899  * management also use the function.
 900  *
 901  * Listening socket must be locked on entry and is returned unlocked on
 902  * return.
 903  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
 904  */
 905 int
 906 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 907 {
 908         struct socket *so;
 909         int error;
 910
 911         SOLISTEN_LOCK_ASSERT(head);
 912
 913         while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
 914             head->so_error == 0) {
 915                 error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
 916                     "accept", 0);
 917                 if (error != 0) {
 918                         SOLISTEN_UNLOCK(head);
 919                         return (error);
 920                 }
 921         }
 922         if (head->so_error) {
 923                 error = head->so_error;
 924                 head->so_error = 0;
 925         } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
 926                 error = EWOULDBLOCK;
 927         else
 928                 error = 0;
 929         if (error) {
 930                 SOLISTEN_UNLOCK(head);
 931                 return (error);
 932         }
 933         so = TAILQ_FIRST(&head->sol_comp);
 934         SOCK_LOCK(so);
 935         KASSERT(so->so_qstate == SQ_COMP,
 936             ("%s: so %p not SQ_COMP", __func__, so));
 937         soref(so);
 938         head->sol_qlen--;
 939         so->so_qstate = SQ_NONE;
 940         so->so_listen = NULL;
 941         TAILQ_REMOVE(&head->sol_comp, so, so_list);
 942         if (flags & ACCEPT4_INHERIT)
 943                 so->so_state |= (head->so_state & SS_NBIO);
 944         else
 945                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 946         SOCK_UNLOCK(so);
 947         sorele(head);
 948
 949         *ret = so;
 950         return (0);
 951 }
 952
 953 /*
 954  * Evaluate the reference count and named references on a socket; if no
 955  * references remain, free it.  This should be called whenever a reference is
 956  * released, such as in sorele(), but also when named reference flags are
 957  * cleared in socket or protocol code.
 958  *
 959  * sofree() will free the socket if:
 960  *
 961  * - There are no outstanding file descriptor references or related consumers
 962  *   (so_count == 0).
 963  *
 964  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 965  *
 966  * - The protocol does not have an outstanding strong reference on the socket
 967  *   (SS_PROTOREF).
 968  *
 969  * - The socket is not in a completed connection queue, so a process has been
 970  *   notified that it is present.  If it is removed, the user process may
 971  *   block in accept() despite select() saying the socket was ready.
 972  */
 973 void
 974 sofree(struct socket *so)
 975 {
 976         struct protosw *pr = so->so_proto;
 977
 978         SOCK_LOCK_ASSERT(so);
 979
 980         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 981             (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
 982                 SOCK_UNLOCK(so);
 983                 return;
 984         }
 985
 986         if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
 987                 struct socket *sol;
 988
 989                 sol = so->so_listen;
 990                 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
 991
 992                 /*
 993                  * To solve race between close of a listening socket and
 994                  * a socket on its incomplete queue, we need to lock both.
 995                  * The order is first listening socket, then regular.
 996                  * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
 997                  * function and the listening socket are the only pointers
 998                  * to so.  To preserve so and sol, we reference both and then
 999                  * relock.
1000                  * After relock the socket may not move to so_comp since it
1001                  * doesn't have PCB already, but it may be removed from
1002                  * so_incomp. If that happens, we share responsiblity on
1003                  * freeing the socket, but soclose() has already removed
1004                  * it from queue.
1005                  */
1006                 soref(sol);
1007                 soref(so);
1008                 SOCK_UNLOCK(so);
1009                 SOLISTEN_LOCK(sol);
1010                 SOCK_LOCK(so);
1011                 if (so->so_qstate == SQ_INCOMP) {
1012                         KASSERT(so->so_listen == sol,
1013                             ("%s: so %p migrated out of sol %p",
1014                             __func__, so, sol));
1015                         TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
1016                         sol->sol_incqlen--;
1017                         /* This is guarenteed not to be the last. */
1018                         refcount_release(&sol->so_count);
1019                         so->so_qstate = SQ_NONE;
1020                         so->so_listen = NULL;
1021                 } else
1022                         KASSERT(so->so_listen == NULL,
1023                             ("%s: so %p not on (in)comp with so_listen",
1024                             __func__, so));
1025                 sorele(sol);
1026                 KASSERT(so->so_count == 1,
1027                     ("%s: so %p count %u", __func__, so, so->so_count));
1028                 so->so_count = 0;
1029         }
1030         if (SOLISTENING(so))
1031                 so->so_error = ECONNABORTED;
1032         SOCK_UNLOCK(so);
1033
1034         if (so->so_dtor != NULL)
1035                 so->so_dtor(so);
1036
1037         VNET_SO_ASSERT(so);
1038         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1039                 (*pr->pr_domain->dom_dispose)(so);
1040         if (pr->pr_usrreqs->pru_detach != NULL)
1041                 (*pr->pr_usrreqs->pru_detach)(so);
1042
1043         /*
1044          * From this point on, we assume that no other references to this
1045          * socket exist anywhere else in the stack.  Therefore, no locks need
1046          * to be acquired or held.
1047          *
1048          * We used to do a lot of socket buffer and socket locking here, as
1049          * well as invoke sorflush() and perform wakeups.  The direct call to
1050          * dom_dispose() and sbdestroy() are an inlining of what was
1051          * necessary from sorflush().
1052          *
1053          * Notice that the socket buffer and kqueue state are torn down
1054          * before calling pru_detach.  This means that protocols shold not
1055          * assume they can perform socket wakeups, etc, in their detach code.
1056          */
1057         if (!SOLISTENING(so)) {
1058                 sbdestroy(&so->so_snd, so);
1059                 sbdestroy(&so->so_rcv, so);
1060         }
1061         seldrain(&so->so_rdsel);
1062         seldrain(&so->so_wrsel);
1063         knlist_destroy(&so->so_rdsel.si_note);
1064         knlist_destroy(&so->so_wrsel.si_note);
1065         sodealloc(so);
1066 }
1067
1068 /*
1069  * Close a socket on last file table reference removal.  Initiate disconnect
1070  * if connected.  Free socket when disconnect complete.
1071  *
1072  * This function will sorele() the socket.  Note that soclose() may be called
1073  * prior to the ref count reaching zero.  The actual socket structure will
1074  * not be freed until the ref count reaches zero.
1075  */
1076 int
1077 soclose(struct socket *so)
1078 {
1079         struct accept_queue lqueue;
1080         bool listening;
1081         int error = 0;
1082
1083         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
1084
1085         CURVNET_SET(so->so_vnet);
1086         funsetown(&so->so_sigio);
1087         if (so->so_state & SS_ISCONNECTED) {
1088                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1089                         error = sodisconnect(so);
1090                         if (error) {
1091                                 if (error == ENOTCONN)
1092                                         error = 0;
1093                                 goto drop;
1094                         }
1095                 }
1096                 if (so->so_options & SO_LINGER) {
1097                         if ((so->so_state & SS_ISDISCONNECTING) &&
1098                             (so->so_state & SS_NBIO))
1099                                 goto drop;
1100                         while (so->so_state & SS_ISCONNECTED) {
1101                                 error = tsleep(&so->so_timeo,
1102                                     PSOCK | PCATCH, "soclos",
1103                                     so->so_linger * hz);
1104                                 if (error)
1105                                         break;
1106                         }
1107                 }
1108         }
1109
1110 drop:
1111         if (so->so_proto->pr_usrreqs->pru_close != NULL)
1112                 (*so->so_proto->pr_usrreqs->pru_close)(so);
1113
1114         SOCK_LOCK(so);
1115         if ((listening = (so->so_options & SO_ACCEPTCONN))) {
1116                 struct socket *sp;
1117
1118                 TAILQ_INIT(&lqueue);
1119                 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1120                 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1121
1122                 so->sol_qlen = so->sol_incqlen = 0;
1123
1124                 TAILQ_FOREACH(sp, &lqueue, so_list) {
1125                         SOCK_LOCK(sp);
1126                         sp->so_qstate = SQ_NONE;
1127                         sp->so_listen = NULL;
1128                         SOCK_UNLOCK(sp);
1129                         /* Guaranteed not to be the last. */
1130                         refcount_release(&so->so_count);
1131                 }
1132         }
1133         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
1134         so->so_state |= SS_NOFDREF;
1135         sorele(so);
1136         if (listening) {
1137                 struct socket *sp, *tsp;
1138
1139                 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
1140                         SOCK_LOCK(sp);
1141                         if (sp->so_count == 0) {
1142                                 SOCK_UNLOCK(sp);
1143                                 soabort(sp);
1144                         } else
1145                                 /* sp is now in sofree() */
1146                                 SOCK_UNLOCK(sp);
1147                 }
1148         }
1149         CURVNET_RESTORE();
1150         return (error);
1151 }
1152
1153 /*
1154  * soabort() is used to abruptly tear down a connection, such as when a
1155  * resource limit is reached (listen queue depth exceeded), or if a listen
1156  * socket is closed while there are sockets waiting to be accepted.
1157  *
1158  * This interface is tricky, because it is called on an unreferenced socket,
1159  * and must be called only by a thread that has actually removed the socket
1160  * from the listen queue it was on, or races with other threads are risked.
1161  *
1162  * This interface will call into the protocol code, so must not be called
1163  * with any socket locks held.  Protocols do call it while holding their own
1164  * recursible protocol mutexes, but this is something that should be subject
1165  * to review in the future.
1166  */
1167 void
1168 soabort(struct socket *so)
1169 {
1170
1171         /*
1172          * In as much as is possible, assert that no references to this
1173          * socket are held.  This is not quite the same as asserting that the
1174          * current thread is responsible for arranging for no references, but
1175          * is as close as we can get for now.
1176          */
1177         KASSERT(so->so_count == 0, ("soabort: so_count"));
1178         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
1179         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
1180         VNET_SO_ASSERT(so);
1181
1182         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
1183                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
1184         SOCK_LOCK(so);
1185         sofree(so);
1186 }
1187
1188 int
1189 soaccept(struct socket *so, struct sockaddr **nam)
1190 {
1191         int error;
1192
1193         SOCK_LOCK(so);
1194         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
1195         so->so_state &= ~SS_NOFDREF;
1196         SOCK_UNLOCK(so);
1197
1198         CURVNET_SET(so->so_vnet);
1199         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1200         CURVNET_RESTORE();
1201         return (error);
1202 }
1203
1204 int
1205 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1206 {
1207
1208         return (soconnectat(AT_FDCWD, so, nam, td));
1209 }
1210
1211 int
1212 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1213 {
1214         int error;
1215
1216         if (so->so_options & SO_ACCEPTCONN)
1217                 return (EOPNOTSUPP);
1218
1219         CURVNET_SET(so->so_vnet);
1220         /*
1221          * If protocol is connection-based, can only connect once.
1222          * Otherwise, if connected, try to disconnect first.  This allows
1223          * user to disconnect by connecting to, e.g., a null address.
1224          */
1225         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1226             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1227             (error = sodisconnect(so)))) {
1228                 error = EISCONN;
1229         } else {
1230                 /*
1231                  * Prevent accumulated error from previous connection from
1232                  * biting us.
1233                  */
1234                 so->so_error = 0;
1235                 if (fd == AT_FDCWD) {
1236                         error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1237                             nam, td);
1238                 } else {
1239                         error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1240                             so, nam, td);
1241                 }
1242         }
1243         CURVNET_RESTORE();
1244
1245         return (error);
1246 }
1247
1248 int
1249 soconnect2(struct socket *so1, struct socket *so2)
1250 {
1251         int error;
1252
1253         CURVNET_SET(so1->so_vnet);
1254         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1255         CURVNET_RESTORE();
1256         return (error);
1257 }
1258
1259 int
1260 sodisconnect(struct socket *so)
1261 {
1262         int error;
1263
1264         if ((so->so_state & SS_ISCONNECTED) == 0)
1265                 return (ENOTCONN);
1266         if (so->so_state & SS_ISDISCONNECTING)
1267                 return (EALREADY);
1268         VNET_SO_ASSERT(so);
1269         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1270         return (error);
1271 }
1272
1273 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1274
1275 int
1276 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1277     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1278 {
1279         long space;
1280         ssize_t resid;
1281         int clen = 0, error, dontroute;
1282
1283         KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1284         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1285             ("sosend_dgram: !PR_ATOMIC"));
1286
1287         if (uio != NULL)
1288                 resid = uio->uio_resid;
1289         else
1290                 resid = top->m_pkthdr.len;
1291         /*
1292          * In theory resid should be unsigned.  However, space must be
1293          * signed, as it might be less than 0 if we over-committed, and we
1294          * must use a signed comparison of space and resid.  On the other
1295          * hand, a negative resid causes us to loop sending 0-length
1296          * segments to the protocol.
1297          */
1298         if (resid < 0) {
1299                 error = EINVAL;
1300                 goto out;
1301         }
1302
1303         dontroute =
1304             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1305         if (td != NULL)
1306                 td->td_ru.ru_msgsnd++;
1307         if (control != NULL)
1308                 clen = control->m_len;
1309
1310         SOCKBUF_LOCK(&so->so_snd);
1311         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1312                 SOCKBUF_UNLOCK(&so->so_snd);
1313                 error = EPIPE;
1314                 goto out;
1315         }
1316         if (so->so_error) {
1317                 error = so->so_error;
1318                 so->so_error = 0;
1319                 SOCKBUF_UNLOCK(&so->so_snd);
1320                 goto out;
1321         }
1322         if ((so->so_state & SS_ISCONNECTED) == 0) {
1323                 /*
1324                  * `sendto' and `sendmsg' is allowed on a connection-based
1325                  * socket if it supports implied connect.  Return ENOTCONN if
1326                  * not connected and no address is supplied.
1327                  */
1328                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1329                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1330                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1331                             !(resid == 0 && clen != 0)) {
1332                                 SOCKBUF_UNLOCK(&so->so_snd);
1333                                 error = ENOTCONN;
1334                                 goto out;
1335                         }
1336                 } else if (addr == NULL) {
1337                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1338                                 error = ENOTCONN;
1339                         else
1340                                 error = EDESTADDRREQ;
1341                         SOCKBUF_UNLOCK(&so->so_snd);
1342                         goto out;
1343                 }
1344         }
1345
1346         /*
1347          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1348          * problem and need fixing.
1349          */
1350         space = sbspace(&so->so_snd);
1351         if (flags & MSG_OOB)
1352                 space += 1024;
1353         space -= clen;
1354         SOCKBUF_UNLOCK(&so->so_snd);
1355         if (resid > space) {
1356                 error = EMSGSIZE;
1357                 goto out;
1358         }
1359         if (uio == NULL) {
1360                 resid = 0;
1361                 if (flags & MSG_EOR)
1362                         top->m_flags |= M_EOR;
1363         } else {
1364                 /*
1365                  * Copy the data from userland into a mbuf chain.
1366                  * If no data is to be copied in, a single empty mbuf
1367                  * is returned.
1368                  */
1369                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1370                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1371                 if (top == NULL) {
1372                         error = EFAULT; /* only possible error */
1373                         goto out;
1374                 }
1375                 space -= resid - uio->uio_resid;
1376                 resid = uio->uio_resid;
1377         }
1378         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1379         /*
1380          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1381          * than with.
1382          */
1383         if (dontroute) {
1384                 SOCK_LOCK(so);
1385                 so->so_options |= SO_DONTROUTE;
1386                 SOCK_UNLOCK(so);
1387         }
1388         /*
1389          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1390          * of date.  We could have received a reset packet in an interrupt or
1391          * maybe we slept while doing page faults in uiomove() etc.  We could
1392          * probably recheck again inside the locking protection here, but
1393          * there are probably other places that this also happens.  We must
1394          * rethink this.
1395          */
1396         VNET_SO_ASSERT(so);
1397         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1398             (flags & MSG_OOB) ? PRUS_OOB :
1399         /*
1400          * If the user set MSG_EOF, the protocol understands this flag and
1401          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1402          */
1403             ((flags & MSG_EOF) &&
1404              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1405              (resid <= 0)) ?
1406                 PRUS_EOF :
1407                 /* If there is more to send set PRUS_MORETOCOME */
1408                 (flags & MSG_MORETOCOME) ||
1409                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1410                 top, addr, control, td);
1411         if (dontroute) {
1412                 SOCK_LOCK(so);
1413                 so->so_options &= ~SO_DONTROUTE;
1414                 SOCK_UNLOCK(so);
1415         }
1416         clen = 0;
1417         control = NULL;
1418         top = NULL;
1419 out:
1420         if (top != NULL)
1421                 m_freem(top);
1422         if (control != NULL)
1423                 m_freem(control);
1424         return (error);
1425 }
1426
1427 /*
1428  * Send on a socket.  If send must go all at once and message is larger than
1429  * send buffering, then hard error.  Lock against other senders.  If must go
1430  * all at once and not enough room now, then inform user that this would
1431  * block and do nothing.  Otherwise, if nonblocking, send as much as
1432  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1433  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1434  * in mbuf chain must be small enough to send all at once.
1435  *
1436  * Returns nonzero on error, timeout or signal; callers must check for short
1437  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1438  * on return.
1439  */
1440 int
1441 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1442     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1443 {
1444         long space;
1445         ssize_t resid;
1446         int clen = 0, error, dontroute;
1447         int atomic = sosendallatonce(so) || top;
1448         int pru_flag;
1449 #ifdef KERN_TLS
1450         struct ktls_session *tls;
1451         int tls_enq_cnt, tls_pruflag;
1452         uint8_t tls_rtype;
1453
1454         tls = NULL;
1455         tls_rtype = TLS_RLTYPE_APP;
1456 #endif
1457         if (uio != NULL)
1458                 resid = uio->uio_resid;
1459         else
1460                 resid = top->m_pkthdr.len;
1461         /*
1462          * In theory resid should be unsigned.  However, space must be
1463          * signed, as it might be less than 0 if we over-committed, and we
1464          * must use a signed comparison of space and resid.  On the other
1465          * hand, a negative resid causes us to loop sending 0-length
1466          * segments to the protocol.
1467          *
1468          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1469          * type sockets since that's an error.
1470          */
1471         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1472                 error = EINVAL;
1473                 goto out;
1474         }
1475
1476         dontroute =
1477             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1478             (so->so_proto->pr_flags & PR_ATOMIC);
1479         if (td != NULL)
1480                 td->td_ru.ru_msgsnd++;
1481         if (control != NULL)
1482                 clen = control->m_len;
1483
1484         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1485         if (error)
1486                 goto out;
1487
1488 #ifdef KERN_TLS
1489         tls_pruflag = 0;
1490         tls = ktls_hold(so->so_snd.sb_tls_info);
1491         if (tls != NULL) {
1492                 if (tls->sw_encrypt != NULL)
1493                         tls_pruflag = PRUS_NOTREADY;
1494
1495                 if (control != NULL) {
1496                         struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1497
1498                         if (clen >= sizeof(*cm) &&
1499                             cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1500                                 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1501                                 clen = 0;
1502                                 m_freem(control);
1503                                 control = NULL;
1504                                 atomic = 1;
1505                         }
1506                 }
1507         }
1508 #endif
1509
1510 restart:
1511         do {
1512                 SOCKBUF_LOCK(&so->so_snd);
1513                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1514                         SOCKBUF_UNLOCK(&so->so_snd);
1515                         error = EPIPE;
1516                         goto release;
1517                 }
1518                 if (so->so_error) {
1519                         error = so->so_error;
1520                         so->so_error = 0;
1521                         SOCKBUF_UNLOCK(&so->so_snd);
1522                         goto release;
1523                 }
1524                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1525                         /*
1526                          * `sendto' and `sendmsg' is allowed on a connection-
1527                          * based socket if it supports implied connect.
1528                          * Return ENOTCONN if not connected and no address is
1529                          * supplied.
1530                          */
1531                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1532                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1533                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1534                                     !(resid == 0 && clen != 0)) {
1535                                         SOCKBUF_UNLOCK(&so->so_snd);
1536                                         error = ENOTCONN;
1537                                         goto release;
1538                                 }
1539                         } else if (addr == NULL) {
1540                                 SOCKBUF_UNLOCK(&so->so_snd);
1541                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1542                                         error = ENOTCONN;
1543                                 else
1544                                         error = EDESTADDRREQ;
1545                                 goto release;
1546                         }
1547                 }
1548                 space = sbspace(&so->so_snd);
1549                 if (flags & MSG_OOB)
1550                         space += 1024;
1551                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1552                     clen > so->so_snd.sb_hiwat) {
1553                         SOCKBUF_UNLOCK(&so->so_snd);
1554                         error = EMSGSIZE;
1555                         goto release;
1556                 }
1557                 if (space < resid + clen &&
1558                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1559                         if ((so->so_state & SS_NBIO) ||
1560                             (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1561                                 SOCKBUF_UNLOCK(&so->so_snd);
1562                                 error = EWOULDBLOCK;
1563                                 goto release;
1564                         }
1565                         error = sbwait(&so->so_snd);
1566                         SOCKBUF_UNLOCK(&so->so_snd);
1567                         if (error)
1568                                 goto release;
1569                         goto restart;
1570                 }
1571                 SOCKBUF_UNLOCK(&so->so_snd);
1572                 space -= clen;
1573                 do {
1574                         if (uio == NULL) {
1575                                 resid = 0;
1576                                 if (flags & MSG_EOR)
1577                                         top->m_flags |= M_EOR;
1578                         } else {
1579                                 /*
1580                                  * Copy the data from userland into a mbuf
1581                                  * chain.  If resid is 0, which can happen
1582                                  * only if we have control to send, then
1583                                  * a single empty mbuf is returned.  This
1584                                  * is a workaround to prevent protocol send
1585                                  * methods to panic.
1586                                  */
1587 #ifdef KERN_TLS
1588                                 if (tls != NULL) {
1589                                         top = m_uiotombuf(uio, M_WAITOK, space,
1590                                             tls->params.max_frame_len,
1591                                             M_NOMAP |
1592                                             ((flags & MSG_EOR) ? M_EOR : 0));
1593                                         if (top != NULL) {
1594                                                 error = ktls_frame(top, tls,
1595                                                     &tls_enq_cnt, tls_rtype);
1596                                                 if (error) {
1597                                                         m_freem(top);
1598                                                         goto release;
1599                                                 }
1600                                         }
1601                                         tls_rtype = TLS_RLTYPE_APP;
1602                                 } else
1603 #endif
1604                                         top = m_uiotombuf(uio, M_WAITOK, space,
1605                                             (atomic ? max_hdr : 0),
1606                                             (atomic ? M_PKTHDR : 0) |
1607                                             ((flags & MSG_EOR) ? M_EOR : 0));
1608                                 if (top == NULL) {
1609                                         error = EFAULT; /* only possible error */
1610                                         goto release;
1611                                 }
1612                                 space -= resid - uio->uio_resid;
1613                                 resid = uio->uio_resid;
1614                         }
1615                         if (dontroute) {
1616                                 SOCK_LOCK(so);
1617                                 so->so_options |= SO_DONTROUTE;
1618                                 SOCK_UNLOCK(so);
1619                         }
1620                         /*
1621                          * XXX all the SBS_CANTSENDMORE checks previously
1622                          * done could be out of date.  We could have received
1623                          * a reset packet in an interrupt or maybe we slept
1624                          * while doing page faults in uiomove() etc.  We
1625                          * could probably recheck again inside the locking
1626                          * protection here, but there are probably other
1627                          * places that this also happens.  We must rethink
1628                          * this.
1629                          */
1630                         VNET_SO_ASSERT(so);
1631
1632                         pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
1633                         /*
1634                          * If the user set MSG_EOF, the protocol understands
1635                          * this flag and nothing left to send then use
1636                          * PRU_SEND_EOF instead of PRU_SEND.
1637                          */
1638                             ((flags & MSG_EOF) &&
1639                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1640                              (resid <= 0)) ?
1641                                 PRUS_EOF :
1642                         /* If there is more to send set PRUS_MORETOCOME. */
1643                             (flags & MSG_MORETOCOME) ||
1644                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1645
1646 #ifdef KERN_TLS
1647                         pru_flag |= tls_pruflag;
1648 #endif
1649
1650                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1651                             pru_flag, top, addr, control, td);
1652
1653                         if (dontroute) {
1654                                 SOCK_LOCK(so);
1655                                 so->so_options &= ~SO_DONTROUTE;
1656                                 SOCK_UNLOCK(so);
1657                         }
1658
1659 #ifdef KERN_TLS
1660                         if (tls != NULL && tls->sw_encrypt != NULL) {
1661                                 /*
1662                                  * Note that error is intentionally
1663                                  * ignored.
1664                                  *
1665                                  * Like sendfile(), we rely on the
1666                                  * completion routine (pru_ready())
1667                                  * to free the mbufs in the event that
1668                                  * pru_send() encountered an error and
1669                                  * did not append them to the sockbuf.
1670                                  */
1671                                 soref(so);
1672                                 ktls_enqueue(top, so, tls_enq_cnt);
1673                         }
1674 #endif
1675                         clen = 0;
1676                         control = NULL;
1677                         top = NULL;
1678                         if (error)
1679                                 goto release;
1680                 } while (resid && space > 0);
1681         } while (resid);
1682
1683 release:
1684         sbunlock(&so->so_snd);
1685 out:
1686 #ifdef KERN_TLS
1687         if (tls != NULL)
1688                 ktls_free(tls);
1689 #endif
1690         if (top != NULL)
1691                 m_freem(top);
1692         if (control != NULL)
1693                 m_freem(control);
1694         return (error);
1695 }
1696
1697 int
1698 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1699     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1700 {
1701         int error;
1702
1703         CURVNET_SET(so->so_vnet);
1704         if (!SOLISTENING(so))
1705                 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
1706                     top, control, flags, td);
1707         else {
1708                 m_freem(top);
1709                 m_freem(control);
1710                 error = ENOTCONN;
1711         }
1712         CURVNET_RESTORE();
1713         return (error);
1714 }
1715
1716 /*
1717  * The part of soreceive() that implements reading non-inline out-of-band
1718  * data from a socket.  For more complete comments, see soreceive(), from
1719  * which this code originated.
1720  *
1721  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1722  * unable to return an mbuf chain to the caller.
1723  */
1724 static int
1725 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1726 {
1727         struct protosw *pr = so->so_proto;
1728         struct mbuf *m;
1729         int error;
1730
1731         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1732         VNET_SO_ASSERT(so);
1733
1734         m = m_get(M_WAITOK, MT_DATA);
1735         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1736         if (error)
1737                 goto bad;
1738         do {
1739                 error = uiomove(mtod(m, void *),
1740                     (int) min(uio->uio_resid, m->m_len), uio);
1741                 m = m_free(m);
1742         } while (uio->uio_resid && error == 0 && m);
1743 bad:
1744         if (m != NULL)
1745                 m_freem(m);
1746         return (error);
1747 }
1748
1749 /*
1750  * Following replacement or removal of the first mbuf on the first mbuf chain
1751  * of a socket buffer, push necessary state changes back into the socket
1752  * buffer so that other consumers see the values consistently.  'nextrecord'
1753  * is the callers locally stored value of the original value of
1754  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1755  * NOTE: 'nextrecord' may be NULL.
1756  */
1757 static __inline void
1758 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1759 {
1760
1761         SOCKBUF_LOCK_ASSERT(sb);
1762         /*
1763          * First, update for the new value of nextrecord.  If necessary, make
1764          * it the first record.
1765          */
1766         if (sb->sb_mb != NULL)
1767                 sb->sb_mb->m_nextpkt = nextrecord;
1768         else
1769                 sb->sb_mb = nextrecord;
1770
1771         /*
1772          * Now update any dependent socket buffer fields to reflect the new
1773          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1774          * addition of a second clause that takes care of the case where
1775          * sb_mb has been updated, but remains the last record.
1776          */
1777         if (sb->sb_mb == NULL) {
1778                 sb->sb_mbtail = NULL;
1779                 sb->sb_lastrecord = NULL;
1780         } else if (sb->sb_mb->m_nextpkt == NULL)
1781                 sb->sb_lastrecord = sb->sb_mb;
1782 }
1783
1784 /*
1785  * Implement receive operations on a socket.  We depend on the way that
1786  * records are added to the sockbuf by sbappend.  In particular, each record
1787  * (mbufs linked through m_next) must begin with an address if the protocol
1788  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1789  * data, and then zero or more mbufs of data.  In order to allow parallelism
1790  * between network receive and copying to user space, as well as avoid
1791  * sleeping with a mutex held, we release the socket buffer mutex during the
1792  * user space copy.  Although the sockbuf is locked, new data may still be
1793  * appended, and thus we must maintain consistency of the sockbuf during that
1794  * time.
1795  *
1796  * The caller may receive the data as a single mbuf chain by supplying an
1797  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1798  * the count in uio_resid.
1799  */
1800 int
1801 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1802     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1803 {
1804         struct mbuf *m, **mp;
1805         int flags, error, offset;
1806         ssize_t len;
1807         struct protosw *pr = so->so_proto;
1808         struct mbuf *nextrecord;
1809         int moff, type = 0;
1810         ssize_t orig_resid = uio->uio_resid;
1811
1812         mp = mp0;
1813         if (psa != NULL)
1814                 *psa = NULL;
1815         if (controlp != NULL)
1816                 *controlp = NULL;
1817         if (flagsp != NULL)
1818                 flags = *flagsp &~ MSG_EOR;
1819         else
1820                 flags = 0;
1821         if (flags & MSG_OOB)
1822                 return (soreceive_rcvoob(so, uio, flags));
1823         if (mp != NULL)
1824                 *mp = NULL;
1825         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1826             && uio->uio_resid) {
1827                 VNET_SO_ASSERT(so);
1828                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1829         }
1830
1831         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1832         if (error)
1833                 return (error);
1834
1835 restart:
1836         SOCKBUF_LOCK(&so->so_rcv);
1837         m = so->so_rcv.sb_mb;
1838         /*
1839          * If we have less data than requested, block awaiting more (subject
1840          * to any timeout) if:
1841          *   1. the current count is less than the low water mark, or
1842          *   2. MSG_DONTWAIT is not set
1843          */
1844         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1845             sbavail(&so->so_rcv) < uio->uio_resid) &&
1846             sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
1847             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1848                 KASSERT(m != NULL || !sbavail(&so->so_rcv),
1849                     ("receive: m == %p sbavail == %u",
1850                     m, sbavail(&so->so_rcv)));
1851                 if (so->so_error) {
1852                         if (m != NULL)
1853                                 goto dontblock;
1854                         error = so->so_error;
1855                         if ((flags & MSG_PEEK) == 0)
1856                                 so->so_error = 0;
1857                         SOCKBUF_UNLOCK(&so->so_rcv);
1858                         goto release;
1859                 }
1860                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1861                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1862                         if (m == NULL) {
1863                                 SOCKBUF_UNLOCK(&so->so_rcv);
1864                                 goto release;
1865                         } else
1866                                 goto dontblock;
1867                 }
1868                 for (; m != NULL; m = m->m_next)
1869                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1870                                 m = so->so_rcv.sb_mb;
1871                                 goto dontblock;
1872                         }
1873                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1874                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1875                         SOCKBUF_UNLOCK(&so->so_rcv);
1876                         error = ENOTCONN;
1877                         goto release;
1878                 }
1879                 if (uio->uio_resid == 0) {
1880                         SOCKBUF_UNLOCK(&so->so_rcv);
1881                         goto release;
1882                 }
1883                 if ((so->so_state & SS_NBIO) ||
1884                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1885                         SOCKBUF_UNLOCK(&so->so_rcv);
1886                         error = EWOULDBLOCK;
1887                         goto release;
1888                 }
1889                 SBLASTRECORDCHK(&so->so_rcv);
1890                 SBLASTMBUFCHK(&so->so_rcv);
1891                 error = sbwait(&so->so_rcv);
1892                 SOCKBUF_UNLOCK(&so->so_rcv);
1893                 if (error)
1894                         goto release;
1895                 goto restart;
1896         }
1897 dontblock:
1898         /*
1899          * From this point onward, we maintain 'nextrecord' as a cache of the
1900          * pointer to the next record in the socket buffer.  We must keep the
1901          * various socket buffer pointers and local stack versions of the
1902          * pointers in sync, pushing out modifications before dropping the
1903          * socket buffer mutex, and re-reading them when picking it up.
1904          *
1905          * Otherwise, we will race with the network stack appending new data
1906          * or records onto the socket buffer by using inconsistent/stale
1907          * versions of the field, possibly resulting in socket buffer
1908          * corruption.
1909          *
1910          * By holding the high-level sblock(), we prevent simultaneous
1911          * readers from pulling off the front of the socket buffer.
1912          */
1913         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1914         if (uio->uio_td)
1915                 uio->uio_td->td_ru.ru_msgrcv++;
1916         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1917         SBLASTRECORDCHK(&so->so_rcv);
1918         SBLASTMBUFCHK(&so->so_rcv);
1919         nextrecord = m->m_nextpkt;
1920         if (pr->pr_flags & PR_ADDR) {
1921                 KASSERT(m->m_type == MT_SONAME,
1922                     ("m->m_type == %d", m->m_type));
1923                 orig_resid = 0;
1924                 if (psa != NULL)
1925                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1926                             M_NOWAIT);
1927                 if (flags & MSG_PEEK) {
1928                         m = m->m_next;
1929                 } else {
1930                         sbfree(&so->so_rcv, m);
1931                         so->so_rcv.sb_mb = m_free(m);
1932                         m = so->so_rcv.sb_mb;
1933                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1934                 }
1935         }
1936
1937         /*
1938          * Process one or more MT_CONTROL mbufs present before any data mbufs
1939          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1940          * just copy the data; if !MSG_PEEK, we call into the protocol to
1941          * perform externalization (or freeing if controlp == NULL).
1942          */
1943         if (m != NULL && m->m_type == MT_CONTROL) {
1944                 struct mbuf *cm = NULL, *cmn;
1945                 struct mbuf **cme = &cm;
1946
1947                 do {
1948                         if (flags & MSG_PEEK) {
1949                                 if (controlp != NULL) {
1950                                         *controlp = m_copym(m, 0, m->m_len,
1951                                             M_NOWAIT);
1952                                         controlp = &(*controlp)->m_next;
1953                                 }
1954                                 m = m->m_next;
1955                         } else {
1956                                 sbfree(&so->so_rcv, m);
1957                                 so->so_rcv.sb_mb = m->m_next;
1958                                 m->m_next = NULL;
1959                                 *cme = m;
1960                                 cme = &(*cme)->m_next;
1961                                 m = so->so_rcv.sb_mb;
1962                         }
1963                 } while (m != NULL && m->m_type == MT_CONTROL);
1964                 if ((flags & MSG_PEEK) == 0)
1965                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1966                 while (cm != NULL) {
1967                         cmn = cm->m_next;
1968                         cm->m_next = NULL;
1969                         if (pr->pr_domain->dom_externalize != NULL) {
1970                                 SOCKBUF_UNLOCK(&so->so_rcv);
1971                                 VNET_SO_ASSERT(so);
1972                                 error = (*pr->pr_domain->dom_externalize)
1973                                     (cm, controlp, flags);
1974                                 SOCKBUF_LOCK(&so->so_rcv);
1975                         } else if (controlp != NULL)
1976                                 *controlp = cm;
1977                         else
1978                                 m_freem(cm);
1979                         if (controlp != NULL) {
1980                                 orig_resid = 0;
1981                                 while (*controlp != NULL)
1982                                         controlp = &(*controlp)->m_next;
1983                         }
1984                         cm = cmn;
1985                 }
1986                 if (m != NULL)
1987                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1988                 else
1989                         nextrecord = so->so_rcv.sb_mb;
1990                 orig_resid = 0;
1991         }
1992         if (m != NULL) {
1993                 if ((flags & MSG_PEEK) == 0) {
1994                         KASSERT(m->m_nextpkt == nextrecord,
1995                             ("soreceive: post-control, nextrecord !sync"));
1996                         if (nextrecord == NULL) {
1997                                 KASSERT(so->so_rcv.sb_mb == m,
1998                                     ("soreceive: post-control, sb_mb!=m"));
1999                                 KASSERT(so->so_rcv.sb_lastrecord == m,
2000                                     ("soreceive: post-control, lastrecord!=m"));
2001                         }
2002                 }
2003                 type = m->m_type;
2004                 if (type == MT_OOBDATA)
2005                         flags |= MSG_OOB;
2006         } else {
2007                 if ((flags & MSG_PEEK) == 0) {
2008                         KASSERT(so->so_rcv.sb_mb == nextrecord,
2009                             ("soreceive: sb_mb != nextrecord"));
2010                         if (so->so_rcv.sb_mb == NULL) {
2011                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2012                                     ("soreceive: sb_lastercord != NULL"));
2013                         }
2014                 }
2015         }
2016         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2017         SBLASTRECORDCHK(&so->so_rcv);
2018         SBLASTMBUFCHK(&so->so_rcv);
2019
2020         /*
2021          * Now continue to read any data mbufs off of the head of the socket
2022          * buffer until the read request is satisfied.  Note that 'type' is
2023          * used to store the type of any mbuf reads that have happened so far
2024          * such that soreceive() can stop reading if the type changes, which
2025          * causes soreceive() to return only one of regular data and inline
2026          * out-of-band data in a single socket receive operation.
2027          */
2028         moff = 0;
2029         offset = 0;
2030         while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2031             && error == 0) {
2032                 /*
2033                  * If the type of mbuf has changed since the last mbuf
2034                  * examined ('type'), end the receive operation.
2035                  */
2036                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2037                 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2038                         if (type != m->m_type)
2039                                 break;
2040                 } else if (type == MT_OOBDATA)
2041                         break;
2042                 else
2043                     KASSERT(m->m_type == MT_DATA,
2044                         ("m->m_type == %d", m->m_type));
2045                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2046                 len = uio->uio_resid;
2047                 if (so->so_oobmark && len > so->so_oobmark - offset)
2048                         len = so->so_oobmark - offset;
2049                 if (len > m->m_len - moff)
2050                         len = m->m_len - moff;
2051                 /*
2052                  * If mp is set, just pass back the mbufs.  Otherwise copy
2053                  * them out via the uio, then free.  Sockbuf must be
2054                  * consistent here (points to current mbuf, it points to next
2055                  * record) when we drop priority; we must note any additions
2056                  * to the sockbuf when we block interrupts again.
2057                  */
2058                 if (mp == NULL) {
2059                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2060                         SBLASTRECORDCHK(&so->so_rcv);
2061                         SBLASTMBUFCHK(&so->so_rcv);
2062                         SOCKBUF_UNLOCK(&so->so_rcv);
2063                         if ((m->m_flags & M_NOMAP) != 0)
2064                                 error = m_unmappedtouio(m, moff, uio, (int)len);
2065                         else
2066                                 error = uiomove(mtod(m, char *) + moff,
2067                                     (int)len, uio);
2068                         SOCKBUF_LOCK(&so->so_rcv);
2069                         if (error) {
2070                                 /*
2071                                  * The MT_SONAME mbuf has already been removed
2072                                  * from the record, so it is necessary to
2073                                  * remove the data mbufs, if any, to preserve
2074                                  * the invariant in the case of PR_ADDR that
2075                                  * requires MT_SONAME mbufs at the head of
2076                                  * each record.
2077                                  */
2078                                 if (pr->pr_flags & PR_ATOMIC &&
2079                                     ((flags & MSG_PEEK) == 0))
2080                                         (void)sbdroprecord_locked(&so->so_rcv);
2081                                 SOCKBUF_UNLOCK(&so->so_rcv);
2082                                 goto release;
2083                         }
2084                 } else
2085                         uio->uio_resid -= len;
2086                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2087                 if (len == m->m_len - moff) {
2088                         if (m->m_flags & M_EOR)
2089                                 flags |= MSG_EOR;
2090                         if (flags & MSG_PEEK) {
2091                                 m = m->m_next;
2092                                 moff = 0;
2093                         } else {
2094                                 nextrecord = m->m_nextpkt;
2095                                 sbfree(&so->so_rcv, m);
2096                                 if (mp != NULL) {
2097                                         m->m_nextpkt = NULL;
2098                                         *mp = m;
2099                                         mp = &m->m_next;
2100                                         so->so_rcv.sb_mb = m = m->m_next;
2101                                         *mp = NULL;
2102                                 } else {
2103                                         so->so_rcv.sb_mb = m_free(m);
2104                                         m = so->so_rcv.sb_mb;
2105                                 }
2106                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
2107                                 SBLASTRECORDCHK(&so->so_rcv);
2108                                 SBLASTMBUFCHK(&so->so_rcv);
2109                         }
2110                 } else {
2111                         if (flags & MSG_PEEK)
2112                                 moff += len;
2113                         else {
2114                                 if (mp != NULL) {
2115                                         if (flags & MSG_DONTWAIT) {
2116                                                 *mp = m_copym(m, 0, len,
2117                                                     M_NOWAIT);
2118                                                 if (*mp == NULL) {
2119                                                         /*
2120                                                          * m_copym() couldn't
2121                                                          * allocate an mbuf.
2122                                                          * Adjust uio_resid back
2123                                                          * (it was adjusted
2124                                                          * down by len bytes,
2125                                                          * which we didn't end
2126                                                          * up "copying" over).
2127                                                          */
2128                                                         uio->uio_resid += len;
2129                                                         break;
2130                                                 }
2131                                         } else {
2132                                                 SOCKBUF_UNLOCK(&so->so_rcv);
2133                                                 *mp = m_copym(m, 0, len,
2134                                                     M_WAITOK);
2135                                                 SOCKBUF_LOCK(&so->so_rcv);
2136                                         }
2137                                 }
2138                                 sbcut_locked(&so->so_rcv, len);
2139                         }
2140                 }
2141                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2142                 if (so->so_oobmark) {
2143                         if ((flags & MSG_PEEK) == 0) {
2144                                 so->so_oobmark -= len;
2145                                 if (so->so_oobmark == 0) {
2146                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
2147                                         break;
2148                                 }
2149                         } else {
2150                                 offset += len;
2151                                 if (offset == so->so_oobmark)
2152                                         break;
2153                         }
2154                 }
2155                 if (flags & MSG_EOR)
2156                         break;
2157                 /*
2158                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
2159                  * must not quit until "uio->uio_resid == 0" or an error
2160                  * termination.  If a signal/timeout occurs, return with a
2161                  * short count but without error.  Keep sockbuf locked
2162                  * against other readers.
2163                  */
2164                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2165                     !sosendallatonce(so) && nextrecord == NULL) {
2166                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2167                         if (so->so_error ||
2168                             so->so_rcv.sb_state & SBS_CANTRCVMORE)
2169                                 break;
2170                         /*
2171                          * Notify the protocol that some data has been
2172                          * drained before blocking.
2173                          */
2174                         if (pr->pr_flags & PR_WANTRCVD) {
2175                                 SOCKBUF_UNLOCK(&so->so_rcv);
2176                                 VNET_SO_ASSERT(so);
2177                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2178                                 SOCKBUF_LOCK(&so->so_rcv);
2179                         }
2180                         SBLASTRECORDCHK(&so->so_rcv);
2181                         SBLASTMBUFCHK(&so->so_rcv);
2182                         /*
2183                          * We could receive some data while was notifying
2184                          * the protocol. Skip blocking in this case.
2185                          */
2186                         if (so->so_rcv.sb_mb == NULL) {
2187                                 error = sbwait(&so->so_rcv);
2188                                 if (error) {
2189                                         SOCKBUF_UNLOCK(&so->so_rcv);
2190                                         goto release;
2191                                 }
2192                         }
2193                         m = so->so_rcv.sb_mb;
2194                         if (m != NULL)
2195                                 nextrecord = m->m_nextpkt;
2196                 }
2197         }
2198
2199         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2200         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2201                 flags |= MSG_TRUNC;
2202                 if ((flags & MSG_PEEK) == 0)
2203                         (void) sbdroprecord_locked(&so->so_rcv);
2204         }
2205         if ((flags & MSG_PEEK) == 0) {
2206                 if (m == NULL) {
2207                         /*
2208                          * First part is an inline SB_EMPTY_FIXUP().  Second
2209                          * part makes sure sb_lastrecord is up-to-date if
2210                          * there is still data in the socket buffer.
2211                          */
2212                         so->so_rcv.sb_mb = nextrecord;
2213                         if (so->so_rcv.sb_mb == NULL) {
2214                                 so->so_rcv.sb_mbtail = NULL;
2215                                 so->so_rcv.sb_lastrecord = NULL;
2216                         } else if (nextrecord->m_nextpkt == NULL)
2217                                 so->so_rcv.sb_lastrecord = nextrecord;
2218                 }
2219                 SBLASTRECORDCHK(&so->so_rcv);
2220                 SBLASTMBUFCHK(&so->so_rcv);
2221                 /*
2222                  * If soreceive() is being done from the socket callback,
2223                  * then don't need to generate ACK to peer to update window,
2224                  * since ACK will be generated on return to TCP.
2225                  */
2226                 if (!(flags & MSG_SOCALLBCK) &&
2227                     (pr->pr_flags & PR_WANTRCVD)) {
2228                         SOCKBUF_UNLOCK(&so->so_rcv);
2229                         VNET_SO_ASSERT(so);
2230                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2231                         SOCKBUF_LOCK(&so->so_rcv);
2232                 }
2233         }
2234         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2235         if (orig_resid == uio->uio_resid && orig_resid &&
2236             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2237                 SOCKBUF_UNLOCK(&so->so_rcv);
2238                 goto restart;
2239         }
2240         SOCKBUF_UNLOCK(&so->so_rcv);
2241
2242         if (flagsp != NULL)
2243                 *flagsp |= flags;
2244 release:
2245         sbunlock(&so->so_rcv);
2246         return (error);
2247 }
2248
2249 /*
2250  * Optimized version of soreceive() for stream (TCP) sockets.
2251  */
2252 int
2253 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2254     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2255 {
2256         int len = 0, error = 0, flags, oresid;
2257         struct sockbuf *sb;
2258         struct mbuf *m, *n = NULL;
2259
2260         /* We only do stream sockets. */
2261         if (so->so_type != SOCK_STREAM)
2262                 return (EINVAL);
2263         if (psa != NULL)
2264                 *psa = NULL;
2265         if (flagsp != NULL)
2266                 flags = *flagsp &~ MSG_EOR;
2267         else
2268                 flags = 0;
2269         if (controlp != NULL)
2270                 *controlp = NULL;
2271         if (flags & MSG_OOB)
2272                 return (soreceive_rcvoob(so, uio, flags));
2273         if (mp0 != NULL)
2274                 *mp0 = NULL;
2275
2276         sb = &so->so_rcv;
2277
2278         /* Prevent other readers from entering the socket. */
2279         error = sblock(sb, SBLOCKWAIT(flags));
2280         if (error)
2281                 return (error);
2282         SOCKBUF_LOCK(sb);
2283
2284         /* Easy one, no space to copyout anything. */
2285         if (uio->uio_resid == 0) {
2286                 error = EINVAL;
2287                 goto out;
2288         }
2289         oresid = uio->uio_resid;
2290
2291         /* We will never ever get anything unless we are or were connected. */
2292         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2293                 error = ENOTCONN;
2294                 goto out;
2295         }
2296
2297 restart:
2298         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2299
2300         /* Abort if socket has reported problems. */
2301         if (so->so_error) {
2302                 if (sbavail(sb) > 0)
2303                         goto deliver;
2304                 if (oresid > uio->uio_resid)
2305                         goto out;
2306                 error = so->so_error;
2307                 if (!(flags & MSG_PEEK))
2308                         so->so_error = 0;
2309                 goto out;
2310         }
2311
2312         /* Door is closed.  Deliver what is left, if any. */
2313         if (sb->sb_state & SBS_CANTRCVMORE) {
2314                 if (sbavail(sb) > 0)
2315                         goto deliver;
2316                 else
2317                         goto out;
2318         }
2319
2320         /* Socket buffer is empty and we shall not block. */
2321         if (sbavail(sb) == 0 &&
2322             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2323                 error = EAGAIN;
2324                 goto out;
2325         }
2326
2327         /* Socket buffer got some data that we shall deliver now. */
2328         if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2329             ((so->so_state & SS_NBIO) ||
2330              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2331              sbavail(sb) >= sb->sb_lowat ||
2332              sbavail(sb) >= uio->uio_resid ||
2333              sbavail(sb) >= sb->sb_hiwat) ) {
2334                 goto deliver;
2335         }
2336
2337         /* On MSG_WAITALL we must wait until all data or error arrives. */
2338         if ((flags & MSG_WAITALL) &&
2339             (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2340                 goto deliver;
2341
2342         /*
2343          * Wait and block until (more) data comes in.
2344          * NB: Drops the sockbuf lock during wait.
2345          */
2346         error = sbwait(sb);
2347         if (error)
2348                 goto out;
2349         goto restart;
2350
2351 deliver:
2352         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2353         KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2354         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2355
2356         /* Statistics. */
2357         if (uio->uio_td)
2358                 uio->uio_td->td_ru.ru_msgrcv++;
2359
2360         /* Fill uio until full or current end of socket buffer is reached. */
2361         len = min(uio->uio_resid, sbavail(sb));
2362         if (mp0 != NULL) {
2363                 /* Dequeue as many mbufs as possible. */
2364                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2365                         if (*mp0 == NULL)
2366                                 *mp0 = sb->sb_mb;
2367                         else
2368                                 m_cat(*mp0, sb->sb_mb);
2369                         for (m = sb->sb_mb;
2370                              m != NULL && m->m_len <= len;
2371                              m = m->m_next) {
2372                                 KASSERT(!(m->m_flags & M_NOTAVAIL),
2373                                     ("%s: m %p not available", __func__, m));
2374                                 len -= m->m_len;
2375                                 uio->uio_resid -= m->m_len;
2376                                 sbfree(sb, m);
2377                                 n = m;
2378                         }
2379                         n->m_next = NULL;
2380                         sb->sb_mb = m;
2381                         sb->sb_lastrecord = sb->sb_mb;
2382                         if (sb->sb_mb == NULL)
2383                                 SB_EMPTY_FIXUP(sb);
2384                 }
2385                 /* Copy the remainder. */
2386                 if (len > 0) {
2387                         KASSERT(sb->sb_mb != NULL,
2388                             ("%s: len > 0 && sb->sb_mb empty", __func__));
2389
2390                         m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2391                         if (m == NULL)
2392                                 len = 0;        /* Don't flush data from sockbuf. */
2393                         else
2394                                 uio->uio_resid -= len;
2395                         if (*mp0 != NULL)
2396                                 m_cat(*mp0, m);
2397                         else
2398                                 *mp0 = m;
2399                         if (*mp0 == NULL) {
2400                                 error = ENOBUFS;
2401                                 goto out;
2402                         }
2403                 }
2404         } else {
2405                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2406                 SOCKBUF_UNLOCK(sb);
2407                 error = m_mbuftouio(uio, sb->sb_mb, len);
2408                 SOCKBUF_LOCK(sb);
2409                 if (error)
2410                         goto out;
2411         }
2412         SBLASTRECORDCHK(sb);
2413         SBLASTMBUFCHK(sb);
2414
2415         /*
2416          * Remove the delivered data from the socket buffer unless we
2417          * were only peeking.
2418          */
2419         if (!(flags & MSG_PEEK)) {
2420                 if (len > 0)
2421                         sbdrop_locked(sb, len);
2422
2423                 /* Notify protocol that we drained some data. */
2424                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2425                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2426                      !(flags & MSG_SOCALLBCK))) {
2427                         SOCKBUF_UNLOCK(sb);
2428                         VNET_SO_ASSERT(so);
2429                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2430                         SOCKBUF_LOCK(sb);
2431                 }
2432         }
2433
2434         /*
2435          * For MSG_WAITALL we may have to loop again and wait for
2436          * more data to come in.
2437          */
2438         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2439                 goto restart;
2440 out:
2441         SOCKBUF_LOCK_ASSERT(sb);
2442         SBLASTRECORDCHK(sb);
2443         SBLASTMBUFCHK(sb);
2444         SOCKBUF_UNLOCK(sb);
2445         sbunlock(sb);
2446         return (error);
2447 }
2448
2449 /*
2450  * Optimized version of soreceive() for simple datagram cases from userspace.
2451  * Unlike in the stream case, we're able to drop a datagram if copyout()
2452  * fails, and because we handle datagrams atomically, we don't need to use a
2453  * sleep lock to prevent I/O interlacing.
2454  */
2455 int
2456 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2457     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2458 {
2459         struct mbuf *m, *m2;
2460         int flags, error;
2461         ssize_t len;
2462         struct protosw *pr = so->so_proto;
2463         struct mbuf *nextrecord;
2464
2465         if (psa != NULL)
2466                 *psa = NULL;
2467         if (controlp != NULL)
2468                 *controlp = NULL;
2469         if (flagsp != NULL)
2470                 flags = *flagsp &~ MSG_EOR;
2471         else
2472                 flags = 0;
2473
2474         /*
2475          * For any complicated cases, fall back to the full
2476          * soreceive_generic().
2477          */
2478         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2479                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2480                     flagsp));
2481
2482         /*
2483          * Enforce restrictions on use.
2484          */
2485         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2486             ("soreceive_dgram: wantrcvd"));
2487         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2488         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2489             ("soreceive_dgram: SBS_RCVATMARK"));
2490         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2491             ("soreceive_dgram: P_CONNREQUIRED"));
2492
2493         /*
2494          * Loop blocking while waiting for a datagram.
2495          */
2496         SOCKBUF_LOCK(&so->so_rcv);
2497         while ((m = so->so_rcv.sb_mb) == NULL) {
2498                 KASSERT(sbavail(&so->so_rcv) == 0,
2499                     ("soreceive_dgram: sb_mb NULL but sbavail %u",
2500                     sbavail(&so->so_rcv)));
2501                 if (so->so_error) {
2502                         error = so->so_error;
2503                         so->so_error = 0;
2504                         SOCKBUF_UNLOCK(&so->so_rcv);
2505                         return (error);
2506                 }
2507                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2508                     uio->uio_resid == 0) {
2509                         SOCKBUF_UNLOCK(&so->so_rcv);
2510                         return (0);
2511                 }
2512                 if ((so->so_state & SS_NBIO) ||
2513                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2514                         SOCKBUF_UNLOCK(&so->so_rcv);
2515                         return (EWOULDBLOCK);
2516                 }
2517                 SBLASTRECORDCHK(&so->so_rcv);
2518                 SBLASTMBUFCHK(&so->so_rcv);
2519                 error = sbwait(&so->so_rcv);
2520                 if (error) {
2521                         SOCKBUF_UNLOCK(&so->so_rcv);
2522                         return (error);
2523                 }
2524         }
2525         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2526
2527         if (uio->uio_td)
2528                 uio->uio_td->td_ru.ru_msgrcv++;
2529         SBLASTRECORDCHK(&so->so_rcv);
2530         SBLASTMBUFCHK(&so->so_rcv);
2531         nextrecord = m->m_nextpkt;
2532         if (nextrecord == NULL) {
2533                 KASSERT(so->so_rcv.sb_lastrecord == m,
2534                     ("soreceive_dgram: lastrecord != m"));
2535         }
2536
2537         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2538             ("soreceive_dgram: m_nextpkt != nextrecord"));
2539
2540         /*
2541          * Pull 'm' and its chain off the front of the packet queue.
2542          */
2543         so->so_rcv.sb_mb = NULL;
2544         sockbuf_pushsync(&so->so_rcv, nextrecord);
2545
2546         /*
2547          * Walk 'm's chain and free that many bytes from the socket buffer.
2548          */
2549         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2550                 sbfree(&so->so_rcv, m2);
2551
2552         /*
2553          * Do a few last checks before we let go of the lock.
2554          */
2555         SBLASTRECORDCHK(&so->so_rcv);
2556         SBLASTMBUFCHK(&so->so_rcv);
2557         SOCKBUF_UNLOCK(&so->so_rcv);
2558
2559         if (pr->pr_flags & PR_ADDR) {
2560                 KASSERT(m->m_type == MT_SONAME,
2561                     ("m->m_type == %d", m->m_type));
2562                 if (psa != NULL)
2563                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2564                             M_NOWAIT);
2565                 m = m_free(m);
2566         }
2567         if (m == NULL) {
2568                 /* XXXRW: Can this happen? */
2569                 return (0);
2570         }
2571
2572         /*
2573          * Packet to copyout() is now in 'm' and it is disconnected from the
2574          * queue.
2575          *
2576          * Process one or more MT_CONTROL mbufs present before any data mbufs
2577          * in the first mbuf chain on the socket buffer.  We call into the
2578          * protocol to perform externalization (or freeing if controlp ==
2579          * NULL). In some cases there can be only MT_CONTROL mbufs without
2580          * MT_DATA mbufs.
2581          */
2582         if (m->m_type == MT_CONTROL) {
2583                 struct mbuf *cm = NULL, *cmn;
2584                 struct mbuf **cme = &cm;
2585
2586                 do {
2587                         m2 = m->m_next;
2588                         m->m_next = NULL;
2589                         *cme = m;
2590                         cme = &(*cme)->m_next;
2591                         m = m2;
2592                 } while (m != NULL && m->m_type == MT_CONTROL);
2593                 while (cm != NULL) {
2594                         cmn = cm->m_next;
2595                         cm->m_next = NULL;
2596                         if (pr->pr_domain->dom_externalize != NULL) {
2597                                 error = (*pr->pr_domain->dom_externalize)
2598                                     (cm, controlp, flags);
2599                         } else if (controlp != NULL)
2600                                 *controlp = cm;
2601                         else
2602                                 m_freem(cm);
2603                         if (controlp != NULL) {
2604                                 while (*controlp != NULL)
2605                                         controlp = &(*controlp)->m_next;
2606                         }
2607                         cm = cmn;
2608                 }
2609         }
2610         KASSERT(m == NULL || m->m_type == MT_DATA,
2611             ("soreceive_dgram: !data"));
2612         while (m != NULL && uio->uio_resid > 0) {
2613                 len = uio->uio_resid;
2614                 if (len > m->m_len)
2615                         len = m->m_len;
2616                 error = uiomove(mtod(m, char *), (int)len, uio);
2617                 if (error) {
2618                         m_freem(m);
2619                         return (error);
2620                 }
2621                 if (len == m->m_len)
2622                         m = m_free(m);
2623                 else {
2624                         m->m_data += len;
2625                         m->m_len -= len;
2626                 }
2627         }
2628         if (m != NULL) {
2629                 flags |= MSG_TRUNC;
2630                 m_freem(m);
2631         }
2632         if (flagsp != NULL)
2633                 *flagsp |= flags;
2634         return (0);
2635 }
2636
2637 int
2638 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2639     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2640 {
2641         int error;
2642
2643         CURVNET_SET(so->so_vnet);
2644         if (!SOLISTENING(so))
2645                 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
2646                     mp0, controlp, flagsp));
2647         else
2648                 error = ENOTCONN;
2649         CURVNET_RESTORE();
2650         return (error);
2651 }
2652
2653 int
2654 soshutdown(struct socket *so, int how)
2655 {
2656         struct protosw *pr = so->so_proto;
2657         int error, soerror_enotconn;
2658
2659         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2660                 return (EINVAL);
2661
2662         soerror_enotconn = 0;
2663         if ((so->so_state &
2664             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2665                 /*
2666                  * POSIX mandates us to return ENOTCONN when shutdown(2) is
2667                  * invoked on a datagram sockets, however historically we would
2668                  * actually tear socket down. This is known to be leveraged by
2669                  * some applications to unblock process waiting in recvXXX(2)
2670                  * by other process that it shares that socket with. Try to meet
2671                  * both backward-compatibility and POSIX requirements by forcing
2672                  * ENOTCONN but still asking protocol to perform pru_shutdown().
2673                  */
2674                 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so))
2675                         return (ENOTCONN);
2676                 soerror_enotconn = 1;
2677         }
2678
2679         if (SOLISTENING(so)) {
2680                 if (how != SHUT_WR) {
2681                         SOLISTEN_LOCK(so);
2682                         so->so_error = ECONNABORTED;
2683                         solisten_wakeup(so);    /* unlocks so */
2684                 }
2685                 goto done;
2686         }
2687
2688         CURVNET_SET(so->so_vnet);
2689         if (pr->pr_usrreqs->pru_flush != NULL)
2690                 (*pr->pr_usrreqs->pru_flush)(so, how);
2691         if (how != SHUT_WR)
2692                 sorflush(so);
2693         if (how != SHUT_RD) {
2694                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2695                 wakeup(&so->so_timeo);
2696                 CURVNET_RESTORE();
2697                 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2698         }
2699         wakeup(&so->so_timeo);
2700         CURVNET_RESTORE();
2701
2702 done:
2703         return (soerror_enotconn ? ENOTCONN : 0);
2704 }
2705
2706 void
2707 sorflush(struct socket *so)
2708 {
2709         struct sockbuf *sb = &so->so_rcv;
2710         struct protosw *pr = so->so_proto;
2711         struct socket aso;
2712
2713         VNET_SO_ASSERT(so);
2714
2715         /*
2716          * In order to avoid calling dom_dispose with the socket buffer mutex
2717          * held, and in order to generally avoid holding the lock for a long
2718          * time, we make a copy of the socket buffer and clear the original
2719          * (except locks, state).  The new socket buffer copy won't have
2720          * initialized locks so we can only call routines that won't use or
2721          * assert those locks.
2722          *
2723          * Dislodge threads currently blocked in receive and wait to acquire
2724          * a lock against other simultaneous readers before clearing the
2725          * socket buffer.  Don't let our acquire be interrupted by a signal
2726          * despite any existing socket disposition on interruptable waiting.
2727          */
2728         socantrcvmore(so);
2729         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2730
2731         /*
2732          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2733          * and mutex data unchanged.
2734          */
2735         SOCKBUF_LOCK(sb);
2736         bzero(&aso, sizeof(aso));
2737         aso.so_pcb = so->so_pcb;
2738         bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
2739             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2740         bzero(&sb->sb_startzero,
2741             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2742         SOCKBUF_UNLOCK(sb);
2743         sbunlock(sb);
2744
2745         /*
2746          * Dispose of special rights and flush the copied socket.  Don't call
2747          * any unsafe routines (that rely on locks being initialized) on aso.
2748          */
2749         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2750                 (*pr->pr_domain->dom_dispose)(&aso);
2751         sbrelease_internal(&aso.so_rcv, so);
2752 }
2753
2754 /*
2755  * Wrapper for Socket established helper hook.
2756  * Parameters: socket, context of the hook point, hook id.
2757  */
2758 static int inline
2759 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2760 {
2761         struct socket_hhook_data hhook_data = {
2762                 .so = so,
2763                 .hctx = hctx,
2764                 .m = NULL,
2765                 .status = 0
2766         };
2767
2768         CURVNET_SET(so->so_vnet);
2769         HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
2770         CURVNET_RESTORE();
2771
2772         /* Ugly but needed, since hhooks return void for now */
2773         return (hhook_data.status);
2774 }
2775
2776 /*
2777  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2778  * additional variant to handle the case where the option value needs to be
2779  * some kind of integer, but not a specific size.  In addition to their use
2780  * here, these functions are also called by the protocol-level pr_ctloutput()
2781  * routines.
2782  */
2783 int
2784 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2785 {
2786         size_t  valsize;
2787
2788         /*
2789          * If the user gives us more than we wanted, we ignore it, but if we
2790          * don't get the minimum length the caller wants, we return EINVAL.
2791          * On success, sopt->sopt_valsize is set to however much we actually
2792          * retrieved.
2793          */
2794         if ((valsize = sopt->sopt_valsize) < minlen)
2795                 return EINVAL;
2796         if (valsize > len)
2797                 sopt->sopt_valsize = valsize = len;
2798
2799         if (sopt->sopt_td != NULL)
2800                 return (copyin(sopt->sopt_val, buf, valsize));
2801
2802         bcopy(sopt->sopt_val, buf, valsize);
2803         return (0);
2804 }
2805
2806 /*
2807  * Kernel version of setsockopt(2).
2808  *
2809  * XXX: optlen is size_t, not socklen_t
2810  */
2811 int
2812 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2813     size_t optlen)
2814 {
2815         struct sockopt sopt;
2816
2817         sopt.sopt_level = level;
2818         sopt.sopt_name = optname;
2819         sopt.sopt_dir = SOPT_SET;
2820         sopt.sopt_val = optval;
2821         sopt.sopt_valsize = optlen;
2822         sopt.sopt_td = NULL;
2823         return (sosetopt(so, &sopt));
2824 }
2825
2826 int
2827 sosetopt(struct socket *so, struct sockopt *sopt)
2828 {
2829         int     error, optval;
2830         struct  linger l;
2831         struct  timeval tv;
2832         sbintime_t val;
2833         uint32_t val32;
2834 #ifdef MAC
2835         struct mac extmac;
2836 #endif
2837
2838         CURVNET_SET(so->so_vnet);
2839         error = 0;
2840         if (sopt->sopt_level != SOL_SOCKET) {
2841                 if (so->so_proto->pr_ctloutput != NULL)
2842                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2843                 else
2844                         error = ENOPROTOOPT;
2845         } else {
2846                 switch (sopt->sopt_name) {
2847                 case SO_ACCEPTFILTER:
2848                         error = accept_filt_setopt(so, sopt);
2849                         if (error)
2850                                 goto bad;
2851                         break;
2852
2853                 case SO_LINGER:
2854                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2855                         if (error)
2856                                 goto bad;
2857                         if (l.l_linger < 0 ||
2858                             l.l_linger > USHRT_MAX ||
2859                             l.l_linger > (INT_MAX / hz)) {
2860                                 error = EDOM;
2861                                 goto bad;
2862                         }
2863                         SOCK_LOCK(so);
2864                         so->so_linger = l.l_linger;
2865                         if (l.l_onoff)
2866                                 so->so_options |= SO_LINGER;
2867                         else
2868                                 so->so_options &= ~SO_LINGER;
2869                         SOCK_UNLOCK(so);
2870                         break;
2871
2872                 case SO_DEBUG:
2873                 case SO_KEEPALIVE:
2874                 case SO_DONTROUTE:
2875                 case SO_USELOOPBACK:
2876                 case SO_BROADCAST:
2877                 case SO_REUSEADDR:
2878                 case SO_REUSEPORT:
2879                 case SO_REUSEPORT_LB:
2880                 case SO_OOBINLINE:
2881                 case SO_TIMESTAMP:
2882                 case SO_BINTIME:
2883                 case SO_NOSIGPIPE:
2884                 case SO_NO_DDP:
2885                 case SO_NO_OFFLOAD:
2886                         error = sooptcopyin(sopt, &optval, sizeof optval,
2887                             sizeof optval);
2888                         if (error)
2889                                 goto bad;
2890                         SOCK_LOCK(so);
2891                         if (optval)
2892                                 so->so_options |= sopt->sopt_name;
2893                         else
2894                                 so->so_options &= ~sopt->sopt_name;
2895                         SOCK_UNLOCK(so);
2896                         break;
2897
2898                 case SO_SETFIB:
2899                         error = sooptcopyin(sopt, &optval, sizeof optval,
2900                             sizeof optval);
2901                         if (error)
2902                                 goto bad;
2903
2904                         if (optval < 0 || optval >= rt_numfibs) {
2905                                 error = EINVAL;
2906                                 goto bad;
2907                         }
2908                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2909                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2910                            (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2911                                 so->so_fibnum = optval;
2912                         else
2913                                 so->so_fibnum = 0;
2914                         break;
2915
2916                 case SO_USER_COOKIE:
2917                         error = sooptcopyin(sopt, &val32, sizeof val32,
2918                             sizeof val32);
2919                         if (error)
2920                                 goto bad;
2921                         so->so_user_cookie = val32;
2922                         break;
2923
2924                 case SO_SNDBUF:
2925                 case SO_RCVBUF:
2926                 case SO_SNDLOWAT:
2927                 case SO_RCVLOWAT:
2928                         error = sooptcopyin(sopt, &optval, sizeof optval,
2929                             sizeof optval);
2930                         if (error)
2931                                 goto bad;
2932
2933                         /*
2934                          * Values < 1 make no sense for any of these options,
2935                          * so disallow them.
2936                          */
2937                         if (optval < 1) {
2938                                 error = EINVAL;
2939                                 goto bad;
2940                         }
2941
2942                         error = sbsetopt(so, sopt->sopt_name, optval);
2943                         break;
2944
2945                 case SO_SNDTIMEO:
2946                 case SO_RCVTIMEO:
2947 #ifdef COMPAT_FREEBSD32
2948                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2949                                 struct timeval32 tv32;
2950
2951                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2952                                     sizeof tv32);
2953                                 CP(tv32, tv, tv_sec);
2954                                 CP(tv32, tv, tv_usec);
2955                         } else
2956 #endif
2957                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2958                                     sizeof tv);
2959                         if (error)
2960                                 goto bad;
2961                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2962                             tv.tv_usec >= 1000000) {
2963                                 error = EDOM;
2964                                 goto bad;
2965                         }
2966                         if (tv.tv_sec > INT32_MAX)
2967                                 val = SBT_MAX;
2968                         else
2969                                 val = tvtosbt(tv);
2970                         switch (sopt->sopt_name) {
2971                         case SO_SNDTIMEO:
2972                                 so->so_snd.sb_timeo = val;
2973                                 break;
2974                         case SO_RCVTIMEO:
2975                                 so->so_rcv.sb_timeo = val;
2976                                 break;
2977                         }
2978                         break;
2979
2980                 case SO_LABEL:
2981 #ifdef MAC
2982                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2983                             sizeof extmac);
2984                         if (error)
2985                                 goto bad;
2986                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2987                             so, &extmac);
2988 #else
2989                         error = EOPNOTSUPP;
2990 #endif
2991                         break;
2992
2993                 case SO_TS_CLOCK:
2994                         error = sooptcopyin(sopt, &optval, sizeof optval,
2995                             sizeof optval);
2996                         if (error)
2997                                 goto bad;
2998                         if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
2999                                 error = EINVAL;
3000                                 goto bad;
3001                         }
3002                         so->so_ts_clock = optval;
3003                         break;
3004
3005                 case SO_MAX_PACING_RATE:
3006                         error = sooptcopyin(sopt, &val32, sizeof(val32),
3007                             sizeof(val32));
3008                         if (error)
3009                                 goto bad;
3010                         so->so_max_pacing_rate = val32;
3011                         break;
3012
3013                 default:
3014                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3015                                 error = hhook_run_socket(so, sopt,
3016                                     HHOOK_SOCKET_OPT);
3017                         else
3018                                 error = ENOPROTOOPT;
3019                         break;
3020                 }
3021                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3022                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
3023         }
3024 bad:
3025         CURVNET_RESTORE();
3026         return (error);
3027 }
3028
3029 /*
3030  * Helper routine for getsockopt.
3031  */
3032 int
3033 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3034 {
3035         int     error;
3036         size_t  valsize;
3037
3038         error = 0;
3039
3040         /*
3041          * Documented get behavior is that we always return a value, possibly
3042          * truncated to fit in the user's buffer.  Traditional behavior is
3043          * that we always tell the user precisely how much we copied, rather
3044          * than something useful like the total amount we had available for
3045          * her.  Note that this interface is not idempotent; the entire
3046          * answer must be generated ahead of time.
3047          */
3048         valsize = min(len, sopt->sopt_valsize);
3049         sopt->sopt_valsize = valsize;
3050         if (sopt->sopt_val != NULL) {
3051                 if (sopt->sopt_td != NULL)
3052                         error = copyout(buf, sopt->sopt_val, valsize);
3053                 else
3054                         bcopy(buf, sopt->sopt_val, valsize);
3055         }
3056         return (error);
3057 }
3058
3059 int
3060 sogetopt(struct socket *so, struct sockopt *sopt)
3061 {
3062         int     error, optval;
3063         struct  linger l;
3064         struct  timeval tv;
3065 #ifdef MAC
3066         struct mac extmac;
3067 #endif
3068
3069         CURVNET_SET(so->so_vnet);
3070         error = 0;
3071         if (sopt->sopt_level != SOL_SOCKET) {
3072                 if (so->so_proto->pr_ctloutput != NULL)
3073                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3074                 else
3075                         error = ENOPROTOOPT;
3076                 CURVNET_RESTORE();
3077                 return (error);
3078         } else {
3079                 switch (sopt->sopt_name) {
3080                 case SO_ACCEPTFILTER:
3081                         error = accept_filt_getopt(so, sopt);
3082                         break;
3083
3084                 case SO_LINGER:
3085                         SOCK_LOCK(so);
3086                         l.l_onoff = so->so_options & SO_LINGER;
3087                         l.l_linger = so->so_linger;
3088                         SOCK_UNLOCK(so);
3089                         error = sooptcopyout(sopt, &l, sizeof l);
3090                         break;
3091
3092                 case SO_USELOOPBACK:
3093                 case SO_DONTROUTE:
3094                 case SO_DEBUG:
3095                 case SO_KEEPALIVE:
3096                 case SO_REUSEADDR:
3097                 case SO_REUSEPORT:
3098                 case SO_REUSEPORT_LB:
3099                 case SO_BROADCAST:
3100                 case SO_OOBINLINE:
3101                 case SO_ACCEPTCONN:
3102                 case SO_TIMESTAMP:
3103                 case SO_BINTIME:
3104                 case SO_NOSIGPIPE:
3105                         optval = so->so_options & sopt->sopt_name;
3106 integer:
3107                         error = sooptcopyout(sopt, &optval, sizeof optval);
3108                         break;
3109
3110                 case SO_DOMAIN:
3111                         optval = so->so_proto->pr_domain->dom_family;
3112                         goto integer;
3113
3114                 case SO_TYPE:
3115                         optval = so->so_type;
3116                         goto integer;
3117
3118                 case SO_PROTOCOL:
3119                         optval = so->so_proto->pr_protocol;
3120                         goto integer;
3121
3122                 case SO_ERROR:
3123                         SOCK_LOCK(so);
3124                         optval = so->so_error;
3125                         so->so_error = 0;
3126                         SOCK_UNLOCK(so);
3127                         goto integer;
3128
3129                 case SO_SNDBUF:
3130                         optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3131                             so->so_snd.sb_hiwat;
3132                         goto integer;
3133
3134                 case SO_RCVBUF:
3135                         optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3136                             so->so_rcv.sb_hiwat;
3137                         goto integer;
3138
3139                 case SO_SNDLOWAT:
3140                         optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3141                             so->so_snd.sb_lowat;
3142                         goto integer;
3143
3144                 case SO_RCVLOWAT:
3145                         optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3146                             so->so_rcv.sb_lowat;
3147                         goto integer;
3148
3149                 case SO_SNDTIMEO:
3150                 case SO_RCVTIMEO:
3151                         tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3152                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3153 #ifdef COMPAT_FREEBSD32
3154                         if (SV_CURPROC_FLAG(SV_ILP32)) {
3155                                 struct timeval32 tv32;
3156
3157                                 CP(tv, tv32, tv_sec);
3158                                 CP(tv, tv32, tv_usec);
3159                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
3160                         } else
3161 #endif
3162                                 error = sooptcopyout(sopt, &tv, sizeof tv);
3163                         break;
3164
3165                 case SO_LABEL:
3166 #ifdef MAC
3167                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3168                             sizeof(extmac));
3169                         if (error)
3170                                 goto bad;
3171                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3172                             so, &extmac);
3173                         if (error)
3174                                 goto bad;
3175                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
3176 #else
3177                         error = EOPNOTSUPP;
3178 #endif
3179                         break;
3180
3181                 case SO_PEERLABEL:
3182 #ifdef MAC
3183                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3184                             sizeof(extmac));
3185                         if (error)
3186                                 goto bad;
3187                         error = mac_getsockopt_peerlabel(
3188                             sopt->sopt_td->td_ucred, so, &extmac);
3189                         if (error)
3190                                 goto bad;
3191                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
3192 #else
3193                         error = EOPNOTSUPP;
3194 #endif
3195                         break;
3196
3197                 case SO_LISTENQLIMIT:
3198                         optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3199                         goto integer;
3200
3201                 case SO_LISTENQLEN:
3202                         optval = SOLISTENING(so) ? so->sol_qlen : 0;
3203                         goto integer;
3204
3205                 case SO_LISTENINCQLEN:
3206                         optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3207                         goto integer;
3208
3209                 case SO_TS_CLOCK:
3210                         optval = so->so_ts_clock;
3211                         goto integer;
3212
3213                 case SO_MAX_PACING_RATE:
3214                         optval = so->so_max_pacing_rate;
3215                         goto integer;
3216
3217                 default:
3218                         if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3219                                 error = hhook_run_socket(so, sopt,
3220                                     HHOOK_SOCKET_OPT);
3221                         else
3222                                 error = ENOPROTOOPT;
3223                         break;
3224                 }
3225         }
3226 #ifdef MAC
3227 bad:
3228 #endif
3229         CURVNET_RESTORE();
3230         return (error);
3231 }
3232
3233 int
3234 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3235 {
3236         struct mbuf *m, *m_prev;
3237         int sopt_size = sopt->sopt_valsize;
3238
3239         MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3240         if (m == NULL)
3241                 return ENOBUFS;
3242         if (sopt_size > MLEN) {
3243                 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3244                 if ((m->m_flags & M_EXT) == 0) {
3245                         m_free(m);
3246                         return ENOBUFS;
3247                 }
3248                 m->m_len = min(MCLBYTES, sopt_size);
3249         } else {
3250                 m->m_len = min(MLEN, sopt_size);
3251         }
3252         sopt_size -= m->m_len;
3253         *mp = m;
3254         m_prev = m;
3255
3256         while (sopt_size) {
3257                 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3258                 if (m == NULL) {
3259                         m_freem(*mp);
3260                         return ENOBUFS;
3261                 }
3262                 if (sopt_size > MLEN) {
3263                         MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3264                             M_NOWAIT);
3265                         if ((m->m_flags & M_EXT) == 0) {
3266                                 m_freem(m);
3267                                 m_freem(*mp);
3268                                 return ENOBUFS;
3269                         }
3270                         m->m_len = min(MCLBYTES, sopt_size);
3271                 } else {
3272                         m->m_len = min(MLEN, sopt_size);
3273                 }
3274                 sopt_size -= m->m_len;
3275                 m_prev->m_next = m;
3276                 m_prev = m;
3277         }
3278         return (0);
3279 }
3280
3281 int
3282 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3283 {
3284         struct mbuf *m0 = m;
3285
3286         if (sopt->sopt_val == NULL)
3287                 return (0);
3288         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3289                 if (sopt->sopt_td != NULL) {
3290                         int error;
3291
3292                         error = copyin(sopt->sopt_val, mtod(m, char *),
3293                             m->m_len);
3294                         if (error != 0) {
3295                                 m_freem(m0);
3296                                 return(error);
3297                         }
3298                 } else
3299                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3300                 sopt->sopt_valsize -= m->m_len;
3301                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3302                 m = m->m_next;
3303         }
3304         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3305                 panic("ip6_sooptmcopyin");
3306         return (0);
3307 }
3308
3309 int
3310 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3311 {
3312         struct mbuf *m0 = m;
3313         size_t valsize = 0;
3314
3315         if (sopt->sopt_val == NULL)
3316                 return (0);
3317         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3318                 if (sopt->sopt_td != NULL) {
3319                         int error;
3320
3321                         error = copyout(mtod(m, char *), sopt->sopt_val,
3322                             m->m_len);
3323                         if (error != 0) {
3324                                 m_freem(m0);
3325                                 return(error);
3326                         }
3327                 } else
3328                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3329                 sopt->sopt_valsize -= m->m_len;
3330                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3331                 valsize += m->m_len;
3332                 m = m->m_next;
3333         }
3334         if (m != NULL) {
3335                 /* enough soopt buffer should be given from user-land */
3336                 m_freem(m0);
3337                 return(EINVAL);
3338         }
3339         sopt->sopt_valsize = valsize;
3340         return (0);
3341 }
3342
3343 /*
3344  * sohasoutofband(): protocol notifies socket layer of the arrival of new
3345  * out-of-band data, which will then notify socket consumers.
3346  */
3347 void
3348 sohasoutofband(struct socket *so)
3349 {
3350
3351         if (so->so_sigio != NULL)
3352                 pgsigio(&so->so_sigio, SIGURG, 0);
3353         selwakeuppri(&so->so_rdsel, PSOCK);
3354 }
3355
3356 int
3357 sopoll(struct socket *so, int events, struct ucred *active_cred,
3358     struct thread *td)
3359 {
3360
3361         /*
3362          * We do not need to set or assert curvnet as long as everyone uses
3363          * sopoll_generic().
3364          */
3365         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3366             td));
3367 }
3368
3369 int
3370 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3371     struct thread *td)
3372 {
3373         int revents;
3374
3375         SOCK_LOCK(so);
3376         if (SOLISTENING(so)) {
3377                 if (!(events & (POLLIN | POLLRDNORM)))
3378                         revents = 0;
3379                 else if (!TAILQ_EMPTY(&so->sol_comp))
3380                         revents = events & (POLLIN | POLLRDNORM);
3381                 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3382                         revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3383                 else {
3384                         selrecord(td, &so->so_rdsel);
3385                         revents = 0;
3386                 }
3387         } else {
3388                 revents = 0;
3389                 SOCKBUF_LOCK(&so->so_snd);
3390                 SOCKBUF_LOCK(&so->so_rcv);
3391                 if (events & (POLLIN | POLLRDNORM))
3392                         if (soreadabledata(so))
3393                                 revents |= events & (POLLIN | POLLRDNORM);
3394                 if (events & (POLLOUT | POLLWRNORM))
3395                         if (sowriteable(so))
3396                                 revents |= events & (POLLOUT | POLLWRNORM);
3397                 if (events & (POLLPRI | POLLRDBAND))
3398                         if (so->so_oobmark ||
3399                             (so->so_rcv.sb_state & SBS_RCVATMARK))
3400                                 revents |= events & (POLLPRI | POLLRDBAND);
3401                 if ((events & POLLINIGNEOF) == 0) {
3402                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3403                                 revents |= events & (POLLIN | POLLRDNORM);
3404                                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3405                                         revents |= POLLHUP;
3406                         }
3407                 }
3408                 if (revents == 0) {
3409                         if (events &
3410                             (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3411                                 selrecord(td, &so->so_rdsel);
3412                                 so->so_rcv.sb_flags |= SB_SEL;
3413                         }
3414                         if (events & (POLLOUT | POLLWRNORM)) {
3415                                 selrecord(td, &so->so_wrsel);
3416                                 so->so_snd.sb_flags |= SB_SEL;
3417                         }
3418                 }
3419                 SOCKBUF_UNLOCK(&so->so_rcv);
3420                 SOCKBUF_UNLOCK(&so->so_snd);
3421         }
3422         SOCK_UNLOCK(so);
3423         return (revents);
3424 }
3425
3426 int
3427 soo_kqfilter(struct file *fp, struct knote *kn)
3428 {
3429         struct socket *so = kn->kn_fp->f_data;
3430         struct sockbuf *sb;
3431         struct knlist *knl;
3432
3433         switch (kn->kn_filter) {
3434         case EVFILT_READ:
3435                 kn->kn_fop = &soread_filtops;
3436                 knl = &so->so_rdsel.si_note;
3437                 sb = &so->so_rcv;
3438                 break;
3439         case EVFILT_WRITE:
3440                 kn->kn_fop = &sowrite_filtops;
3441                 knl = &so->so_wrsel.si_note;
3442                 sb = &so->so_snd;
3443                 break;
3444         case EVFILT_EMPTY:
3445                 kn->kn_fop = &soempty_filtops;
3446                 knl = &so->so_wrsel.si_note;
3447                 sb = &so->so_snd;
3448                 break;
3449         default:
3450                 return (EINVAL);
3451         }
3452
3453         SOCK_LOCK(so);
3454         if (SOLISTENING(so)) {
3455                 knlist_add(knl, kn, 1);
3456         } else {
3457                 SOCKBUF_LOCK(sb);
3458                 knlist_add(knl, kn, 1);
3459                 sb->sb_flags |= SB_KNOTE;
3460                 SOCKBUF_UNLOCK(sb);
3461         }
3462         SOCK_UNLOCK(so);
3463         return (0);
3464 }
3465
3466 /*
3467  * Some routines that return EOPNOTSUPP for entry points that are not
3468  * supported by a protocol.  Fill in as needed.
3469  */
3470 int
3471 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3472 {
3473
3474         return EOPNOTSUPP;
3475 }
3476
3477 int
3478 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
3479 {
3480
3481         return EOPNOTSUPP;
3482 }
3483
3484 int
3485 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3486 {
3487
3488         return EOPNOTSUPP;
3489 }
3490
3491 int
3492 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3493 {
3494
3495         return EOPNOTSUPP;
3496 }
3497
3498 int
3499 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3500     struct thread *td)
3501 {
3502
3503         return EOPNOTSUPP;
3504 }
3505
3506 int
3507 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3508 {
3509
3510         return EOPNOTSUPP;
3511 }
3512
3513 int
3514 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3515     struct thread *td)
3516 {
3517
3518         return EOPNOTSUPP;
3519 }
3520
3521 int
3522 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3523 {
3524
3525         return EOPNOTSUPP;
3526 }
3527
3528 int
3529 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3530     struct ifnet *ifp, struct thread *td)
3531 {
3532
3533         return EOPNOTSUPP;
3534 }
3535
3536 int
3537 pru_disconnect_notsupp(struct socket *so)
3538 {
3539
3540         return EOPNOTSUPP;
3541 }
3542
3543 int
3544 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3545 {
3546
3547         return EOPNOTSUPP;
3548 }
3549
3550 int
3551 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3552 {
3553
3554         return EOPNOTSUPP;
3555 }
3556
3557 int
3558 pru_rcvd_notsupp(struct socket *so, int flags)
3559 {
3560
3561         return EOPNOTSUPP;
3562 }
3563
3564 int
3565 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3566 {
3567
3568         return EOPNOTSUPP;
3569 }
3570
3571 int
3572 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3573     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3574 {
3575
3576         return EOPNOTSUPP;
3577 }
3578
3579 int
3580 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
3581 {
3582
3583         return (EOPNOTSUPP);
3584 }
3585
3586 /*
3587  * This isn't really a ``null'' operation, but it's the default one and
3588  * doesn't do anything destructive.
3589  */
3590 int
3591 pru_sense_null(struct socket *so, struct stat *sb)
3592 {
3593
3594         sb->st_blksize = so->so_snd.sb_hiwat;
3595         return 0;
3596 }
3597
3598 int
3599 pru_shutdown_notsupp(struct socket *so)
3600 {
3601
3602         return EOPNOTSUPP;
3603 }
3604
3605 int
3606 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3607 {
3608
3609         return EOPNOTSUPP;
3610 }
3611
3612 int
3613 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3614     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3615 {
3616
3617         return EOPNOTSUPP;
3618 }
3619
3620 int
3621 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3622     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3623 {
3624
3625         return EOPNOTSUPP;
3626 }
3627
3628 int
3629 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3630     struct thread *td)
3631 {
3632
3633         return EOPNOTSUPP;
3634 }
3635
3636 static void
3637 filt_sordetach(struct knote *kn)
3638 {
3639         struct socket *so = kn->kn_fp->f_data;
3640
3641         so_rdknl_lock(so);
3642         knlist_remove(&so->so_rdsel.si_note, kn, 1);
3643         if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3644                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3645         so_rdknl_unlock(so);
3646 }
3647
3648 /*ARGSUSED*/
3649 static int
3650 filt_soread(struct knote *kn, long hint)
3651 {
3652         struct socket *so;
3653
3654         so = kn->kn_fp->f_data;
3655
3656         if (SOLISTENING(so)) {
3657                 SOCK_LOCK_ASSERT(so);
3658                 kn->kn_data = so->sol_qlen;
3659                 if (so->so_error) {
3660                         kn->kn_flags |= EV_EOF;
3661                         kn->kn_fflags = so->so_error;
3662                         return (1);
3663                 }
3664                 return (!TAILQ_EMPTY(&so->sol_comp));
3665         }
3666
3667         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3668
3669         kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3670         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3671                 kn->kn_flags |= EV_EOF;
3672                 kn->kn_fflags = so->so_error;
3673                 return (1);
3674         } else if (so->so_error)        /* temporary udp error */
3675                 return (1);
3676
3677         if (kn->kn_sfflags & NOTE_LOWAT) {
3678                 if (kn->kn_data >= kn->kn_sdata)
3679                         return (1);
3680         } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3681                 return (1);
3682
3683         /* This hook returning non-zero indicates an event, not error */
3684         return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3685 }
3686
3687 static void
3688 filt_sowdetach(struct knote *kn)
3689 {
3690         struct socket *so = kn->kn_fp->f_data;
3691
3692         so_wrknl_lock(so);
3693         knlist_remove(&so->so_wrsel.si_note, kn, 1);
3694         if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3695                 so->so_snd.sb_flags &= ~SB_KNOTE;
3696         so_wrknl_unlock(so);
3697 }
3698
3699 /*ARGSUSED*/
3700 static int
3701 filt_sowrite(struct knote *kn, long hint)
3702 {
3703         struct socket *so;
3704
3705         so = kn->kn_fp->f_data;
3706
3707         if (SOLISTENING(so))
3708                 return (0);
3709
3710         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3711         kn->kn_data = sbspace(&so->so_snd);
3712
3713         hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3714
3715         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3716                 kn->kn_flags |= EV_EOF;
3717                 kn->kn_fflags = so->so_error;
3718                 return (1);
3719         } else if (so->so_error)        /* temporary udp error */
3720                 return (1);
3721         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3722             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3723                 return (0);
3724         else if (kn->kn_sfflags & NOTE_LOWAT)
3725                 return (kn->kn_data >= kn->kn_sdata);
3726         else
3727                 return (kn->kn_data >= so->so_snd.sb_lowat);
3728 }
3729
3730 static int
3731 filt_soempty(struct knote *kn, long hint)
3732 {
3733         struct socket *so;
3734
3735         so = kn->kn_fp->f_data;
3736
3737         if (SOLISTENING(so))
3738                 return (1);
3739
3740         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3741         kn->kn_data = sbused(&so->so_snd);
3742
3743         if (kn->kn_data == 0)
3744                 return (1);
3745         else
3746                 return (0);
3747 }
3748
3749 int
3750 socheckuid(struct socket *so, uid_t uid)
3751 {
3752
3753         if (so == NULL)
3754                 return (EPERM);
3755         if (so->so_cred->cr_uid != uid)
3756                 return (EPERM);
3757         return (0);
3758 }
3759
3760 /*
3761  * These functions are used by protocols to notify the socket layer (and its
3762  * consumers) of state changes in the sockets driven by protocol-side events.
3763  */
3764
3765 /*
3766  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3767  *
3768  * Normal sequence from the active (originating) side is that
3769  * soisconnecting() is called during processing of connect() call, resulting
3770  * in an eventual call to soisconnected() if/when the connection is
3771  * established.  When the connection is torn down soisdisconnecting() is
3772  * called during processing of disconnect() call, and soisdisconnected() is
3773  * called when the connection to the peer is totally severed.  The semantics
3774  * of these routines are such that connectionless protocols can call
3775  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3776  * calls when setting up a ``connection'' takes no time.
3777  *
3778  * From the passive side, a socket is created with two queues of sockets:
3779  * so_incomp for connections in progress and so_comp for connections already
3780  * made and awaiting user acceptance.  As a protocol is preparing incoming
3781  * connections, it creates a socket structure queued on so_incomp by calling
3782  * sonewconn().  When the connection is established, soisconnected() is
3783  * called, and transfers the socket structure to so_comp, making it available
3784  * to accept().
3785  *
3786  * If a socket is closed with sockets on either so_incomp or so_comp, these
3787  * sockets are dropped.
3788  *
3789  * If higher-level protocols are implemented in the kernel, the wakeups done
3790  * here will sometimes cause software-interrupt process scheduling.
3791  */
3792 void
3793 soisconnecting(struct socket *so)
3794 {
3795
3796         SOCK_LOCK(so);
3797         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3798         so->so_state |= SS_ISCONNECTING;
3799         SOCK_UNLOCK(so);
3800 }
3801
3802 void
3803 soisconnected(struct socket *so)
3804 {
3805
3806         SOCK_LOCK(so);
3807         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3808         so->so_state |= SS_ISCONNECTED;
3809
3810         if (so->so_qstate == SQ_INCOMP) {
3811                 struct socket *head = so->so_listen;
3812                 int ret;
3813
3814                 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
3815                 /*
3816                  * Promoting a socket from incomplete queue to complete, we
3817                  * need to go through reverse order of locking.  We first do
3818                  * trylock, and if that doesn't succeed, we go the hard way
3819                  * leaving a reference and rechecking consistency after proper
3820                  * locking.
3821                  */
3822                 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
3823                         soref(head);
3824                         SOCK_UNLOCK(so);
3825                         SOLISTEN_LOCK(head);
3826                         SOCK_LOCK(so);
3827                         if (__predict_false(head != so->so_listen)) {
3828                                 /*
3829                                  * The socket went off the listen queue,
3830                                  * should be lost race to close(2) of sol.
3831                                  * The socket is about to soabort().
3832                                  */
3833                                 SOCK_UNLOCK(so);
3834                                 sorele(head);
3835                                 return;
3836                         }
3837                         /* Not the last one, as so holds a ref. */
3838                         refcount_release(&head->so_count);
3839                 }
3840 again:
3841                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3842                         TAILQ_REMOVE(&head->sol_incomp, so, so_list);
3843                         head->sol_incqlen--;
3844                         TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
3845                         head->sol_qlen++;
3846                         so->so_qstate = SQ_COMP;
3847                         SOCK_UNLOCK(so);
3848                         solisten_wakeup(head);  /* unlocks */
3849                 } else {
3850                         SOCKBUF_LOCK(&so->so_rcv);
3851                         soupcall_set(so, SO_RCV,
3852                             head->sol_accept_filter->accf_callback,
3853                             head->sol_accept_filter_arg);
3854                         so->so_options &= ~SO_ACCEPTFILTER;
3855                         ret = head->sol_accept_filter->accf_callback(so,
3856                             head->sol_accept_filter_arg, M_NOWAIT);
3857                         if (ret == SU_ISCONNECTED) {
3858                                 soupcall_clear(so, SO_RCV);
3859                                 SOCKBUF_UNLOCK(&so->so_rcv);
3860                                 goto again;
3861                         }
3862                         SOCKBUF_UNLOCK(&so->so_rcv);
3863                         SOCK_UNLOCK(so);
3864                         SOLISTEN_UNLOCK(head);
3865                 }
3866                 return;
3867         }
3868         SOCK_UNLOCK(so);
3869         wakeup(&so->so_timeo);
3870         sorwakeup(so);
3871         sowwakeup(so);
3872 }
3873
3874 void
3875 soisdisconnecting(struct socket *so)
3876 {
3877
3878         SOCK_LOCK(so);
3879         so->so_state &= ~SS_ISCONNECTING;
3880         so->so_state |= SS_ISDISCONNECTING;
3881
3882         if (!SOLISTENING(so)) {
3883                 SOCKBUF_LOCK(&so->so_rcv);
3884                 socantrcvmore_locked(so);
3885                 SOCKBUF_LOCK(&so->so_snd);
3886                 socantsendmore_locked(so);
3887         }
3888         SOCK_UNLOCK(so);
3889         wakeup(&so->so_timeo);
3890 }
3891
3892 void
3893 soisdisconnected(struct socket *so)
3894 {
3895
3896         SOCK_LOCK(so);
3897         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3898         so->so_state |= SS_ISDISCONNECTED;
3899
3900         if (!SOLISTENING(so)) {
3901                 SOCK_UNLOCK(so);
3902                 SOCKBUF_LOCK(&so->so_rcv);
3903                 socantrcvmore_locked(so);
3904                 SOCKBUF_LOCK(&so->so_snd);
3905                 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
3906                 socantsendmore_locked(so);
3907         } else
3908                 SOCK_UNLOCK(so);
3909         wakeup(&so->so_timeo);
3910 }
3911
3912 /*
3913  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3914  */
3915 struct sockaddr *
3916 sodupsockaddr(const struct sockaddr *sa, int mflags)
3917 {
3918         struct sockaddr *sa2;
3919
3920         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3921         if (sa2)
3922                 bcopy(sa, sa2, sa->sa_len);
3923         return sa2;
3924 }
3925
3926 /*
3927  * Register per-socket destructor.
3928  */
3929 void
3930 sodtor_set(struct socket *so, so_dtor_t *func)
3931 {
3932
3933         SOCK_LOCK_ASSERT(so);
3934         so->so_dtor = func;
3935 }
3936
3937 /*
3938  * Register per-socket buffer upcalls.
3939  */
3940 void
3941 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
3942 {
3943         struct sockbuf *sb;
3944
3945         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
3946
3947         switch (which) {
3948         case SO_RCV:
3949                 sb = &so->so_rcv;
3950                 break;
3951         case SO_SND:
3952                 sb = &so->so_snd;
3953                 break;
3954         default:
3955                 panic("soupcall_set: bad which");
3956         }
3957         SOCKBUF_LOCK_ASSERT(sb);
3958         sb->sb_upcall = func;
3959         sb->sb_upcallarg = arg;
3960         sb->sb_flags |= SB_UPCALL;
3961 }
3962
3963 void
3964 soupcall_clear(struct socket *so, int which)
3965 {
3966         struct sockbuf *sb;
3967
3968         KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
3969
3970         switch (which) {
3971         case SO_RCV:
3972                 sb = &so->so_rcv;
3973                 break;
3974         case SO_SND:
3975                 sb = &so->so_snd;
3976                 break;
3977         default:
3978                 panic("soupcall_clear: bad which");
3979         }
3980         SOCKBUF_LOCK_ASSERT(sb);
3981         KASSERT(sb->sb_upcall != NULL,
3982             ("%s: so %p no upcall to clear", __func__, so));
3983         sb->sb_upcall = NULL;
3984         sb->sb_upcallarg = NULL;
3985         sb->sb_flags &= ~SB_UPCALL;
3986 }
3987
3988 void
3989 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
3990 {
3991
3992         SOLISTEN_LOCK_ASSERT(so);
3993         so->sol_upcall = func;
3994         so->sol_upcallarg = arg;
3995 }
3996
3997 static void
3998 so_rdknl_lock(void *arg)
3999 {
4000         struct socket *so = arg;
4001
4002         if (SOLISTENING(so))
4003                 SOCK_LOCK(so);
4004         else
4005                 SOCKBUF_LOCK(&so->so_rcv);
4006 }
4007
4008 static void
4009 so_rdknl_unlock(void *arg)
4010 {
4011         struct socket *so = arg;
4012
4013         if (SOLISTENING(so))
4014                 SOCK_UNLOCK(so);
4015         else
4016                 SOCKBUF_UNLOCK(&so->so_rcv);
4017 }
4018
4019 static void
4020 so_rdknl_assert_locked(void *arg)
4021 {
4022         struct socket *so = arg;
4023
4024         if (SOLISTENING(so))
4025                 SOCK_LOCK_ASSERT(so);
4026         else
4027                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
4028 }
4029
4030 static void
4031 so_rdknl_assert_unlocked(void *arg)
4032 {
4033         struct socket *so = arg;
4034
4035         if (SOLISTENING(so))
4036                 SOCK_UNLOCK_ASSERT(so);
4037         else
4038                 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
4039 }
4040
4041 static void
4042 so_wrknl_lock(void *arg)
4043 {
4044         struct socket *so = arg;
4045
4046         if (SOLISTENING(so))
4047                 SOCK_LOCK(so);
4048         else
4049                 SOCKBUF_LOCK(&so->so_snd);
4050 }
4051
4052 static void
4053 so_wrknl_unlock(void *arg)
4054 {
4055         struct socket *so = arg;
4056
4057         if (SOLISTENING(so))
4058                 SOCK_UNLOCK(so);
4059         else
4060                 SOCKBUF_UNLOCK(&so->so_snd);
4061 }
4062
4063 static void
4064 so_wrknl_assert_locked(void *arg)
4065 {
4066         struct socket *so = arg;
4067
4068         if (SOLISTENING(so))
4069                 SOCK_LOCK_ASSERT(so);
4070         else
4071                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
4072 }
4073
4074 static void
4075 so_wrknl_assert_unlocked(void *arg)
4076 {
4077         struct socket *so = arg;
4078
4079         if (SOLISTENING(so))
4080                 SOCK_UNLOCK_ASSERT(so);
4081         else
4082                 SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
4083 }
4084
4085 /*
4086  * Create an external-format (``xsocket'') structure using the information in
4087  * the kernel-format socket structure pointed to by so.  This is done to
4088  * reduce the spew of irrelevant information over this interface, to isolate
4089  * user code from changes in the kernel structure, and potentially to provide
4090  * information-hiding if we decide that some of this information should be
4091  * hidden from users.
4092  */
4093 void
4094 sotoxsocket(struct socket *so, struct xsocket *xso)
4095 {
4096
4097         bzero(xso, sizeof(*xso));
4098         xso->xso_len = sizeof *xso;
4099         xso->xso_so = (uintptr_t)so;
4100         xso->so_type = so->so_type;
4101         xso->so_options = so->so_options;
4102         xso->so_linger = so->so_linger;
4103         xso->so_state = so->so_state;
4104         xso->so_pcb = (uintptr_t)so->so_pcb;
4105         xso->xso_protocol = so->so_proto->pr_protocol;
4106         xso->xso_family = so->so_proto->pr_domain->dom_family;
4107         xso->so_timeo = so->so_timeo;
4108         xso->so_error = so->so_error;
4109         xso->so_uid = so->so_cred->cr_uid;
4110         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4111         if (SOLISTENING(so)) {
4112                 xso->so_qlen = so->sol_qlen;
4113                 xso->so_incqlen = so->sol_incqlen;
4114                 xso->so_qlimit = so->sol_qlimit;
4115                 xso->so_oobmark = 0;
4116         } else {
4117                 xso->so_state |= so->so_qstate;
4118                 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4119                 xso->so_oobmark = so->so_oobmark;
4120                 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4121                 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4122         }
4123 }
4124
4125 struct sockbuf *
4126 so_sockbuf_rcv(struct socket *so)
4127 {
4128
4129         return (&so->so_rcv);
4130 }
4131
4132 struct sockbuf *
4133 so_sockbuf_snd(struct socket *so)
4134 {
4135
4136         return (&so->so_snd);
4137 }
4138
4139 int
4140 so_state_get(const struct socket *so)
4141 {
4142
4143         return (so->so_state);
4144 }
4145
4146 void
4147 so_state_set(struct socket *so, int val)
4148 {
4149
4150         so->so_state = val;
4151 }
4152
4153 int
4154 so_options_get(const struct socket *so)
4155 {
4156
4157         return (so->so_options);
4158 }
4159
4160 void
4161 so_options_set(struct socket *so, int val)
4162 {
4163
4164         so->so_options = val;
4165 }
4166
4167 int
4168 so_error_get(const struct socket *so)
4169 {
4170
4171         return (so->so_error);
4172 }
4173
4174 void
4175 so_error_set(struct socket *so, int val)
4176 {
4177
4178         so->so_error = val;
4179 }
4180
4181 int
4182 so_linger_get(const struct socket *so)
4183 {
4184
4185         return (so->so_linger);
4186 }
4187
4188 void
4189 so_linger_set(struct socket *so, int val)
4190 {
4191
4192         KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4193             ("%s: val %d out of range", __func__, val));
4194
4195         so->so_linger = val;
4196 }
4197
4198 struct protosw *
4199 so_protosw_get(const struct socket *so)
4200 {
4201
4202         return (so->so_proto);
4203 }
4204
4205 void
4206 so_protosw_set(struct socket *so, struct protosw *val)
4207 {
4208
4209         so->so_proto = val;
4210 }
4211
4212 void
4213 so_sorwakeup(struct socket *so)
4214 {
4215
4216         sorwakeup(so);
4217 }
4218
4219 void
4220 so_sowwakeup(struct socket *so)
4221 {
4222
4223         sowwakeup(so);
4224 }
4225
4226 void
4227 so_sorwakeup_locked(struct socket *so)
4228 {
4229
4230         sorwakeup_locked(so);
4231 }
4232
4233 void
4234 so_sowwakeup_locked(struct socket *so)
4235 {
4236
4237         sowwakeup_locked(so);
4238 }
4239
4240 void
4241 so_lock(struct socket *so)
4242 {
4243
4244         SOCK_LOCK(so);
4245 }
4246
4247 void
4248 so_unlock(struct socket *so)
4249 {
4250
4251         SOCK_UNLOCK(so);
4252 }