sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3  *      The Regents of the University of California.
   4  * Copyright (c) 2004 The FreeBSD Foundation
   5  * Copyright (c) 2004-2008 Robert N. M. Watson
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  33  */
  34
  35 /*
  36  * Comments on the socket life cycle:
  37  *
  38  * soalloc() sets of socket layer state for a socket, called only by
  39  * socreate() and sonewconn().  Socket layer private.
  40  *
  41  * sodealloc() tears down socket layer state for a socket, called only by
  42  * sofree() and sonewconn().  Socket layer private.
  43  *
  44  * pru_attach() associates protocol layer state with an allocated socket;
  45  * called only once, may fail, aborting socket allocation.  This is called
  46  * from socreate() and sonewconn().  Socket layer private.
  47  *
  48  * pru_detach() disassociates protocol layer state from an attached socket,
  49  * and will be called exactly once for sockets in which pru_attach() has
  50  * been successfully called.  If pru_attach() returned an error,
  51  * pru_detach() will not be called.  Socket layer private.
  52  *
  53  * pru_abort() and pru_close() notify the protocol layer that the last
  54  * consumer of a socket is starting to tear down the socket, and that the
  55  * protocol should terminate the connection.  Historically, pru_abort() also
  56  * detached protocol state from the socket state, but this is no longer the
  57  * case.
  58  *
  59  * socreate() creates a socket and attaches protocol state.  This is a public
  60  * interface that may be used by socket layer consumers to create new
  61  * sockets.
  62  *
  63  * sonewconn() creates a socket and attaches protocol state.  This is a
  64  * public interface  that may be used by protocols to create new sockets when
  65  * a new connection is received and will be available for accept() on a
  66  * listen socket.
  67  *
  68  * soclose() destroys a socket after possibly waiting for it to disconnect.
  69  * This is a public interface that socket consumers should use to close and
  70  * release a socket when done with it.
  71  *
  72  * soabort() destroys a socket without waiting for it to disconnect (used
  73  * only for incoming connections that are already partially or fully
  74  * connected).  This is used internally by the socket layer when clearing
  75  * listen socket queues (due to overflow or close on the listen socket), but
  76  * is also a public interface protocols may use to abort connections in
  77  * their incomplete listen queues should they no longer be required.  Sockets
  78  * placed in completed connection listen queues should not be aborted for
  79  * reasons described in the comment above the soclose() implementation.  This
  80  * is not a general purpose close routine, and except in the specific
  81  * circumstances described here, should not be used.
  82  *
  83  * sofree() will free a socket and its protocol state if all references on
  84  * the socket have been released, and is the public interface to attempt to
  85  * free a socket when a reference is removed.  This is a socket layer private
  86  * interface.
  87  *
  88  * NOTE: In addition to socreate() and soclose(), which provide a single
  89  * socket reference to the consumer to be managed as required, there are two
  90  * calls to explicitly manage socket references, soref(), and sorele().
  91  * Currently, these are generally required only when transitioning a socket
  92  * from a listen queue to a file descriptor, in order to prevent garbage
  93  * collection of the socket at an untimely moment.  For a number of reasons,
  94  * these interfaces are not preferred, and should be avoided.
  95  *
  96  * NOTE: With regard to VNETs the general rule is that callers do not set
  97  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  98  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  99  * and sorflush(), which are usually called from a pre-set VNET context.
 100  * sopoll() currently does not need a VNET context to be set.
 101  */
 102
 103 #include <sys/cdefs.h>
 104 __FBSDID("$FreeBSD$");
 105
 106 #include "opt_inet.h"
 107 #include "opt_inet6.h"
 108 #include "opt_compat.h"
 109
 110 #include <sys/param.h>
 111 #include <sys/systm.h>
 112 #include <sys/fcntl.h>
 113 #include <sys/limits.h>
 114 #include <sys/lock.h>
 115 #include <sys/mac.h>
 116 #include <sys/malloc.h>
 117 #include <sys/mbuf.h>
 118 #include <sys/mutex.h>
 119 #include <sys/domain.h>
 120 #include <sys/file.h>                   /* for struct knote */
 121 #include <sys/kernel.h>
 122 #include <sys/event.h>
 123 #include <sys/eventhandler.h>
 124 #include <sys/poll.h>
 125 #include <sys/proc.h>
 126 #include <sys/protosw.h>
 127 #include <sys/socket.h>
 128 #include <sys/socketvar.h>
 129 #include <sys/resourcevar.h>
 130 #include <net/route.h>
 131 #include <sys/signalvar.h>
 132 #include <sys/stat.h>
 133 #include <sys/sx.h>
 134 #include <sys/sysctl.h>
 135 #include <sys/uio.h>
 136 #include <sys/jail.h>
 137 #include <sys/syslog.h>
 138 #include <netinet/in.h>
 139
 140 #include <net/vnet.h>
 141
 142 #include <security/mac/mac_framework.h>
 143
 144 #include <vm/uma.h>
 145
 146 #ifdef COMPAT_FREEBSD32
 147 #include <sys/mount.h>
 148 #include <sys/sysent.h>
 149 #include <compat/freebsd32/freebsd32.h>
 150 #endif
 151
 152 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
 153                     int flags);
 154
 155 static void     filt_sordetach(struct knote *kn);
 156 static int      filt_soread(struct knote *kn, long hint);
 157 static void     filt_sowdetach(struct knote *kn);
 158 static int      filt_sowrite(struct knote *kn, long hint);
 159 static int      filt_solisten(struct knote *kn, long hint);
 160
 161 static struct filterops solisten_filtops = {
 162         .f_isfd = 1,
 163         .f_detach = filt_sordetach,
 164         .f_event = filt_solisten,
 165 };
 166 static struct filterops soread_filtops = {
 167         .f_isfd = 1,
 168         .f_detach = filt_sordetach,
 169         .f_event = filt_soread,
 170 };
 171 static struct filterops sowrite_filtops = {
 172         .f_isfd = 1,
 173         .f_detach = filt_sowdetach,
 174         .f_event = filt_sowrite,
 175 };
 176
 177 so_gen_t        so_gencnt;      /* generation count for sockets */
 178
 179 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 180 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 181
 182 #define VNET_SO_ASSERT(so)                                              \
 183         VNET_ASSERT(curvnet != NULL,                                    \
 184             ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 185
 186 /*
 187  * Limit on the number of connections in the listen queue waiting
 188  * for accept(2).
 189  * NB: The orginal sysctl somaxconn is still available but hidden
 190  * to prevent confusion about the actual purpose of this number.
 191  */
 192 static int somaxconn = SOMAXCONN;
 193
 194 static int
 195 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 196 {
 197         int error;
 198         int val;
 199
 200         val = somaxconn;
 201         error = sysctl_handle_int(oidp, &val, 0, req);
 202         if (error || !req->newptr )
 203                 return (error);
 204
 205         if (val < 1 || val > USHRT_MAX)
 206                 return (EINVAL);
 207
 208         somaxconn = val;
 209         return (0);
 210 }
 211 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
 212     0, sizeof(int), sysctl_somaxconn, "I",
 213     "Maximum listen socket pending connection accept queue size");
 214 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 215     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
 216     0, sizeof(int), sysctl_somaxconn, "I",
 217     "Maximum listen socket pending connection accept queue size (compat)");
 218
 219 static int numopensockets;
 220 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 221     &numopensockets, 0, "Number of open sockets");
 222
 223 /*
 224  * accept_mtx locks down per-socket fields relating to accept queues.  See
 225  * socketvar.h for an annotation of the protected fields of struct socket.
 226  */
 227 struct mtx accept_mtx;
 228 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 229
 230 /*
 231  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 232  * so_gencnt field.
 233  */
 234 static struct mtx so_global_mtx;
 235 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 236
 237 /*
 238  * General IPC sysctl name space, used by sockets and a variety of other IPC
 239  * types.
 240  */
 241 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 242
 243 /*
 244  * Initialize the socket subsystem and set up the socket
 245  * memory allocator.
 246  */
 247 static uma_zone_t socket_zone;
 248 int     maxsockets;
 249
 250 static void
 251 socket_zone_change(void *tag)
 252 {
 253
 254         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 255 }
 256
 257 static void
 258 socket_init(void *tag)
 259 {
 260
 261         socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 262             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 263         maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 264         uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 265         EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 266             EVENTHANDLER_PRI_FIRST);
 267 }
 268 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 269
 270 /*
 271  * Initialise maxsockets.  This SYSINIT must be run after
 272  * tunable_mbinit().
 273  */
 274 static void
 275 init_maxsockets(void *ignored)
 276 {
 277
 278         TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 279         maxsockets = imax(maxsockets, maxfiles);
 280 }
 281 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 282
 283 /*
 284  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 285  * of the change so that they can update their dependent limits as required.
 286  */
 287 static int
 288 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 289 {
 290         int error, newmaxsockets;
 291
 292         newmaxsockets = maxsockets;
 293         error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 294         if (error == 0 && req->newptr) {
 295                 if (newmaxsockets > maxsockets &&
 296                     newmaxsockets <= maxfiles) {
 297                         maxsockets = newmaxsockets;
 298                         EVENTHANDLER_INVOKE(maxsockets_change);
 299                 } else
 300                         error = EINVAL;
 301         }
 302         return (error);
 303 }
 304 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 305     &maxsockets, 0, sysctl_maxsockets, "IU",
 306     "Maximum number of sockets avaliable");
 307
 308 /*
 309  * Socket operation routines.  These routines are called by the routines in
 310  * sys_socket.c or from a system process, and implement the semantics of
 311  * socket operations by switching out to the protocol specific routines.
 312  */
 313
 314 /*
 315  * Get a socket structure from our zone, and initialize it.  Note that it
 316  * would probably be better to allocate socket and PCB at the same time, but
 317  * I'm not convinced that all the protocols can be easily modified to do
 318  * this.
 319  *
 320  * soalloc() returns a socket with a ref count of 0.
 321  */
 322 static struct socket *
 323 soalloc(struct vnet *vnet)
 324 {
 325         struct socket *so;
 326
 327         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 328         if (so == NULL)
 329                 return (NULL);
 330 #ifdef MAC
 331         if (mac_socket_init(so, M_NOWAIT) != 0) {
 332                 uma_zfree(socket_zone, so);
 333                 return (NULL);
 334         }
 335 #endif
 336         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 337         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 338         sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 339         sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 340         TAILQ_INIT(&so->so_aiojobq);
 341         mtx_lock(&so_global_mtx);
 342         so->so_gencnt = ++so_gencnt;
 343         ++numopensockets;
 344 #ifdef VIMAGE
 345         VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 346             __func__, __LINE__, so));
 347         vnet->vnet_sockcnt++;
 348         so->so_vnet = vnet;
 349 #endif
 350         mtx_unlock(&so_global_mtx);
 351         return (so);
 352 }
 353
 354 /*
 355  * Free the storage associated with a socket at the socket layer, tear down
 356  * locks, labels, etc.  All protocol state is assumed already to have been
 357  * torn down (and possibly never set up) by the caller.
 358  */
 359 static void
 360 sodealloc(struct socket *so)
 361 {
 362
 363         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 364         KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 365
 366         mtx_lock(&so_global_mtx);
 367         so->so_gencnt = ++so_gencnt;
 368         --numopensockets;       /* Could be below, but faster here. */
 369 #ifdef VIMAGE
 370         VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 371             __func__, __LINE__, so));
 372         so->so_vnet->vnet_sockcnt--;
 373 #endif
 374         mtx_unlock(&so_global_mtx);
 375         if (so->so_rcv.sb_hiwat)
 376                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 377                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 378         if (so->so_snd.sb_hiwat)
 379                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 380                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 381         /* remove acccept filter if one is present. */
 382         if (so->so_accf != NULL)
 383                 do_setopt_accept_filter(so, NULL);
 384 #ifdef MAC
 385         mac_socket_destroy(so);
 386 #endif
 387         crfree(so->so_cred);
 388         sx_destroy(&so->so_snd.sb_sx);
 389         sx_destroy(&so->so_rcv.sb_sx);
 390         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 391         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 392         uma_zfree(socket_zone, so);
 393 }
 394
 395 /*
 396  * socreate returns a socket with a ref count of 1.  The socket should be
 397  * closed with soclose().
 398  */
 399 int
 400 socreate(int dom, struct socket **aso, int type, int proto,
 401     struct ucred *cred, struct thread *td)
 402 {
 403         struct protosw *prp;
 404         struct socket *so;
 405         int error;
 406
 407         if (proto)
 408                 prp = pffindproto(dom, proto, type);
 409         else
 410                 prp = pffindtype(dom, type);
 411
 412         if (prp == NULL) {
 413                 /* No support for domain. */
 414                 if (pffinddomain(dom) == NULL)
 415                         return (EAFNOSUPPORT);
 416                 /* No support for socket type. */
 417                 if (proto == 0 && type != 0)
 418                         return (EPROTOTYPE);
 419                 return (EPROTONOSUPPORT);
 420         }
 421         if (prp->pr_usrreqs->pru_attach == NULL ||
 422             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 423                 return (EPROTONOSUPPORT);
 424
 425         if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 426                 return (EPROTONOSUPPORT);
 427
 428         if (prp->pr_type != type)
 429                 return (EPROTOTYPE);
 430         so = soalloc(CRED_TO_VNET(cred));
 431         if (so == NULL)
 432                 return (ENOBUFS);
 433
 434         TAILQ_INIT(&so->so_incomp);
 435         TAILQ_INIT(&so->so_comp);
 436         so->so_type = type;
 437         so->so_cred = crhold(cred);
 438         if ((prp->pr_domain->dom_family == PF_INET) ||
 439             (prp->pr_domain->dom_family == PF_INET6) ||
 440             (prp->pr_domain->dom_family == PF_ROUTE))
 441                 so->so_fibnum = td->td_proc->p_fibnum;
 442         else
 443                 so->so_fibnum = 0;
 444         so->so_proto = prp;
 445 #ifdef MAC
 446         mac_socket_create(cred, so);
 447 #endif
 448         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 449         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 450         so->so_count = 1;
 451         /*
 452          * Auto-sizing of socket buffers is managed by the protocols and
 453          * the appropriate flags must be set in the pru_attach function.
 454          */
 455         CURVNET_SET(so->so_vnet);
 456         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 457         CURVNET_RESTORE();
 458         if (error) {
 459                 KASSERT(so->so_count == 1, ("socreate: so_count %d",
 460                     so->so_count));
 461                 so->so_count = 0;
 462                 sodealloc(so);
 463                 return (error);
 464         }
 465         *aso = so;
 466         return (0);
 467 }
 468
 469 #ifdef REGRESSION
 470 static int regression_sonewconn_earlytest = 1;
 471 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 472     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 473 #endif
 474
 475 /*
 476  * When an attempt at a new connection is noted on a socket which accepts
 477  * connections, sonewconn is called.  If the connection is possible (subject
 478  * to space constraints, etc.) then we allocate a new structure, propoerly
 479  * linked into the data structure of the original socket, and return this.
 480  * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
 481  *
 482  * Note: the ref count on the socket is 0 on return.
 483  */
 484 struct socket *
 485 sonewconn(struct socket *head, int connstatus)
 486 {
 487         static struct timeval lastover;
 488         static struct timeval overinterval = { 60, 0 };
 489         static int overcount;
 490
 491         struct socket *so;
 492         int over;
 493
 494         ACCEPT_LOCK();
 495         over = (head->so_qlen > 3 * head->so_qlimit / 2);
 496         ACCEPT_UNLOCK();
 497 #ifdef REGRESSION
 498         if (regression_sonewconn_earlytest && over) {
 499 #else
 500         if (over) {
 501 #endif
 502                 overcount++;
 503
 504                 if (ratecheck(&lastover, &overinterval)) {
 505                         log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
 506                             "%i already in queue awaiting acceptance "
 507                             "(%d occurrences)\n",
 508                             __func__, head->so_pcb, head->so_qlen, overcount);
 509
 510                         overcount = 0;
 511                 }
 512
 513                 return (NULL);
 514         }
 515         VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 516             __func__, __LINE__, head));
 517         so = soalloc(head->so_vnet);
 518         if (so == NULL) {
 519                 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 520                     "limit reached or out of memory\n",
 521                     __func__, head->so_pcb);
 522                 return (NULL);
 523         }
 524         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 525                 connstatus = 0;
 526         so->so_head = head;
 527         so->so_type = head->so_type;
 528         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 529         so->so_linger = head->so_linger;
 530         so->so_state = head->so_state | SS_NOFDREF;
 531         so->so_fibnum = head->so_fibnum;
 532         so->so_proto = head->so_proto;
 533         so->so_cred = crhold(head->so_cred);
 534 #ifdef MAC
 535         mac_socket_newconn(head, so);
 536 #endif
 537         knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 538         knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 539         VNET_SO_ASSERT(head);
 540         if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 541                 sodealloc(so);
 542                 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 543                     __func__, head->so_pcb);
 544                 return (NULL);
 545         }
 546         if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 547                 sodealloc(so);
 548                 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 549                     __func__, head->so_pcb);
 550                 return (NULL);
 551         }
 552         so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 553         so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 554         so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 555         so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 556         so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 557         so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 558         so->so_state |= connstatus;
 559         ACCEPT_LOCK();
 560         /*
 561          * The accept socket may be tearing down but we just
 562          * won a race on the ACCEPT_LOCK.
 563          * However, if sctp_peeloff() is called on a 1-to-many
 564          * style socket, the SO_ACCEPTCONN doesn't need to be set.
 565          */
 566         if (!(head->so_options & SO_ACCEPTCONN) &&
 567             ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
 568              (head->so_type != SOCK_SEQPACKET))) {
 569                 SOCK_LOCK(so);
 570                 so->so_head = NULL;
 571                 sofree(so);             /* NB: returns ACCEPT_UNLOCK'ed. */
 572                 return (NULL);
 573         }
 574         if (connstatus) {
 575                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 576                 so->so_qstate |= SQ_COMP;
 577                 head->so_qlen++;
 578         } else {
 579                 /*
 580                  * Keep removing sockets from the head until there's room for
 581                  * us to insert on the tail.  In pre-locking revisions, this
 582                  * was a simple if(), but as we could be racing with other
 583                  * threads and soabort() requires dropping locks, we must
 584                  * loop waiting for the condition to be true.
 585                  */
 586                 while (head->so_incqlen > head->so_qlimit) {
 587                         struct socket *sp;
 588                         sp = TAILQ_FIRST(&head->so_incomp);
 589                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 590                         head->so_incqlen--;
 591                         sp->so_qstate &= ~SQ_INCOMP;
 592                         sp->so_head = NULL;
 593                         ACCEPT_UNLOCK();
 594                         soabort(sp);
 595                         ACCEPT_LOCK();
 596                 }
 597                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 598                 so->so_qstate |= SQ_INCOMP;
 599                 head->so_incqlen++;
 600         }
 601         ACCEPT_UNLOCK();
 602         if (connstatus) {
 603                 sorwakeup(head);
 604                 wakeup_one(&head->so_timeo);
 605         }
 606         return (so);
 607 }
 608
 609 int
 610 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 611 {
 612         int error;
 613
 614         CURVNET_SET(so->so_vnet);
 615         error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 616         CURVNET_RESTORE();
 617         return (error);
 618 }
 619
 620 int
 621 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 622 {
 623         int error;
 624
 625         CURVNET_SET(so->so_vnet);
 626         error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
 627         CURVNET_RESTORE();
 628         return (error);
 629 }
 630
 631 /*
 632  * solisten() transitions a socket from a non-listening state to a listening
 633  * state, but can also be used to update the listen queue depth on an
 634  * existing listen socket.  The protocol will call back into the sockets
 635  * layer using solisten_proto_check() and solisten_proto() to check and set
 636  * socket-layer listen state.  Call backs are used so that the protocol can
 637  * acquire both protocol and socket layer locks in whatever order is required
 638  * by the protocol.
 639  *
 640  * Protocol implementors are advised to hold the socket lock across the
 641  * socket-layer test and set to avoid races at the socket layer.
 642  */
 643 int
 644 solisten(struct socket *so, int backlog, struct thread *td)
 645 {
 646         int error;
 647
 648         CURVNET_SET(so->so_vnet);
 649         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 650         CURVNET_RESTORE();
 651         return (error);
 652 }
 653
 654 int
 655 solisten_proto_check(struct socket *so)
 656 {
 657
 658         SOCK_LOCK_ASSERT(so);
 659
 660         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 661             SS_ISDISCONNECTING))
 662                 return (EINVAL);
 663         return (0);
 664 }
 665
 666 void
 667 solisten_proto(struct socket *so, int backlog)
 668 {
 669
 670         SOCK_LOCK_ASSERT(so);
 671
 672         if (backlog < 0 || backlog > somaxconn)
 673                 backlog = somaxconn;
 674         so->so_qlimit = backlog;
 675         so->so_options |= SO_ACCEPTCONN;
 676 }
 677
 678 /*
 679  * Evaluate the reference count and named references on a socket; if no
 680  * references remain, free it.  This should be called whenever a reference is
 681  * released, such as in sorele(), but also when named reference flags are
 682  * cleared in socket or protocol code.
 683  *
 684  * sofree() will free the socket if:
 685  *
 686  * - There are no outstanding file descriptor references or related consumers
 687  *   (so_count == 0).
 688  *
 689  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 690  *
 691  * - The protocol does not have an outstanding strong reference on the socket
 692  *   (SS_PROTOREF).
 693  *
 694  * - The socket is not in a completed connection queue, so a process has been
 695  *   notified that it is present.  If it is removed, the user process may
 696  *   block in accept() despite select() saying the socket was ready.
 697  */
 698 void
 699 sofree(struct socket *so)
 700 {
 701         struct protosw *pr = so->so_proto;
 702         struct socket *head;
 703
 704         ACCEPT_LOCK_ASSERT();
 705         SOCK_LOCK_ASSERT(so);
 706
 707         if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 708             (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 709                 SOCK_UNLOCK(so);
 710                 ACCEPT_UNLOCK();
 711                 return;
 712         }
 713
 714         head = so->so_head;
 715         if (head != NULL) {
 716                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 717                     (so->so_qstate & SQ_INCOMP) != 0,
 718                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 719                     "SQ_INCOMP"));
 720                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 721                     (so->so_qstate & SQ_INCOMP) == 0,
 722                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 723                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 724                 head->so_incqlen--;
 725                 so->so_qstate &= ~SQ_INCOMP;
 726                 so->so_head = NULL;
 727         }
 728         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 729             (so->so_qstate & SQ_INCOMP) == 0,
 730             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 731             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 732         if (so->so_options & SO_ACCEPTCONN) {
 733                 KASSERT((TAILQ_EMPTY(&so->so_comp)),
 734                     ("sofree: so_comp populated"));
 735                 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
 736                     ("sofree: so_incomp populated"));
 737         }
 738         SOCK_UNLOCK(so);
 739         ACCEPT_UNLOCK();
 740
 741         VNET_SO_ASSERT(so);
 742         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 743                 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 744         if (pr->pr_usrreqs->pru_detach != NULL)
 745                 (*pr->pr_usrreqs->pru_detach)(so);
 746
 747         /*
 748          * From this point on, we assume that no other references to this
 749          * socket exist anywhere else in the stack.  Therefore, no locks need
 750          * to be acquired or held.
 751          *
 752          * We used to do a lot of socket buffer and socket locking here, as
 753          * well as invoke sorflush() and perform wakeups.  The direct call to
 754          * dom_dispose() and sbrelease_internal() are an inlining of what was
 755          * necessary from sorflush().
 756          *
 757          * Notice that the socket buffer and kqueue state are torn down
 758          * before calling pru_detach.  This means that protocols shold not
 759          * assume they can perform socket wakeups, etc, in their detach code.
 760          */
 761         sbdestroy(&so->so_snd, so);
 762         sbdestroy(&so->so_rcv, so);
 763         seldrain(&so->so_snd.sb_sel);
 764         seldrain(&so->so_rcv.sb_sel);
 765         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 766         knlist_destroy(&so->so_snd.sb_sel.si_note);
 767         sodealloc(so);
 768 }
 769
 770 /*
 771  * Close a socket on last file table reference removal.  Initiate disconnect
 772  * if connected.  Free socket when disconnect complete.
 773  *
 774  * This function will sorele() the socket.  Note that soclose() may be called
 775  * prior to the ref count reaching zero.  The actual socket structure will
 776  * not be freed until the ref count reaches zero.
 777  */
 778 int
 779 soclose(struct socket *so)
 780 {
 781         int error = 0;
 782
 783         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 784
 785         CURVNET_SET(so->so_vnet);
 786         funsetown(&so->so_sigio);
 787         if (so->so_state & SS_ISCONNECTED) {
 788                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 789                         error = sodisconnect(so);
 790                         if (error) {
 791                                 if (error == ENOTCONN)
 792                                         error = 0;
 793                                 goto drop;
 794                         }
 795                 }
 796                 if (so->so_options & SO_LINGER) {
 797                         if ((so->so_state & SS_ISDISCONNECTING) &&
 798                             (so->so_state & SS_NBIO))
 799                                 goto drop;
 800                         while (so->so_state & SS_ISCONNECTED) {
 801                                 error = tsleep(&so->so_timeo,
 802                                     PSOCK | PCATCH, "soclos",
 803                                     so->so_linger * hz);
 804                                 if (error)
 805                                         break;
 806                         }
 807                 }
 808         }
 809
 810 drop:
 811         if (so->so_proto->pr_usrreqs->pru_close != NULL)
 812                 (*so->so_proto->pr_usrreqs->pru_close)(so);
 813         ACCEPT_LOCK();
 814         if (so->so_options & SO_ACCEPTCONN) {
 815                 struct socket *sp;
 816                 /*
 817                  * Prevent new additions to the accept queues due
 818                  * to ACCEPT_LOCK races while we are draining them.
 819                  */
 820                 so->so_options &= ~SO_ACCEPTCONN;
 821                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 822                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 823                         so->so_incqlen--;
 824                         sp->so_qstate &= ~SQ_INCOMP;
 825                         sp->so_head = NULL;
 826                         ACCEPT_UNLOCK();
 827                         soabort(sp);
 828                         ACCEPT_LOCK();
 829                 }
 830                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 831                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 832                         so->so_qlen--;
 833                         sp->so_qstate &= ~SQ_COMP;
 834                         sp->so_head = NULL;
 835                         ACCEPT_UNLOCK();
 836                         soabort(sp);
 837                         ACCEPT_LOCK();
 838                 }
 839                 KASSERT((TAILQ_EMPTY(&so->so_comp)),
 840                     ("%s: so_comp populated", __func__));
 841                 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
 842                     ("%s: so_incomp populated", __func__));
 843         }
 844         SOCK_LOCK(so);
 845         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 846         so->so_state |= SS_NOFDREF;
 847         sorele(so);                     /* NB: Returns with ACCEPT_UNLOCK(). */
 848         CURVNET_RESTORE();
 849         return (error);
 850 }
 851
 852 /*
 853  * soabort() is used to abruptly tear down a connection, such as when a
 854  * resource limit is reached (listen queue depth exceeded), or if a listen
 855  * socket is closed while there are sockets waiting to be accepted.
 856  *
 857  * This interface is tricky, because it is called on an unreferenced socket,
 858  * and must be called only by a thread that has actually removed the socket
 859  * from the listen queue it was on, or races with other threads are risked.
 860  *
 861  * This interface will call into the protocol code, so must not be called
 862  * with any socket locks held.  Protocols do call it while holding their own
 863  * recursible protocol mutexes, but this is something that should be subject
 864  * to review in the future.
 865  */
 866 void
 867 soabort(struct socket *so)
 868 {
 869
 870         /*
 871          * In as much as is possible, assert that no references to this
 872          * socket are held.  This is not quite the same as asserting that the
 873          * current thread is responsible for arranging for no references, but
 874          * is as close as we can get for now.
 875          */
 876         KASSERT(so->so_count == 0, ("soabort: so_count"));
 877         KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 878         KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 879         KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 880         KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 881         VNET_SO_ASSERT(so);
 882
 883         if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 884                 (*so->so_proto->pr_usrreqs->pru_abort)(so);
 885         ACCEPT_LOCK();
 886         SOCK_LOCK(so);
 887         sofree(so);
 888 }
 889
 890 int
 891 soaccept(struct socket *so, struct sockaddr **nam)
 892 {
 893         int error;
 894
 895         SOCK_LOCK(so);
 896         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 897         so->so_state &= ~SS_NOFDREF;
 898         SOCK_UNLOCK(so);
 899
 900         CURVNET_SET(so->so_vnet);
 901         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 902         CURVNET_RESTORE();
 903         return (error);
 904 }
 905
 906 int
 907 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 908 {
 909
 910         return (soconnectat(AT_FDCWD, so, nam, td));
 911 }
 912
 913 int
 914 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 915 {
 916         int error;
 917
 918         if (so->so_options & SO_ACCEPTCONN)
 919                 return (EOPNOTSUPP);
 920
 921         CURVNET_SET(so->so_vnet);
 922         /*
 923          * If protocol is connection-based, can only connect once.
 924          * Otherwise, if connected, try to disconnect first.  This allows
 925          * user to disconnect by connecting to, e.g., a null address.
 926          */
 927         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 928             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 929             (error = sodisconnect(so)))) {
 930                 error = EISCONN;
 931         } else {
 932                 /*
 933                  * Prevent accumulated error from previous connection from
 934                  * biting us.
 935                  */
 936                 so->so_error = 0;
 937                 if (fd == AT_FDCWD) {
 938                         error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
 939                             nam, td);
 940                 } else {
 941                         error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
 942                             so, nam, td);
 943                 }
 944         }
 945         CURVNET_RESTORE();
 946
 947         return (error);
 948 }
 949
 950 int
 951 soconnect2(struct socket *so1, struct socket *so2)
 952 {
 953         int error;
 954
 955         CURVNET_SET(so1->so_vnet);
 956         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 957         CURVNET_RESTORE();
 958         return (error);
 959 }
 960
 961 int
 962 sodisconnect(struct socket *so)
 963 {
 964         int error;
 965
 966         if ((so->so_state & SS_ISCONNECTED) == 0)
 967                 return (ENOTCONN);
 968         if (so->so_state & SS_ISDISCONNECTING)
 969                 return (EALREADY);
 970         VNET_SO_ASSERT(so);
 971         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 972         return (error);
 973 }
 974
 975 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 976
 977 int
 978 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 979     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 980 {
 981         long space;
 982         ssize_t resid;
 983         int clen = 0, error, dontroute;
 984
 985         KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
 986         KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 987             ("sosend_dgram: !PR_ATOMIC"));
 988
 989         if (uio != NULL)
 990                 resid = uio->uio_resid;
 991         else
 992                 resid = top->m_pkthdr.len;
 993         /*
 994          * In theory resid should be unsigned.  However, space must be
 995          * signed, as it might be less than 0 if we over-committed, and we
 996          * must use a signed comparison of space and resid.  On the other
 997          * hand, a negative resid causes us to loop sending 0-length
 998          * segments to the protocol.
 999          */
1000         if (resid < 0) {
1001                 error = EINVAL;
1002                 goto out;
1003         }
1004
1005         dontroute =
1006             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1007         if (td != NULL)
1008                 td->td_ru.ru_msgsnd++;
1009         if (control != NULL)
1010                 clen = control->m_len;
1011
1012         SOCKBUF_LOCK(&so->so_snd);
1013         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1014                 SOCKBUF_UNLOCK(&so->so_snd);
1015                 error = EPIPE;
1016                 goto out;
1017         }
1018         if (so->so_error) {
1019                 error = so->so_error;
1020                 so->so_error = 0;
1021                 SOCKBUF_UNLOCK(&so->so_snd);
1022                 goto out;
1023         }
1024         if ((so->so_state & SS_ISCONNECTED) == 0) {
1025                 /*
1026                  * `sendto' and `sendmsg' is allowed on a connection-based
1027                  * socket if it supports implied connect.  Return ENOTCONN if
1028                  * not connected and no address is supplied.
1029                  */
1030                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1031                     (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1032                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1033                             !(resid == 0 && clen != 0)) {
1034                                 SOCKBUF_UNLOCK(&so->so_snd);
1035                                 error = ENOTCONN;
1036                                 goto out;
1037                         }
1038                 } else if (addr == NULL) {
1039                         if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1040                                 error = ENOTCONN;
1041                         else
1042                                 error = EDESTADDRREQ;
1043                         SOCKBUF_UNLOCK(&so->so_snd);
1044                         goto out;
1045                 }
1046         }
1047
1048         /*
1049          * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1050          * problem and need fixing.
1051          */
1052         space = sbspace(&so->so_snd);
1053         if (flags & MSG_OOB)
1054                 space += 1024;
1055         space -= clen;
1056         SOCKBUF_UNLOCK(&so->so_snd);
1057         if (resid > space) {
1058                 error = EMSGSIZE;
1059                 goto out;
1060         }
1061         if (uio == NULL) {
1062                 resid = 0;
1063                 if (flags & MSG_EOR)
1064                         top->m_flags |= M_EOR;
1065         } else {
1066                 /*
1067                  * Copy the data from userland into a mbuf chain.
1068                  * If no data is to be copied in, a single empty mbuf
1069                  * is returned.
1070                  */
1071                 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1072                     (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1073                 if (top == NULL) {
1074                         error = EFAULT; /* only possible error */
1075                         goto out;
1076                 }
1077                 space -= resid - uio->uio_resid;
1078                 resid = uio->uio_resid;
1079         }
1080         KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1081         /*
1082          * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1083          * than with.
1084          */
1085         if (dontroute) {
1086                 SOCK_LOCK(so);
1087                 so->so_options |= SO_DONTROUTE;
1088                 SOCK_UNLOCK(so);
1089         }
1090         /*
1091          * XXX all the SBS_CANTSENDMORE checks previously done could be out
1092          * of date.  We could have recieved a reset packet in an interrupt or
1093          * maybe we slept while doing page faults in uiomove() etc.  We could
1094          * probably recheck again inside the locking protection here, but
1095          * there are probably other places that this also happens.  We must
1096          * rethink this.
1097          */
1098         VNET_SO_ASSERT(so);
1099         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1100             (flags & MSG_OOB) ? PRUS_OOB :
1101         /*
1102          * If the user set MSG_EOF, the protocol understands this flag and
1103          * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1104          */
1105             ((flags & MSG_EOF) &&
1106              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1107              (resid <= 0)) ?
1108                 PRUS_EOF :
1109                 /* If there is more to send set PRUS_MORETOCOME */
1110                 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1111                 top, addr, control, td);
1112         if (dontroute) {
1113                 SOCK_LOCK(so);
1114                 so->so_options &= ~SO_DONTROUTE;
1115                 SOCK_UNLOCK(so);
1116         }
1117         clen = 0;
1118         control = NULL;
1119         top = NULL;
1120 out:
1121         if (top != NULL)
1122                 m_freem(top);
1123         if (control != NULL)
1124                 m_freem(control);
1125         return (error);
1126 }
1127
1128 /*
1129  * Send on a socket.  If send must go all at once and message is larger than
1130  * send buffering, then hard error.  Lock against other senders.  If must go
1131  * all at once and not enough room now, then inform user that this would
1132  * block and do nothing.  Otherwise, if nonblocking, send as much as
1133  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1134  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1135  * in mbuf chain must be small enough to send all at once.
1136  *
1137  * Returns nonzero on error, timeout or signal; callers must check for short
1138  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1139  * on return.
1140  */
1141 int
1142 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1143     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1144 {
1145         long space;
1146         ssize_t resid;
1147         int clen = 0, error, dontroute;
1148         int atomic = sosendallatonce(so) || top;
1149
1150         if (uio != NULL)
1151                 resid = uio->uio_resid;
1152         else
1153                 resid = top->m_pkthdr.len;
1154         /*
1155          * In theory resid should be unsigned.  However, space must be
1156          * signed, as it might be less than 0 if we over-committed, and we
1157          * must use a signed comparison of space and resid.  On the other
1158          * hand, a negative resid causes us to loop sending 0-length
1159          * segments to the protocol.
1160          *
1161          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1162          * type sockets since that's an error.
1163          */
1164         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1165                 error = EINVAL;
1166                 goto out;
1167         }
1168
1169         dontroute =
1170             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1171             (so->so_proto->pr_flags & PR_ATOMIC);
1172         if (td != NULL)
1173                 td->td_ru.ru_msgsnd++;
1174         if (control != NULL)
1175                 clen = control->m_len;
1176
1177         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1178         if (error)
1179                 goto out;
1180
1181 restart:
1182         do {
1183                 SOCKBUF_LOCK(&so->so_snd);
1184                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1185                         SOCKBUF_UNLOCK(&so->so_snd);
1186                         error = EPIPE;
1187                         goto release;
1188                 }
1189                 if (so->so_error) {
1190                         error = so->so_error;
1191                         so->so_error = 0;
1192                         SOCKBUF_UNLOCK(&so->so_snd);
1193                         goto release;
1194                 }
1195                 if ((so->so_state & SS_ISCONNECTED) == 0) {
1196                         /*
1197                          * `sendto' and `sendmsg' is allowed on a connection-
1198                          * based socket if it supports implied connect.
1199                          * Return ENOTCONN if not connected and no address is
1200                          * supplied.
1201                          */
1202                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1203                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1204                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1205                                     !(resid == 0 && clen != 0)) {
1206                                         SOCKBUF_UNLOCK(&so->so_snd);
1207                                         error = ENOTCONN;
1208                                         goto release;
1209                                 }
1210                         } else if (addr == NULL) {
1211                                 SOCKBUF_UNLOCK(&so->so_snd);
1212                                 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1213                                         error = ENOTCONN;
1214                                 else
1215                                         error = EDESTADDRREQ;
1216                                 goto release;
1217                         }
1218                 }
1219                 space = sbspace(&so->so_snd);
1220                 if (flags & MSG_OOB)
1221                         space += 1024;
1222                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1223                     clen > so->so_snd.sb_hiwat) {
1224                         SOCKBUF_UNLOCK(&so->so_snd);
1225                         error = EMSGSIZE;
1226                         goto release;
1227                 }
1228                 if (space < resid + clen &&
1229                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1230                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1231                                 SOCKBUF_UNLOCK(&so->so_snd);
1232                                 error = EWOULDBLOCK;
1233                                 goto release;
1234                         }
1235                         error = sbwait(&so->so_snd);
1236                         SOCKBUF_UNLOCK(&so->so_snd);
1237                         if (error)
1238                                 goto release;
1239                         goto restart;
1240                 }
1241                 SOCKBUF_UNLOCK(&so->so_snd);
1242                 space -= clen;
1243                 do {
1244                         if (uio == NULL) {
1245                                 resid = 0;
1246                                 if (flags & MSG_EOR)
1247                                         top->m_flags |= M_EOR;
1248                         } else {
1249                                 /*
1250                                  * Copy the data from userland into a mbuf
1251                                  * chain.  If no data is to be copied in,
1252                                  * a single empty mbuf is returned.
1253                                  */
1254                                 top = m_uiotombuf(uio, M_WAITOK, space,
1255                                     (atomic ? max_hdr : 0),
1256                                     (atomic ? M_PKTHDR : 0) |
1257                                     ((flags & MSG_EOR) ? M_EOR : 0));
1258                                 if (top == NULL) {
1259                                         error = EFAULT; /* only possible error */
1260                                         goto release;
1261                                 }
1262                                 space -= resid - uio->uio_resid;
1263                                 resid = uio->uio_resid;
1264                         }
1265                         if (dontroute) {
1266                                 SOCK_LOCK(so);
1267                                 so->so_options |= SO_DONTROUTE;
1268                                 SOCK_UNLOCK(so);
1269                         }
1270                         /*
1271                          * XXX all the SBS_CANTSENDMORE checks previously
1272                          * done could be out of date.  We could have recieved
1273                          * a reset packet in an interrupt or maybe we slept
1274                          * while doing page faults in uiomove() etc.  We
1275                          * could probably recheck again inside the locking
1276                          * protection here, but there are probably other
1277                          * places that this also happens.  We must rethink
1278                          * this.
1279                          */
1280                         VNET_SO_ASSERT(so);
1281                         error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1282                             (flags & MSG_OOB) ? PRUS_OOB :
1283                         /*
1284                          * If the user set MSG_EOF, the protocol understands
1285                          * this flag and nothing left to send then use
1286                          * PRU_SEND_EOF instead of PRU_SEND.
1287                          */
1288                             ((flags & MSG_EOF) &&
1289                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1290                              (resid <= 0)) ?
1291                                 PRUS_EOF :
1292                         /* If there is more to send set PRUS_MORETOCOME. */
1293                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1294                             top, addr, control, td);
1295                         if (dontroute) {
1296                                 SOCK_LOCK(so);
1297                                 so->so_options &= ~SO_DONTROUTE;
1298                                 SOCK_UNLOCK(so);
1299                         }
1300                         clen = 0;
1301                         control = NULL;
1302                         top = NULL;
1303                         if (error)
1304                                 goto release;
1305                 } while (resid && space > 0);
1306         } while (resid);
1307
1308 release:
1309         sbunlock(&so->so_snd);
1310 out:
1311         if (top != NULL)
1312                 m_freem(top);
1313         if (control != NULL)
1314                 m_freem(control);
1315         return (error);
1316 }
1317
1318 int
1319 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1320     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1321 {
1322         int error;
1323
1324         CURVNET_SET(so->so_vnet);
1325         error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1326             control, flags, td);
1327         CURVNET_RESTORE();
1328         return (error);
1329 }
1330
1331 /*
1332  * The part of soreceive() that implements reading non-inline out-of-band
1333  * data from a socket.  For more complete comments, see soreceive(), from
1334  * which this code originated.
1335  *
1336  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1337  * unable to return an mbuf chain to the caller.
1338  */
1339 static int
1340 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1341 {
1342         struct protosw *pr = so->so_proto;
1343         struct mbuf *m;
1344         int error;
1345
1346         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1347         VNET_SO_ASSERT(so);
1348
1349         m = m_get(M_WAITOK, MT_DATA);
1350         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1351         if (error)
1352                 goto bad;
1353         do {
1354                 error = uiomove(mtod(m, void *),
1355                     (int) min(uio->uio_resid, m->m_len), uio);
1356                 m = m_free(m);
1357         } while (uio->uio_resid && error == 0 && m);
1358 bad:
1359         if (m != NULL)
1360                 m_freem(m);
1361         return (error);
1362 }
1363
1364 /*
1365  * Following replacement or removal of the first mbuf on the first mbuf chain
1366  * of a socket buffer, push necessary state changes back into the socket
1367  * buffer so that other consumers see the values consistently.  'nextrecord'
1368  * is the callers locally stored value of the original value of
1369  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1370  * NOTE: 'nextrecord' may be NULL.
1371  */
1372 static __inline void
1373 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1374 {
1375
1376         SOCKBUF_LOCK_ASSERT(sb);
1377         /*
1378          * First, update for the new value of nextrecord.  If necessary, make
1379          * it the first record.
1380          */
1381         if (sb->sb_mb != NULL)
1382                 sb->sb_mb->m_nextpkt = nextrecord;
1383         else
1384                 sb->sb_mb = nextrecord;
1385
1386         /*
1387          * Now update any dependent socket buffer fields to reflect the new
1388          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1389          * addition of a second clause that takes care of the case where
1390          * sb_mb has been updated, but remains the last record.
1391          */
1392         if (sb->sb_mb == NULL) {
1393                 sb->sb_mbtail = NULL;
1394                 sb->sb_lastrecord = NULL;
1395         } else if (sb->sb_mb->m_nextpkt == NULL)
1396                 sb->sb_lastrecord = sb->sb_mb;
1397 }
1398
1399 /*
1400  * Implement receive operations on a socket.  We depend on the way that
1401  * records are added to the sockbuf by sbappend.  In particular, each record
1402  * (mbufs linked through m_next) must begin with an address if the protocol
1403  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1404  * data, and then zero or more mbufs of data.  In order to allow parallelism
1405  * between network receive and copying to user space, as well as avoid
1406  * sleeping with a mutex held, we release the socket buffer mutex during the
1407  * user space copy.  Although the sockbuf is locked, new data may still be
1408  * appended, and thus we must maintain consistency of the sockbuf during that
1409  * time.
1410  *
1411  * The caller may receive the data as a single mbuf chain by supplying an
1412  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1413  * the count in uio_resid.
1414  */
1415 int
1416 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1417     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1418 {
1419         struct mbuf *m, **mp;
1420         int flags, error, offset;
1421         ssize_t len;
1422         struct protosw *pr = so->so_proto;
1423         struct mbuf *nextrecord;
1424         int moff, type = 0;
1425         ssize_t orig_resid = uio->uio_resid;
1426
1427         mp = mp0;
1428         if (psa != NULL)
1429                 *psa = NULL;
1430         if (controlp != NULL)
1431                 *controlp = NULL;
1432         if (flagsp != NULL)
1433                 flags = *flagsp &~ MSG_EOR;
1434         else
1435                 flags = 0;
1436         if (flags & MSG_OOB)
1437                 return (soreceive_rcvoob(so, uio, flags));
1438         if (mp != NULL)
1439                 *mp = NULL;
1440         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1441             && uio->uio_resid) {
1442                 VNET_SO_ASSERT(so);
1443                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1444         }
1445
1446         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1447         if (error)
1448                 return (error);
1449
1450 restart:
1451         SOCKBUF_LOCK(&so->so_rcv);
1452         m = so->so_rcv.sb_mb;
1453         /*
1454          * If we have less data than requested, block awaiting more (subject
1455          * to any timeout) if:
1456          *   1. the current count is less than the low water mark, or
1457          *   2. MSG_DONTWAIT is not set
1458          */
1459         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1460             so->so_rcv.sb_cc < uio->uio_resid) &&
1461             so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1462             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1463                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1464                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1465                     m, so->so_rcv.sb_cc));
1466                 if (so->so_error) {
1467                         if (m != NULL)
1468                                 goto dontblock;
1469                         error = so->so_error;
1470                         if ((flags & MSG_PEEK) == 0)
1471                                 so->so_error = 0;
1472                         SOCKBUF_UNLOCK(&so->so_rcv);
1473                         goto release;
1474                 }
1475                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1476                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1477                         if (m == NULL) {
1478                                 SOCKBUF_UNLOCK(&so->so_rcv);
1479                                 goto release;
1480                         } else
1481                                 goto dontblock;
1482                 }
1483                 for (; m != NULL; m = m->m_next)
1484                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1485                                 m = so->so_rcv.sb_mb;
1486                                 goto dontblock;
1487                         }
1488                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1489                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1490                         SOCKBUF_UNLOCK(&so->so_rcv);
1491                         error = ENOTCONN;
1492                         goto release;
1493                 }
1494                 if (uio->uio_resid == 0) {
1495                         SOCKBUF_UNLOCK(&so->so_rcv);
1496                         goto release;
1497                 }
1498                 if ((so->so_state & SS_NBIO) ||
1499                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1500                         SOCKBUF_UNLOCK(&so->so_rcv);
1501                         error = EWOULDBLOCK;
1502                         goto release;
1503                 }
1504                 SBLASTRECORDCHK(&so->so_rcv);
1505                 SBLASTMBUFCHK(&so->so_rcv);
1506                 error = sbwait(&so->so_rcv);
1507                 SOCKBUF_UNLOCK(&so->so_rcv);
1508                 if (error)
1509                         goto release;
1510                 goto restart;
1511         }
1512 dontblock:
1513         /*
1514          * From this point onward, we maintain 'nextrecord' as a cache of the
1515          * pointer to the next record in the socket buffer.  We must keep the
1516          * various socket buffer pointers and local stack versions of the
1517          * pointers in sync, pushing out modifications before dropping the
1518          * socket buffer mutex, and re-reading them when picking it up.
1519          *
1520          * Otherwise, we will race with the network stack appending new data
1521          * or records onto the socket buffer by using inconsistent/stale
1522          * versions of the field, possibly resulting in socket buffer
1523          * corruption.
1524          *
1525          * By holding the high-level sblock(), we prevent simultaneous
1526          * readers from pulling off the front of the socket buffer.
1527          */
1528         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1529         if (uio->uio_td)
1530                 uio->uio_td->td_ru.ru_msgrcv++;
1531         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1532         SBLASTRECORDCHK(&so->so_rcv);
1533         SBLASTMBUFCHK(&so->so_rcv);
1534         nextrecord = m->m_nextpkt;
1535         if (pr->pr_flags & PR_ADDR) {
1536                 KASSERT(m->m_type == MT_SONAME,
1537                     ("m->m_type == %d", m->m_type));
1538                 orig_resid = 0;
1539                 if (psa != NULL)
1540                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1541                             M_NOWAIT);
1542                 if (flags & MSG_PEEK) {
1543                         m = m->m_next;
1544                 } else {
1545                         sbfree(&so->so_rcv, m);
1546                         so->so_rcv.sb_mb = m_free(m);
1547                         m = so->so_rcv.sb_mb;
1548                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1549                 }
1550         }
1551
1552         /*
1553          * Process one or more MT_CONTROL mbufs present before any data mbufs
1554          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1555          * just copy the data; if !MSG_PEEK, we call into the protocol to
1556          * perform externalization (or freeing if controlp == NULL).
1557          */
1558         if (m != NULL && m->m_type == MT_CONTROL) {
1559                 struct mbuf *cm = NULL, *cmn;
1560                 struct mbuf **cme = &cm;
1561
1562                 do {
1563                         if (flags & MSG_PEEK) {
1564                                 if (controlp != NULL) {
1565                                         *controlp = m_copy(m, 0, m->m_len);
1566                                         controlp = &(*controlp)->m_next;
1567                                 }
1568                                 m = m->m_next;
1569                         } else {
1570                                 sbfree(&so->so_rcv, m);
1571                                 so->so_rcv.sb_mb = m->m_next;
1572                                 m->m_next = NULL;
1573                                 *cme = m;
1574                                 cme = &(*cme)->m_next;
1575                                 m = so->so_rcv.sb_mb;
1576                         }
1577                 } while (m != NULL && m->m_type == MT_CONTROL);
1578                 if ((flags & MSG_PEEK) == 0)
1579                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1580                 while (cm != NULL) {
1581                         cmn = cm->m_next;
1582                         cm->m_next = NULL;
1583                         if (pr->pr_domain->dom_externalize != NULL) {
1584                                 SOCKBUF_UNLOCK(&so->so_rcv);
1585                                 VNET_SO_ASSERT(so);
1586                                 error = (*pr->pr_domain->dom_externalize)
1587                                     (cm, controlp, flags);
1588                                 SOCKBUF_LOCK(&so->so_rcv);
1589                         } else if (controlp != NULL)
1590                                 *controlp = cm;
1591                         else
1592                                 m_freem(cm);
1593                         if (controlp != NULL) {
1594                                 orig_resid = 0;
1595                                 while (*controlp != NULL)
1596                                         controlp = &(*controlp)->m_next;
1597                         }
1598                         cm = cmn;
1599                 }
1600                 if (m != NULL)
1601                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1602                 else
1603                         nextrecord = so->so_rcv.sb_mb;
1604                 orig_resid = 0;
1605         }
1606         if (m != NULL) {
1607                 if ((flags & MSG_PEEK) == 0) {
1608                         KASSERT(m->m_nextpkt == nextrecord,
1609                             ("soreceive: post-control, nextrecord !sync"));
1610                         if (nextrecord == NULL) {
1611                                 KASSERT(so->so_rcv.sb_mb == m,
1612                                     ("soreceive: post-control, sb_mb!=m"));
1613                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1614                                     ("soreceive: post-control, lastrecord!=m"));
1615                         }
1616                 }
1617                 type = m->m_type;
1618                 if (type == MT_OOBDATA)
1619                         flags |= MSG_OOB;
1620         } else {
1621                 if ((flags & MSG_PEEK) == 0) {
1622                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1623                             ("soreceive: sb_mb != nextrecord"));
1624                         if (so->so_rcv.sb_mb == NULL) {
1625                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1626                                     ("soreceive: sb_lastercord != NULL"));
1627                         }
1628                 }
1629         }
1630         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1631         SBLASTRECORDCHK(&so->so_rcv);
1632         SBLASTMBUFCHK(&so->so_rcv);
1633
1634         /*
1635          * Now continue to read any data mbufs off of the head of the socket
1636          * buffer until the read request is satisfied.  Note that 'type' is
1637          * used to store the type of any mbuf reads that have happened so far
1638          * such that soreceive() can stop reading if the type changes, which
1639          * causes soreceive() to return only one of regular data and inline
1640          * out-of-band data in a single socket receive operation.
1641          */
1642         moff = 0;
1643         offset = 0;
1644         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1645                 /*
1646                  * If the type of mbuf has changed since the last mbuf
1647                  * examined ('type'), end the receive operation.
1648                  */
1649                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1650                 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1651                         if (type != m->m_type)
1652                                 break;
1653                 } else if (type == MT_OOBDATA)
1654                         break;
1655                 else
1656                     KASSERT(m->m_type == MT_DATA,
1657                         ("m->m_type == %d", m->m_type));
1658                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1659                 len = uio->uio_resid;
1660                 if (so->so_oobmark && len > so->so_oobmark - offset)
1661                         len = so->so_oobmark - offset;
1662                 if (len > m->m_len - moff)
1663                         len = m->m_len - moff;
1664                 /*
1665                  * If mp is set, just pass back the mbufs.  Otherwise copy
1666                  * them out via the uio, then free.  Sockbuf must be
1667                  * consistent here (points to current mbuf, it points to next
1668                  * record) when we drop priority; we must note any additions
1669                  * to the sockbuf when we block interrupts again.
1670                  */
1671                 if (mp == NULL) {
1672                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1673                         SBLASTRECORDCHK(&so->so_rcv);
1674                         SBLASTMBUFCHK(&so->so_rcv);
1675                         SOCKBUF_UNLOCK(&so->so_rcv);
1676                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1677                         SOCKBUF_LOCK(&so->so_rcv);
1678                         if (error) {
1679                                 /*
1680                                  * The MT_SONAME mbuf has already been removed
1681                                  * from the record, so it is necessary to
1682                                  * remove the data mbufs, if any, to preserve
1683                                  * the invariant in the case of PR_ADDR that
1684                                  * requires MT_SONAME mbufs at the head of
1685                                  * each record.
1686                                  */
1687                                 if (m && pr->pr_flags & PR_ATOMIC &&
1688                                     ((flags & MSG_PEEK) == 0))
1689                                         (void)sbdroprecord_locked(&so->so_rcv);
1690                                 SOCKBUF_UNLOCK(&so->so_rcv);
1691                                 goto release;
1692                         }
1693                 } else
1694                         uio->uio_resid -= len;
1695                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1696                 if (len == m->m_len - moff) {
1697                         if (m->m_flags & M_EOR)
1698                                 flags |= MSG_EOR;
1699                         if (flags & MSG_PEEK) {
1700                                 m = m->m_next;
1701                                 moff = 0;
1702                         } else {
1703                                 nextrecord = m->m_nextpkt;
1704                                 sbfree(&so->so_rcv, m);
1705                                 if (mp != NULL) {
1706                                         m->m_nextpkt = NULL;
1707                                         *mp = m;
1708                                         mp = &m->m_next;
1709                                         so->so_rcv.sb_mb = m = m->m_next;
1710                                         *mp = NULL;
1711                                 } else {
1712                                         so->so_rcv.sb_mb = m_free(m);
1713                                         m = so->so_rcv.sb_mb;
1714                                 }
1715                                 sockbuf_pushsync(&so->so_rcv, nextrecord);
1716                                 SBLASTRECORDCHK(&so->so_rcv);
1717                                 SBLASTMBUFCHK(&so->so_rcv);
1718                         }
1719                 } else {
1720                         if (flags & MSG_PEEK)
1721                                 moff += len;
1722                         else {
1723                                 if (mp != NULL) {
1724                                         if (flags & MSG_DONTWAIT) {
1725                                                 *mp = m_copym(m, 0, len,
1726                                                     M_NOWAIT);
1727                                                 if (*mp == NULL) {
1728                                                         /*
1729                                                          * m_copym() couldn't
1730                                                          * allocate an mbuf.
1731                                                          * Adjust uio_resid back
1732                                                          * (it was adjusted
1733                                                          * down by len bytes,
1734                                                          * which we didn't end
1735                                                          * up "copying" over).
1736                                                          */
1737                                                         uio->uio_resid += len;
1738                                                         break;
1739                                                 }
1740                                         } else {
1741                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1742                                                 *mp = m_copym(m, 0, len,
1743                                                     M_WAITOK);
1744                                                 SOCKBUF_LOCK(&so->so_rcv);
1745                                         }
1746                                 }
1747                                 m->m_data += len;
1748                                 m->m_len -= len;
1749                                 so->so_rcv.sb_cc -= len;
1750                         }
1751                 }
1752                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1753                 if (so->so_oobmark) {
1754                         if ((flags & MSG_PEEK) == 0) {
1755                                 so->so_oobmark -= len;
1756                                 if (so->so_oobmark == 0) {
1757                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1758                                         break;
1759                                 }
1760                         } else {
1761                                 offset += len;
1762                                 if (offset == so->so_oobmark)
1763                                         break;
1764                         }
1765                 }
1766                 if (flags & MSG_EOR)
1767                         break;
1768                 /*
1769                  * If the MSG_WAITALL flag is set (for non-atomic socket), we
1770                  * must not quit until "uio->uio_resid == 0" or an error
1771                  * termination.  If a signal/timeout occurs, return with a
1772                  * short count but without error.  Keep sockbuf locked
1773                  * against other readers.
1774                  */
1775                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1776                     !sosendallatonce(so) && nextrecord == NULL) {
1777                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1778                         if (so->so_error ||
1779                             so->so_rcv.sb_state & SBS_CANTRCVMORE)
1780                                 break;
1781                         /*
1782                          * Notify the protocol that some data has been
1783                          * drained before blocking.
1784                          */
1785                         if (pr->pr_flags & PR_WANTRCVD) {
1786                                 SOCKBUF_UNLOCK(&so->so_rcv);
1787                                 VNET_SO_ASSERT(so);
1788                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1789                                 SOCKBUF_LOCK(&so->so_rcv);
1790                         }
1791                         SBLASTRECORDCHK(&so->so_rcv);
1792                         SBLASTMBUFCHK(&so->so_rcv);
1793                         /*
1794                          * We could receive some data while was notifying
1795                          * the protocol. Skip blocking in this case.
1796                          */
1797                         if (so->so_rcv.sb_mb == NULL) {
1798                                 error = sbwait(&so->so_rcv);
1799                                 if (error) {
1800                                         SOCKBUF_UNLOCK(&so->so_rcv);
1801                                         goto release;
1802                                 }
1803                         }
1804                         m = so->so_rcv.sb_mb;
1805                         if (m != NULL)
1806                                 nextrecord = m->m_nextpkt;
1807                 }
1808         }
1809
1810         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1812                 flags |= MSG_TRUNC;
1813                 if ((flags & MSG_PEEK) == 0)
1814                         (void) sbdroprecord_locked(&so->so_rcv);
1815         }
1816         if ((flags & MSG_PEEK) == 0) {
1817                 if (m == NULL) {
1818                         /*
1819                          * First part is an inline SB_EMPTY_FIXUP().  Second
1820                          * part makes sure sb_lastrecord is up-to-date if
1821                          * there is still data in the socket buffer.
1822                          */
1823                         so->so_rcv.sb_mb = nextrecord;
1824                         if (so->so_rcv.sb_mb == NULL) {
1825                                 so->so_rcv.sb_mbtail = NULL;
1826                                 so->so_rcv.sb_lastrecord = NULL;
1827                         } else if (nextrecord->m_nextpkt == NULL)
1828                                 so->so_rcv.sb_lastrecord = nextrecord;
1829                 }
1830                 SBLASTRECORDCHK(&so->so_rcv);
1831                 SBLASTMBUFCHK(&so->so_rcv);
1832                 /*
1833                  * If soreceive() is being done from the socket callback,
1834                  * then don't need to generate ACK to peer to update window,
1835                  * since ACK will be generated on return to TCP.
1836                  */
1837                 if (!(flags & MSG_SOCALLBCK) &&
1838                     (pr->pr_flags & PR_WANTRCVD)) {
1839                         SOCKBUF_UNLOCK(&so->so_rcv);
1840                         VNET_SO_ASSERT(so);
1841                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1842                         SOCKBUF_LOCK(&so->so_rcv);
1843                 }
1844         }
1845         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1846         if (orig_resid == uio->uio_resid && orig_resid &&
1847             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1848                 SOCKBUF_UNLOCK(&so->so_rcv);
1849                 goto restart;
1850         }
1851         SOCKBUF_UNLOCK(&so->so_rcv);
1852
1853         if (flagsp != NULL)
1854                 *flagsp |= flags;
1855 release:
1856         sbunlock(&so->so_rcv);
1857         return (error);
1858 }
1859
1860 /*
1861  * Optimized version of soreceive() for stream (TCP) sockets.
1862  * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1863  */
1864 int
1865 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1866     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1867 {
1868         int len = 0, error = 0, flags, oresid;
1869         struct sockbuf *sb;
1870         struct mbuf *m, *n = NULL;
1871
1872         /* We only do stream sockets. */
1873         if (so->so_type != SOCK_STREAM)
1874                 return (EINVAL);
1875         if (psa != NULL)
1876                 *psa = NULL;
1877         if (controlp != NULL)
1878                 return (EINVAL);
1879         if (flagsp != NULL)
1880                 flags = *flagsp &~ MSG_EOR;
1881         else
1882                 flags = 0;
1883         if (flags & MSG_OOB)
1884                 return (soreceive_rcvoob(so, uio, flags));
1885         if (mp0 != NULL)
1886                 *mp0 = NULL;
1887
1888         sb = &so->so_rcv;
1889
1890         /* Prevent other readers from entering the socket. */
1891         error = sblock(sb, SBLOCKWAIT(flags));
1892         if (error)
1893                 goto out;
1894         SOCKBUF_LOCK(sb);
1895
1896         /* Easy one, no space to copyout anything. */
1897         if (uio->uio_resid == 0) {
1898                 error = EINVAL;
1899                 goto out;
1900         }
1901         oresid = uio->uio_resid;
1902
1903         /* We will never ever get anything unless we are or were connected. */
1904         if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1905                 error = ENOTCONN;
1906                 goto out;
1907         }
1908
1909 restart:
1910         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1911
1912         /* Abort if socket has reported problems. */
1913         if (so->so_error) {
1914                 if (sb->sb_cc > 0)
1915                         goto deliver;
1916                 if (oresid > uio->uio_resid)
1917                         goto out;
1918                 error = so->so_error;
1919                 if (!(flags & MSG_PEEK))
1920                         so->so_error = 0;
1921                 goto out;
1922         }
1923
1924         /* Door is closed.  Deliver what is left, if any. */
1925         if (sb->sb_state & SBS_CANTRCVMORE) {
1926                 if (sb->sb_cc > 0)
1927                         goto deliver;
1928                 else
1929                         goto out;
1930         }
1931
1932         /* Socket buffer is empty and we shall not block. */
1933         if (sb->sb_cc == 0 &&
1934             ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1935                 error = EAGAIN;
1936                 goto out;
1937         }
1938
1939         /* Socket buffer got some data that we shall deliver now. */
1940         if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1941             ((sb->sb_flags & SS_NBIO) ||
1942              (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1943              sb->sb_cc >= sb->sb_lowat ||
1944              sb->sb_cc >= uio->uio_resid ||
1945              sb->sb_cc >= sb->sb_hiwat) ) {
1946                 goto deliver;
1947         }
1948
1949         /* On MSG_WAITALL we must wait until all data or error arrives. */
1950         if ((flags & MSG_WAITALL) &&
1951             (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
1952                 goto deliver;
1953
1954         /*
1955          * Wait and block until (more) data comes in.
1956          * NB: Drops the sockbuf lock during wait.
1957          */
1958         error = sbwait(sb);
1959         if (error)
1960                 goto out;
1961         goto restart;
1962
1963 deliver:
1964         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1965         KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1966         KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1967
1968         /* Statistics. */
1969         if (uio->uio_td)
1970                 uio->uio_td->td_ru.ru_msgrcv++;
1971
1972         /* Fill uio until full or current end of socket buffer is reached. */
1973         len = min(uio->uio_resid, sb->sb_cc);
1974         if (mp0 != NULL) {
1975                 /* Dequeue as many mbufs as possible. */
1976                 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1977                         if (*mp0 == NULL)
1978                                 *mp0 = sb->sb_mb;
1979                         else
1980                                 m_cat(*mp0, sb->sb_mb);
1981                         for (m = sb->sb_mb;
1982                              m != NULL && m->m_len <= len;
1983                              m = m->m_next) {
1984                                 len -= m->m_len;
1985                                 uio->uio_resid -= m->m_len;
1986                                 sbfree(sb, m);
1987                                 n = m;
1988                         }
1989                         n->m_next = NULL;
1990                         sb->sb_mb = m;
1991                         sb->sb_lastrecord = sb->sb_mb;
1992                         if (sb->sb_mb == NULL)
1993                                 SB_EMPTY_FIXUP(sb);
1994                 }
1995                 /* Copy the remainder. */
1996                 if (len > 0) {
1997                         KASSERT(sb->sb_mb != NULL,
1998                             ("%s: len > 0 && sb->sb_mb empty", __func__));
1999
2000                         m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2001                         if (m == NULL)
2002                                 len = 0;        /* Don't flush data from sockbuf. */
2003                         else
2004                                 uio->uio_resid -= len;
2005                         if (*mp0 != NULL)
2006                                 m_cat(*mp0, m);
2007                         else
2008                                 *mp0 = m;
2009                         if (*mp0 == NULL) {
2010                                 error = ENOBUFS;
2011                                 goto out;
2012                         }
2013                 }
2014         } else {
2015                 /* NB: Must unlock socket buffer as uiomove may sleep. */
2016                 SOCKBUF_UNLOCK(sb);
2017                 error = m_mbuftouio(uio, sb->sb_mb, len);
2018                 SOCKBUF_LOCK(sb);
2019                 if (error)
2020                         goto out;
2021         }
2022         SBLASTRECORDCHK(sb);
2023         SBLASTMBUFCHK(sb);
2024
2025         /*
2026          * Remove the delivered data from the socket buffer unless we
2027          * were only peeking.
2028          */
2029         if (!(flags & MSG_PEEK)) {
2030                 if (len > 0)
2031                         sbdrop_locked(sb, len);
2032
2033                 /* Notify protocol that we drained some data. */
2034                 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2035                     (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2036                      !(flags & MSG_SOCALLBCK))) {
2037                         SOCKBUF_UNLOCK(sb);
2038                         VNET_SO_ASSERT(so);
2039                         (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2040                         SOCKBUF_LOCK(sb);
2041                 }
2042         }
2043
2044         /*
2045          * For MSG_WAITALL we may have to loop again and wait for
2046          * more data to come in.
2047          */
2048         if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2049                 goto restart;
2050 out:
2051         SOCKBUF_LOCK_ASSERT(sb);
2052         SBLASTRECORDCHK(sb);
2053         SBLASTMBUFCHK(sb);
2054         SOCKBUF_UNLOCK(sb);
2055         sbunlock(sb);
2056         return (error);
2057 }
2058
2059 /*
2060  * Optimized version of soreceive() for simple datagram cases from userspace.
2061  * Unlike in the stream case, we're able to drop a datagram if copyout()
2062  * fails, and because we handle datagrams atomically, we don't need to use a
2063  * sleep lock to prevent I/O interlacing.
2064  */
2065 int
2066 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2067     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2068 {
2069         struct mbuf *m, *m2;
2070         int flags, error;
2071         ssize_t len;
2072         struct protosw *pr = so->so_proto;
2073         struct mbuf *nextrecord;
2074
2075         if (psa != NULL)
2076                 *psa = NULL;
2077         if (controlp != NULL)
2078                 *controlp = NULL;
2079         if (flagsp != NULL)
2080                 flags = *flagsp &~ MSG_EOR;
2081         else
2082                 flags = 0;
2083
2084         /*
2085          * For any complicated cases, fall back to the full
2086          * soreceive_generic().
2087          */
2088         if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2089                 return (soreceive_generic(so, psa, uio, mp0, controlp,
2090                     flagsp));
2091
2092         /*
2093          * Enforce restrictions on use.
2094          */
2095         KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2096             ("soreceive_dgram: wantrcvd"));
2097         KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2098         KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2099             ("soreceive_dgram: SBS_RCVATMARK"));
2100         KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2101             ("soreceive_dgram: P_CONNREQUIRED"));
2102
2103         /*
2104          * Loop blocking while waiting for a datagram.
2105          */
2106         SOCKBUF_LOCK(&so->so_rcv);
2107         while ((m = so->so_rcv.sb_mb) == NULL) {
2108                 KASSERT(so->so_rcv.sb_cc == 0,
2109                     ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2110                     so->so_rcv.sb_cc));
2111                 if (so->so_error) {
2112                         error = so->so_error;
2113                         so->so_error = 0;
2114                         SOCKBUF_UNLOCK(&so->so_rcv);
2115                         return (error);
2116                 }
2117                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2118                     uio->uio_resid == 0) {
2119                         SOCKBUF_UNLOCK(&so->so_rcv);
2120                         return (0);
2121                 }
2122                 if ((so->so_state & SS_NBIO) ||
2123                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2124                         SOCKBUF_UNLOCK(&so->so_rcv);
2125                         return (EWOULDBLOCK);
2126                 }
2127                 SBLASTRECORDCHK(&so->so_rcv);
2128                 SBLASTMBUFCHK(&so->so_rcv);
2129                 error = sbwait(&so->so_rcv);
2130                 if (error) {
2131                         SOCKBUF_UNLOCK(&so->so_rcv);
2132                         return (error);
2133                 }
2134         }
2135         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2136
2137         if (uio->uio_td)
2138                 uio->uio_td->td_ru.ru_msgrcv++;
2139         SBLASTRECORDCHK(&so->so_rcv);
2140         SBLASTMBUFCHK(&so->so_rcv);
2141         nextrecord = m->m_nextpkt;
2142         if (nextrecord == NULL) {
2143                 KASSERT(so->so_rcv.sb_lastrecord == m,
2144                     ("soreceive_dgram: lastrecord != m"));
2145         }
2146
2147         KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2148             ("soreceive_dgram: m_nextpkt != nextrecord"));
2149
2150         /*
2151          * Pull 'm' and its chain off the front of the packet queue.
2152          */
2153         so->so_rcv.sb_mb = NULL;
2154         sockbuf_pushsync(&so->so_rcv, nextrecord);
2155
2156         /*
2157          * Walk 'm's chain and free that many bytes from the socket buffer.
2158          */
2159         for (m2 = m; m2 != NULL; m2 = m2->m_next)
2160                 sbfree(&so->so_rcv, m2);
2161
2162         /*
2163          * Do a few last checks before we let go of the lock.
2164          */
2165         SBLASTRECORDCHK(&so->so_rcv);
2166         SBLASTMBUFCHK(&so->so_rcv);
2167         SOCKBUF_UNLOCK(&so->so_rcv);
2168
2169         if (pr->pr_flags & PR_ADDR) {
2170                 KASSERT(m->m_type == MT_SONAME,
2171                     ("m->m_type == %d", m->m_type));
2172                 if (psa != NULL)
2173                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2174                             M_NOWAIT);
2175                 m = m_free(m);
2176         }
2177         if (m == NULL) {
2178                 /* XXXRW: Can this happen? */
2179                 return (0);
2180         }
2181
2182         /*
2183          * Packet to copyout() is now in 'm' and it is disconnected from the
2184          * queue.
2185          *
2186          * Process one or more MT_CONTROL mbufs present before any data mbufs
2187          * in the first mbuf chain on the socket buffer.  We call into the
2188          * protocol to perform externalization (or freeing if controlp ==
2189          * NULL).
2190          */
2191         if (m->m_type == MT_CONTROL) {
2192                 struct mbuf *cm = NULL, *cmn;
2193                 struct mbuf **cme = &cm;
2194
2195                 do {
2196                         m2 = m->m_next;
2197                         m->m_next = NULL;
2198                         *cme = m;
2199                         cme = &(*cme)->m_next;
2200                         m = m2;
2201                 } while (m != NULL && m->m_type == MT_CONTROL);
2202                 while (cm != NULL) {
2203                         cmn = cm->m_next;
2204                         cm->m_next = NULL;
2205                         if (pr->pr_domain->dom_externalize != NULL) {
2206                                 error = (*pr->pr_domain->dom_externalize)
2207                                     (cm, controlp, flags);
2208                         } else if (controlp != NULL)
2209                                 *controlp = cm;
2210                         else
2211                                 m_freem(cm);
2212                         if (controlp != NULL) {
2213                                 while (*controlp != NULL)
2214                                         controlp = &(*controlp)->m_next;
2215                         }
2216                         cm = cmn;
2217                 }
2218         }
2219         KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2220
2221         while (m != NULL && uio->uio_resid > 0) {
2222                 len = uio->uio_resid;
2223                 if (len > m->m_len)
2224                         len = m->m_len;
2225                 error = uiomove(mtod(m, char *), (int)len, uio);
2226                 if (error) {
2227                         m_freem(m);
2228                         return (error);
2229                 }
2230                 if (len == m->m_len)
2231                         m = m_free(m);
2232                 else {
2233                         m->m_data += len;
2234                         m->m_len -= len;
2235                 }
2236         }
2237         if (m != NULL)
2238                 flags |= MSG_TRUNC;
2239         m_freem(m);
2240         if (flagsp != NULL)
2241                 *flagsp |= flags;
2242         return (0);
2243 }
2244
2245 int
2246 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2247     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2248 {
2249         int error;
2250
2251         CURVNET_SET(so->so_vnet);
2252         error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2253             controlp, flagsp));
2254         CURVNET_RESTORE();
2255         return (error);
2256 }
2257
2258 int
2259 soshutdown(struct socket *so, int how)
2260 {
2261         struct protosw *pr = so->so_proto;
2262         int error;
2263
2264         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2265                 return (EINVAL);
2266
2267         CURVNET_SET(so->so_vnet);
2268         if (pr->pr_usrreqs->pru_flush != NULL)
2269                 (*pr->pr_usrreqs->pru_flush)(so, how);
2270         if (how != SHUT_WR)
2271                 sorflush(so);
2272         if (how != SHUT_RD) {
2273                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2274                 wakeup(&so->so_timeo);
2275                 CURVNET_RESTORE();
2276                 return (error);
2277         }
2278         wakeup(&so->so_timeo);
2279         CURVNET_RESTORE();
2280         return (0);
2281 }
2282
2283 void
2284 sorflush(struct socket *so)
2285 {
2286         struct sockbuf *sb = &so->so_rcv;
2287         struct protosw *pr = so->so_proto;
2288         struct sockbuf asb;
2289
2290         VNET_SO_ASSERT(so);
2291
2292         /*
2293          * In order to avoid calling dom_dispose with the socket buffer mutex
2294          * held, and in order to generally avoid holding the lock for a long
2295          * time, we make a copy of the socket buffer and clear the original
2296          * (except locks, state).  The new socket buffer copy won't have
2297          * initialized locks so we can only call routines that won't use or
2298          * assert those locks.
2299          *
2300          * Dislodge threads currently blocked in receive and wait to acquire
2301          * a lock against other simultaneous readers before clearing the
2302          * socket buffer.  Don't let our acquire be interrupted by a signal
2303          * despite any existing socket disposition on interruptable waiting.
2304          */
2305         socantrcvmore(so);
2306         (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2307
2308         /*
2309          * Invalidate/clear most of the sockbuf structure, but leave selinfo
2310          * and mutex data unchanged.
2311          */
2312         SOCKBUF_LOCK(sb);
2313         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2314         bcopy(&sb->sb_startzero, &asb.sb_startzero,
2315             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2316         bzero(&sb->sb_startzero,
2317             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2318         SOCKBUF_UNLOCK(sb);
2319         sbunlock(sb);
2320
2321         /*
2322          * Dispose of special rights and flush the socket buffer.  Don't call
2323          * any unsafe routines (that rely on locks being initialized) on asb.
2324          */
2325         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2326                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2327         sbrelease_internal(&asb, so);
2328 }
2329
2330 /*
2331  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2332  * additional variant to handle the case where the option value needs to be
2333  * some kind of integer, but not a specific size.  In addition to their use
2334  * here, these functions are also called by the protocol-level pr_ctloutput()
2335  * routines.
2336  */
2337 int
2338 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2339 {
2340         size_t  valsize;
2341
2342         /*
2343          * If the user gives us more than we wanted, we ignore it, but if we
2344          * don't get the minimum length the caller wants, we return EINVAL.
2345          * On success, sopt->sopt_valsize is set to however much we actually
2346          * retrieved.
2347          */
2348         if ((valsize = sopt->sopt_valsize) < minlen)
2349                 return EINVAL;
2350         if (valsize > len)
2351                 sopt->sopt_valsize = valsize = len;
2352
2353         if (sopt->sopt_td != NULL)
2354                 return (copyin(sopt->sopt_val, buf, valsize));
2355
2356         bcopy(sopt->sopt_val, buf, valsize);
2357         return (0);
2358 }
2359
2360 /*
2361  * Kernel version of setsockopt(2).
2362  *
2363  * XXX: optlen is size_t, not socklen_t
2364  */
2365 int
2366 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2367     size_t optlen)
2368 {
2369         struct sockopt sopt;
2370
2371         sopt.sopt_level = level;
2372         sopt.sopt_name = optname;
2373         sopt.sopt_dir = SOPT_SET;
2374         sopt.sopt_val = optval;
2375         sopt.sopt_valsize = optlen;
2376         sopt.sopt_td = NULL;
2377         return (sosetopt(so, &sopt));
2378 }
2379
2380 int
2381 sosetopt(struct socket *so, struct sockopt *sopt)
2382 {
2383         int     error, optval;
2384         struct  linger l;
2385         struct  timeval tv;
2386         sbintime_t val;
2387         uint32_t val32;
2388 #ifdef MAC
2389         struct mac extmac;
2390 #endif
2391
2392         CURVNET_SET(so->so_vnet);
2393         error = 0;
2394         if (sopt->sopt_level != SOL_SOCKET) {
2395                 if (so->so_proto->pr_ctloutput != NULL) {
2396                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2397                         CURVNET_RESTORE();
2398                         return (error);
2399                 }
2400                 error = ENOPROTOOPT;
2401         } else {
2402                 switch (sopt->sopt_name) {
2403                 case SO_ACCEPTFILTER:
2404                         error = do_setopt_accept_filter(so, sopt);
2405                         if (error)
2406                                 goto bad;
2407                         break;
2408
2409                 case SO_LINGER:
2410                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2411                         if (error)
2412                                 goto bad;
2413
2414                         SOCK_LOCK(so);
2415                         so->so_linger = l.l_linger;
2416                         if (l.l_onoff)
2417                                 so->so_options |= SO_LINGER;
2418                         else
2419                                 so->so_options &= ~SO_LINGER;
2420                         SOCK_UNLOCK(so);
2421                         break;
2422
2423                 case SO_DEBUG:
2424                 case SO_KEEPALIVE:
2425                 case SO_DONTROUTE:
2426                 case SO_USELOOPBACK:
2427                 case SO_BROADCAST:
2428                 case SO_REUSEADDR:
2429                 case SO_REUSEPORT:
2430                 case SO_OOBINLINE:
2431                 case SO_TIMESTAMP:
2432                 case SO_BINTIME:
2433                 case SO_NOSIGPIPE:
2434                 case SO_NO_DDP:
2435                 case SO_NO_OFFLOAD:
2436                         error = sooptcopyin(sopt, &optval, sizeof optval,
2437                             sizeof optval);
2438                         if (error)
2439                                 goto bad;
2440                         SOCK_LOCK(so);
2441                         if (optval)
2442                                 so->so_options |= sopt->sopt_name;
2443                         else
2444                                 so->so_options &= ~sopt->sopt_name;
2445                         SOCK_UNLOCK(so);
2446                         break;
2447
2448                 case SO_SETFIB:
2449                         error = sooptcopyin(sopt, &optval, sizeof optval,
2450                             sizeof optval);
2451                         if (error)
2452                                 goto bad;
2453
2454                         if (optval < 0 || optval >= rt_numfibs) {
2455                                 error = EINVAL;
2456                                 goto bad;
2457                         }
2458                         if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2459                            (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2460                            (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2461                                 so->so_fibnum = optval;
2462                         else
2463                                 so->so_fibnum = 0;
2464                         break;
2465
2466                 case SO_USER_COOKIE:
2467                         error = sooptcopyin(sopt, &val32, sizeof val32,
2468                             sizeof val32);
2469                         if (error)
2470                                 goto bad;
2471                         so->so_user_cookie = val32;
2472                         break;
2473
2474                 case SO_SNDBUF:
2475                 case SO_RCVBUF:
2476                 case SO_SNDLOWAT:
2477                 case SO_RCVLOWAT:
2478                         error = sooptcopyin(sopt, &optval, sizeof optval,
2479                             sizeof optval);
2480                         if (error)
2481                                 goto bad;
2482
2483                         /*
2484                          * Values < 1 make no sense for any of these options,
2485                          * so disallow them.
2486                          */
2487                         if (optval < 1) {
2488                                 error = EINVAL;
2489                                 goto bad;
2490                         }
2491
2492                         switch (sopt->sopt_name) {
2493                         case SO_SNDBUF:
2494                         case SO_RCVBUF:
2495                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2496                                     &so->so_snd : &so->so_rcv, (u_long)optval,
2497                                     so, curthread) == 0) {
2498                                         error = ENOBUFS;
2499                                         goto bad;
2500                                 }
2501                                 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2502                                     &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2503                                 break;
2504
2505                         /*
2506                          * Make sure the low-water is never greater than the
2507                          * high-water.
2508                          */
2509                         case SO_SNDLOWAT:
2510                                 SOCKBUF_LOCK(&so->so_snd);
2511                                 so->so_snd.sb_lowat =
2512                                     (optval > so->so_snd.sb_hiwat) ?
2513                                     so->so_snd.sb_hiwat : optval;
2514                                 SOCKBUF_UNLOCK(&so->so_snd);
2515                                 break;
2516                         case SO_RCVLOWAT:
2517                                 SOCKBUF_LOCK(&so->so_rcv);
2518                                 so->so_rcv.sb_lowat =
2519                                     (optval > so->so_rcv.sb_hiwat) ?
2520                                     so->so_rcv.sb_hiwat : optval;
2521                                 SOCKBUF_UNLOCK(&so->so_rcv);
2522                                 break;
2523                         }
2524                         break;
2525
2526                 case SO_SNDTIMEO:
2527                 case SO_RCVTIMEO:
2528 #ifdef COMPAT_FREEBSD32
2529                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2530                                 struct timeval32 tv32;
2531
2532                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2533                                     sizeof tv32);
2534                                 CP(tv32, tv, tv_sec);
2535                                 CP(tv32, tv, tv_usec);
2536                         } else
2537 #endif
2538                                 error = sooptcopyin(sopt, &tv, sizeof tv,
2539                                     sizeof tv);
2540                         if (error)
2541                                 goto bad;
2542                         if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2543                             tv.tv_usec >= 1000000) {
2544                                 error = EDOM;
2545                                 goto bad;
2546                         }
2547                         if (tv.tv_sec > INT32_MAX)
2548                                 val = SBT_MAX;
2549                         else
2550                                 val = tvtosbt(tv);
2551                         switch (sopt->sopt_name) {
2552                         case SO_SNDTIMEO:
2553                                 so->so_snd.sb_timeo = val;
2554                                 break;
2555                         case SO_RCVTIMEO:
2556                                 so->so_rcv.sb_timeo = val;
2557                                 break;
2558                         }
2559                         break;
2560
2561                 case SO_LABEL:
2562 #ifdef MAC
2563                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
2564                             sizeof extmac);
2565                         if (error)
2566                                 goto bad;
2567                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2568                             so, &extmac);
2569 #else
2570                         error = EOPNOTSUPP;
2571 #endif
2572                         break;
2573
2574                 default:
2575                         error = ENOPROTOOPT;
2576                         break;
2577                 }
2578                 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2579                         (void)(*so->so_proto->pr_ctloutput)(so, sopt);
2580         }
2581 bad:
2582         CURVNET_RESTORE();
2583         return (error);
2584 }
2585
2586 /*
2587  * Helper routine for getsockopt.
2588  */
2589 int
2590 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2591 {
2592         int     error;
2593         size_t  valsize;
2594
2595         error = 0;
2596
2597         /*
2598          * Documented get behavior is that we always return a value, possibly
2599          * truncated to fit in the user's buffer.  Traditional behavior is
2600          * that we always tell the user precisely how much we copied, rather
2601          * than something useful like the total amount we had available for
2602          * her.  Note that this interface is not idempotent; the entire
2603          * answer must generated ahead of time.
2604          */
2605         valsize = min(len, sopt->sopt_valsize);
2606         sopt->sopt_valsize = valsize;
2607         if (sopt->sopt_val != NULL) {
2608                 if (sopt->sopt_td != NULL)
2609                         error = copyout(buf, sopt->sopt_val, valsize);
2610                 else
2611                         bcopy(buf, sopt->sopt_val, valsize);
2612         }
2613         return (error);
2614 }
2615
2616 int
2617 sogetopt(struct socket *so, struct sockopt *sopt)
2618 {
2619         int     error, optval;
2620         struct  linger l;
2621         struct  timeval tv;
2622 #ifdef MAC
2623         struct mac extmac;
2624 #endif
2625
2626         CURVNET_SET(so->so_vnet);
2627         error = 0;
2628         if (sopt->sopt_level != SOL_SOCKET) {
2629                 if (so->so_proto->pr_ctloutput != NULL)
2630                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2631                 else
2632                         error = ENOPROTOOPT;
2633                 CURVNET_RESTORE();
2634                 return (error);
2635         } else {
2636                 switch (sopt->sopt_name) {
2637                 case SO_ACCEPTFILTER:
2638                         error = do_getopt_accept_filter(so, sopt);
2639                         break;
2640
2641                 case SO_LINGER:
2642                         SOCK_LOCK(so);
2643                         l.l_onoff = so->so_options & SO_LINGER;
2644                         l.l_linger = so->so_linger;
2645                         SOCK_UNLOCK(so);
2646                         error = sooptcopyout(sopt, &l, sizeof l);
2647                         break;
2648
2649                 case SO_USELOOPBACK:
2650                 case SO_DONTROUTE:
2651                 case SO_DEBUG:
2652                 case SO_KEEPALIVE:
2653                 case SO_REUSEADDR:
2654                 case SO_REUSEPORT:
2655                 case SO_BROADCAST:
2656                 case SO_OOBINLINE:
2657                 case SO_ACCEPTCONN:
2658                 case SO_TIMESTAMP:
2659                 case SO_BINTIME:
2660                 case SO_NOSIGPIPE:
2661                         optval = so->so_options & sopt->sopt_name;
2662 integer:
2663                         error = sooptcopyout(sopt, &optval, sizeof optval);
2664                         break;
2665
2666                 case SO_TYPE:
2667                         optval = so->so_type;
2668                         goto integer;
2669
2670                 case SO_PROTOCOL:
2671                         optval = so->so_proto->pr_protocol;
2672                         goto integer;
2673
2674                 case SO_ERROR:
2675                         SOCK_LOCK(so);
2676                         optval = so->so_error;
2677                         so->so_error = 0;
2678                         SOCK_UNLOCK(so);
2679                         goto integer;
2680
2681                 case SO_SNDBUF:
2682                         optval = so->so_snd.sb_hiwat;
2683                         goto integer;
2684
2685                 case SO_RCVBUF:
2686                         optval = so->so_rcv.sb_hiwat;
2687                         goto integer;
2688
2689                 case SO_SNDLOWAT:
2690                         optval = so->so_snd.sb_lowat;
2691                         goto integer;
2692
2693                 case SO_RCVLOWAT:
2694                         optval = so->so_rcv.sb_lowat;
2695                         goto integer;
2696
2697                 case SO_SNDTIMEO:
2698                 case SO_RCVTIMEO:
2699                         tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
2700                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2701 #ifdef COMPAT_FREEBSD32
2702                         if (SV_CURPROC_FLAG(SV_ILP32)) {
2703                                 struct timeval32 tv32;
2704
2705                                 CP(tv, tv32, tv_sec);
2706                                 CP(tv, tv32, tv_usec);
2707                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2708                         } else
2709 #endif
2710                                 error = sooptcopyout(sopt, &tv, sizeof tv);
2711                         break;
2712
2713                 case SO_LABEL:
2714 #ifdef MAC
2715                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2716                             sizeof(extmac));
2717                         if (error)
2718                                 goto bad;
2719                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2720                             so, &extmac);
2721                         if (error)
2722                                 goto bad;
2723                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2724 #else
2725                         error = EOPNOTSUPP;
2726 #endif
2727                         break;
2728
2729                 case SO_PEERLABEL:
2730 #ifdef MAC
2731                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2732                             sizeof(extmac));
2733                         if (error)
2734                                 goto bad;
2735                         error = mac_getsockopt_peerlabel(
2736                             sopt->sopt_td->td_ucred, so, &extmac);
2737                         if (error)
2738                                 goto bad;
2739                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
2740 #else
2741                         error = EOPNOTSUPP;
2742 #endif
2743                         break;
2744
2745                 case SO_LISTENQLIMIT:
2746                         optval = so->so_qlimit;
2747                         goto integer;
2748
2749                 case SO_LISTENQLEN:
2750                         optval = so->so_qlen;
2751                         goto integer;
2752
2753                 case SO_LISTENINCQLEN:
2754                         optval = so->so_incqlen;
2755                         goto integer;
2756
2757                 default:
2758                         error = ENOPROTOOPT;
2759                         break;
2760                 }
2761         }
2762 #ifdef MAC
2763 bad:
2764 #endif
2765         CURVNET_RESTORE();
2766         return (error);
2767 }
2768
2769 int
2770 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2771 {
2772         struct mbuf *m, *m_prev;
2773         int sopt_size = sopt->sopt_valsize;
2774
2775         MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2776         if (m == NULL)
2777                 return ENOBUFS;
2778         if (sopt_size > MLEN) {
2779                 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
2780                 if ((m->m_flags & M_EXT) == 0) {
2781                         m_free(m);
2782                         return ENOBUFS;
2783                 }
2784                 m->m_len = min(MCLBYTES, sopt_size);
2785         } else {
2786                 m->m_len = min(MLEN, sopt_size);
2787         }
2788         sopt_size -= m->m_len;
2789         *mp = m;
2790         m_prev = m;
2791
2792         while (sopt_size) {
2793                 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2794                 if (m == NULL) {
2795                         m_freem(*mp);
2796                         return ENOBUFS;
2797                 }
2798                 if (sopt_size > MLEN) {
2799                         MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
2800                             M_NOWAIT);
2801                         if ((m->m_flags & M_EXT) == 0) {
2802                                 m_freem(m);
2803                                 m_freem(*mp);
2804                                 return ENOBUFS;
2805                         }
2806                         m->m_len = min(MCLBYTES, sopt_size);
2807                 } else {
2808                         m->m_len = min(MLEN, sopt_size);
2809                 }
2810                 sopt_size -= m->m_len;
2811                 m_prev->m_next = m;
2812                 m_prev = m;
2813         }
2814         return (0);
2815 }
2816
2817 int
2818 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2819 {
2820         struct mbuf *m0 = m;
2821
2822         if (sopt->sopt_val == NULL)
2823                 return (0);
2824         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2825                 if (sopt->sopt_td != NULL) {
2826                         int error;
2827
2828                         error = copyin(sopt->sopt_val, mtod(m, char *),
2829                             m->m_len);
2830                         if (error != 0) {
2831                                 m_freem(m0);
2832                                 return(error);
2833                         }
2834                 } else
2835                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2836                 sopt->sopt_valsize -= m->m_len;
2837                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2838                 m = m->m_next;
2839         }
2840         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2841                 panic("ip6_sooptmcopyin");
2842         return (0);
2843 }
2844
2845 int
2846 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2847 {
2848         struct mbuf *m0 = m;
2849         size_t valsize = 0;
2850
2851         if (sopt->sopt_val == NULL)
2852                 return (0);
2853         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2854                 if (sopt->sopt_td != NULL) {
2855                         int error;
2856
2857                         error = copyout(mtod(m, char *), sopt->sopt_val,
2858                             m->m_len);
2859                         if (error != 0) {
2860                                 m_freem(m0);
2861                                 return(error);
2862                         }
2863                 } else
2864                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2865                 sopt->sopt_valsize -= m->m_len;
2866                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2867                 valsize += m->m_len;
2868                 m = m->m_next;
2869         }
2870         if (m != NULL) {
2871                 /* enough soopt buffer should be given from user-land */
2872                 m_freem(m0);
2873                 return(EINVAL);
2874         }
2875         sopt->sopt_valsize = valsize;
2876         return (0);
2877 }
2878
2879 /*
2880  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2881  * out-of-band data, which will then notify socket consumers.
2882  */
2883 void
2884 sohasoutofband(struct socket *so)
2885 {
2886
2887         if (so->so_sigio != NULL)
2888                 pgsigio(&so->so_sigio, SIGURG, 0);
2889         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2890 }
2891
2892 int
2893 sopoll(struct socket *so, int events, struct ucred *active_cred,
2894     struct thread *td)
2895 {
2896
2897         /*
2898          * We do not need to set or assert curvnet as long as everyone uses
2899          * sopoll_generic().
2900          */
2901         return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2902             td));
2903 }
2904
2905 int
2906 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2907     struct thread *td)
2908 {
2909         int revents = 0;
2910
2911         SOCKBUF_LOCK(&so->so_snd);
2912         SOCKBUF_LOCK(&so->so_rcv);
2913         if (events & (POLLIN | POLLRDNORM))
2914                 if (soreadabledata(so))
2915                         revents |= events & (POLLIN | POLLRDNORM);
2916
2917         if (events & (POLLOUT | POLLWRNORM))
2918                 if (sowriteable(so))
2919                         revents |= events & (POLLOUT | POLLWRNORM);
2920
2921         if (events & (POLLPRI | POLLRDBAND))
2922                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2923                         revents |= events & (POLLPRI | POLLRDBAND);
2924
2925         if ((events & POLLINIGNEOF) == 0) {
2926                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2927                         revents |= events & (POLLIN | POLLRDNORM);
2928                         if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2929                                 revents |= POLLHUP;
2930                 }
2931         }
2932
2933         if (revents == 0) {
2934                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2935                         selrecord(td, &so->so_rcv.sb_sel);
2936                         so->so_rcv.sb_flags |= SB_SEL;
2937                 }
2938
2939                 if (events & (POLLOUT | POLLWRNORM)) {
2940                         selrecord(td, &so->so_snd.sb_sel);
2941                         so->so_snd.sb_flags |= SB_SEL;
2942                 }
2943         }
2944
2945         SOCKBUF_UNLOCK(&so->so_rcv);
2946         SOCKBUF_UNLOCK(&so->so_snd);
2947         return (revents);
2948 }
2949
2950 int
2951 soo_kqfilter(struct file *fp, struct knote *kn)
2952 {
2953         struct socket *so = kn->kn_fp->f_data;
2954         struct sockbuf *sb;
2955
2956         switch (kn->kn_filter) {
2957         case EVFILT_READ:
2958                 if (so->so_options & SO_ACCEPTCONN)
2959                         kn->kn_fop = &solisten_filtops;
2960                 else
2961                         kn->kn_fop = &soread_filtops;
2962                 sb = &so->so_rcv;
2963                 break;
2964         case EVFILT_WRITE:
2965                 kn->kn_fop = &sowrite_filtops;
2966                 sb = &so->so_snd;
2967                 break;
2968         default:
2969                 return (EINVAL);
2970         }
2971
2972         SOCKBUF_LOCK(sb);
2973         knlist_add(&sb->sb_sel.si_note, kn, 1);
2974         sb->sb_flags |= SB_KNOTE;
2975         SOCKBUF_UNLOCK(sb);
2976         return (0);
2977 }
2978
2979 /*
2980  * Some routines that return EOPNOTSUPP for entry points that are not
2981  * supported by a protocol.  Fill in as needed.
2982  */
2983 int
2984 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2985 {
2986
2987         return EOPNOTSUPP;
2988 }
2989
2990 int
2991 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2992 {
2993
2994         return EOPNOTSUPP;
2995 }
2996
2997 int
2998 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2999 {
3000
3001         return EOPNOTSUPP;
3002 }
3003
3004 int
3005 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3006     struct thread *td)
3007 {
3008
3009         return EOPNOTSUPP;
3010 }
3011
3012 int
3013 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3014 {
3015
3016         return EOPNOTSUPP;
3017 }
3018
3019 int
3020 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3021     struct thread *td)
3022 {
3023
3024         return EOPNOTSUPP;
3025 }
3026
3027 int
3028 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3029 {
3030
3031         return EOPNOTSUPP;
3032 }
3033
3034 int
3035 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3036     struct ifnet *ifp, struct thread *td)
3037 {
3038
3039         return EOPNOTSUPP;
3040 }
3041
3042 int
3043 pru_disconnect_notsupp(struct socket *so)
3044 {
3045
3046         return EOPNOTSUPP;
3047 }
3048
3049 int
3050 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3051 {
3052
3053         return EOPNOTSUPP;
3054 }
3055
3056 int
3057 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3058 {
3059
3060         return EOPNOTSUPP;
3061 }
3062
3063 int
3064 pru_rcvd_notsupp(struct socket *so, int flags)
3065 {
3066
3067         return EOPNOTSUPP;
3068 }
3069
3070 int
3071 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3072 {
3073
3074         return EOPNOTSUPP;
3075 }
3076
3077 int
3078 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3079     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3080 {
3081
3082         return EOPNOTSUPP;
3083 }
3084
3085 /*
3086  * This isn't really a ``null'' operation, but it's the default one and
3087  * doesn't do anything destructive.
3088  */
3089 int
3090 pru_sense_null(struct socket *so, struct stat *sb)
3091 {
3092
3093         sb->st_blksize = so->so_snd.sb_hiwat;
3094         return 0;
3095 }
3096
3097 int
3098 pru_shutdown_notsupp(struct socket *so)
3099 {
3100
3101         return EOPNOTSUPP;
3102 }
3103
3104 int
3105 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3106 {
3107
3108         return EOPNOTSUPP;
3109 }
3110
3111 int
3112 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3113     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3114 {
3115
3116         return EOPNOTSUPP;
3117 }
3118
3119 int
3120 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3121     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3122 {
3123
3124         return EOPNOTSUPP;
3125 }
3126
3127 int
3128 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3129     struct thread *td)
3130 {
3131
3132         return EOPNOTSUPP;
3133 }
3134
3135 static void
3136 filt_sordetach(struct knote *kn)
3137 {
3138         struct socket *so = kn->kn_fp->f_data;
3139
3140         SOCKBUF_LOCK(&so->so_rcv);
3141         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3142         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3143                 so->so_rcv.sb_flags &= ~SB_KNOTE;
3144         SOCKBUF_UNLOCK(&so->so_rcv);
3145 }
3146
3147 /*ARGSUSED*/
3148 static int
3149 filt_soread(struct knote *kn, long hint)
3150 {
3151         struct socket *so;
3152
3153         so = kn->kn_fp->f_data;
3154         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3155
3156         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3157         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3158                 kn->kn_flags |= EV_EOF;
3159                 kn->kn_fflags = so->so_error;
3160                 return (1);
3161         } else if (so->so_error)        /* temporary udp error */
3162                 return (1);
3163         else if (kn->kn_sfflags & NOTE_LOWAT)
3164                 return (kn->kn_data >= kn->kn_sdata);
3165         else
3166                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3167 }
3168
3169 static void
3170 filt_sowdetach(struct knote *kn)
3171 {
3172         struct socket *so = kn->kn_fp->f_data;
3173
3174         SOCKBUF_LOCK(&so->so_snd);
3175         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3176         if (knlist_empty(&so->so_snd.sb_sel.si_note))
3177                 so->so_snd.sb_flags &= ~SB_KNOTE;
3178         SOCKBUF_UNLOCK(&so->so_snd);
3179 }
3180
3181 /*ARGSUSED*/
3182 static int
3183 filt_sowrite(struct knote *kn, long hint)
3184 {
3185         struct socket *so;
3186
3187         so = kn->kn_fp->f_data;
3188         SOCKBUF_LOCK_ASSERT(&so->so_snd);
3189         kn->kn_data = sbspace(&so->so_snd);
3190         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3191                 kn->kn_flags |= EV_EOF;
3192                 kn->kn_fflags = so->so_error;
3193                 return (1);
3194         } else if (so->so_error)        /* temporary udp error */
3195                 return (1);
3196         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3197             (so->so_proto->pr_flags & PR_CONNREQUIRED))
3198                 return (0);
3199         else if (kn->kn_sfflags & NOTE_LOWAT)
3200                 return (kn->kn_data >= kn->kn_sdata);
3201         else
3202                 return (kn->kn_data >= so->so_snd.sb_lowat);
3203 }
3204
3205 /*ARGSUSED*/
3206 static int
3207 filt_solisten(struct knote *kn, long hint)
3208 {
3209         struct socket *so = kn->kn_fp->f_data;
3210
3211         kn->kn_data = so->so_qlen;
3212         return (!TAILQ_EMPTY(&so->so_comp));
3213 }
3214
3215 int
3216 socheckuid(struct socket *so, uid_t uid)
3217 {
3218
3219         if (so == NULL)
3220                 return (EPERM);
3221         if (so->so_cred->cr_uid != uid)
3222                 return (EPERM);
3223         return (0);
3224 }
3225
3226 /*
3227  * These functions are used by protocols to notify the socket layer (and its
3228  * consumers) of state changes in the sockets driven by protocol-side events.
3229  */
3230
3231 /*
3232  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3233  *
3234  * Normal sequence from the active (originating) side is that
3235  * soisconnecting() is called during processing of connect() call, resulting
3236  * in an eventual call to soisconnected() if/when the connection is
3237  * established.  When the connection is torn down soisdisconnecting() is
3238  * called during processing of disconnect() call, and soisdisconnected() is
3239  * called when the connection to the peer is totally severed.  The semantics
3240  * of these routines are such that connectionless protocols can call
3241  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3242  * calls when setting up a ``connection'' takes no time.
3243  *
3244  * From the passive side, a socket is created with two queues of sockets:
3245  * so_incomp for connections in progress and so_comp for connections already
3246  * made and awaiting user acceptance.  As a protocol is preparing incoming
3247  * connections, it creates a socket structure queued on so_incomp by calling
3248  * sonewconn().  When the connection is established, soisconnected() is
3249  * called, and transfers the socket structure to so_comp, making it available
3250  * to accept().
3251  *
3252  * If a socket is closed with sockets on either so_incomp or so_comp, these
3253  * sockets are dropped.
3254  *
3255  * If higher-level protocols are implemented in the kernel, the wakeups done
3256  * here will sometimes cause software-interrupt process scheduling.
3257  */
3258 void
3259 soisconnecting(struct socket *so)
3260 {
3261
3262         SOCK_LOCK(so);
3263         so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3264         so->so_state |= SS_ISCONNECTING;
3265         SOCK_UNLOCK(so);
3266 }
3267
3268 void
3269 soisconnected(struct socket *so)
3270 {
3271         struct socket *head;
3272         int ret;
3273
3274 restart:
3275         ACCEPT_LOCK();
3276         SOCK_LOCK(so);
3277         so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3278         so->so_state |= SS_ISCONNECTED;
3279         head = so->so_head;
3280         if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3281                 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3282                         SOCK_UNLOCK(so);
3283                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
3284                         head->so_incqlen--;
3285                         so->so_qstate &= ~SQ_INCOMP;
3286                         TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3287                         head->so_qlen++;
3288                         so->so_qstate |= SQ_COMP;
3289                         ACCEPT_UNLOCK();
3290                         sorwakeup(head);
3291                         wakeup_one(&head->so_timeo);
3292                 } else {
3293                         ACCEPT_UNLOCK();
3294                         soupcall_set(so, SO_RCV,
3295                             head->so_accf->so_accept_filter->accf_callback,
3296                             head->so_accf->so_accept_filter_arg);
3297                         so->so_options &= ~SO_ACCEPTFILTER;
3298                         ret = head->so_accf->so_accept_filter->accf_callback(so,
3299                             head->so_accf->so_accept_filter_arg, M_NOWAIT);
3300                         if (ret == SU_ISCONNECTED)
3301                                 soupcall_clear(so, SO_RCV);
3302                         SOCK_UNLOCK(so);
3303                         if (ret == SU_ISCONNECTED)
3304                                 goto restart;
3305                 }
3306                 return;
3307         }
3308         SOCK_UNLOCK(so);
3309         ACCEPT_UNLOCK();
3310         wakeup(&so->so_timeo);
3311         sorwakeup(so);
3312         sowwakeup(so);
3313 }
3314
3315 void
3316 soisdisconnecting(struct socket *so)
3317 {
3318
3319         /*
3320          * Note: This code assumes that SOCK_LOCK(so) and
3321          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3322          */
3323         SOCKBUF_LOCK(&so->so_rcv);
3324         so->so_state &= ~SS_ISCONNECTING;
3325         so->so_state |= SS_ISDISCONNECTING;
3326         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3327         sorwakeup_locked(so);
3328         SOCKBUF_LOCK(&so->so_snd);
3329         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3330         sowwakeup_locked(so);
3331         wakeup(&so->so_timeo);
3332 }
3333
3334 void
3335 soisdisconnected(struct socket *so)
3336 {
3337
3338         /*
3339          * Note: This code assumes that SOCK_LOCK(so) and
3340          * SOCKBUF_LOCK(&so->so_rcv) are the same.
3341          */
3342         SOCKBUF_LOCK(&so->so_rcv);
3343         so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3344         so->so_state |= SS_ISDISCONNECTED;
3345         so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3346         sorwakeup_locked(so);
3347         SOCKBUF_LOCK(&so->so_snd);
3348         so->so_snd.sb_state |= SBS_CANTSENDMORE;
3349         sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3350         sowwakeup_locked(so);
3351         wakeup(&so->so_timeo);
3352 }
3353
3354 /*
3355  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3356  */
3357 struct sockaddr *
3358 sodupsockaddr(const struct sockaddr *sa, int mflags)
3359 {
3360         struct sockaddr *sa2;
3361
3362         sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3363         if (sa2)
3364                 bcopy(sa, sa2, sa->sa_len);
3365         return sa2;
3366 }
3367
3368 /*
3369  * Register per-socket buffer upcalls.
3370  */
3371 void
3372 soupcall_set(struct socket *so, int which,
3373     int (*func)(struct socket *, void *, int), void *arg)
3374 {
3375         struct sockbuf *sb;
3376
3377         switch (which) {
3378         case SO_RCV:
3379                 sb = &so->so_rcv;
3380                 break;
3381         case SO_SND:
3382                 sb = &so->so_snd;
3383                 break;
3384         default:
3385                 panic("soupcall_set: bad which");
3386         }
3387         SOCKBUF_LOCK_ASSERT(sb);
3388 #if 0
3389         /* XXX: accf_http actually wants to do this on purpose. */
3390         KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3391 #endif
3392         sb->sb_upcall = func;
3393         sb->sb_upcallarg = arg;
3394         sb->sb_flags |= SB_UPCALL;
3395 }
3396
3397 void
3398 soupcall_clear(struct socket *so, int which)
3399 {
3400         struct sockbuf *sb;
3401
3402         switch (which) {
3403         case SO_RCV:
3404                 sb = &so->so_rcv;
3405                 break;
3406         case SO_SND:
3407                 sb = &so->so_snd;
3408                 break;
3409         default:
3410                 panic("soupcall_clear: bad which");
3411         }
3412         SOCKBUF_LOCK_ASSERT(sb);
3413         KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3414         sb->sb_upcall = NULL;
3415         sb->sb_upcallarg = NULL;
3416         sb->sb_flags &= ~SB_UPCALL;
3417 }
3418
3419 /*
3420  * Create an external-format (``xsocket'') structure using the information in
3421  * the kernel-format socket structure pointed to by so.  This is done to
3422  * reduce the spew of irrelevant information over this interface, to isolate
3423  * user code from changes in the kernel structure, and potentially to provide
3424  * information-hiding if we decide that some of this information should be
3425  * hidden from users.
3426  */
3427 void
3428 sotoxsocket(struct socket *so, struct xsocket *xso)
3429 {
3430
3431         xso->xso_len = sizeof *xso;
3432         xso->xso_so = so;
3433         xso->so_type = so->so_type;
3434         xso->so_options = so->so_options;
3435         xso->so_linger = so->so_linger;
3436         xso->so_state = so->so_state;
3437         xso->so_pcb = so->so_pcb;
3438         xso->xso_protocol = so->so_proto->pr_protocol;
3439         xso->xso_family = so->so_proto->pr_domain->dom_family;
3440         xso->so_qlen = so->so_qlen;
3441         xso->so_incqlen = so->so_incqlen;
3442         xso->so_qlimit = so->so_qlimit;
3443         xso->so_timeo = so->so_timeo;
3444         xso->so_error = so->so_error;
3445         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3446         xso->so_oobmark = so->so_oobmark;
3447         sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3448         sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3449         xso->so_uid = so->so_cred->cr_uid;
3450 }
3451
3452
3453 /*
3454  * Socket accessor functions to provide external consumers with
3455  * a safe interface to socket state
3456  *
3457  */
3458
3459 void
3460 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
3461     void *arg)
3462 {
3463
3464         TAILQ_FOREACH(so, &so->so_comp, so_list)
3465                 func(so, arg);
3466 }
3467
3468 struct sockbuf *
3469 so_sockbuf_rcv(struct socket *so)
3470 {
3471
3472         return (&so->so_rcv);
3473 }
3474
3475 struct sockbuf *
3476 so_sockbuf_snd(struct socket *so)
3477 {
3478
3479         return (&so->so_snd);
3480 }
3481
3482 int
3483 so_state_get(const struct socket *so)
3484 {
3485
3486         return (so->so_state);
3487 }
3488
3489 void
3490 so_state_set(struct socket *so, int val)
3491 {
3492
3493         so->so_state = val;
3494 }
3495
3496 int
3497 so_options_get(const struct socket *so)
3498 {
3499
3500         return (so->so_options);
3501 }
3502
3503 void
3504 so_options_set(struct socket *so, int val)
3505 {
3506
3507         so->so_options = val;
3508 }
3509
3510 int
3511 so_error_get(const struct socket *so)
3512 {
3513
3514         return (so->so_error);
3515 }
3516
3517 void
3518 so_error_set(struct socket *so, int val)
3519 {
3520
3521         so->so_error = val;
3522 }
3523
3524 int
3525 so_linger_get(const struct socket *so)
3526 {
3527
3528         return (so->so_linger);
3529 }
3530
3531 void
3532 so_linger_set(struct socket *so, int val)
3533 {
3534
3535         so->so_linger = val;
3536 }
3537
3538 struct protosw *
3539 so_protosw_get(const struct socket *so)
3540 {
3541
3542         return (so->so_proto);
3543 }
3544
3545 void
3546 so_protosw_set(struct socket *so, struct protosw *val)
3547 {
3548
3549         so->so_proto = val;
3550 }
3551
3552 void
3553 so_sorwakeup(struct socket *so)
3554 {
3555
3556         sorwakeup(so);
3557 }
3558
3559 void
3560 so_sowwakeup(struct socket *so)
3561 {
3562
3563         sowwakeup(so);
3564 }
3565
3566 void
3567 so_sorwakeup_locked(struct socket *so)
3568 {
3569
3570         sorwakeup_locked(so);
3571 }
3572
3573 void
3574 so_sowwakeup_locked(struct socket *so)
3575 {
3576
3577         sowwakeup_locked(so);
3578 }
3579
3580 void
3581 so_lock(struct socket *so)
3582 {
3583
3584         SOCK_LOCK(so);
3585 }
3586
3587 void
3588 so_unlock(struct socket *so)
3589 {
3590
3591         SOCK_UNLOCK(so);
3592 }