sys/kern/uipc_socket.c

   1 /*-
   2  * Copyright (c) 2004 The FreeBSD Foundation
   3  * Copyright (c) 2004-2005 Robert N. M. Watson
   4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 4. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include "opt_inet.h"
  38 #include "opt_mac.h"
  39 #include "opt_zero.h"
  40 #include "opt_compat.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/mac.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mbuf.h>
  50 #include <sys/mutex.h>
  51 #include <sys/domain.h>
  52 #include <sys/file.h>                   /* for struct knote */
  53 #include <sys/kernel.h>
  54 #include <sys/event.h>
  55 #include <sys/poll.h>
  56 #include <sys/proc.h>
  57 #include <sys/protosw.h>
  58 #include <sys/socket.h>
  59 #include <sys/socketvar.h>
  60 #include <sys/resourcevar.h>
  61 #include <sys/signalvar.h>
  62 #include <sys/sysctl.h>
  63 #include <sys/uio.h>
  64 #include <sys/jail.h>
  65
  66 #include <vm/uma.h>
  67
  68 #ifdef COMPAT_IA32
  69 #include <sys/mount.h>
  70 #include <compat/freebsd32/freebsd32.h>
  71
  72 extern struct sysentvec ia32_freebsd_sysvec;
  73 #endif
  74
  75 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
  76                     int flags);
  77
  78 static void     filt_sordetach(struct knote *kn);
  79 static int      filt_soread(struct knote *kn, long hint);
  80 static void     filt_sowdetach(struct knote *kn);
  81 static int      filt_sowrite(struct knote *kn, long hint);
  82 static int      filt_solisten(struct knote *kn, long hint);
  83
  84 static struct filterops solisten_filtops =
  85         { 1, NULL, filt_sordetach, filt_solisten };
  86 static struct filterops soread_filtops =
  87         { 1, NULL, filt_sordetach, filt_soread };
  88 static struct filterops sowrite_filtops =
  89         { 1, NULL, filt_sowdetach, filt_sowrite };
  90
  91 uma_zone_t socket_zone;
  92 so_gen_t        so_gencnt;      /* generation count for sockets */
  93
  94 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
  95 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
  96
  97 SYSCTL_DECL(_kern_ipc);
  98
  99 static int somaxconn = SOMAXCONN;
 100 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
 101 /* XXX: we dont have SYSCTL_USHORT */
 102 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
 103     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
 104     "queue size");
 105 static int numopensockets;
 106 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 107     &numopensockets, 0, "Number of open sockets");
 108 #ifdef ZERO_COPY_SOCKETS
 109 /* These aren't static because they're used in other files. */
 110 int so_zero_copy_send = 1;
 111 int so_zero_copy_receive = 1;
 112 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
 113     "Zero copy controls");
 114 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
 115     &so_zero_copy_receive, 0, "Enable zero copy receive");
 116 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
 117     &so_zero_copy_send, 0, "Enable zero copy send");
 118 #endif /* ZERO_COPY_SOCKETS */
 119
 120 /*
 121  * accept_mtx locks down per-socket fields relating to accept queues.  See
 122  * socketvar.h for an annotation of the protected fields of struct socket.
 123  */
 124 struct mtx accept_mtx;
 125 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 126
 127 /*
 128  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 129  * so_gencnt field.
 130  */
 131 static struct mtx so_global_mtx;
 132 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 133
 134 /*
 135  * Socket operation routines.
 136  * These routines are called by the routines in
 137  * sys_socket.c or from a system process, and
 138  * implement the semantics of socket operations by
 139  * switching out to the protocol specific routines.
 140  */
 141
 142 /*
 143  * Get a socket structure from our zone, and initialize it.
 144  * Note that it would probably be better to allocate socket
 145  * and PCB at the same time, but I'm not convinced that all
 146  * the protocols can be easily modified to do this.
 147  *
 148  * soalloc() returns a socket with a ref count of 0.
 149  */
 150 struct socket *
 151 soalloc(void)
 152 {
 153         struct socket *so;
 154
 155         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 156         if (so == NULL)
 157                 return (NULL);
 158 #ifdef MAC
 159         if (mac_init_socket(so, M_NOWAIT) != 0) {
 160                 uma_zfree(socket_zone, so);
 161                 return (NULL);
 162         }
 163 #endif
 164         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 165         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 166         TAILQ_INIT(&so->so_aiojobq);
 167         mtx_lock(&so_global_mtx);
 168         so->so_gencnt = ++so_gencnt;
 169         ++numopensockets;
 170         mtx_unlock(&so_global_mtx);
 171         return (so);
 172 }
 173
 174 /*
 175  * socreate returns a socket with a ref count of 1.  The socket should be
 176  * closed with soclose().
 177  */
 178 int
 179 socreate(dom, aso, type, proto, cred, td)
 180         int dom;
 181         struct socket **aso;
 182         int type;
 183         int proto;
 184         struct ucred *cred;
 185         struct thread *td;
 186 {
 187         struct protosw *prp;
 188         struct socket *so;
 189         int error;
 190
 191         if (proto)
 192                 prp = pffindproto(dom, proto, type);
 193         else
 194                 prp = pffindtype(dom, type);
 195
 196         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
 197             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 198                 return (EPROTONOSUPPORT);
 199
 200         if (jailed(cred) && jail_socket_unixiproute_only &&
 201             prp->pr_domain->dom_family != PF_LOCAL &&
 202             prp->pr_domain->dom_family != PF_INET &&
 203             prp->pr_domain->dom_family != PF_ROUTE) {
 204                 return (EPROTONOSUPPORT);
 205         }
 206
 207         if (prp->pr_type != type)
 208                 return (EPROTOTYPE);
 209         so = soalloc();
 210         if (so == NULL)
 211                 return (ENOBUFS);
 212
 213         TAILQ_INIT(&so->so_incomp);
 214         TAILQ_INIT(&so->so_comp);
 215         so->so_type = type;
 216         so->so_cred = crhold(cred);
 217         so->so_proto = prp;
 218 #ifdef MAC
 219         mac_create_socket(cred, so);
 220 #endif
 221         knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
 222             NULL, NULL, NULL);
 223         knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
 224             NULL, NULL, NULL);
 225         so->so_count = 1;
 226         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 227         if (error) {
 228                 ACCEPT_LOCK();
 229                 SOCK_LOCK(so);
 230                 so->so_state |= SS_NOFDREF;
 231                 sorele(so);
 232                 return (error);
 233         }
 234         *aso = so;
 235         return (0);
 236 }
 237
 238 int
 239 sobind(so, nam, td)
 240         struct socket *so;
 241         struct sockaddr *nam;
 242         struct thread *td;
 243 {
 244
 245         return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
 246 }
 247
 248 void
 249 sodealloc(struct socket *so)
 250 {
 251
 252         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 253         mtx_lock(&so_global_mtx);
 254         so->so_gencnt = ++so_gencnt;
 255         mtx_unlock(&so_global_mtx);
 256         if (so->so_rcv.sb_hiwat)
 257                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 258                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 259         if (so->so_snd.sb_hiwat)
 260                 (void)chgsbsize(so->so_cred->cr_uidinfo,
 261                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 262 #ifdef INET
 263         /* remove acccept filter if one is present. */
 264         if (so->so_accf != NULL)
 265                 do_setopt_accept_filter(so, NULL);
 266 #endif
 267 #ifdef MAC
 268         mac_destroy_socket(so);
 269 #endif
 270         crfree(so->so_cred);
 271         SOCKBUF_LOCK_DESTROY(&so->so_snd);
 272         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 273         uma_zfree(socket_zone, so);
 274         mtx_lock(&so_global_mtx);
 275         --numopensockets;
 276         mtx_unlock(&so_global_mtx);
 277 }
 278
 279 /*
 280  * solisten() transitions a socket from a non-listening state to a listening
 281  * state, but can also be used to update the listen queue depth on an
 282  * existing listen socket.  The protocol will call back into the sockets
 283  * layer using solisten_proto_check() and solisten_proto() to check and set
 284  * socket-layer listen state.  Call backs are used so that the protocol can
 285  * acquire both protocol and socket layer locks in whatever order is required
 286  * by the protocol.
 287  *
 288  * Protocol implementors are advised to hold the socket lock across the
 289  * socket-layer test and set to avoid races at the socket layer.
 290  */
 291 int
 292 solisten(so, backlog, td)
 293         struct socket *so;
 294         int backlog;
 295         struct thread *td;
 296 {
 297         int error;
 298
 299         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
 300         if (error)
 301                 return (error);
 302
 303         /*
 304          * XXXRW: The following state adjustment should occur in
 305          * solisten_proto(), but we don't currently pass the backlog request
 306          * to the protocol via pru_listen().
 307          */
 308         if (backlog < 0 || backlog > somaxconn)
 309                 backlog = somaxconn;
 310         so->so_qlimit = backlog;
 311         return (0);
 312 }
 313
 314 int
 315 solisten_proto_check(so)
 316         struct socket *so;
 317 {
 318
 319         SOCK_LOCK_ASSERT(so);
 320
 321         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 322             SS_ISDISCONNECTING))
 323                 return (EINVAL);
 324         return (0);
 325 }
 326
 327 void
 328 solisten_proto(so)
 329         struct socket *so;
 330 {
 331
 332         SOCK_LOCK_ASSERT(so);
 333
 334         so->so_options |= SO_ACCEPTCONN;
 335 }
 336
 337 /*
 338  * Attempt to free a socket.  This should really be sotryfree().
 339  *
 340  * We free the socket if the protocol is no longer interested in the socket,
 341  * there's no file descriptor reference, and the refcount is 0.  While the
 342  * calling macro sotryfree() tests the refcount, sofree() has to test it
 343  * again as it's possible to race with an accept()ing thread if the socket is
 344  * in an listen queue of a listen socket, as being in the listen queue
 345  * doesn't elevate the reference count.  sofree() acquires the accept mutex
 346  * early for this test in order to avoid that race.
 347  */
 348 void
 349 sofree(so)
 350         struct socket *so;
 351 {
 352         struct socket *head;
 353
 354         ACCEPT_LOCK_ASSERT();
 355         SOCK_LOCK_ASSERT(so);
 356
 357         if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
 358             so->so_count != 0) {
 359                 SOCK_UNLOCK(so);
 360                 ACCEPT_UNLOCK();
 361                 return;
 362         }
 363
 364         head = so->so_head;
 365         if (head != NULL) {
 366                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 367                     (so->so_qstate & SQ_INCOMP) != 0,
 368                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
 369                     "SQ_INCOMP"));
 370                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 371                     (so->so_qstate & SQ_INCOMP) == 0,
 372                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 373                 /*
 374                  * accept(2) is responsible draining the completed
 375                  * connection queue and freeing those sockets, so
 376                  * we just return here if this socket is currently
 377                  * on the completed connection queue.  Otherwise,
 378                  * accept(2) may hang after select(2) has indicating
 379                  * that a listening socket was ready.  If it's an
 380                  * incomplete connection, we remove it from the queue
 381                  * and free it; otherwise, it won't be released until
 382                  * the listening socket is closed.
 383                  */
 384                 if ((so->so_qstate & SQ_COMP) != 0) {
 385                         SOCK_UNLOCK(so);
 386                         ACCEPT_UNLOCK();
 387                         return;
 388                 }
 389                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 390                 head->so_incqlen--;
 391                 so->so_qstate &= ~SQ_INCOMP;
 392                 so->so_head = NULL;
 393         }
 394         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 395             (so->so_qstate & SQ_INCOMP) == 0,
 396             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 397             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 398         SOCK_UNLOCK(so);
 399         ACCEPT_UNLOCK();
 400         SOCKBUF_LOCK(&so->so_snd);
 401         so->so_snd.sb_flags |= SB_NOINTR;
 402         (void)sblock(&so->so_snd, M_WAITOK);
 403         /*
 404          * socantsendmore_locked() drops the socket buffer mutex so that it
 405          * can safely perform wakeups.  Re-acquire the mutex before
 406          * continuing.
 407          */
 408         socantsendmore_locked(so);
 409         SOCKBUF_LOCK(&so->so_snd);
 410         sbunlock(&so->so_snd);
 411         sbrelease_locked(&so->so_snd, so);
 412         SOCKBUF_UNLOCK(&so->so_snd);
 413         sorflush(so);
 414         knlist_destroy(&so->so_rcv.sb_sel.si_note);
 415         knlist_destroy(&so->so_snd.sb_sel.si_note);
 416         sodealloc(so);
 417 }
 418
 419 /*
 420  * Close a socket on last file table reference removal.
 421  * Initiate disconnect if connected.
 422  * Free socket when disconnect complete.
 423  *
 424  * This function will sorele() the socket.  Note that soclose() may be
 425  * called prior to the ref count reaching zero.  The actual socket
 426  * structure will not be freed until the ref count reaches zero.
 427  */
 428 int
 429 soclose(so)
 430         struct socket *so;
 431 {
 432         int error = 0;
 433
 434         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 435
 436         funsetown(&so->so_sigio);
 437         if (so->so_pcb == NULL)
 438                 goto discard;
 439         if (so->so_state & SS_ISCONNECTED) {
 440                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 441                         error = sodisconnect(so);
 442                         if (error)
 443                                 goto drop;
 444                 }
 445                 if (so->so_options & SO_LINGER) {
 446                         if ((so->so_state & SS_ISDISCONNECTING) &&
 447                             (so->so_state & SS_NBIO))
 448                                 goto drop;
 449                         while (so->so_state & SS_ISCONNECTED) {
 450                                 error = tsleep(&so->so_timeo,
 451                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
 452                                 if (error)
 453                                         break;
 454                         }
 455                 }
 456         }
 457 drop:
 458         if (so->so_pcb != NULL) {
 459                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
 460                 if (error == 0)
 461                         error = error2;
 462         }
 463         if (so->so_options & SO_ACCEPTCONN) {
 464                 struct socket *sp;
 465                 ACCEPT_LOCK();
 466                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 467                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 468                         so->so_incqlen--;
 469                         sp->so_qstate &= ~SQ_INCOMP;
 470                         sp->so_head = NULL;
 471                         ACCEPT_UNLOCK();
 472                         (void) soabort(sp);
 473                         ACCEPT_LOCK();
 474                 }
 475                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 476                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 477                         so->so_qlen--;
 478                         sp->so_qstate &= ~SQ_COMP;
 479                         sp->so_head = NULL;
 480                         ACCEPT_UNLOCK();
 481                         (void) soabort(sp);
 482                         ACCEPT_LOCK();
 483                 }
 484                 ACCEPT_UNLOCK();
 485         }
 486 discard:
 487         ACCEPT_LOCK();
 488         SOCK_LOCK(so);
 489         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 490         so->so_state |= SS_NOFDREF;
 491         sorele(so);
 492         return (error);
 493 }
 494
 495 /*
 496  * soabort() must not be called with any socket locks held, as it calls
 497  * into the protocol, which will call back into the socket code causing
 498  * it to acquire additional socket locks that may cause recursion or lock
 499  * order reversals.
 500  */
 501 int
 502 soabort(so)
 503         struct socket *so;
 504 {
 505         int error;
 506
 507         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
 508         if (error) {
 509                 ACCEPT_LOCK();
 510                 SOCK_LOCK(so);
 511                 sotryfree(so);  /* note: does not decrement the ref count */
 512                 return error;
 513         }
 514         return (0);
 515 }
 516
 517 int
 518 soaccept(so, nam)
 519         struct socket *so;
 520         struct sockaddr **nam;
 521 {
 522         int error;
 523
 524         SOCK_LOCK(so);
 525         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 526         so->so_state &= ~SS_NOFDREF;
 527         SOCK_UNLOCK(so);
 528         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 529         return (error);
 530 }
 531
 532 int
 533 soconnect(so, nam, td)
 534         struct socket *so;
 535         struct sockaddr *nam;
 536         struct thread *td;
 537 {
 538         int error;
 539
 540         if (so->so_options & SO_ACCEPTCONN)
 541                 return (EOPNOTSUPP);
 542         /*
 543          * If protocol is connection-based, can only connect once.
 544          * Otherwise, if connected, try to disconnect first.
 545          * This allows user to disconnect by connecting to, e.g.,
 546          * a null address.
 547          */
 548         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 549             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 550             (error = sodisconnect(so)))) {
 551                 error = EISCONN;
 552         } else {
 553                 /*
 554                  * Prevent accumulated error from previous connection
 555                  * from biting us.
 556                  */
 557                 so->so_error = 0;
 558                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
 559         }
 560
 561         return (error);
 562 }
 563
 564 int
 565 soconnect2(so1, so2)
 566         struct socket *so1;
 567         struct socket *so2;
 568 {
 569
 570         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
 571 }
 572
 573 int
 574 sodisconnect(so)
 575         struct socket *so;
 576 {
 577         int error;
 578
 579         if ((so->so_state & SS_ISCONNECTED) == 0)
 580                 return (ENOTCONN);
 581         if (so->so_state & SS_ISDISCONNECTING)
 582                 return (EALREADY);
 583         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 584         return (error);
 585 }
 586
 587 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
 588 /*
 589  * Send on a socket.
 590  * If send must go all at once and message is larger than
 591  * send buffering, then hard error.
 592  * Lock against other senders.
 593  * If must go all at once and not enough room now, then
 594  * inform user that this would block and do nothing.
 595  * Otherwise, if nonblocking, send as much as possible.
 596  * The data to be sent is described by "uio" if nonzero,
 597  * otherwise by the mbuf chain "top" (which must be null
 598  * if uio is not).  Data provided in mbuf chain must be small
 599  * enough to send all at once.
 600  *
 601  * Returns nonzero on error, timeout or signal; callers
 602  * must check for short counts if EINTR/ERESTART are returned.
 603  * Data and control buffers are freed on return.
 604  */
 605
 606 #ifdef ZERO_COPY_SOCKETS
 607 struct so_zerocopy_stats{
 608         int size_ok;
 609         int align_ok;
 610         int found_ifp;
 611 };
 612 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
 613 #include <netinet/in.h>
 614 #include <net/route.h>
 615 #include <netinet/in_pcb.h>
 616 #include <vm/vm.h>
 617 #include <vm/vm_page.h>
 618 #include <vm/vm_object.h>
 619 #endif /*ZERO_COPY_SOCKETS*/
 620
 621 int
 622 sosend(so, addr, uio, top, control, flags, td)
 623         struct socket *so;
 624         struct sockaddr *addr;
 625         struct uio *uio;
 626         struct mbuf *top;
 627         struct mbuf *control;
 628         int flags;
 629         struct thread *td;
 630 {
 631         struct mbuf **mp;
 632         struct mbuf *m;
 633         long space, len = 0, resid;
 634         int clen = 0, error, dontroute;
 635         int atomic = sosendallatonce(so) || top;
 636 #ifdef ZERO_COPY_SOCKETS
 637         int cow_send;
 638 #endif /* ZERO_COPY_SOCKETS */
 639
 640         if (uio != NULL)
 641                 resid = uio->uio_resid;
 642         else
 643                 resid = top->m_pkthdr.len;
 644         /*
 645          * In theory resid should be unsigned.
 646          * However, space must be signed, as it might be less than 0
 647          * if we over-committed, and we must use a signed comparison
 648          * of space and resid.  On the other hand, a negative resid
 649          * causes us to loop sending 0-length segments to the protocol.
 650          *
 651          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 652          * type sockets since that's an error.
 653          */
 654         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 655                 error = EINVAL;
 656                 goto out;
 657         }
 658
 659         dontroute =
 660             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 661             (so->so_proto->pr_flags & PR_ATOMIC);
 662         if (td != NULL)
 663                 td->td_proc->p_stats->p_ru.ru_msgsnd++;
 664         if (control != NULL)
 665                 clen = control->m_len;
 666 #define snderr(errno)   { error = (errno); goto release; }
 667
 668         SOCKBUF_LOCK(&so->so_snd);
 669 restart:
 670         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 671         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
 672         if (error)
 673                 goto out_locked;
 674         do {
 675                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
 676                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
 677                         snderr(EPIPE);
 678                 if (so->so_error) {
 679                         error = so->so_error;
 680                         so->so_error = 0;
 681                         goto release;
 682                 }
 683                 if ((so->so_state & SS_ISCONNECTED) == 0) {
 684                         /*
 685                          * `sendto' and `sendmsg' is allowed on a connection-
 686                          * based socket if it supports implied connect.
 687                          * Return ENOTCONN if not connected and no address is
 688                          * supplied.
 689                          */
 690                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 691                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 692                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 693                                     !(resid == 0 && clen != 0))
 694                                         snderr(ENOTCONN);
 695                         } else if (addr == NULL)
 696                             snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
 697                                    ENOTCONN : EDESTADDRREQ);
 698                 }
 699                 space = sbspace(&so->so_snd);
 700                 if (flags & MSG_OOB)
 701                         space += 1024;
 702                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
 703                     clen > so->so_snd.sb_hiwat)
 704                         snderr(EMSGSIZE);
 705                 if (space < resid + clen &&
 706                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
 707                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
 708                                 snderr(EWOULDBLOCK);
 709                         sbunlock(&so->so_snd);
 710                         error = sbwait(&so->so_snd);
 711                         if (error)
 712                                 goto out_locked;
 713                         goto restart;
 714                 }
 715                 SOCKBUF_UNLOCK(&so->so_snd);
 716                 mp = &top;
 717                 space -= clen;
 718                 do {
 719                     if (uio == NULL) {
 720                         /*
 721                          * Data is prepackaged in "top".
 722                          */
 723                         resid = 0;
 724                         if (flags & MSG_EOR)
 725                                 top->m_flags |= M_EOR;
 726                     } else do {
 727 #ifdef ZERO_COPY_SOCKETS
 728                         cow_send = 0;
 729 #endif /* ZERO_COPY_SOCKETS */
 730                         if (resid >= MINCLSIZE) {
 731 #ifdef ZERO_COPY_SOCKETS
 732                                 if (top == NULL) {
 733                                         MGETHDR(m, M_TRYWAIT, MT_DATA);
 734                                         if (m == NULL) {
 735                                                 error = ENOBUFS;
 736                                                 SOCKBUF_LOCK(&so->so_snd);
 737                                                 goto release;
 738                                         }
 739                                         m->m_pkthdr.len = 0;
 740                                         m->m_pkthdr.rcvif = NULL;
 741                                 } else {
 742                                         MGET(m, M_TRYWAIT, MT_DATA);
 743                                         if (m == NULL) {
 744                                                 error = ENOBUFS;
 745                                                 SOCKBUF_LOCK(&so->so_snd);
 746                                                 goto release;
 747                                         }
 748                                 }
 749                                 if (so_zero_copy_send &&
 750                                     resid>=PAGE_SIZE &&
 751                                     space>=PAGE_SIZE &&
 752                                     uio->uio_iov->iov_len>=PAGE_SIZE) {
 753                                         so_zerocp_stats.size_ok++;
 754                                         so_zerocp_stats.align_ok++;
 755                                         cow_send = socow_setup(m, uio);
 756                                         len = cow_send;
 757                                 }
 758                                 if (!cow_send) {
 759                                         MCLGET(m, M_TRYWAIT);
 760                                         if ((m->m_flags & M_EXT) == 0) {
 761                                                 m_free(m);
 762                                                 m = NULL;
 763                                         } else {
 764                                                 len = min(min(MCLBYTES, resid), space);
 765                                         }
 766                                 }
 767 #else /* ZERO_COPY_SOCKETS */
 768                                 if (top == NULL) {
 769                                         m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
 770                                         m->m_pkthdr.len = 0;
 771                                         m->m_pkthdr.rcvif = NULL;
 772                                 } else
 773                                         m = m_getcl(M_TRYWAIT, MT_DATA, 0);
 774                                 len = min(min(MCLBYTES, resid), space);
 775 #endif /* ZERO_COPY_SOCKETS */
 776                         } else {
 777                                 if (top == NULL) {
 778                                         m = m_gethdr(M_TRYWAIT, MT_DATA);
 779                                         m->m_pkthdr.len = 0;
 780                                         m->m_pkthdr.rcvif = NULL;
 781
 782                                         len = min(min(MHLEN, resid), space);
 783                                         /*
 784                                          * For datagram protocols, leave room
 785                                          * for protocol headers in first mbuf.
 786                                          */
 787                                         if (atomic && m && len < MHLEN)
 788                                                 MH_ALIGN(m, len);
 789                                 } else {
 790                                         m = m_get(M_TRYWAIT, MT_DATA);
 791                                         len = min(min(MLEN, resid), space);
 792                                 }
 793                         }
 794                         if (m == NULL) {
 795                                 error = ENOBUFS;
 796                                 SOCKBUF_LOCK(&so->so_snd);
 797                                 goto release;
 798                         }
 799
 800                         space -= len;
 801 #ifdef ZERO_COPY_SOCKETS
 802                         if (cow_send)
 803                                 error = 0;
 804                         else
 805 #endif /* ZERO_COPY_SOCKETS */
 806                         error = uiomove(mtod(m, void *), (int)len, uio);
 807                         resid = uio->uio_resid;
 808                         m->m_len = len;
 809                         *mp = m;
 810                         top->m_pkthdr.len += len;
 811                         if (error) {
 812                                 SOCKBUF_LOCK(&so->so_snd);
 813                                 goto release;
 814                         }
 815                         mp = &m->m_next;
 816                         if (resid <= 0) {
 817                                 if (flags & MSG_EOR)
 818                                         top->m_flags |= M_EOR;
 819                                 break;
 820                         }
 821                     } while (space > 0 && atomic);
 822                     if (dontroute) {
 823                             SOCK_LOCK(so);
 824                             so->so_options |= SO_DONTROUTE;
 825                             SOCK_UNLOCK(so);
 826                     }
 827                     /*
 828                      * XXX all the SBS_CANTSENDMORE checks previously
 829                      * done could be out of date.  We could have recieved
 830                      * a reset packet in an interrupt or maybe we slept
 831                      * while doing page faults in uiomove() etc. We could
 832                      * probably recheck again inside the locking protection
 833                      * here, but there are probably other places that this
 834                      * also happens.  We must rethink this.
 835                      */
 836                     error = (*so->so_proto->pr_usrreqs->pru_send)(so,
 837                         (flags & MSG_OOB) ? PRUS_OOB :
 838                         /*
 839                          * If the user set MSG_EOF, the protocol
 840                          * understands this flag and nothing left to
 841                          * send then use PRU_SEND_EOF instead of PRU_SEND.
 842                          */
 843                         ((flags & MSG_EOF) &&
 844                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 845                          (resid <= 0)) ?
 846                                 PRUS_EOF :
 847                         /* If there is more to send set PRUS_MORETOCOME */
 848                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 849                         top, addr, control, td);
 850                     if (dontroute) {
 851                             SOCK_LOCK(so);
 852                             so->so_options &= ~SO_DONTROUTE;
 853                             SOCK_UNLOCK(so);
 854                     }
 855                     clen = 0;
 856                     control = NULL;
 857                     top = NULL;
 858                     mp = &top;
 859                     if (error) {
 860                         SOCKBUF_LOCK(&so->so_snd);
 861                         goto release;
 862                     }
 863                 } while (resid && space > 0);
 864                 SOCKBUF_LOCK(&so->so_snd);
 865         } while (resid);
 866
 867 release:
 868         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 869         sbunlock(&so->so_snd);
 870 out_locked:
 871         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 872         SOCKBUF_UNLOCK(&so->so_snd);
 873 out:
 874         if (top != NULL)
 875                 m_freem(top);
 876         if (control != NULL)
 877                 m_freem(control);
 878         return (error);
 879 }
 880
 881 /*
 882  * The part of soreceive() that implements reading non-inline out-of-band
 883  * data from a socket.  For more complete comments, see soreceive(), from
 884  * which this code originated.
 885  *
 886  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
 887  * unable to return an mbuf chain to the caller.
 888  */
 889 static int
 890 soreceive_rcvoob(so, uio, flags)
 891         struct socket *so;
 892         struct uio *uio;
 893         int flags;
 894 {
 895         struct protosw *pr = so->so_proto;
 896         struct mbuf *m;
 897         int error;
 898
 899         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 900
 901         m = m_get(M_TRYWAIT, MT_DATA);
 902         if (m == NULL)
 903                 return (ENOBUFS);
 904         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
 905         if (error)
 906                 goto bad;
 907         do {
 908 #ifdef ZERO_COPY_SOCKETS
 909                 if (so_zero_copy_receive) {
 910                         int disposable;
 911
 912                         if ((m->m_flags & M_EXT)
 913                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
 914                                 disposable = 1;
 915                         else
 916                                 disposable = 0;
 917
 918                         error = uiomoveco(mtod(m, void *),
 919                                           min(uio->uio_resid, m->m_len),
 920                                           uio, disposable);
 921                 } else
 922 #endif /* ZERO_COPY_SOCKETS */
 923                 error = uiomove(mtod(m, void *),
 924                     (int) min(uio->uio_resid, m->m_len), uio);
 925                 m = m_free(m);
 926         } while (uio->uio_resid && error == 0 && m);
 927 bad:
 928         if (m != NULL)
 929                 m_freem(m);
 930         return (error);
 931 }
 932
 933 /*
 934  * Following replacement or removal of the first mbuf on the first mbuf chain
 935  * of a socket buffer, push necessary state changes back into the socket
 936  * buffer so that other consumers see the values consistently.  'nextrecord'
 937  * is the callers locally stored value of the original value of
 938  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
 939  * NOTE: 'nextrecord' may be NULL.
 940  */
 941 static __inline void
 942 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
 943 {
 944
 945         SOCKBUF_LOCK_ASSERT(sb);
 946         /*
 947          * First, update for the new value of nextrecord.  If necessary, make
 948          * it the first record.
 949          */
 950         if (sb->sb_mb != NULL)
 951                 sb->sb_mb->m_nextpkt = nextrecord;
 952         else
 953                 sb->sb_mb = nextrecord;
 954
 955         /*
 956          * Now update any dependent socket buffer fields to reflect the new
 957          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
 958          * addition of a second clause that takes care of the case where
 959          * sb_mb has been updated, but remains the last record.
 960          */
 961         if (sb->sb_mb == NULL) {
 962                 sb->sb_mbtail = NULL;
 963                 sb->sb_lastrecord = NULL;
 964         } else if (sb->sb_mb->m_nextpkt == NULL)
 965                 sb->sb_lastrecord = sb->sb_mb;
 966 }
 967
 968
 969 /*
 970  * Implement receive operations on a socket.
 971  * We depend on the way that records are added to the sockbuf
 972  * by sbappend*.  In particular, each record (mbufs linked through m_next)
 973  * must begin with an address if the protocol so specifies,
 974  * followed by an optional mbuf or mbufs containing ancillary data,
 975  * and then zero or more mbufs of data.
 976  * In order to avoid blocking network interrupts for the entire time here,
 977  * we splx() while doing the actual copy to user space.
 978  * Although the sockbuf is locked, new data may still be appended,
 979  * and thus we must maintain consistency of the sockbuf during that time.
 980  *
 981  * The caller may receive the data as a single mbuf chain by supplying
 982  * an mbuf **mp0 for use in returning the chain.  The uio is then used
 983  * only for the count in uio_resid.
 984  */
 985 int
 986 soreceive(so, psa, uio, mp0, controlp, flagsp)
 987         struct socket *so;
 988         struct sockaddr **psa;
 989         struct uio *uio;
 990         struct mbuf **mp0;
 991         struct mbuf **controlp;
 992         int *flagsp;
 993 {
 994         struct mbuf *m, **mp;
 995         int flags, len, error, offset;
 996         struct protosw *pr = so->so_proto;
 997         struct mbuf *nextrecord;
 998         int moff, type = 0;
 999         int orig_resid = uio->uio_resid;
1000
1001         mp = mp0;
1002         if (psa != NULL)
1003                 *psa = NULL;
1004         if (controlp != NULL)
1005                 *controlp = NULL;
1006         if (flagsp != NULL)
1007                 flags = *flagsp &~ MSG_EOR;
1008         else
1009                 flags = 0;
1010         if (flags & MSG_OOB)
1011                 return (soreceive_rcvoob(so, uio, flags));
1012         if (mp != NULL)
1013                 *mp = NULL;
1014         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1015             && uio->uio_resid)
1016                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1017
1018         SOCKBUF_LOCK(&so->so_rcv);
1019 restart:
1020         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1021         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1022         if (error)
1023                 goto out;
1024
1025         m = so->so_rcv.sb_mb;
1026         /*
1027          * If we have less data than requested, block awaiting more
1028          * (subject to any timeout) if:
1029          *   1. the current count is less than the low water mark, or
1030          *   2. MSG_WAITALL is set, and it is possible to do the entire
1031          *      receive operation at once if we block (resid <= hiwat).
1032          *   3. MSG_DONTWAIT is not set
1033          * If MSG_WAITALL is set but resid is larger than the receive buffer,
1034          * we have to do the receive in sections, and thus risk returning
1035          * a short count if a timeout or signal occurs after we start.
1036          */
1037         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1038             so->so_rcv.sb_cc < uio->uio_resid) &&
1039             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1040             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1041             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1042                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1043                     ("receive: m == %p so->so_rcv.sb_cc == %u",
1044                     m, so->so_rcv.sb_cc));
1045                 if (so->so_error) {
1046                         if (m != NULL)
1047                                 goto dontblock;
1048                         error = so->so_error;
1049                         if ((flags & MSG_PEEK) == 0)
1050                                 so->so_error = 0;
1051                         goto release;
1052                 }
1053                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1054                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1055                         if (m)
1056                                 goto dontblock;
1057                         else
1058                                 goto release;
1059                 }
1060                 for (; m != NULL; m = m->m_next)
1061                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1062                                 m = so->so_rcv.sb_mb;
1063                                 goto dontblock;
1064                         }
1065                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1066                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1067                         error = ENOTCONN;
1068                         goto release;
1069                 }
1070                 if (uio->uio_resid == 0)
1071                         goto release;
1072                 if ((so->so_state & SS_NBIO) ||
1073                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1074                         error = EWOULDBLOCK;
1075                         goto release;
1076                 }
1077                 SBLASTRECORDCHK(&so->so_rcv);
1078                 SBLASTMBUFCHK(&so->so_rcv);
1079                 sbunlock(&so->so_rcv);
1080                 error = sbwait(&so->so_rcv);
1081                 if (error)
1082                         goto out;
1083                 goto restart;
1084         }
1085 dontblock:
1086         /*
1087          * From this point onward, we maintain 'nextrecord' as a cache of the
1088          * pointer to the next record in the socket buffer.  We must keep the
1089          * various socket buffer pointers and local stack versions of the
1090          * pointers in sync, pushing out modifications before dropping the
1091          * socket buffer mutex, and re-reading them when picking it up.
1092          *
1093          * Otherwise, we will race with the network stack appending new data
1094          * or records onto the socket buffer by using inconsistent/stale
1095          * versions of the field, possibly resulting in socket buffer
1096          * corruption.
1097          *
1098          * By holding the high-level sblock(), we prevent simultaneous
1099          * readers from pulling off the front of the socket buffer.
1100          */
1101         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1102         if (uio->uio_td)
1103                 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1104         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1105         SBLASTRECORDCHK(&so->so_rcv);
1106         SBLASTMBUFCHK(&so->so_rcv);
1107         nextrecord = m->m_nextpkt;
1108         if (pr->pr_flags & PR_ADDR) {
1109                 KASSERT(m->m_type == MT_SONAME,
1110                     ("m->m_type == %d", m->m_type));
1111                 orig_resid = 0;
1112                 if (psa != NULL)
1113                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1114                             M_NOWAIT);
1115                 if (flags & MSG_PEEK) {
1116                         m = m->m_next;
1117                 } else {
1118                         sbfree(&so->so_rcv, m);
1119                         so->so_rcv.sb_mb = m_free(m);
1120                         m = so->so_rcv.sb_mb;
1121                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1122                 }
1123         }
1124
1125         /*
1126          * Process one or more MT_CONTROL mbufs present before any data mbufs
1127          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1128          * just copy the data; if !MSG_PEEK, we call into the protocol to
1129          * perform externalization (or freeing if controlp == NULL).
1130          */
1131         if (m != NULL && m->m_type == MT_CONTROL) {
1132                 struct mbuf *cm = NULL, *cmn;
1133                 struct mbuf **cme = &cm;
1134
1135                 do {
1136                         if (flags & MSG_PEEK) {
1137                                 if (controlp != NULL) {
1138                                         *controlp = m_copy(m, 0, m->m_len);
1139                                         controlp = &(*controlp)->m_next;
1140                                 }
1141                                 m = m->m_next;
1142                         } else {
1143                                 sbfree(&so->so_rcv, m);
1144                                 so->so_rcv.sb_mb = m->m_next;
1145                                 m->m_next = NULL;
1146                                 *cme = m;
1147                                 cme = &(*cme)->m_next;
1148                                 m = so->so_rcv.sb_mb;
1149                         }
1150                 } while (m != NULL && m->m_type == MT_CONTROL);
1151                 if ((flags & MSG_PEEK) == 0)
1152                         sockbuf_pushsync(&so->so_rcv, nextrecord);
1153                 while (cm != NULL) {
1154                         cmn = cm->m_next;
1155                         cm->m_next = NULL;
1156                         if (pr->pr_domain->dom_externalize != NULL) {
1157                                 SOCKBUF_UNLOCK(&so->so_rcv);
1158                                 error = (*pr->pr_domain->dom_externalize)
1159                                     (cm, controlp);
1160                                 SOCKBUF_LOCK(&so->so_rcv);
1161                         } else if (controlp != NULL)
1162                                 *controlp = cm;
1163                         else
1164                                 m_freem(cm);
1165                         if (controlp != NULL) {
1166                                 orig_resid = 0;
1167                                 while (*controlp != NULL)
1168                                         controlp = &(*controlp)->m_next;
1169                         }
1170                         cm = cmn;
1171                 }
1172                 if (m != NULL)
1173                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1174                 else
1175                         nextrecord = so->so_rcv.sb_mb;
1176                 orig_resid = 0;
1177         }
1178         if (m != NULL) {
1179                 if ((flags & MSG_PEEK) == 0) {
1180                         KASSERT(m->m_nextpkt == nextrecord,
1181                             ("soreceive: post-control, nextrecord !sync"));
1182                         if (nextrecord == NULL) {
1183                                 KASSERT(so->so_rcv.sb_mb == m,
1184                                     ("soreceive: post-control, sb_mb!=m"));
1185                                 KASSERT(so->so_rcv.sb_lastrecord == m,
1186                                     ("soreceive: post-control, lastrecord!=m"));
1187                         }
1188                 }
1189                 type = m->m_type;
1190                 if (type == MT_OOBDATA)
1191                         flags |= MSG_OOB;
1192         } else {
1193                 if ((flags & MSG_PEEK) == 0) {
1194                         KASSERT(so->so_rcv.sb_mb == nextrecord,
1195                             ("soreceive: sb_mb != nextrecord"));
1196                         if (so->so_rcv.sb_mb == NULL) {
1197                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1198                                     ("soreceive: sb_lastercord != NULL"));
1199                         }
1200                 }
1201         }
1202         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1203         SBLASTRECORDCHK(&so->so_rcv);
1204         SBLASTMBUFCHK(&so->so_rcv);
1205
1206         /*
1207          * Now continue to read any data mbufs off of the head of the socket
1208          * buffer until the read request is satisfied.  Note that 'type' is
1209          * used to store the type of any mbuf reads that have happened so far
1210          * such that soreceive() can stop reading if the type changes, which
1211          * causes soreceive() to return only one of regular data and inline
1212          * out-of-band data in a single socket receive operation.
1213          */
1214         moff = 0;
1215         offset = 0;
1216         while (m != NULL && uio->uio_resid > 0 && error == 0) {
1217                 /*
1218                  * If the type of mbuf has changed since the last mbuf
1219                  * examined ('type'), end the receive operation.
1220                  */
1221                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1222                 if (m->m_type == MT_OOBDATA) {
1223                         if (type != MT_OOBDATA)
1224                                 break;
1225                 } else if (type == MT_OOBDATA)
1226                         break;
1227                 else
1228                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1229                         ("m->m_type == %d", m->m_type));
1230                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1231                 len = uio->uio_resid;
1232                 if (so->so_oobmark && len > so->so_oobmark - offset)
1233                         len = so->so_oobmark - offset;
1234                 if (len > m->m_len - moff)
1235                         len = m->m_len - moff;
1236                 /*
1237                  * If mp is set, just pass back the mbufs.
1238                  * Otherwise copy them out via the uio, then free.
1239                  * Sockbuf must be consistent here (points to current mbuf,
1240                  * it points to next record) when we drop priority;
1241                  * we must note any additions to the sockbuf when we
1242                  * block interrupts again.
1243                  */
1244                 if (mp == NULL) {
1245                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1246                         SBLASTRECORDCHK(&so->so_rcv);
1247                         SBLASTMBUFCHK(&so->so_rcv);
1248                         SOCKBUF_UNLOCK(&so->so_rcv);
1249 #ifdef ZERO_COPY_SOCKETS
1250                         if (so_zero_copy_receive) {
1251                                 int disposable;
1252
1253                                 if ((m->m_flags & M_EXT)
1254                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
1255                                         disposable = 1;
1256                                 else
1257                                         disposable = 0;
1258
1259                                 error = uiomoveco(mtod(m, char *) + moff,
1260                                                   (int)len, uio,
1261                                                   disposable);
1262                         } else
1263 #endif /* ZERO_COPY_SOCKETS */
1264                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1265                         SOCKBUF_LOCK(&so->so_rcv);
1266                         if (error) {
1267                                 /*
1268                                  * If any part of the record has been removed
1269                                  * (such as the MT_SONAME mbuf, which will
1270                                  * happen when PR_ADDR, and thus also
1271                                  * PR_ATOMIC, is set), then drop the entire
1272                                  * record to maintain the atomicity of the
1273                                  * receive operation.
1274                                  */
1275                                 if (m && pr->pr_flags & PR_ATOMIC &&
1276                                     ((flags & MSG_PEEK) == 0))
1277                                         (void)sbdroprecord_locked(&so->so_rcv);
1278                                 goto release;
1279                         }
1280                 } else
1281                         uio->uio_resid -= len;
1282                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1283                 if (len == m->m_len - moff) {
1284                         if (m->m_flags & M_EOR)
1285                                 flags |= MSG_EOR;
1286                         if (flags & MSG_PEEK) {
1287                                 m = m->m_next;
1288                                 moff = 0;
1289                         } else {
1290                                 nextrecord = m->m_nextpkt;
1291                                 sbfree(&so->so_rcv, m);
1292                                 if (mp != NULL) {
1293                                         *mp = m;
1294                                         mp = &m->m_next;
1295                                         so->so_rcv.sb_mb = m = m->m_next;
1296                                         *mp = NULL;
1297                                 } else {
1298                                         so->so_rcv.sb_mb = m_free(m);
1299                                         m = so->so_rcv.sb_mb;
1300                                 }
1301                                 if (m != NULL) {
1302                                         m->m_nextpkt = nextrecord;
1303                                         if (nextrecord == NULL)
1304                                                 so->so_rcv.sb_lastrecord = m;
1305                                 } else {
1306                                         so->so_rcv.sb_mb = nextrecord;
1307                                         SB_EMPTY_FIXUP(&so->so_rcv);
1308                                 }
1309                                 SBLASTRECORDCHK(&so->so_rcv);
1310                                 SBLASTMBUFCHK(&so->so_rcv);
1311                         }
1312                 } else {
1313                         if (flags & MSG_PEEK)
1314                                 moff += len;
1315                         else {
1316                                 if (mp != NULL) {
1317                                         int copy_flag;
1318
1319                                         if (flags & MSG_DONTWAIT)
1320                                                 copy_flag = M_DONTWAIT;
1321                                         else
1322                                                 copy_flag = M_TRYWAIT;
1323                                         if (copy_flag == M_TRYWAIT)
1324                                                 SOCKBUF_UNLOCK(&so->so_rcv);
1325                                         *mp = m_copym(m, 0, len, copy_flag);
1326                                         if (copy_flag == M_TRYWAIT)
1327                                                 SOCKBUF_LOCK(&so->so_rcv);
1328                                         if (*mp == NULL) {
1329                                                 /*
1330                                                  * m_copym() couldn't allocate an mbuf.
1331                                                  * Adjust uio_resid back (it was adjusted
1332                                                  * down by len bytes, which we didn't end
1333                                                  * up "copying" over).
1334                                                  */
1335                                                 uio->uio_resid += len;
1336                                                 break;
1337                                         }
1338                                 }
1339                                 m->m_data += len;
1340                                 m->m_len -= len;
1341                                 so->so_rcv.sb_cc -= len;
1342                         }
1343                 }
1344                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1345                 if (so->so_oobmark) {
1346                         if ((flags & MSG_PEEK) == 0) {
1347                                 so->so_oobmark -= len;
1348                                 if (so->so_oobmark == 0) {
1349                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
1350                                         break;
1351                                 }
1352                         } else {
1353                                 offset += len;
1354                                 if (offset == so->so_oobmark)
1355                                         break;
1356                         }
1357                 }
1358                 if (flags & MSG_EOR)
1359                         break;
1360                 /*
1361                  * If the MSG_WAITALL flag is set (for non-atomic socket),
1362                  * we must not quit until "uio->uio_resid == 0" or an error
1363                  * termination.  If a signal/timeout occurs, return
1364                  * with a short count but without error.
1365                  * Keep sockbuf locked against other readers.
1366                  */
1367                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1368                     !sosendallatonce(so) && nextrecord == NULL) {
1369                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1370                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1371                                 break;
1372                         /*
1373                          * Notify the protocol that some data has been
1374                          * drained before blocking.
1375                          */
1376                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
1377                                 SOCKBUF_UNLOCK(&so->so_rcv);
1378                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1379                                 SOCKBUF_LOCK(&so->so_rcv);
1380                         }
1381                         SBLASTRECORDCHK(&so->so_rcv);
1382                         SBLASTMBUFCHK(&so->so_rcv);
1383                         error = sbwait(&so->so_rcv);
1384                         if (error)
1385                                 goto release;
1386                         m = so->so_rcv.sb_mb;
1387                         if (m != NULL)
1388                                 nextrecord = m->m_nextpkt;
1389                 }
1390         }
1391
1392         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1393         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1394                 flags |= MSG_TRUNC;
1395                 if ((flags & MSG_PEEK) == 0)
1396                         (void) sbdroprecord_locked(&so->so_rcv);
1397         }
1398         if ((flags & MSG_PEEK) == 0) {
1399                 if (m == NULL) {
1400                         /*
1401                          * First part is an inline SB_EMPTY_FIXUP().  Second
1402                          * part makes sure sb_lastrecord is up-to-date if
1403                          * there is still data in the socket buffer.
1404                          */
1405                         so->so_rcv.sb_mb = nextrecord;
1406                         if (so->so_rcv.sb_mb == NULL) {
1407                                 so->so_rcv.sb_mbtail = NULL;
1408                                 so->so_rcv.sb_lastrecord = NULL;
1409                         } else if (nextrecord->m_nextpkt == NULL)
1410                                 so->so_rcv.sb_lastrecord = nextrecord;
1411                 }
1412                 SBLASTRECORDCHK(&so->so_rcv);
1413                 SBLASTMBUFCHK(&so->so_rcv);
1414                 /*
1415                  * If soreceive() is being done from the socket callback, then
1416                  * don't need to generate ACK to peer to update window, since
1417                  * ACK will be generated on return to TCP.
1418                  */
1419                 if (!(flags & MSG_SOCALLBCK) &&
1420                     (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
1421                         SOCKBUF_UNLOCK(&so->so_rcv);
1422                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1423                         SOCKBUF_LOCK(&so->so_rcv);
1424                 }
1425         }
1426         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1427         if (orig_resid == uio->uio_resid && orig_resid &&
1428             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1429                 sbunlock(&so->so_rcv);
1430                 goto restart;
1431         }
1432
1433         if (flagsp != NULL)
1434                 *flagsp |= flags;
1435 release:
1436         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1437         sbunlock(&so->so_rcv);
1438 out:
1439         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1440         SOCKBUF_UNLOCK(&so->so_rcv);
1441         return (error);
1442 }
1443
1444 int
1445 soshutdown(so, how)
1446         struct socket *so;
1447         int how;
1448 {
1449         struct protosw *pr = so->so_proto;
1450
1451         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1452                 return (EINVAL);
1453
1454         if (how != SHUT_WR)
1455                 sorflush(so);
1456         if (how != SHUT_RD)
1457                 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1458         return (0);
1459 }
1460
1461 void
1462 sorflush(so)
1463         struct socket *so;
1464 {
1465         struct sockbuf *sb = &so->so_rcv;
1466         struct protosw *pr = so->so_proto;
1467         struct sockbuf asb;
1468
1469         /*
1470          * XXXRW: This is quite ugly.  Previously, this code made a copy of
1471          * the socket buffer, then zero'd the original to clear the buffer
1472          * fields.  However, with mutexes in the socket buffer, this causes
1473          * problems.  We only clear the zeroable bits of the original;
1474          * however, we have to initialize and destroy the mutex in the copy
1475          * so that dom_dispose() and sbrelease() can lock t as needed.
1476          */
1477         SOCKBUF_LOCK(sb);
1478         sb->sb_flags |= SB_NOINTR;
1479         (void) sblock(sb, M_WAITOK);
1480         /*
1481          * socantrcvmore_locked() drops the socket buffer mutex so that it
1482          * can safely perform wakeups.  Re-acquire the mutex before
1483          * continuing.
1484          */
1485         socantrcvmore_locked(so);
1486         SOCKBUF_LOCK(sb);
1487         sbunlock(sb);
1488         /*
1489          * Invalidate/clear most of the sockbuf structure, but leave
1490          * selinfo and mutex data unchanged.
1491          */
1492         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1493         bcopy(&sb->sb_startzero, &asb.sb_startzero,
1494             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1495         bzero(&sb->sb_startzero,
1496             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1497         SOCKBUF_UNLOCK(sb);
1498
1499         SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1500         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1501                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1502         sbrelease(&asb, so);
1503         SOCKBUF_LOCK_DESTROY(&asb);
1504 }
1505
1506 /*
1507  * Perhaps this routine, and sooptcopyout(), below, ought to come in
1508  * an additional variant to handle the case where the option value needs
1509  * to be some kind of integer, but not a specific size.
1510  * In addition to their use here, these functions are also called by the
1511  * protocol-level pr_ctloutput() routines.
1512  */
1513 int
1514 sooptcopyin(sopt, buf, len, minlen)
1515         struct  sockopt *sopt;
1516         void    *buf;
1517         size_t  len;
1518         size_t  minlen;
1519 {
1520         size_t  valsize;
1521
1522         /*
1523          * If the user gives us more than we wanted, we ignore it,
1524          * but if we don't get the minimum length the caller
1525          * wants, we return EINVAL.  On success, sopt->sopt_valsize
1526          * is set to however much we actually retrieved.
1527          */
1528         if ((valsize = sopt->sopt_valsize) < minlen)
1529                 return EINVAL;
1530         if (valsize > len)
1531                 sopt->sopt_valsize = valsize = len;
1532
1533         if (sopt->sopt_td != NULL)
1534                 return (copyin(sopt->sopt_val, buf, valsize));
1535
1536         bcopy(sopt->sopt_val, buf, valsize);
1537         return 0;
1538 }
1539
1540 /*
1541  * Kernel version of setsockopt(2)/
1542  * XXX: optlen is size_t, not socklen_t
1543  */
1544 int
1545 so_setsockopt(struct socket *so, int level, int optname, void *optval,
1546     size_t optlen)
1547 {
1548         struct sockopt sopt;
1549
1550         sopt.sopt_level = level;
1551         sopt.sopt_name = optname;
1552         sopt.sopt_dir = SOPT_SET;
1553         sopt.sopt_val = optval;
1554         sopt.sopt_valsize = optlen;
1555         sopt.sopt_td = NULL;
1556         return (sosetopt(so, &sopt));
1557 }
1558
1559 int
1560 sosetopt(so, sopt)
1561         struct socket *so;
1562         struct sockopt *sopt;
1563 {
1564         int     error, optval;
1565         struct  linger l;
1566         struct  timeval tv;
1567         u_long  val;
1568 #ifdef MAC
1569         struct mac extmac;
1570 #endif
1571
1572         error = 0;
1573         if (sopt->sopt_level != SOL_SOCKET) {
1574                 if (so->so_proto && so->so_proto->pr_ctloutput)
1575                         return ((*so->so_proto->pr_ctloutput)
1576                                   (so, sopt));
1577                 error = ENOPROTOOPT;
1578         } else {
1579                 switch (sopt->sopt_name) {
1580 #ifdef INET
1581                 case SO_ACCEPTFILTER:
1582                         error = do_setopt_accept_filter(so, sopt);
1583                         if (error)
1584                                 goto bad;
1585                         break;
1586 #endif
1587                 case SO_LINGER:
1588                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1589                         if (error)
1590                                 goto bad;
1591
1592                         SOCK_LOCK(so);
1593                         so->so_linger = l.l_linger;
1594                         if (l.l_onoff)
1595                                 so->so_options |= SO_LINGER;
1596                         else
1597                                 so->so_options &= ~SO_LINGER;
1598                         SOCK_UNLOCK(so);
1599                         break;
1600
1601                 case SO_DEBUG:
1602                 case SO_KEEPALIVE:
1603                 case SO_DONTROUTE:
1604                 case SO_USELOOPBACK:
1605                 case SO_BROADCAST:
1606                 case SO_REUSEADDR:
1607                 case SO_REUSEPORT:
1608                 case SO_OOBINLINE:
1609                 case SO_TIMESTAMP:
1610                 case SO_BINTIME:
1611                 case SO_NOSIGPIPE:
1612                         error = sooptcopyin(sopt, &optval, sizeof optval,
1613                                             sizeof optval);
1614                         if (error)
1615                                 goto bad;
1616                         SOCK_LOCK(so);
1617                         if (optval)
1618                                 so->so_options |= sopt->sopt_name;
1619                         else
1620                                 so->so_options &= ~sopt->sopt_name;
1621                         SOCK_UNLOCK(so);
1622                         break;
1623
1624                 case SO_SNDBUF:
1625                 case SO_RCVBUF:
1626                 case SO_SNDLOWAT:
1627                 case SO_RCVLOWAT:
1628                         error = sooptcopyin(sopt, &optval, sizeof optval,
1629                                             sizeof optval);
1630                         if (error)
1631                                 goto bad;
1632
1633                         /*
1634                          * Values < 1 make no sense for any of these
1635                          * options, so disallow them.
1636                          */
1637                         if (optval < 1) {
1638                                 error = EINVAL;
1639                                 goto bad;
1640                         }
1641
1642                         switch (sopt->sopt_name) {
1643                         case SO_SNDBUF:
1644                         case SO_RCVBUF:
1645                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1646                                     &so->so_snd : &so->so_rcv, (u_long)optval,
1647                                     so, curthread) == 0) {
1648                                         error = ENOBUFS;
1649                                         goto bad;
1650                                 }
1651                                 break;
1652
1653                         /*
1654                          * Make sure the low-water is never greater than
1655                          * the high-water.
1656                          */
1657                         case SO_SNDLOWAT:
1658                                 SOCKBUF_LOCK(&so->so_snd);
1659                                 so->so_snd.sb_lowat =
1660                                     (optval > so->so_snd.sb_hiwat) ?
1661                                     so->so_snd.sb_hiwat : optval;
1662                                 SOCKBUF_UNLOCK(&so->so_snd);
1663                                 break;
1664                         case SO_RCVLOWAT:
1665                                 SOCKBUF_LOCK(&so->so_rcv);
1666                                 so->so_rcv.sb_lowat =
1667                                     (optval > so->so_rcv.sb_hiwat) ?
1668                                     so->so_rcv.sb_hiwat : optval;
1669                                 SOCKBUF_UNLOCK(&so->so_rcv);
1670                                 break;
1671                         }
1672                         break;
1673
1674                 case SO_SNDTIMEO:
1675                 case SO_RCVTIMEO:
1676 #ifdef COMPAT_IA32
1677                         if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
1678                                 struct timeval32 tv32;
1679
1680                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
1681                                     sizeof tv32);
1682                                 CP(tv32, tv, tv_sec);
1683                                 CP(tv32, tv, tv_usec);
1684                         } else
1685 #endif
1686                                 error = sooptcopyin(sopt, &tv, sizeof tv,
1687                                     sizeof tv);
1688                         if (error)
1689                                 goto bad;
1690
1691                         /* assert(hz > 0); */
1692                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
1693                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1694                                 error = EDOM;
1695                                 goto bad;
1696                         }
1697                         /* assert(tick > 0); */
1698                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
1699                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1700                         if (val > INT_MAX) {
1701                                 error = EDOM;
1702                                 goto bad;
1703                         }
1704                         if (val == 0 && tv.tv_usec != 0)
1705                                 val = 1;
1706
1707                         switch (sopt->sopt_name) {
1708                         case SO_SNDTIMEO:
1709                                 so->so_snd.sb_timeo = val;
1710                                 break;
1711                         case SO_RCVTIMEO:
1712                                 so->so_rcv.sb_timeo = val;
1713                                 break;
1714                         }
1715                         break;
1716
1717                 case SO_LABEL:
1718 #ifdef MAC
1719                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
1720                             sizeof extmac);
1721                         if (error)
1722                                 goto bad;
1723                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1724                             so, &extmac);
1725 #else
1726                         error = EOPNOTSUPP;
1727 #endif
1728                         break;
1729
1730                 default:
1731                         error = ENOPROTOOPT;
1732                         break;
1733                 }
1734                 if (error == 0 && so->so_proto != NULL &&
1735                     so->so_proto->pr_ctloutput != NULL) {
1736                         (void) ((*so->so_proto->pr_ctloutput)
1737                                   (so, sopt));
1738                 }
1739         }
1740 bad:
1741         return (error);
1742 }
1743
1744 /* Helper routine for getsockopt */
1745 int
1746 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1747 {
1748         int     error;
1749         size_t  valsize;
1750
1751         error = 0;
1752
1753         /*
1754          * Documented get behavior is that we always return a value,
1755          * possibly truncated to fit in the user's buffer.
1756          * Traditional behavior is that we always tell the user
1757          * precisely how much we copied, rather than something useful
1758          * like the total amount we had available for her.
1759          * Note that this interface is not idempotent; the entire answer must
1760          * generated ahead of time.
1761          */
1762         valsize = min(len, sopt->sopt_valsize);
1763         sopt->sopt_valsize = valsize;
1764         if (sopt->sopt_val != NULL) {
1765                 if (sopt->sopt_td != NULL)
1766                         error = copyout(buf, sopt->sopt_val, valsize);
1767                 else
1768                         bcopy(buf, sopt->sopt_val, valsize);
1769         }
1770         return error;
1771 }
1772
1773 int
1774 sogetopt(so, sopt)
1775         struct socket *so;
1776         struct sockopt *sopt;
1777 {
1778         int     error, optval;
1779         struct  linger l;
1780         struct  timeval tv;
1781 #ifdef MAC
1782         struct mac extmac;
1783 #endif
1784
1785         error = 0;
1786         if (sopt->sopt_level != SOL_SOCKET) {
1787                 if (so->so_proto && so->so_proto->pr_ctloutput) {
1788                         return ((*so->so_proto->pr_ctloutput)
1789                                   (so, sopt));
1790                 } else
1791                         return (ENOPROTOOPT);
1792         } else {
1793                 switch (sopt->sopt_name) {
1794 #ifdef INET
1795                 case SO_ACCEPTFILTER:
1796                         error = do_getopt_accept_filter(so, sopt);
1797                         break;
1798 #endif
1799                 case SO_LINGER:
1800                         SOCK_LOCK(so);
1801                         l.l_onoff = so->so_options & SO_LINGER;
1802                         l.l_linger = so->so_linger;
1803                         SOCK_UNLOCK(so);
1804                         error = sooptcopyout(sopt, &l, sizeof l);
1805                         break;
1806
1807                 case SO_USELOOPBACK:
1808                 case SO_DONTROUTE:
1809                 case SO_DEBUG:
1810                 case SO_KEEPALIVE:
1811                 case SO_REUSEADDR:
1812                 case SO_REUSEPORT:
1813                 case SO_BROADCAST:
1814                 case SO_OOBINLINE:
1815                 case SO_ACCEPTCONN:
1816                 case SO_TIMESTAMP:
1817                 case SO_BINTIME:
1818                 case SO_NOSIGPIPE:
1819                         optval = so->so_options & sopt->sopt_name;
1820 integer:
1821                         error = sooptcopyout(sopt, &optval, sizeof optval);
1822                         break;
1823
1824                 case SO_TYPE:
1825                         optval = so->so_type;
1826                         goto integer;
1827
1828                 case SO_ERROR:
1829                         optval = so->so_error;
1830                         so->so_error = 0;
1831                         goto integer;
1832
1833                 case SO_SNDBUF:
1834                         optval = so->so_snd.sb_hiwat;
1835                         goto integer;
1836
1837                 case SO_RCVBUF:
1838                         optval = so->so_rcv.sb_hiwat;
1839                         goto integer;
1840
1841                 case SO_SNDLOWAT:
1842                         optval = so->so_snd.sb_lowat;
1843                         goto integer;
1844
1845                 case SO_RCVLOWAT:
1846                         optval = so->so_rcv.sb_lowat;
1847                         goto integer;
1848
1849                 case SO_SNDTIMEO:
1850                 case SO_RCVTIMEO:
1851                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
1852                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1853
1854                         tv.tv_sec = optval / hz;
1855                         tv.tv_usec = (optval % hz) * tick;
1856 #ifdef COMPAT_IA32
1857                         if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
1858                                 struct timeval32 tv32;
1859
1860                                 CP(tv, tv32, tv_sec);
1861                                 CP(tv, tv32, tv_usec);
1862                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
1863                         } else
1864 #endif
1865                                 error = sooptcopyout(sopt, &tv, sizeof tv);
1866                         break;
1867
1868                 case SO_LABEL:
1869 #ifdef MAC
1870                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1871                             sizeof(extmac));
1872                         if (error)
1873                                 return (error);
1874                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1875                             so, &extmac);
1876                         if (error)
1877                                 return (error);
1878                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
1879 #else
1880                         error = EOPNOTSUPP;
1881 #endif
1882                         break;
1883
1884                 case SO_PEERLABEL:
1885 #ifdef MAC
1886                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1887                             sizeof(extmac));
1888                         if (error)
1889                                 return (error);
1890                         error = mac_getsockopt_peerlabel(
1891                             sopt->sopt_td->td_ucred, so, &extmac);
1892                         if (error)
1893                                 return (error);
1894                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
1895 #else
1896                         error = EOPNOTSUPP;
1897 #endif
1898                         break;
1899
1900                 case SO_LISTENQLIMIT:
1901                         optval = so->so_qlimit;
1902                         goto integer;
1903
1904                 case SO_LISTENQLEN:
1905                         optval = so->so_qlen;
1906                         goto integer;
1907
1908                 case SO_LISTENINCQLEN:
1909                         optval = so->so_incqlen;
1910                         goto integer;
1911
1912                 default:
1913                         error = ENOPROTOOPT;
1914                         break;
1915                 }
1916                 return (error);
1917         }
1918 }
1919
1920 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1921 int
1922 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1923 {
1924         struct mbuf *m, *m_prev;
1925         int sopt_size = sopt->sopt_valsize;
1926
1927         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1928         if (m == NULL)
1929                 return ENOBUFS;
1930         if (sopt_size > MLEN) {
1931                 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
1932                 if ((m->m_flags & M_EXT) == 0) {
1933                         m_free(m);
1934                         return ENOBUFS;
1935                 }
1936                 m->m_len = min(MCLBYTES, sopt_size);
1937         } else {
1938                 m->m_len = min(MLEN, sopt_size);
1939         }
1940         sopt_size -= m->m_len;
1941         *mp = m;
1942         m_prev = m;
1943
1944         while (sopt_size) {
1945                 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1946                 if (m == NULL) {
1947                         m_freem(*mp);
1948                         return ENOBUFS;
1949                 }
1950                 if (sopt_size > MLEN) {
1951                         MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
1952                             M_DONTWAIT);
1953                         if ((m->m_flags & M_EXT) == 0) {
1954                                 m_freem(m);
1955                                 m_freem(*mp);
1956                                 return ENOBUFS;
1957                         }
1958                         m->m_len = min(MCLBYTES, sopt_size);
1959                 } else {
1960                         m->m_len = min(MLEN, sopt_size);
1961                 }
1962                 sopt_size -= m->m_len;
1963                 m_prev->m_next = m;
1964                 m_prev = m;
1965         }
1966         return 0;
1967 }
1968
1969 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1970 int
1971 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1972 {
1973         struct mbuf *m0 = m;
1974
1975         if (sopt->sopt_val == NULL)
1976                 return 0;
1977         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1978                 if (sopt->sopt_td != NULL) {
1979                         int error;
1980
1981                         error = copyin(sopt->sopt_val, mtod(m, char *),
1982                                        m->m_len);
1983                         if (error != 0) {
1984                                 m_freem(m0);
1985                                 return(error);
1986                         }
1987                 } else
1988                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1989                 sopt->sopt_valsize -= m->m_len;
1990                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
1991                 m = m->m_next;
1992         }
1993         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1994                 panic("ip6_sooptmcopyin");
1995         return 0;
1996 }
1997
1998 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1999 int
2000 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2001 {
2002         struct mbuf *m0 = m;
2003         size_t valsize = 0;
2004
2005         if (sopt->sopt_val == NULL)
2006                 return 0;
2007         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2008                 if (sopt->sopt_td != NULL) {
2009                         int error;
2010
2011                         error = copyout(mtod(m, char *), sopt->sopt_val,
2012                                        m->m_len);
2013                         if (error != 0) {
2014                                 m_freem(m0);
2015                                 return(error);
2016                         }
2017                 } else
2018                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2019                sopt->sopt_valsize -= m->m_len;
2020                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2021                valsize += m->m_len;
2022                m = m->m_next;
2023         }
2024         if (m != NULL) {
2025                 /* enough soopt buffer should be given from user-land */
2026                 m_freem(m0);
2027                 return(EINVAL);
2028         }
2029         sopt->sopt_valsize = valsize;
2030         return 0;
2031 }
2032
2033 void
2034 sohasoutofband(so)
2035         struct socket *so;
2036 {
2037         if (so->so_sigio != NULL)
2038                 pgsigio(&so->so_sigio, SIGURG, 0);
2039         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2040 }
2041
2042 int
2043 sopoll(struct socket *so, int events, struct ucred *active_cred,
2044     struct thread *td)
2045 {
2046         int revents = 0;
2047
2048         SOCKBUF_LOCK(&so->so_snd);
2049         SOCKBUF_LOCK(&so->so_rcv);
2050         if (events & (POLLIN | POLLRDNORM))
2051                 if (soreadable(so))
2052                         revents |= events & (POLLIN | POLLRDNORM);
2053
2054         if (events & POLLINIGNEOF)
2055                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2056                     !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2057                         revents |= POLLINIGNEOF;
2058
2059         if (events & (POLLOUT | POLLWRNORM))
2060                 if (sowriteable(so))
2061                         revents |= events & (POLLOUT | POLLWRNORM);
2062
2063         if (events & (POLLPRI | POLLRDBAND))
2064                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2065                         revents |= events & (POLLPRI | POLLRDBAND);
2066
2067         if (revents == 0) {
2068                 if (events &
2069                     (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2070                      POLLRDBAND)) {
2071                         selrecord(td, &so->so_rcv.sb_sel);
2072                         so->so_rcv.sb_flags |= SB_SEL;
2073                 }
2074
2075                 if (events & (POLLOUT | POLLWRNORM)) {
2076                         selrecord(td, &so->so_snd.sb_sel);
2077                         so->so_snd.sb_flags |= SB_SEL;
2078                 }
2079         }
2080
2081         SOCKBUF_UNLOCK(&so->so_rcv);
2082         SOCKBUF_UNLOCK(&so->so_snd);
2083         return (revents);
2084 }
2085
2086 int
2087 soo_kqfilter(struct file *fp, struct knote *kn)
2088 {
2089         struct socket *so = kn->kn_fp->f_data;
2090         struct sockbuf *sb;
2091
2092         switch (kn->kn_filter) {
2093         case EVFILT_READ:
2094                 if (so->so_options & SO_ACCEPTCONN)
2095                         kn->kn_fop = &solisten_filtops;
2096                 else
2097                         kn->kn_fop = &soread_filtops;
2098                 sb = &so->so_rcv;
2099                 break;
2100         case EVFILT_WRITE:
2101                 kn->kn_fop = &sowrite_filtops;
2102                 sb = &so->so_snd;
2103                 break;
2104         default:
2105                 return (EINVAL);
2106         }
2107
2108         SOCKBUF_LOCK(sb);
2109         knlist_add(&sb->sb_sel.si_note, kn, 1);
2110         sb->sb_flags |= SB_KNOTE;
2111         SOCKBUF_UNLOCK(sb);
2112         return (0);
2113 }
2114
2115 static void
2116 filt_sordetach(struct knote *kn)
2117 {
2118         struct socket *so = kn->kn_fp->f_data;
2119
2120         SOCKBUF_LOCK(&so->so_rcv);
2121         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2122         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2123                 so->so_rcv.sb_flags &= ~SB_KNOTE;
2124         SOCKBUF_UNLOCK(&so->so_rcv);
2125 }
2126
2127 /*ARGSUSED*/
2128 static int
2129 filt_soread(struct knote *kn, long hint)
2130 {
2131         struct socket *so;
2132
2133         so = kn->kn_fp->f_data;
2134         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2135
2136         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2137         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2138                 kn->kn_flags |= EV_EOF;
2139                 kn->kn_fflags = so->so_error;
2140                 return (1);
2141         } else if (so->so_error)        /* temporary udp error */
2142                 return (1);
2143         else if (kn->kn_sfflags & NOTE_LOWAT)
2144                 return (kn->kn_data >= kn->kn_sdata);
2145         else
2146                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2147 }
2148
2149 static void
2150 filt_sowdetach(struct knote *kn)
2151 {
2152         struct socket *so = kn->kn_fp->f_data;
2153
2154         SOCKBUF_LOCK(&so->so_snd);
2155         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2156         if (knlist_empty(&so->so_snd.sb_sel.si_note))
2157                 so->so_snd.sb_flags &= ~SB_KNOTE;
2158         SOCKBUF_UNLOCK(&so->so_snd);
2159 }
2160
2161 /*ARGSUSED*/
2162 static int
2163 filt_sowrite(struct knote *kn, long hint)
2164 {
2165         struct socket *so;
2166
2167         so = kn->kn_fp->f_data;
2168         SOCKBUF_LOCK_ASSERT(&so->so_snd);
2169         kn->kn_data = sbspace(&so->so_snd);
2170         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2171                 kn->kn_flags |= EV_EOF;
2172                 kn->kn_fflags = so->so_error;
2173                 return (1);
2174         } else if (so->so_error)        /* temporary udp error */
2175                 return (1);
2176         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2177             (so->so_proto->pr_flags & PR_CONNREQUIRED))
2178                 return (0);
2179         else if (kn->kn_sfflags & NOTE_LOWAT)
2180                 return (kn->kn_data >= kn->kn_sdata);
2181         else
2182                 return (kn->kn_data >= so->so_snd.sb_lowat);
2183 }
2184
2185 /*ARGSUSED*/
2186 static int
2187 filt_solisten(struct knote *kn, long hint)
2188 {
2189         struct socket *so = kn->kn_fp->f_data;
2190
2191         kn->kn_data = so->so_qlen;
2192         return (! TAILQ_EMPTY(&so->so_comp));
2193 }
2194
2195 int
2196 socheckuid(struct socket *so, uid_t uid)
2197 {
2198
2199         if (so == NULL)
2200                 return (EPERM);
2201         if (so->so_cred->cr_uid != uid)
2202                 return (EPERM);
2203         return (0);
2204 }
2205
2206 static int
2207 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2208 {
2209         int error;
2210         int val;
2211
2212         val = somaxconn;
2213         error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2214         if (error || !req->newptr )
2215                 return (error);
2216
2217         if (val < 1 || val > USHRT_MAX)
2218                 return (EINVAL);
2219
2220         somaxconn = val;
2221         return (0);
2222 }