sys/kern/uipc_syscalls.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * sendfile(2) and related extensions:
   6  * Copyright (c) 1998, David Greenman. All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_syscalls.c     8.4 (Berkeley) 2/21/94
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 #include "opt_capsicum.h"
  39 #include "opt_inet.h"
  40 #include "opt_inet6.h"
  41 #include "opt_sctp.h"
  42 #include "opt_compat.h"
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/capsicum.h>
  48 #include <sys/condvar.h>
  49 #include <sys/kernel.h>
  50 #include <sys/lock.h>
  51 #include <sys/mutex.h>
  52 #include <sys/sysproto.h>
  53 #include <sys/malloc.h>
  54 #include <sys/filedesc.h>
  55 #include <sys/event.h>
  56 #include <sys/proc.h>
  57 #include <sys/fcntl.h>
  58 #include <sys/file.h>
  59 #include <sys/filio.h>
  60 #include <sys/jail.h>
  61 #include <sys/mman.h>
  62 #include <sys/mount.h>
  63 #include <sys/mbuf.h>
  64 #include <sys/protosw.h>
  65 #include <sys/rwlock.h>
  66 #include <sys/sf_buf.h>
  67 #include <sys/sf_sync.h>
  68 #include <sys/sf_base.h>
  69 #include <sys/sysent.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/signalvar.h>
  73 #include <sys/syscallsubr.h>
  74 #include <sys/sysctl.h>
  75 #include <sys/uio.h>
  76 #include <sys/vnode.h>
  77 #ifdef KTRACE
  78 #include <sys/ktrace.h>
  79 #endif
  80 #ifdef COMPAT_FREEBSD32
  81 #include <compat/freebsd32/freebsd32_util.h>
  82 #endif
  83
  84 #include <net/vnet.h>
  85
  86 #include <security/audit/audit.h>
  87 #include <security/mac/mac_framework.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_param.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_page.h>
  93 #include <vm/vm_pager.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_extern.h>
  96 #include <vm/uma.h>
  97
  98 #if defined(INET) || defined(INET6)
  99 #ifdef SCTP
 100 #include <netinet/sctp.h>
 101 #include <netinet/sctp_peeloff.h>
 102 #endif /* SCTP */
 103 #endif /* INET || INET6 */
 104
 105 /*
 106  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
 107  * and SOCK_NONBLOCK.
 108  */
 109 #define ACCEPT4_INHERIT 0x1
 110 #define ACCEPT4_COMPAT  0x2
 111
 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 114
 115 static int accept1(struct thread *td, int s, struct sockaddr *uname,
 116                    socklen_t *anamelen, int flags);
 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
 118                    int compat);
 119 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 120                         int compat);
 121 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 122                         int compat);
 123
 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 125
 126 static int      filt_sfsync_attach(struct knote *kn);
 127 static void     filt_sfsync_detach(struct knote *kn);
 128 static int      filt_sfsync(struct knote *kn, long hint);
 129
 130 /*
 131  * sendfile(2)-related variables and associated sysctls
 132  */
 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
 134     "sendfile(2) tunables");
 135 static int sfreadahead = 1;
 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
 137     &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 138
 139 #ifdef  SFSYNC_DEBUG
 140 static int sf_sync_debug = 0;
 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW,
 142     &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle");
 143 #define SFSYNC_DPRINTF(s, ...)                          \
 144                 do {                                    \
 145                         if (sf_sync_debug)              \
 146                                 printf((s), ##__VA_ARGS__); \
 147                 } while (0)
 148 #else
 149 #define SFSYNC_DPRINTF(c, ...)
 150 #endif
 151
 152 static uma_zone_t       zone_sfsync;
 153
 154 static struct filterops sendfile_filtops = {
 155         .f_isfd = 0,
 156         .f_attach = filt_sfsync_attach,
 157         .f_detach = filt_sfsync_detach,
 158         .f_event = filt_sfsync,
 159 };
 160
 161 static void
 162 sfstat_init(const void *unused)
 163 {
 164
 165         COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 166             M_WAITOK);
 167 }
 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 169
 170 static void
 171 sf_sync_init(const void *unused)
 172 {
 173
 174         zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync),
 175             NULL, NULL,
 176             NULL, NULL,
 177             UMA_ALIGN_CACHE,
 178             0);
 179         kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops);
 180 }
 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL);
 182
 183 static int
 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 185 {
 186         struct sfstat s;
 187
 188         COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 189         if (req->newptr)
 190                 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 191         return (SYSCTL_OUT(req, &s, sizeof(s)));
 192 }
 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
 194     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
 195
 196 /*
 197  * Convert a user file descriptor to a kernel file entry and check if required
 198  * capability rights are present.
 199  * A reference on the file entry is held upon returning.
 200  */
 201 static int
 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
 203     struct file **fpp, u_int *fflagp)
 204 {
 205         struct file *fp;
 206         int error;
 207
 208         error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
 209         if (error != 0)
 210                 return (error);
 211         if (fp->f_type != DTYPE_SOCKET) {
 212                 fdrop(fp, curthread);
 213                 return (ENOTSOCK);
 214         }
 215         if (fflagp != NULL)
 216                 *fflagp = fp->f_flag;
 217         *fpp = fp;
 218         return (0);
 219 }
 220
 221 /*
 222  * System call interface to the socket abstraction.
 223  */
 224 #if defined(COMPAT_43)
 225 #define COMPAT_OLDSOCK
 226 #endif
 227
 228 int
 229 sys_socket(td, uap)
 230         struct thread *td;
 231         struct socket_args /* {
 232                 int     domain;
 233                 int     type;
 234                 int     protocol;
 235         } */ *uap;
 236 {
 237         struct socket *so;
 238         struct file *fp;
 239         int fd, error, type, oflag, fflag;
 240
 241         AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
 242
 243         type = uap->type;
 244         oflag = 0;
 245         fflag = 0;
 246         if ((type & SOCK_CLOEXEC) != 0) {
 247                 type &= ~SOCK_CLOEXEC;
 248                 oflag |= O_CLOEXEC;
 249         }
 250         if ((type & SOCK_NONBLOCK) != 0) {
 251                 type &= ~SOCK_NONBLOCK;
 252                 fflag |= FNONBLOCK;
 253         }
 254
 255 #ifdef MAC
 256         error = mac_socket_check_create(td->td_ucred, uap->domain, type,
 257             uap->protocol);
 258         if (error != 0)
 259                 return (error);
 260 #endif
 261         error = falloc(td, &fp, &fd, oflag);
 262         if (error != 0)
 263                 return (error);
 264         /* An extra reference on `fp' has been held for us by falloc(). */
 265         error = socreate(uap->domain, &so, type, uap->protocol,
 266             td->td_ucred, td);
 267         if (error != 0) {
 268                 fdclose(td->td_proc->p_fd, fp, fd, td);
 269         } else {
 270                 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
 271                 if ((fflag & FNONBLOCK) != 0)
 272                         (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 273                 td->td_retval[0] = fd;
 274         }
 275         fdrop(fp, td);
 276         return (error);
 277 }
 278
 279 /* ARGSUSED */
 280 int
 281 sys_bind(td, uap)
 282         struct thread *td;
 283         struct bind_args /* {
 284                 int     s;
 285                 caddr_t name;
 286                 int     namelen;
 287         } */ *uap;
 288 {
 289         struct sockaddr *sa;
 290         int error;
 291
 292         error = getsockaddr(&sa, uap->name, uap->namelen);
 293         if (error == 0) {
 294                 error = kern_bind(td, uap->s, sa);
 295                 free(sa, M_SONAME);
 296         }
 297         return (error);
 298 }
 299
 300 static int
 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 302 {
 303         struct socket *so;
 304         struct file *fp;
 305         cap_rights_t rights;
 306         int error;
 307
 308         AUDIT_ARG_FD(fd);
 309         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 310         error = getsock_cap(td->td_proc->p_fd, fd,
 311             cap_rights_init(&rights, CAP_BIND), &fp, NULL);
 312         if (error != 0)
 313                 return (error);
 314         so = fp->f_data;
 315 #ifdef KTRACE
 316         if (KTRPOINT(td, KTR_STRUCT))
 317                 ktrsockaddr(sa);
 318 #endif
 319 #ifdef MAC
 320         error = mac_socket_check_bind(td->td_ucred, so, sa);
 321         if (error == 0) {
 322 #endif
 323                 if (dirfd == AT_FDCWD)
 324                         error = sobind(so, sa, td);
 325                 else
 326                         error = sobindat(dirfd, so, sa, td);
 327 #ifdef MAC
 328         }
 329 #endif
 330         fdrop(fp, td);
 331         return (error);
 332 }
 333
 334 int
 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
 336 {
 337
 338         return (kern_bindat(td, AT_FDCWD, fd, sa));
 339 }
 340
 341 /* ARGSUSED */
 342 int
 343 sys_bindat(td, uap)
 344         struct thread *td;
 345         struct bindat_args /* {
 346                 int     fd;
 347                 int     s;
 348                 caddr_t name;
 349                 int     namelen;
 350         } */ *uap;
 351 {
 352         struct sockaddr *sa;
 353         int error;
 354
 355         error = getsockaddr(&sa, uap->name, uap->namelen);
 356         if (error == 0) {
 357                 error = kern_bindat(td, uap->fd, uap->s, sa);
 358                 free(sa, M_SONAME);
 359         }
 360         return (error);
 361 }
 362
 363 /* ARGSUSED */
 364 int
 365 sys_listen(td, uap)
 366         struct thread *td;
 367         struct listen_args /* {
 368                 int     s;
 369                 int     backlog;
 370         } */ *uap;
 371 {
 372         struct socket *so;
 373         struct file *fp;
 374         cap_rights_t rights;
 375         int error;
 376
 377         AUDIT_ARG_FD(uap->s);
 378         error = getsock_cap(td->td_proc->p_fd, uap->s,
 379             cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
 380         if (error == 0) {
 381                 so = fp->f_data;
 382 #ifdef MAC
 383                 error = mac_socket_check_listen(td->td_ucred, so);
 384                 if (error == 0)
 385 #endif
 386                         error = solisten(so, uap->backlog, td);
 387                 fdrop(fp, td);
 388         }
 389         return(error);
 390 }
 391
 392 /*
 393  * accept1()
 394  */
 395 static int
 396 accept1(td, s, uname, anamelen, flags)
 397         struct thread *td;
 398         int s;
 399         struct sockaddr *uname;
 400         socklen_t *anamelen;
 401         int flags;
 402 {
 403         struct sockaddr *name;
 404         socklen_t namelen;
 405         struct file *fp;
 406         int error;
 407
 408         if (uname == NULL)
 409                 return (kern_accept4(td, s, NULL, NULL, flags, NULL));
 410
 411         error = copyin(anamelen, &namelen, sizeof (namelen));
 412         if (error != 0)
 413                 return (error);
 414
 415         error = kern_accept4(td, s, &name, &namelen, flags, &fp);
 416
 417         if (error != 0)
 418                 return (error);
 419
 420         if (error == 0 && uname != NULL) {
 421 #ifdef COMPAT_OLDSOCK
 422                 if (flags & ACCEPT4_COMPAT)
 423                         ((struct osockaddr *)name)->sa_family =
 424                             name->sa_family;
 425 #endif
 426                 error = copyout(name, uname, namelen);
 427         }
 428         if (error == 0)
 429                 error = copyout(&namelen, anamelen,
 430                     sizeof(namelen));
 431         if (error != 0)
 432                 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 433         fdrop(fp, td);
 434         free(name, M_SONAME);
 435         return (error);
 436 }
 437
 438 int
 439 kern_accept(struct thread *td, int s, struct sockaddr **name,
 440     socklen_t *namelen, struct file **fp)
 441 {
 442         return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
 443 }
 444
 445 int
 446 kern_accept4(struct thread *td, int s, struct sockaddr **name,
 447     socklen_t *namelen, int flags, struct file **fp)
 448 {
 449         struct filedesc *fdp;
 450         struct file *headfp, *nfp = NULL;
 451         struct sockaddr *sa = NULL;
 452         struct socket *head, *so;
 453         cap_rights_t rights;
 454         u_int fflag;
 455         pid_t pgid;
 456         int error, fd, tmp;
 457
 458         if (name != NULL)
 459                 *name = NULL;
 460
 461         AUDIT_ARG_FD(s);
 462         fdp = td->td_proc->p_fd;
 463         error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
 464             &headfp, &fflag);
 465         if (error != 0)
 466                 return (error);
 467         head = headfp->f_data;
 468         if ((head->so_options & SO_ACCEPTCONN) == 0) {
 469                 error = EINVAL;
 470                 goto done;
 471         }
 472 #ifdef MAC
 473         error = mac_socket_check_accept(td->td_ucred, head);
 474         if (error != 0)
 475                 goto done;
 476 #endif
 477         error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
 478         if (error != 0)
 479                 goto done;
 480         ACCEPT_LOCK();
 481         if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 482                 ACCEPT_UNLOCK();
 483                 error = EWOULDBLOCK;
 484                 goto noconnection;
 485         }
 486         while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 487                 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 488                         head->so_error = ECONNABORTED;
 489                         break;
 490                 }
 491                 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 492                     "accept", 0);
 493                 if (error != 0) {
 494                         ACCEPT_UNLOCK();
 495                         goto noconnection;
 496                 }
 497         }
 498         if (head->so_error) {
 499                 error = head->so_error;
 500                 head->so_error = 0;
 501                 ACCEPT_UNLOCK();
 502                 goto noconnection;
 503         }
 504         so = TAILQ_FIRST(&head->so_comp);
 505         KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 506         KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 507
 508         /*
 509          * Before changing the flags on the socket, we have to bump the
 510          * reference count.  Otherwise, if the protocol calls sofree(),
 511          * the socket will be released due to a zero refcount.
 512          */
 513         SOCK_LOCK(so);                  /* soref() and so_state update */
 514         soref(so);                      /* file descriptor reference */
 515
 516         TAILQ_REMOVE(&head->so_comp, so, so_list);
 517         head->so_qlen--;
 518         if (flags & ACCEPT4_INHERIT)
 519                 so->so_state |= (head->so_state & SS_NBIO);
 520         else
 521                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 522         so->so_qstate &= ~SQ_COMP;
 523         so->so_head = NULL;
 524
 525         SOCK_UNLOCK(so);
 526         ACCEPT_UNLOCK();
 527
 528         /* An extra reference on `nfp' has been held for us by falloc(). */
 529         td->td_retval[0] = fd;
 530
 531         /* connection has been removed from the listen queue */
 532         KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 533
 534         if (flags & ACCEPT4_INHERIT) {
 535                 pgid = fgetown(&head->so_sigio);
 536                 if (pgid != 0)
 537                         fsetown(pgid, &so->so_sigio);
 538         } else {
 539                 fflag &= ~(FNONBLOCK | FASYNC);
 540                 if (flags & SOCK_NONBLOCK)
 541                         fflag |= FNONBLOCK;
 542         }
 543
 544         finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 545         /* Sync socket nonblocking/async state with file flags */
 546         tmp = fflag & FNONBLOCK;
 547         (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 548         tmp = fflag & FASYNC;
 549         (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 550         sa = 0;
 551         error = soaccept(so, &sa);
 552         if (error != 0)
 553                 goto noconnection;
 554         if (sa == NULL) {
 555                 if (name)
 556                         *namelen = 0;
 557                 goto done;
 558         }
 559         AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 560         if (name) {
 561                 /* check sa_len before it is destroyed */
 562                 if (*namelen > sa->sa_len)
 563                         *namelen = sa->sa_len;
 564 #ifdef KTRACE
 565                 if (KTRPOINT(td, KTR_STRUCT))
 566                         ktrsockaddr(sa);
 567 #endif
 568                 *name = sa;
 569                 sa = NULL;
 570         }
 571 noconnection:
 572         free(sa, M_SONAME);
 573
 574         /*
 575          * close the new descriptor, assuming someone hasn't ripped it
 576          * out from under us.
 577          */
 578         if (error != 0)
 579                 fdclose(fdp, nfp, fd, td);
 580
 581         /*
 582          * Release explicitly held references before returning.  We return
 583          * a reference on nfp to the caller on success if they request it.
 584          */
 585 done:
 586         if (fp != NULL) {
 587                 if (error == 0) {
 588                         *fp = nfp;
 589                         nfp = NULL;
 590                 } else
 591                         *fp = NULL;
 592         }
 593         if (nfp != NULL)
 594                 fdrop(nfp, td);
 595         fdrop(headfp, td);
 596         return (error);
 597 }
 598
 599 int
 600 sys_accept(td, uap)
 601         struct thread *td;
 602         struct accept_args *uap;
 603 {
 604
 605         return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 606 }
 607
 608 int
 609 sys_accept4(td, uap)
 610         struct thread *td;
 611         struct accept4_args *uap;
 612 {
 613
 614         if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 615                 return (EINVAL);
 616
 617         return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
 618 }
 619
 620 #ifdef COMPAT_OLDSOCK
 621 int
 622 oaccept(td, uap)
 623         struct thread *td;
 624         struct accept_args *uap;
 625 {
 626
 627         return (accept1(td, uap->s, uap->name, uap->anamelen,
 628             ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 629 }
 630 #endif /* COMPAT_OLDSOCK */
 631
 632 /* ARGSUSED */
 633 int
 634 sys_connect(td, uap)
 635         struct thread *td;
 636         struct connect_args /* {
 637                 int     s;
 638                 caddr_t name;
 639                 int     namelen;
 640         } */ *uap;
 641 {
 642         struct sockaddr *sa;
 643         int error;
 644
 645         error = getsockaddr(&sa, uap->name, uap->namelen);
 646         if (error == 0) {
 647                 error = kern_connect(td, uap->s, sa);
 648                 free(sa, M_SONAME);
 649         }
 650         return (error);
 651 }
 652
 653 static int
 654 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 655 {
 656         struct socket *so;
 657         struct file *fp;
 658         cap_rights_t rights;
 659         int error, interrupted = 0;
 660
 661         AUDIT_ARG_FD(fd);
 662         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 663         error = getsock_cap(td->td_proc->p_fd, fd,
 664             cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
 665         if (error != 0)
 666                 return (error);
 667         so = fp->f_data;
 668         if (so->so_state & SS_ISCONNECTING) {
 669                 error = EALREADY;
 670                 goto done1;
 671         }
 672 #ifdef KTRACE
 673         if (KTRPOINT(td, KTR_STRUCT))
 674                 ktrsockaddr(sa);
 675 #endif
 676 #ifdef MAC
 677         error = mac_socket_check_connect(td->td_ucred, so, sa);
 678         if (error != 0)
 679                 goto bad;
 680 #endif
 681         if (dirfd == AT_FDCWD)
 682                 error = soconnect(so, sa, td);
 683         else
 684                 error = soconnectat(dirfd, so, sa, td);
 685         if (error != 0)
 686                 goto bad;
 687         if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 688                 error = EINPROGRESS;
 689                 goto done1;
 690         }
 691         SOCK_LOCK(so);
 692         while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 693                 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 694                     "connec", 0);
 695                 if (error != 0) {
 696                         if (error == EINTR || error == ERESTART)
 697                                 interrupted = 1;
 698                         break;
 699                 }
 700         }
 701         if (error == 0) {
 702                 error = so->so_error;
 703                 so->so_error = 0;
 704         }
 705         SOCK_UNLOCK(so);
 706 bad:
 707         if (!interrupted)
 708                 so->so_state &= ~SS_ISCONNECTING;
 709         if (error == ERESTART)
 710                 error = EINTR;
 711 done1:
 712         fdrop(fp, td);
 713         return (error);
 714 }
 715
 716 int
 717 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
 718 {
 719
 720         return (kern_connectat(td, AT_FDCWD, fd, sa));
 721 }
 722
 723 /* ARGSUSED */
 724 int
 725 sys_connectat(td, uap)
 726         struct thread *td;
 727         struct connectat_args /* {
 728                 int     fd;
 729                 int     s;
 730                 caddr_t name;
 731                 int     namelen;
 732         } */ *uap;
 733 {
 734         struct sockaddr *sa;
 735         int error;
 736
 737         error = getsockaddr(&sa, uap->name, uap->namelen);
 738         if (error == 0) {
 739                 error = kern_connectat(td, uap->fd, uap->s, sa);
 740                 free(sa, M_SONAME);
 741         }
 742         return (error);
 743 }
 744
 745 int
 746 kern_socketpair(struct thread *td, int domain, int type, int protocol,
 747     int *rsv)
 748 {
 749         struct filedesc *fdp = td->td_proc->p_fd;
 750         struct file *fp1, *fp2;
 751         struct socket *so1, *so2;
 752         int fd, error, oflag, fflag;
 753
 754         AUDIT_ARG_SOCKET(domain, type, protocol);
 755
 756         oflag = 0;
 757         fflag = 0;
 758         if ((type & SOCK_CLOEXEC) != 0) {
 759                 type &= ~SOCK_CLOEXEC;
 760                 oflag |= O_CLOEXEC;
 761         }
 762         if ((type & SOCK_NONBLOCK) != 0) {
 763                 type &= ~SOCK_NONBLOCK;
 764                 fflag |= FNONBLOCK;
 765         }
 766 #ifdef MAC
 767         /* We might want to have a separate check for socket pairs. */
 768         error = mac_socket_check_create(td->td_ucred, domain, type,
 769             protocol);
 770         if (error != 0)
 771                 return (error);
 772 #endif
 773         error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
 774         if (error != 0)
 775                 return (error);
 776         error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
 777         if (error != 0)
 778                 goto free1;
 779         /* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 780         error = falloc(td, &fp1, &fd, oflag);
 781         if (error != 0)
 782                 goto free2;
 783         rsv[0] = fd;
 784         fp1->f_data = so1;      /* so1 already has ref count */
 785         error = falloc(td, &fp2, &fd, oflag);
 786         if (error != 0)
 787                 goto free3;
 788         fp2->f_data = so2;      /* so2 already has ref count */
 789         rsv[1] = fd;
 790         error = soconnect2(so1, so2);
 791         if (error != 0)
 792                 goto free4;
 793         if (type == SOCK_DGRAM) {
 794                 /*
 795                  * Datagram socket connection is asymmetric.
 796                  */
 797                  error = soconnect2(so2, so1);
 798                  if (error != 0)
 799                         goto free4;
 800         }
 801         finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
 802             &socketops);
 803         finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
 804             &socketops);
 805         if ((fflag & FNONBLOCK) != 0) {
 806                 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
 807                 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
 808         }
 809         fdrop(fp1, td);
 810         fdrop(fp2, td);
 811         return (0);
 812 free4:
 813         fdclose(fdp, fp2, rsv[1], td);
 814         fdrop(fp2, td);
 815 free3:
 816         fdclose(fdp, fp1, rsv[0], td);
 817         fdrop(fp1, td);
 818 free2:
 819         if (so2 != NULL)
 820                 (void)soclose(so2);
 821 free1:
 822         if (so1 != NULL)
 823                 (void)soclose(so1);
 824         return (error);
 825 }
 826
 827 int
 828 sys_socketpair(struct thread *td, struct socketpair_args *uap)
 829 {
 830         int error, sv[2];
 831
 832         error = kern_socketpair(td, uap->domain, uap->type,
 833             uap->protocol, sv);
 834         if (error != 0)
 835                 return (error);
 836         error = copyout(sv, uap->rsv, 2 * sizeof(int));
 837         if (error != 0) {
 838                 (void)kern_close(td, sv[0]);
 839                 (void)kern_close(td, sv[1]);
 840         }
 841         return (error);
 842 }
 843
 844 static int
 845 sendit(td, s, mp, flags)
 846         struct thread *td;
 847         int s;
 848         struct msghdr *mp;
 849         int flags;
 850 {
 851         struct mbuf *control;
 852         struct sockaddr *to;
 853         int error;
 854
 855 #ifdef CAPABILITY_MODE
 856         if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
 857                 return (ECAPMODE);
 858 #endif
 859
 860         if (mp->msg_name != NULL) {
 861                 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 862                 if (error != 0) {
 863                         to = NULL;
 864                         goto bad;
 865                 }
 866                 mp->msg_name = to;
 867         } else {
 868                 to = NULL;
 869         }
 870
 871         if (mp->msg_control) {
 872                 if (mp->msg_controllen < sizeof(struct cmsghdr)
 873 #ifdef COMPAT_OLDSOCK
 874                     && mp->msg_flags != MSG_COMPAT
 875 #endif
 876                 ) {
 877                         error = EINVAL;
 878                         goto bad;
 879                 }
 880                 error = sockargs(&control, mp->msg_control,
 881                     mp->msg_controllen, MT_CONTROL);
 882                 if (error != 0)
 883                         goto bad;
 884 #ifdef COMPAT_OLDSOCK
 885                 if (mp->msg_flags == MSG_COMPAT) {
 886                         struct cmsghdr *cm;
 887
 888                         M_PREPEND(control, sizeof(*cm), M_WAITOK);
 889                         cm = mtod(control, struct cmsghdr *);
 890                         cm->cmsg_len = control->m_len;
 891                         cm->cmsg_level = SOL_SOCKET;
 892                         cm->cmsg_type = SCM_RIGHTS;
 893                 }
 894 #endif
 895         } else {
 896                 control = NULL;
 897         }
 898
 899         error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 900
 901 bad:
 902         free(to, M_SONAME);
 903         return (error);
 904 }
 905
 906 int
 907 kern_sendit(td, s, mp, flags, control, segflg)
 908         struct thread *td;
 909         int s;
 910         struct msghdr *mp;
 911         int flags;
 912         struct mbuf *control;
 913         enum uio_seg segflg;
 914 {
 915         struct file *fp;
 916         struct uio auio;
 917         struct iovec *iov;
 918         struct socket *so;
 919         cap_rights_t rights;
 920 #ifdef KTRACE
 921         struct uio *ktruio = NULL;
 922 #endif
 923         ssize_t len;
 924         int i, error;
 925
 926         AUDIT_ARG_FD(s);
 927         cap_rights_init(&rights, CAP_SEND);
 928         if (mp->msg_name != NULL) {
 929                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
 930                 cap_rights_set(&rights, CAP_CONNECT);
 931         }
 932         error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
 933         if (error != 0)
 934                 return (error);
 935         so = (struct socket *)fp->f_data;
 936
 937 #ifdef KTRACE
 938         if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
 939                 ktrsockaddr(mp->msg_name);
 940 #endif
 941 #ifdef MAC
 942         if (mp->msg_name != NULL) {
 943                 error = mac_socket_check_connect(td->td_ucred, so,
 944                     mp->msg_name);
 945                 if (error != 0)
 946                         goto bad;
 947         }
 948         error = mac_socket_check_send(td->td_ucred, so);
 949         if (error != 0)
 950                 goto bad;
 951 #endif
 952
 953         auio.uio_iov = mp->msg_iov;
 954         auio.uio_iovcnt = mp->msg_iovlen;
 955         auio.uio_segflg = segflg;
 956         auio.uio_rw = UIO_WRITE;
 957         auio.uio_td = td;
 958         auio.uio_offset = 0;                    /* XXX */
 959         auio.uio_resid = 0;
 960         iov = mp->msg_iov;
 961         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 962                 if ((auio.uio_resid += iov->iov_len) < 0) {
 963                         error = EINVAL;
 964                         goto bad;
 965                 }
 966         }
 967 #ifdef KTRACE
 968         if (KTRPOINT(td, KTR_GENIO))
 969                 ktruio = cloneuio(&auio);
 970 #endif
 971         len = auio.uio_resid;
 972         error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 973         if (error != 0) {
 974                 if (auio.uio_resid != len && (error == ERESTART ||
 975                     error == EINTR || error == EWOULDBLOCK))
 976                         error = 0;
 977                 /* Generation of SIGPIPE can be controlled per socket */
 978                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 979                     !(flags & MSG_NOSIGNAL)) {
 980                         PROC_LOCK(td->td_proc);
 981                         tdsignal(td, SIGPIPE);
 982                         PROC_UNLOCK(td->td_proc);
 983                 }
 984         }
 985         if (error == 0)
 986                 td->td_retval[0] = len - auio.uio_resid;
 987 #ifdef KTRACE
 988         if (ktruio != NULL) {
 989                 ktruio->uio_resid = td->td_retval[0];
 990                 ktrgenio(s, UIO_WRITE, ktruio, error);
 991         }
 992 #endif
 993 bad:
 994         fdrop(fp, td);
 995         return (error);
 996 }
 997
 998 int
 999 sys_sendto(td, uap)
1000         struct thread *td;
1001         struct sendto_args /* {
1002                 int     s;
1003                 caddr_t buf;
1004                 size_t  len;
1005                 int     flags;
1006                 caddr_t to;
1007                 int     tolen;
1008         } */ *uap;
1009 {
1010         struct msghdr msg;
1011         struct iovec aiov;
1012
1013         msg.msg_name = uap->to;
1014         msg.msg_namelen = uap->tolen;
1015         msg.msg_iov = &aiov;
1016         msg.msg_iovlen = 1;
1017         msg.msg_control = 0;
1018 #ifdef COMPAT_OLDSOCK
1019         msg.msg_flags = 0;
1020 #endif
1021         aiov.iov_base = uap->buf;
1022         aiov.iov_len = uap->len;
1023         return (sendit(td, uap->s, &msg, uap->flags));
1024 }
1025
1026 #ifdef COMPAT_OLDSOCK
1027 int
1028 osend(td, uap)
1029         struct thread *td;
1030         struct osend_args /* {
1031                 int     s;
1032                 caddr_t buf;
1033                 int     len;
1034                 int     flags;
1035         } */ *uap;
1036 {
1037         struct msghdr msg;
1038         struct iovec aiov;
1039
1040         msg.msg_name = 0;
1041         msg.msg_namelen = 0;
1042         msg.msg_iov = &aiov;
1043         msg.msg_iovlen = 1;
1044         aiov.iov_base = uap->buf;
1045         aiov.iov_len = uap->len;
1046         msg.msg_control = 0;
1047         msg.msg_flags = 0;
1048         return (sendit(td, uap->s, &msg, uap->flags));
1049 }
1050
1051 int
1052 osendmsg(td, uap)
1053         struct thread *td;
1054         struct osendmsg_args /* {
1055                 int     s;
1056                 caddr_t msg;
1057                 int     flags;
1058         } */ *uap;
1059 {
1060         struct msghdr msg;
1061         struct iovec *iov;
1062         int error;
1063
1064         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1065         if (error != 0)
1066                 return (error);
1067         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1068         if (error != 0)
1069                 return (error);
1070         msg.msg_iov = iov;
1071         msg.msg_flags = MSG_COMPAT;
1072         error = sendit(td, uap->s, &msg, uap->flags);
1073         free(iov, M_IOV);
1074         return (error);
1075 }
1076 #endif
1077
1078 int
1079 sys_sendmsg(td, uap)
1080         struct thread *td;
1081         struct sendmsg_args /* {
1082                 int     s;
1083                 caddr_t msg;
1084                 int     flags;
1085         } */ *uap;
1086 {
1087         struct msghdr msg;
1088         struct iovec *iov;
1089         int error;
1090
1091         error = copyin(uap->msg, &msg, sizeof (msg));
1092         if (error != 0)
1093                 return (error);
1094         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1095         if (error != 0)
1096                 return (error);
1097         msg.msg_iov = iov;
1098 #ifdef COMPAT_OLDSOCK
1099         msg.msg_flags = 0;
1100 #endif
1101         error = sendit(td, uap->s, &msg, uap->flags);
1102         free(iov, M_IOV);
1103         return (error);
1104 }
1105
1106 int
1107 kern_recvit(td, s, mp, fromseg, controlp)
1108         struct thread *td;
1109         int s;
1110         struct msghdr *mp;
1111         enum uio_seg fromseg;
1112         struct mbuf **controlp;
1113 {
1114         struct uio auio;
1115         struct iovec *iov;
1116         struct mbuf *m, *control = NULL;
1117         caddr_t ctlbuf;
1118         struct file *fp;
1119         struct socket *so;
1120         struct sockaddr *fromsa = NULL;
1121         cap_rights_t rights;
1122 #ifdef KTRACE
1123         struct uio *ktruio = NULL;
1124 #endif
1125         ssize_t len;
1126         int error, i;
1127
1128         if (controlp != NULL)
1129                 *controlp = NULL;
1130
1131         AUDIT_ARG_FD(s);
1132         error = getsock_cap(td->td_proc->p_fd, s,
1133             cap_rights_init(&rights, CAP_RECV), &fp, NULL);
1134         if (error != 0)
1135                 return (error);
1136         so = fp->f_data;
1137
1138 #ifdef MAC
1139         error = mac_socket_check_receive(td->td_ucred, so);
1140         if (error != 0) {
1141                 fdrop(fp, td);
1142                 return (error);
1143         }
1144 #endif
1145
1146         auio.uio_iov = mp->msg_iov;
1147         auio.uio_iovcnt = mp->msg_iovlen;
1148         auio.uio_segflg = UIO_USERSPACE;
1149         auio.uio_rw = UIO_READ;
1150         auio.uio_td = td;
1151         auio.uio_offset = 0;                    /* XXX */
1152         auio.uio_resid = 0;
1153         iov = mp->msg_iov;
1154         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1155                 if ((auio.uio_resid += iov->iov_len) < 0) {
1156                         fdrop(fp, td);
1157                         return (EINVAL);
1158                 }
1159         }
1160 #ifdef KTRACE
1161         if (KTRPOINT(td, KTR_GENIO))
1162                 ktruio = cloneuio(&auio);
1163 #endif
1164         len = auio.uio_resid;
1165         error = soreceive(so, &fromsa, &auio, NULL,
1166             (mp->msg_control || controlp) ? &control : NULL,
1167             &mp->msg_flags);
1168         if (error != 0) {
1169                 if (auio.uio_resid != len && (error == ERESTART ||
1170                     error == EINTR || error == EWOULDBLOCK))
1171                         error = 0;
1172         }
1173         if (fromsa != NULL)
1174                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1175 #ifdef KTRACE
1176         if (ktruio != NULL) {
1177                 ktruio->uio_resid = len - auio.uio_resid;
1178                 ktrgenio(s, UIO_READ, ktruio, error);
1179         }
1180 #endif
1181         if (error != 0)
1182                 goto out;
1183         td->td_retval[0] = len - auio.uio_resid;
1184         if (mp->msg_name) {
1185                 len = mp->msg_namelen;
1186                 if (len <= 0 || fromsa == NULL)
1187                         len = 0;
1188                 else {
1189                         /* save sa_len before it is destroyed by MSG_COMPAT */
1190                         len = MIN(len, fromsa->sa_len);
1191 #ifdef COMPAT_OLDSOCK
1192                         if (mp->msg_flags & MSG_COMPAT)
1193                                 ((struct osockaddr *)fromsa)->sa_family =
1194                                     fromsa->sa_family;
1195 #endif
1196                         if (fromseg == UIO_USERSPACE) {
1197                                 error = copyout(fromsa, mp->msg_name,
1198                                     (unsigned)len);
1199                                 if (error != 0)
1200                                         goto out;
1201                         } else
1202                                 bcopy(fromsa, mp->msg_name, len);
1203                 }
1204                 mp->msg_namelen = len;
1205         }
1206         if (mp->msg_control && controlp == NULL) {
1207 #ifdef COMPAT_OLDSOCK
1208                 /*
1209                  * We assume that old recvmsg calls won't receive access
1210                  * rights and other control info, esp. as control info
1211                  * is always optional and those options didn't exist in 4.3.
1212                  * If we receive rights, trim the cmsghdr; anything else
1213                  * is tossed.
1214                  */
1215                 if (control && mp->msg_flags & MSG_COMPAT) {
1216                         if (mtod(control, struct cmsghdr *)->cmsg_level !=
1217                             SOL_SOCKET ||
1218                             mtod(control, struct cmsghdr *)->cmsg_type !=
1219                             SCM_RIGHTS) {
1220                                 mp->msg_controllen = 0;
1221                                 goto out;
1222                         }
1223                         control->m_len -= sizeof (struct cmsghdr);
1224                         control->m_data += sizeof (struct cmsghdr);
1225                 }
1226 #endif
1227                 len = mp->msg_controllen;
1228                 m = control;
1229                 mp->msg_controllen = 0;
1230                 ctlbuf = mp->msg_control;
1231
1232                 while (m && len > 0) {
1233                         unsigned int tocopy;
1234
1235                         if (len >= m->m_len)
1236                                 tocopy = m->m_len;
1237                         else {
1238                                 mp->msg_flags |= MSG_CTRUNC;
1239                                 tocopy = len;
1240                         }
1241
1242                         if ((error = copyout(mtod(m, caddr_t),
1243                                         ctlbuf, tocopy)) != 0)
1244                                 goto out;
1245
1246                         ctlbuf += tocopy;
1247                         len -= tocopy;
1248                         m = m->m_next;
1249                 }
1250                 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1251         }
1252 out:
1253         fdrop(fp, td);
1254 #ifdef KTRACE
1255         if (fromsa && KTRPOINT(td, KTR_STRUCT))
1256                 ktrsockaddr(fromsa);
1257 #endif
1258         free(fromsa, M_SONAME);
1259
1260         if (error == 0 && controlp != NULL)
1261                 *controlp = control;
1262         else  if (control)
1263                 m_freem(control);
1264
1265         return (error);
1266 }
1267
1268 static int
1269 recvit(td, s, mp, namelenp)
1270         struct thread *td;
1271         int s;
1272         struct msghdr *mp;
1273         void *namelenp;
1274 {
1275         int error;
1276
1277         error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1278         if (error != 0)
1279                 return (error);
1280         if (namelenp != NULL) {
1281                 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1282 #ifdef COMPAT_OLDSOCK
1283                 if (mp->msg_flags & MSG_COMPAT)
1284                         error = 0;      /* old recvfrom didn't check */
1285 #endif
1286         }
1287         return (error);
1288 }
1289
1290 int
1291 sys_recvfrom(td, uap)
1292         struct thread *td;
1293         struct recvfrom_args /* {
1294                 int     s;
1295                 caddr_t buf;
1296                 size_t  len;
1297                 int     flags;
1298                 struct sockaddr * __restrict    from;
1299                 socklen_t * __restrict fromlenaddr;
1300         } */ *uap;
1301 {
1302         struct msghdr msg;
1303         struct iovec aiov;
1304         int error;
1305
1306         if (uap->fromlenaddr) {
1307                 error = copyin(uap->fromlenaddr,
1308                     &msg.msg_namelen, sizeof (msg.msg_namelen));
1309                 if (error != 0)
1310                         goto done2;
1311         } else {
1312                 msg.msg_namelen = 0;
1313         }
1314         msg.msg_name = uap->from;
1315         msg.msg_iov = &aiov;
1316         msg.msg_iovlen = 1;
1317         aiov.iov_base = uap->buf;
1318         aiov.iov_len = uap->len;
1319         msg.msg_control = 0;
1320         msg.msg_flags = uap->flags;
1321         error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1322 done2:
1323         return (error);
1324 }
1325
1326 #ifdef COMPAT_OLDSOCK
1327 int
1328 orecvfrom(td, uap)
1329         struct thread *td;
1330         struct recvfrom_args *uap;
1331 {
1332
1333         uap->flags |= MSG_COMPAT;
1334         return (sys_recvfrom(td, uap));
1335 }
1336 #endif
1337
1338 #ifdef COMPAT_OLDSOCK
1339 int
1340 orecv(td, uap)
1341         struct thread *td;
1342         struct orecv_args /* {
1343                 int     s;
1344                 caddr_t buf;
1345                 int     len;
1346                 int     flags;
1347         } */ *uap;
1348 {
1349         struct msghdr msg;
1350         struct iovec aiov;
1351
1352         msg.msg_name = 0;
1353         msg.msg_namelen = 0;
1354         msg.msg_iov = &aiov;
1355         msg.msg_iovlen = 1;
1356         aiov.iov_base = uap->buf;
1357         aiov.iov_len = uap->len;
1358         msg.msg_control = 0;
1359         msg.msg_flags = uap->flags;
1360         return (recvit(td, uap->s, &msg, NULL));
1361 }
1362
1363 /*
1364  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1365  * overlays the new one, missing only the flags, and with the (old) access
1366  * rights where the control fields are now.
1367  */
1368 int
1369 orecvmsg(td, uap)
1370         struct thread *td;
1371         struct orecvmsg_args /* {
1372                 int     s;
1373                 struct  omsghdr *msg;
1374                 int     flags;
1375         } */ *uap;
1376 {
1377         struct msghdr msg;
1378         struct iovec *iov;
1379         int error;
1380
1381         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1382         if (error != 0)
1383                 return (error);
1384         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1385         if (error != 0)
1386                 return (error);
1387         msg.msg_flags = uap->flags | MSG_COMPAT;
1388         msg.msg_iov = iov;
1389         error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1390         if (msg.msg_controllen && error == 0)
1391                 error = copyout(&msg.msg_controllen,
1392                     &uap->msg->msg_accrightslen, sizeof (int));
1393         free(iov, M_IOV);
1394         return (error);
1395 }
1396 #endif
1397
1398 int
1399 sys_recvmsg(td, uap)
1400         struct thread *td;
1401         struct recvmsg_args /* {
1402                 int     s;
1403                 struct  msghdr *msg;
1404                 int     flags;
1405         } */ *uap;
1406 {
1407         struct msghdr msg;
1408         struct iovec *uiov, *iov;
1409         int error;
1410
1411         error = copyin(uap->msg, &msg, sizeof (msg));
1412         if (error != 0)
1413                 return (error);
1414         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1415         if (error != 0)
1416                 return (error);
1417         msg.msg_flags = uap->flags;
1418 #ifdef COMPAT_OLDSOCK
1419         msg.msg_flags &= ~MSG_COMPAT;
1420 #endif
1421         uiov = msg.msg_iov;
1422         msg.msg_iov = iov;
1423         error = recvit(td, uap->s, &msg, NULL);
1424         if (error == 0) {
1425                 msg.msg_iov = uiov;
1426                 error = copyout(&msg, uap->msg, sizeof(msg));
1427         }
1428         free(iov, M_IOV);
1429         return (error);
1430 }
1431
1432 /* ARGSUSED */
1433 int
1434 sys_shutdown(td, uap)
1435         struct thread *td;
1436         struct shutdown_args /* {
1437                 int     s;
1438                 int     how;
1439         } */ *uap;
1440 {
1441         struct socket *so;
1442         struct file *fp;
1443         cap_rights_t rights;
1444         int error;
1445
1446         AUDIT_ARG_FD(uap->s);
1447         error = getsock_cap(td->td_proc->p_fd, uap->s,
1448             cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
1449         if (error == 0) {
1450                 so = fp->f_data;
1451                 error = soshutdown(so, uap->how);
1452                 fdrop(fp, td);
1453         }
1454         return (error);
1455 }
1456
1457 /* ARGSUSED */
1458 int
1459 sys_setsockopt(td, uap)
1460         struct thread *td;
1461         struct setsockopt_args /* {
1462                 int     s;
1463                 int     level;
1464                 int     name;
1465                 caddr_t val;
1466                 int     valsize;
1467         } */ *uap;
1468 {
1469
1470         return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1471             uap->val, UIO_USERSPACE, uap->valsize));
1472 }
1473
1474 int
1475 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1476         struct thread *td;
1477         int s;
1478         int level;
1479         int name;
1480         void *val;
1481         enum uio_seg valseg;
1482         socklen_t valsize;
1483 {
1484         struct socket *so;
1485         struct file *fp;
1486         struct sockopt sopt;
1487         cap_rights_t rights;
1488         int error;
1489
1490         if (val == NULL && valsize != 0)
1491                 return (EFAULT);
1492         if ((int)valsize < 0)
1493                 return (EINVAL);
1494
1495         sopt.sopt_dir = SOPT_SET;
1496         sopt.sopt_level = level;
1497         sopt.sopt_name = name;
1498         sopt.sopt_val = val;
1499         sopt.sopt_valsize = valsize;
1500         switch (valseg) {
1501         case UIO_USERSPACE:
1502                 sopt.sopt_td = td;
1503                 break;
1504         case UIO_SYSSPACE:
1505                 sopt.sopt_td = NULL;
1506                 break;
1507         default:
1508                 panic("kern_setsockopt called with bad valseg");
1509         }
1510
1511         AUDIT_ARG_FD(s);
1512         error = getsock_cap(td->td_proc->p_fd, s,
1513             cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
1514         if (error == 0) {
1515                 so = fp->f_data;
1516                 error = sosetopt(so, &sopt);
1517                 fdrop(fp, td);
1518         }
1519         return(error);
1520 }
1521
1522 /* ARGSUSED */
1523 int
1524 sys_getsockopt(td, uap)
1525         struct thread *td;
1526         struct getsockopt_args /* {
1527                 int     s;
1528                 int     level;
1529                 int     name;
1530                 void * __restrict       val;
1531                 socklen_t * __restrict avalsize;
1532         } */ *uap;
1533 {
1534         socklen_t valsize;
1535         int error;
1536
1537         if (uap->val) {
1538                 error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1539                 if (error != 0)
1540                         return (error);
1541         }
1542
1543         error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1544             uap->val, UIO_USERSPACE, &valsize);
1545
1546         if (error == 0)
1547                 error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1548         return (error);
1549 }
1550
1551 /*
1552  * Kernel version of getsockopt.
1553  * optval can be a userland or userspace. optlen is always a kernel pointer.
1554  */
1555 int
1556 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1557         struct thread *td;
1558         int s;
1559         int level;
1560         int name;
1561         void *val;
1562         enum uio_seg valseg;
1563         socklen_t *valsize;
1564 {
1565         struct socket *so;
1566         struct file *fp;
1567         struct sockopt sopt;
1568         cap_rights_t rights;
1569         int error;
1570
1571         if (val == NULL)
1572                 *valsize = 0;
1573         if ((int)*valsize < 0)
1574                 return (EINVAL);
1575
1576         sopt.sopt_dir = SOPT_GET;
1577         sopt.sopt_level = level;
1578         sopt.sopt_name = name;
1579         sopt.sopt_val = val;
1580         sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1581         switch (valseg) {
1582         case UIO_USERSPACE:
1583                 sopt.sopt_td = td;
1584                 break;
1585         case UIO_SYSSPACE:
1586                 sopt.sopt_td = NULL;
1587                 break;
1588         default:
1589                 panic("kern_getsockopt called with bad valseg");
1590         }
1591
1592         AUDIT_ARG_FD(s);
1593         error = getsock_cap(td->td_proc->p_fd, s,
1594             cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
1595         if (error == 0) {
1596                 so = fp->f_data;
1597                 error = sogetopt(so, &sopt);
1598                 *valsize = sopt.sopt_valsize;
1599                 fdrop(fp, td);
1600         }
1601         return (error);
1602 }
1603
1604 /*
1605  * getsockname1() - Get socket name.
1606  */
1607 /* ARGSUSED */
1608 static int
1609 getsockname1(td, uap, compat)
1610         struct thread *td;
1611         struct getsockname_args /* {
1612                 int     fdes;
1613                 struct sockaddr * __restrict asa;
1614                 socklen_t * __restrict alen;
1615         } */ *uap;
1616         int compat;
1617 {
1618         struct sockaddr *sa;
1619         socklen_t len;
1620         int error;
1621
1622         error = copyin(uap->alen, &len, sizeof(len));
1623         if (error != 0)
1624                 return (error);
1625
1626         error = kern_getsockname(td, uap->fdes, &sa, &len);
1627         if (error != 0)
1628                 return (error);
1629
1630         if (len != 0) {
1631 #ifdef COMPAT_OLDSOCK
1632                 if (compat)
1633                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1634 #endif
1635                 error = copyout(sa, uap->asa, (u_int)len);
1636         }
1637         free(sa, M_SONAME);
1638         if (error == 0)
1639                 error = copyout(&len, uap->alen, sizeof(len));
1640         return (error);
1641 }
1642
1643 int
1644 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1645     socklen_t *alen)
1646 {
1647         struct socket *so;
1648         struct file *fp;
1649         cap_rights_t rights;
1650         socklen_t len;
1651         int error;
1652
1653         AUDIT_ARG_FD(fd);
1654         error = getsock_cap(td->td_proc->p_fd, fd,
1655             cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
1656         if (error != 0)
1657                 return (error);
1658         so = fp->f_data;
1659         *sa = NULL;
1660         CURVNET_SET(so->so_vnet);
1661         error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1662         CURVNET_RESTORE();
1663         if (error != 0)
1664                 goto bad;
1665         if (*sa == NULL)
1666                 len = 0;
1667         else
1668                 len = MIN(*alen, (*sa)->sa_len);
1669         *alen = len;
1670 #ifdef KTRACE
1671         if (KTRPOINT(td, KTR_STRUCT))
1672                 ktrsockaddr(*sa);
1673 #endif
1674 bad:
1675         fdrop(fp, td);
1676         if (error != 0 && *sa != NULL) {
1677                 free(*sa, M_SONAME);
1678                 *sa = NULL;
1679         }
1680         return (error);
1681 }
1682
1683 int
1684 sys_getsockname(td, uap)
1685         struct thread *td;
1686         struct getsockname_args *uap;
1687 {
1688
1689         return (getsockname1(td, uap, 0));
1690 }
1691
1692 #ifdef COMPAT_OLDSOCK
1693 int
1694 ogetsockname(td, uap)
1695         struct thread *td;
1696         struct getsockname_args *uap;
1697 {
1698
1699         return (getsockname1(td, uap, 1));
1700 }
1701 #endif /* COMPAT_OLDSOCK */
1702
1703 /*
1704  * getpeername1() - Get name of peer for connected socket.
1705  */
1706 /* ARGSUSED */
1707 static int
1708 getpeername1(td, uap, compat)
1709         struct thread *td;
1710         struct getpeername_args /* {
1711                 int     fdes;
1712                 struct sockaddr * __restrict    asa;
1713                 socklen_t * __restrict  alen;
1714         } */ *uap;
1715         int compat;
1716 {
1717         struct sockaddr *sa;
1718         socklen_t len;
1719         int error;
1720
1721         error = copyin(uap->alen, &len, sizeof (len));
1722         if (error != 0)
1723                 return (error);
1724
1725         error = kern_getpeername(td, uap->fdes, &sa, &len);
1726         if (error != 0)
1727                 return (error);
1728
1729         if (len != 0) {
1730 #ifdef COMPAT_OLDSOCK
1731                 if (compat)
1732                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1733 #endif
1734                 error = copyout(sa, uap->asa, (u_int)len);
1735         }
1736         free(sa, M_SONAME);
1737         if (error == 0)
1738                 error = copyout(&len, uap->alen, sizeof(len));
1739         return (error);
1740 }
1741
1742 int
1743 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1744     socklen_t *alen)
1745 {
1746         struct socket *so;
1747         struct file *fp;
1748         cap_rights_t rights;
1749         socklen_t len;
1750         int error;
1751
1752         AUDIT_ARG_FD(fd);
1753         error = getsock_cap(td->td_proc->p_fd, fd,
1754             cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
1755         if (error != 0)
1756                 return (error);
1757         so = fp->f_data;
1758         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1759                 error = ENOTCONN;
1760                 goto done;
1761         }
1762         *sa = NULL;
1763         CURVNET_SET(so->so_vnet);
1764         error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1765         CURVNET_RESTORE();
1766         if (error != 0)
1767                 goto bad;
1768         if (*sa == NULL)
1769                 len = 0;
1770         else
1771                 len = MIN(*alen, (*sa)->sa_len);
1772         *alen = len;
1773 #ifdef KTRACE
1774         if (KTRPOINT(td, KTR_STRUCT))
1775                 ktrsockaddr(*sa);
1776 #endif
1777 bad:
1778         if (error != 0 && *sa != NULL) {
1779                 free(*sa, M_SONAME);
1780                 *sa = NULL;
1781         }
1782 done:
1783         fdrop(fp, td);
1784         return (error);
1785 }
1786
1787 int
1788 sys_getpeername(td, uap)
1789         struct thread *td;
1790         struct getpeername_args *uap;
1791 {
1792
1793         return (getpeername1(td, uap, 0));
1794 }
1795
1796 #ifdef COMPAT_OLDSOCK
1797 int
1798 ogetpeername(td, uap)
1799         struct thread *td;
1800         struct ogetpeername_args *uap;
1801 {
1802
1803         /* XXX uap should have type `getpeername_args *' to begin with. */
1804         return (getpeername1(td, (struct getpeername_args *)uap, 1));
1805 }
1806 #endif /* COMPAT_OLDSOCK */
1807
1808 int
1809 sockargs(mp, buf, buflen, type)
1810         struct mbuf **mp;
1811         caddr_t buf;
1812         int buflen, type;
1813 {
1814         struct sockaddr *sa;
1815         struct mbuf *m;
1816         int error;
1817
1818         if (buflen > MLEN) {
1819 #ifdef COMPAT_OLDSOCK
1820                 if (type == MT_SONAME && buflen <= 112)
1821                         buflen = MLEN;          /* unix domain compat. hack */
1822                 else
1823 #endif
1824                         if (buflen > MCLBYTES)
1825                                 return (EINVAL);
1826         }
1827         m = m_get2(buflen, M_WAITOK, type, 0);
1828         m->m_len = buflen;
1829         error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1830         if (error != 0)
1831                 (void) m_free(m);
1832         else {
1833                 *mp = m;
1834                 if (type == MT_SONAME) {
1835                         sa = mtod(m, struct sockaddr *);
1836
1837 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1838                         if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1839                                 sa->sa_family = sa->sa_len;
1840 #endif
1841                         sa->sa_len = buflen;
1842                 }
1843         }
1844         return (error);
1845 }
1846
1847 int
1848 getsockaddr(namp, uaddr, len)
1849         struct sockaddr **namp;
1850         caddr_t uaddr;
1851         size_t len;
1852 {
1853         struct sockaddr *sa;
1854         int error;
1855
1856         if (len > SOCK_MAXADDRLEN)
1857                 return (ENAMETOOLONG);
1858         if (len < offsetof(struct sockaddr, sa_data[0]))
1859                 return (EINVAL);
1860         sa = malloc(len, M_SONAME, M_WAITOK);
1861         error = copyin(uaddr, sa, len);
1862         if (error != 0) {
1863                 free(sa, M_SONAME);
1864         } else {
1865 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1866                 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1867                         sa->sa_family = sa->sa_len;
1868 #endif
1869                 sa->sa_len = len;
1870                 *namp = sa;
1871         }
1872         return (error);
1873 }
1874
1875 static int
1876 filt_sfsync_attach(struct knote *kn)
1877 {
1878         struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata;
1879         struct knlist *knl = &sfs->klist;
1880
1881         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1882
1883         /*
1884          * Validate that we actually received this via the kernel API.
1885          */
1886         if ((kn->kn_flags & EV_FLAG1) == 0)
1887                 return (EPERM);
1888
1889         kn->kn_ptr.p_v = sfs;
1890         kn->kn_flags &= ~EV_FLAG1;
1891
1892         knl->kl_lock(knl->kl_lockarg);
1893         /*
1894          * If we're in the "freeing" state,
1895          * don't allow the add.  That way we don't
1896          * end up racing with some other thread that
1897          * is trying to finish some setup.
1898          */
1899         if (sfs->state == SF_STATE_FREEING) {
1900                 knl->kl_unlock(knl->kl_lockarg);
1901                 return (EINVAL);
1902         }
1903         knlist_add(&sfs->klist, kn, 1);
1904         knl->kl_unlock(knl->kl_lockarg);
1905
1906         return (0);
1907 }
1908
1909 /*
1910  * Called when a knote is being detached.
1911  */
1912 static void
1913 filt_sfsync_detach(struct knote *kn)
1914 {
1915         struct knlist *knl;
1916         struct sendfile_sync *sfs;
1917         int do_free = 0;
1918
1919         sfs = kn->kn_ptr.p_v;
1920         knl = &sfs->klist;
1921
1922         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1923
1924         knl->kl_lock(knl->kl_lockarg);
1925         if (!knlist_empty(knl))
1926                 knlist_remove(knl, kn, 1);
1927
1928         /*
1929          * If the list is empty _AND_ the refcount is 0
1930          * _AND_ we've finished the setup phase and now
1931          * we're in the running phase, we can free the
1932          * underlying sendfile_sync.
1933          *
1934          * But we shouldn't do it before finishing the
1935          * underlying divorce from the knote.
1936          *
1937          * So, we have the sfsync lock held; transition
1938          * it to "freeing", then unlock, then free
1939          * normally.
1940          */
1941         if (knlist_empty(knl)) {
1942                 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) {
1943                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
1944                             "count==0, empty list: time to free!\n",
1945                             __func__,
1946                             (unsigned long long) curthread->td_tid,
1947                             sfs);
1948                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
1949                         do_free = 1;
1950                 }
1951         }
1952         knl->kl_unlock(knl->kl_lockarg);
1953
1954         /*
1955          * Only call free if we're the one who has transitioned things
1956          * to free.  Otherwise we could race with another thread that
1957          * is currently tearing things down.
1958          */
1959         if (do_free == 1) {
1960                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n",
1961                     __func__,
1962                     (unsigned long long) curthread->td_tid,
1963                     sfs,
1964                     __FILE__,
1965                     __LINE__);
1966                 sf_sync_free(sfs);
1967         }
1968 }
1969
1970 static int
1971 filt_sfsync(struct knote *kn, long hint)
1972 {
1973         struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v;
1974         int ret;
1975
1976         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1977
1978         /*
1979          * XXX add a lock assertion here!
1980          */
1981         ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED);
1982
1983         return (ret);
1984 }
1985
1986 /*
1987  * Add more references to a vm_page + sf_buf + sendfile_sync.
1988  */
1989 void
1990 sf_ext_ref(void *arg1, void *arg2)
1991 {
1992         struct sf_buf *sf = arg1;
1993         struct sendfile_sync *sfs = arg2;
1994         vm_page_t pg = sf_buf_page(sf);
1995
1996         /* XXXGL: there should be sf_buf_ref() */
1997         sf_buf_alloc(sf_buf_page(sf), SFB_NOWAIT);
1998
1999         vm_page_lock(pg);
2000         vm_page_wire(pg);
2001         vm_page_unlock(pg);
2002
2003         if (sfs != NULL) {
2004                 mtx_lock(&sfs->mtx);
2005                 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
2006                 sfs->count++;
2007                 mtx_unlock(&sfs->mtx);
2008         }
2009 }
2010
2011 /*
2012  * Detach mapped page and release resources back to the system.
2013  */
2014 void
2015 sf_ext_free(void *arg1, void *arg2)
2016 {
2017         struct sf_buf *sf = arg1;
2018         struct sendfile_sync *sfs = arg2;
2019         vm_page_t pg = sf_buf_page(sf);
2020
2021         sf_buf_free(sf);
2022
2023         vm_page_lock(pg);
2024         vm_page_unwire(pg, PQ_INACTIVE);
2025         /*
2026          * Check for the object going away on us. This can
2027          * happen since we don't hold a reference to it.
2028          * If so, we're responsible for freeing the page.
2029          */
2030         if (pg->wire_count == 0 && pg->object == NULL)
2031                 vm_page_free(pg);
2032         vm_page_unlock(pg);
2033
2034         if (sfs != NULL)
2035                 sf_sync_deref(sfs);
2036 }
2037
2038 /*
2039  * Called to remove a reference to a sf_sync object.
2040  *
2041  * This is generally done during the mbuf free path to signify
2042  * that one of the mbufs in the transaction has been completed.
2043  *
2044  * If we're doing SF_SYNC and the refcount is zero then we'll wake
2045  * up any waiters.
2046  *
2047  * IF we're doing SF_KQUEUE and the refcount is zero then we'll
2048  * fire off the knote.
2049  */
2050 void
2051 sf_sync_deref(struct sendfile_sync *sfs)
2052 {
2053         int do_free = 0;
2054
2055         if (sfs == NULL)
2056                 return;
2057
2058         mtx_lock(&sfs->mtx);
2059         KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
2060         sfs->count --;
2061
2062         /*
2063          * Only fire off the wakeup / kqueue notification if
2064          * we are in the running state.
2065          */
2066         if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) {
2067                 if (sfs->flags & SF_SYNC)
2068                         cv_signal(&sfs->cv);
2069
2070                 if (sfs->flags & SF_KQUEUE) {
2071                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n",
2072                             __func__,
2073                             (unsigned long long) curthread->td_tid,
2074                             sfs);
2075                         KNOTE_LOCKED(&sfs->klist, 1);
2076                 }
2077
2078                 /*
2079                  * If we're not waiting around for a sync,
2080                  * check if the knote list is empty.
2081                  * If it is, we transition to free.
2082                  *
2083                  * XXX I think it's about time I added some state
2084                  * or flag that says whether we're supposed to be
2085                  * waiting around until we've done a signal.
2086                  *
2087                  * XXX Ie, the reason that I don't free it here
2088                  * is because the caller will free the last reference,
2089                  * not us.  That should be codified in some flag
2090                  * that indicates "self-free" rather than checking
2091                  * for SF_SYNC all the time.
2092                  */
2093                 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) {
2094                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
2095                             "count==0, empty list: time to free!\n",
2096                             __func__,
2097                             (unsigned long long) curthread->td_tid,
2098                             sfs);
2099                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2100                         do_free = 1;
2101                 }
2102
2103         }
2104         mtx_unlock(&sfs->mtx);
2105
2106         /*
2107          * Attempt to do a free here.
2108          *
2109          * We do this outside of the lock because it may destroy the
2110          * lock in question as it frees things.  We can optimise this
2111          * later.
2112          *
2113          * XXX yes, we should make it a requirement to hold the
2114          * lock across sf_sync_free().
2115          */
2116         if (do_free == 1) {
2117                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n",
2118                     __func__,
2119                     (unsigned long long) curthread->td_tid,
2120                     sfs);
2121                 sf_sync_free(sfs);
2122         }
2123 }
2124
2125 /*
2126  * Allocate a sendfile_sync state structure.
2127  *
2128  * For now this only knows about the "sleep" sync, but later it will
2129  * grow various other personalities.
2130  */
2131 struct sendfile_sync *
2132 sf_sync_alloc(uint32_t flags)
2133 {
2134         struct sendfile_sync *sfs;
2135
2136         sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO);
2137         mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2138         cv_init(&sfs->cv, "sendfile");
2139         sfs->flags = flags;
2140         sfs->state = SF_STATE_SETUP;
2141         knlist_init_mtx(&sfs->klist, &sfs->mtx);
2142
2143         SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags);
2144
2145         return (sfs);
2146 }
2147
2148 /*
2149  * Take a reference to a sfsync instance.
2150  *
2151  * This has to map 1:1 to free calls coming in via sf_ext_free(),
2152  * so typically this will be referenced once for each mbuf allocated.
2153  */
2154 void
2155 sf_sync_ref(struct sendfile_sync *sfs)
2156 {
2157
2158         if (sfs == NULL)
2159                 return;
2160
2161         mtx_lock(&sfs->mtx);
2162         sfs->count++;
2163         mtx_unlock(&sfs->mtx);
2164 }
2165
2166 void
2167 sf_sync_syscall_wait(struct sendfile_sync *sfs)
2168 {
2169
2170         if (sfs == NULL)
2171                 return;
2172
2173         KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
2174             __func__,
2175             sfs));
2176
2177         /*
2178          * If we're not requested to wait during the syscall,
2179          * don't bother waiting.
2180          */
2181         if ((sfs->flags & SF_SYNC) == 0)
2182                 goto out;
2183
2184         /*
2185          * This is a bit suboptimal and confusing, so bear with me.
2186          *
2187          * Ideally sf_sync_syscall_wait() will wait until
2188          * all pending mbuf transmit operations are done.
2189          * This means that when sendfile becomes async, it'll
2190          * run in the background and will transition from
2191          * RUNNING to COMPLETED when it's finished acquiring
2192          * new things to send.  Then, when the mbufs finish
2193          * sending, COMPLETED + sfs->count == 0 is enough to
2194          * know that no further work is being done.
2195          *
2196          * So, we will sleep on both RUNNING and COMPLETED.
2197          * It's up to the (in progress) async sendfile loop
2198          * to transition the sf_sync from RUNNING to
2199          * COMPLETED so the wakeup above will actually
2200          * do the cv_signal() call.
2201          */
2202         if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING)
2203                 goto out;
2204
2205         if (sfs->count != 0)
2206                 cv_wait(&sfs->cv, &sfs->mtx);
2207         KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2208
2209 out:
2210         return;
2211 }
2212
2213 /*
2214  * Free an sf_sync if it's appropriate to.
2215  */
2216 void
2217 sf_sync_free(struct sendfile_sync *sfs)
2218 {
2219
2220         if (sfs == NULL)
2221                 return;
2222
2223         SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x "
2224             "count=%d\n",
2225             __func__,
2226             (long long) curthread->td_tid,
2227             sfs,
2228             sfs->state,
2229             sfs->flags,
2230             sfs->count);
2231
2232         mtx_lock(&sfs->mtx);
2233
2234         /*
2235          * We keep the sf_sync around if the state is active,
2236          * we are doing kqueue notification and we have active
2237          * knotes.
2238          *
2239          * If the caller wants to free us right this second it
2240          * should transition this to the freeing state.
2241          *
2242          * So, complain loudly if they break this rule.
2243          */
2244         if (sfs->state != SF_STATE_FREEING) {
2245                 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n",
2246                     __func__,
2247                     (unsigned long long) curthread->td_tid,
2248                     sfs);
2249                 mtx_unlock(&sfs->mtx);
2250                 return;
2251         }
2252
2253         KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2254         cv_destroy(&sfs->cv);
2255         /*
2256          * This doesn't call knlist_detach() on each knote; it just frees
2257          * the entire list.
2258          */
2259         knlist_delete(&sfs->klist, curthread, 1);
2260         mtx_destroy(&sfs->mtx);
2261         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n",
2262             __func__,
2263             (unsigned long long) curthread->td_tid,
2264             sfs);
2265         uma_zfree(zone_sfsync, sfs);
2266 }
2267
2268 /*
2269  * Setup a sf_sync to post a kqueue notification when things are complete.
2270  */
2271 int
2272 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq)
2273 {
2274         struct kevent kev;
2275         int error;
2276
2277         sfs->flags |= SF_KQUEUE;
2278
2279         /* Check the flags are valid */
2280         if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0)
2281                 return (EINVAL);
2282
2283         SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n",
2284             __func__,
2285             sfs,
2286             sfkq->kq_fd,
2287             sfkq->kq_flags,
2288             (void *) sfkq->kq_ident,
2289             (void *) sfkq->kq_udata);
2290
2291         /* Setup and register a knote on the given kqfd. */
2292         kev.ident = (uintptr_t) sfkq->kq_ident;
2293         kev.filter = EVFILT_SENDFILE;
2294         kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags;
2295         kev.data = (intptr_t) sfs;
2296         kev.udata = sfkq->kq_udata;
2297
2298         error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1);
2299         if (error != 0) {
2300                 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error);
2301         }
2302         return (error);
2303 }
2304
2305 void
2306 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state,
2307     int islocked)
2308 {
2309         sendfile_sync_state_t old_state;
2310
2311         if (! islocked)
2312                 mtx_lock(&sfs->mtx);
2313
2314         /*
2315          * Update our current state.
2316          */
2317         old_state = sfs->state;
2318         sfs->state = state;
2319         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n",
2320             __func__,
2321             (unsigned long long) curthread->td_tid,
2322             sfs,
2323             old_state,
2324             state);
2325
2326         /*
2327          * If we're transitioning from RUNNING to COMPLETED and the count is
2328          * zero, then post the knote.  The caller may have completed the
2329          * send before we updated the state to COMPLETED and we need to make
2330          * sure this is communicated.
2331          */
2332         if (old_state == SF_STATE_RUNNING
2333             && state == SF_STATE_COMPLETED
2334             && sfs->count == 0
2335             && sfs->flags & SF_KQUEUE) {
2336                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n",
2337                     __func__,
2338                     (unsigned long long) curthread->td_tid,
2339                     sfs);
2340                 KNOTE_LOCKED(&sfs->klist, 1);
2341         }
2342
2343         if (! islocked)
2344                 mtx_unlock(&sfs->mtx);
2345 }
2346
2347 /*
2348  * Set the retval/errno for the given transaction.
2349  *
2350  * This will eventually/ideally be used when the KNOTE is fired off
2351  * to signify the completion of this transaction.
2352  *
2353  * The sfsync lock should be held before entering this function.
2354  */
2355 void
2356 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno)
2357 {
2358
2359         KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
2360             __func__,
2361             sfs));
2362
2363         SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n",
2364             __func__,
2365             (unsigned long long) curthread->td_tid,
2366             sfs,
2367             xerrno,
2368             (intmax_t) retval);
2369
2370         sfs->retval = retval;
2371         sfs->xerrno = xerrno;
2372 }
2373
2374 /*
2375  * sendfile(2)
2376  *
2377  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
2378  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
2379  *
2380  * Send a file specified by 'fd' and starting at 'offset' to a socket
2381  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
2382  * 0.  Optionally add a header and/or trailer to the socket output.  If
2383  * specified, write the total number of bytes sent into *sbytes.
2384  */
2385 int
2386 sys_sendfile(struct thread *td, struct sendfile_args *uap)
2387 {
2388
2389         return (do_sendfile(td, uap, 0));
2390 }
2391
2392 int
2393 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags,
2394     int compat, off_t offset, size_t nbytes, off_t *sbytes,
2395     struct uio *hdr_uio,
2396     struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq)
2397 {
2398         cap_rights_t rights;
2399         struct sendfile_sync *sfs = NULL;
2400         struct file *fp;
2401         int error;
2402         int do_kqueue = 0;
2403         int do_free = 0;
2404
2405         AUDIT_ARG_FD(src_fd);
2406
2407         if (hdtr_kq != NULL)
2408                 do_kqueue = 1;
2409
2410         /*
2411          * sendfile(2) can start at any offset within a file so we require
2412          * CAP_READ+CAP_SEEK = CAP_PREAD.
2413          */
2414         if ((error = fget_read(td, src_fd,
2415             cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
2416                 goto out;
2417         }
2418
2419         /*
2420          * IF SF_KQUEUE is set but we haven't copied in anything for
2421          * kqueue data, error out.
2422          */
2423         if (flags & SF_KQUEUE && do_kqueue == 0) {
2424                 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__);
2425                 goto out;
2426         }
2427
2428         /*
2429          * If we need to wait for completion, initialise the sfsync
2430          * state here.
2431          */
2432         if (flags & (SF_SYNC | SF_KQUEUE))
2433                 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE));
2434
2435         if (flags & SF_KQUEUE) {
2436                 error = sf_sync_kqueue_setup(sfs, hdtr_kq);
2437                 if (error) {
2438                         SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n",
2439                             __func__,
2440                             (unsigned long long) curthread->td_tid,
2441                             sfs);
2442                         sf_sync_set_state(sfs, SF_STATE_FREEING, 0);
2443                         sf_sync_free(sfs);
2444                         goto out;
2445                 }
2446         }
2447
2448         /*
2449          * Do the sendfile call.
2450          *
2451          * If this fails, it'll free the mbuf chain which will free up the
2452          * sendfile_sync references.
2453          */
2454         error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset,
2455             nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td);
2456
2457         /*
2458          * If the sendfile call succeeded, transition the sf_sync state
2459          * to RUNNING, then COMPLETED.
2460          *
2461          * If the sendfile call failed, then the sendfile call may have
2462          * actually sent some data first - so we check to see whether
2463          * any data was sent.  If some data was queued (ie, count > 0)
2464          * then we can't call free; we have to wait until the partial
2465          * transaction completes before we continue along.
2466          *
2467          * This has the side effect of firing off the knote
2468          * if the refcount has hit zero by the time we get here.
2469          */
2470         if (sfs != NULL) {
2471                 mtx_lock(&sfs->mtx);
2472                 if (error == 0 || sfs->count > 0) {
2473                         /*
2474                          * When it's time to do async sendfile, the transition
2475                          * to RUNNING signifies that we're actually actively
2476                          * adding and completing mbufs.  When the last disk
2477                          * buffer is read (ie, when we're not doing any
2478                          * further read IO and all subsequent stuff is mbuf
2479                          * transmissions) we'll transition to COMPLETED
2480                          * and when the final mbuf is freed, the completion
2481                          * will be signaled.
2482                          */
2483                         sf_sync_set_state(sfs, SF_STATE_RUNNING, 1);
2484
2485                         /*
2486                          * Set the retval before we signal completed.
2487                          * If we do it the other way around then transitioning to
2488                          * COMPLETED may post the knote before you set the return
2489                          * status!
2490                          *
2491                          * XXX for now, errno is always 0, as we don't post
2492                          * knotes if sendfile failed.  Maybe that'll change later.
2493                          */
2494                         sf_sync_set_retval(sfs, *sbytes, error);
2495
2496                         /*
2497                          * And now transition to completed, which will kick off
2498                          * the knote if required.
2499                          */
2500                         sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1);
2501                 } else {
2502                         /*
2503                          * Error isn't zero, sfs_count is zero, so we
2504                          * won't have some other thing to wake things up.
2505                          * Thus free.
2506                          */
2507                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2508                         do_free = 1;
2509                 }
2510
2511                 /*
2512                  * Next - wait if appropriate.
2513                  */
2514                 sf_sync_syscall_wait(sfs);
2515
2516                 /*
2517                  * If we're not doing kqueue notifications, we can
2518                  * transition this immediately to the freeing state.
2519                  */
2520                 if ((sfs->flags & SF_KQUEUE) == 0) {
2521                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2522                         do_free = 1;
2523                 }
2524
2525                 mtx_unlock(&sfs->mtx);
2526         }
2527
2528         /*
2529          * If do_free is set, free here.
2530          *
2531          * If we're doing no-kqueue notification and it's just sleep notification,
2532          * we also do free; it's the only chance we have.
2533          */
2534         if (sfs != NULL && do_free == 1) {
2535                 sf_sync_free(sfs);
2536         }
2537
2538         /*
2539          * XXX Should we wait until the send has completed before freeing the source
2540          * file handle? It's the previous behaviour, sure, but is it required?
2541          * We've wired down the page references after all.
2542          */
2543         fdrop(fp, td);
2544
2545 out:
2546         /* Return error */
2547         return (error);
2548 }
2549
2550
2551 static int
2552 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
2553 {
2554         struct sf_hdtr hdtr;
2555         struct sf_hdtr_kq hdtr_kq;
2556         struct uio *hdr_uio, *trl_uio;
2557         int error;
2558         off_t sbytes;
2559         int do_kqueue = 0;
2560
2561         /*
2562          * File offset must be positive.  If it goes beyond EOF
2563          * we send only the header/trailer and no payload data.
2564          */
2565         if (uap->offset < 0)
2566                 return (EINVAL);
2567
2568         hdr_uio = trl_uio = NULL;
2569
2570         if (uap->hdtr != NULL) {
2571                 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
2572                 if (error != 0)
2573                         goto out;
2574                 if (hdtr.headers != NULL) {
2575                         error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
2576                         if (error != 0)
2577                                 goto out;
2578                 }
2579                 if (hdtr.trailers != NULL) {
2580                         error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
2581                         if (error != 0)
2582                                 goto out;
2583                 }
2584
2585                 /*
2586                  * If SF_KQUEUE is set, then we need to also copy in
2587                  * the kqueue data after the normal hdtr set and set
2588                  * do_kqueue=1.
2589                  */
2590                 if (uap->flags & SF_KQUEUE) {
2591                         error = copyin(((char *) uap->hdtr) + sizeof(hdtr),
2592                             &hdtr_kq,
2593                             sizeof(hdtr_kq));
2594                         if (error != 0)
2595                                 goto out;
2596                         do_kqueue = 1;
2597                 }
2598         }
2599
2600         /* Call sendfile */
2601         error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat,
2602             uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq);
2603
2604         if (uap->sbytes != NULL) {
2605                 copyout(&sbytes, uap->sbytes, sizeof(off_t));
2606         }
2607 out:
2608         free(hdr_uio, M_IOV);
2609         free(trl_uio, M_IOV);
2610         return (error);
2611 }
2612
2613 #ifdef COMPAT_FREEBSD4
2614 int
2615 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
2616 {
2617         struct sendfile_args args;
2618
2619         args.fd = uap->fd;
2620         args.s = uap->s;
2621         args.offset = uap->offset;
2622         args.nbytes = uap->nbytes;
2623         args.hdtr = uap->hdtr;
2624         args.sbytes = uap->sbytes;
2625         args.flags = uap->flags;
2626
2627         return (do_sendfile(td, &args, 1));
2628 }
2629 #endif /* COMPAT_FREEBSD4 */
2630
2631 static int
2632 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
2633     off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
2634 {
2635         vm_page_t m;
2636         vm_pindex_t pindex;
2637         ssize_t resid;
2638         int error, readahead, rv;
2639
2640         pindex = OFF_TO_IDX(off);
2641         VM_OBJECT_WLOCK(obj);
2642         m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
2643             VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
2644
2645         /*
2646          * Check if page is valid for what we need, otherwise initiate I/O.
2647          *
2648          * The non-zero nd argument prevents disk I/O, instead we
2649          * return the caller what he specified in nd.  In particular,
2650          * if we already turned some pages into mbufs, nd == EAGAIN
2651          * and the main function send them the pages before we come
2652          * here again and block.
2653          */
2654         if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
2655                 if (vp == NULL)
2656                         vm_page_xunbusy(m);
2657                 VM_OBJECT_WUNLOCK(obj);
2658                 *res = m;
2659                 return (0);
2660         } else if (nd != 0) {
2661                 if (vp == NULL)
2662                         vm_page_xunbusy(m);
2663                 error = nd;
2664                 goto free_page;
2665         }
2666
2667         /*
2668          * Get the page from backing store.
2669          */
2670         error = 0;
2671         if (vp != NULL) {
2672                 VM_OBJECT_WUNLOCK(obj);
2673                 readahead = sfreadahead * MAXBSIZE;
2674
2675                 /*
2676                  * Use vn_rdwr() instead of the pager interface for
2677                  * the vnode, to allow the read-ahead.
2678                  *
2679                  * XXXMAC: Because we don't have fp->f_cred here, we
2680                  * pass in NOCRED.  This is probably wrong, but is
2681                  * consistent with our original implementation.
2682                  */
2683                 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
2684                     UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
2685                     bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
2686                 SFSTAT_INC(sf_iocnt);
2687                 VM_OBJECT_WLOCK(obj);
2688         } else {
2689                 if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
2690                         rv = vm_pager_get_pages(obj, &m, 1, 0);
2691                         SFSTAT_INC(sf_iocnt);
2692                         m = vm_page_lookup(obj, pindex);
2693                         if (m == NULL)
2694                                 error = EIO;
2695                         else if (rv != VM_PAGER_OK) {
2696                                 vm_page_lock(m);
2697                                 vm_page_free(m);
2698                                 vm_page_unlock(m);
2699                                 m = NULL;
2700                                 error = EIO;
2701                         }
2702                 } else {
2703                         pmap_zero_page(m);
2704                         m->valid = VM_PAGE_BITS_ALL;
2705                         m->dirty = 0;
2706                 }
2707                 if (m != NULL)
2708                         vm_page_xunbusy(m);
2709         }
2710         if (error == 0) {
2711                 *res = m;
2712         } else if (m != NULL) {
2713 free_page:
2714                 vm_page_lock(m);
2715                 vm_page_unwire(m, PQ_INACTIVE);
2716
2717                 /*
2718                  * See if anyone else might know about this page.  If
2719                  * not and it is not valid, then free it.
2720                  */
2721                 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
2722                         vm_page_free(m);
2723                 vm_page_unlock(m);
2724         }
2725         KASSERT(error != 0 || (m->wire_count > 0 &&
2726             vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
2727             ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
2728             xfsize));
2729         VM_OBJECT_WUNLOCK(obj);
2730         return (error);
2731 }
2732
2733 static int
2734 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
2735     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
2736     int *bsize)
2737 {
2738         struct vattr va;
2739         vm_object_t obj;
2740         struct vnode *vp;
2741         struct shmfd *shmfd;
2742         int error;
2743
2744         vp = *vp_res = NULL;
2745         obj = NULL;
2746         shmfd = *shmfd_res = NULL;
2747         *bsize = 0;
2748
2749         /*
2750          * The file descriptor must be a regular file and have a
2751          * backing VM object.
2752          */
2753         if (fp->f_type == DTYPE_VNODE) {
2754                 vp = fp->f_vnode;
2755                 vn_lock(vp, LK_SHARED | LK_RETRY);
2756                 if (vp->v_type != VREG) {
2757                         error = EINVAL;
2758                         goto out;
2759                 }
2760                 *bsize = vp->v_mount->mnt_stat.f_iosize;
2761                 error = VOP_GETATTR(vp, &va, td->td_ucred);
2762                 if (error != 0)
2763                         goto out;
2764                 *obj_size = va.va_size;
2765                 obj = vp->v_object;
2766                 if (obj == NULL) {
2767                         error = EINVAL;
2768                         goto out;
2769                 }
2770         } else if (fp->f_type == DTYPE_SHM) {
2771                 shmfd = fp->f_data;
2772                 obj = shmfd->shm_object;
2773                 *obj_size = shmfd->shm_size;
2774         } else {
2775                 error = EINVAL;
2776                 goto out;
2777         }
2778
2779         VM_OBJECT_WLOCK(obj);
2780         if ((obj->flags & OBJ_DEAD) != 0) {
2781                 VM_OBJECT_WUNLOCK(obj);
2782                 error = EBADF;
2783                 goto out;
2784         }
2785
2786         /*
2787          * Temporarily increase the backing VM object's reference
2788          * count so that a forced reclamation of its vnode does not
2789          * immediately destroy it.
2790          */
2791         vm_object_reference_locked(obj);
2792         VM_OBJECT_WUNLOCK(obj);
2793         *obj_res = obj;
2794         *vp_res = vp;
2795         *shmfd_res = shmfd;
2796
2797 out:
2798         if (vp != NULL)
2799                 VOP_UNLOCK(vp, 0);
2800         return (error);
2801 }
2802
2803 static int
2804 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
2805     struct socket **so)
2806 {
2807         cap_rights_t rights;
2808         int error;
2809
2810         *sock_fp = NULL;
2811         *so = NULL;
2812
2813         /*
2814          * The socket must be a stream socket and connected.
2815          */
2816         error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights,
2817             CAP_SEND), sock_fp, NULL);
2818         if (error != 0)
2819                 return (error);
2820         *so = (*sock_fp)->f_data;
2821         if ((*so)->so_type != SOCK_STREAM)
2822                 return (EINVAL);
2823         if (((*so)->so_state & SS_ISCONNECTED) == 0)
2824                 return (ENOTCONN);
2825         return (0);
2826 }
2827
2828 int
2829 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
2830     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
2831     int kflags, struct sendfile_sync *sfs, struct thread *td)
2832 {
2833         struct file *sock_fp;
2834         struct vnode *vp;
2835         struct vm_object *obj;
2836         struct socket *so;
2837         struct mbuf *m;
2838         struct sf_buf *sf;
2839         struct vm_page *pg;
2840         struct shmfd *shmfd;
2841         struct vattr va;
2842         off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
2843         int error, bsize, nd, hdrlen, mnw;
2844
2845         pg = NULL;
2846         obj = NULL;
2847         so = NULL;
2848         m = NULL;
2849         fsbytes = sbytes = 0;
2850         hdrlen = mnw = 0;
2851         rem = nbytes;
2852         obj_size = 0;
2853
2854         error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
2855         if (error != 0)
2856                 return (error);
2857         if (rem == 0)
2858                 rem = obj_size;
2859
2860         error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
2861         if (error != 0)
2862                 goto out;
2863
2864         /*
2865          * Do not wait on memory allocations but return ENOMEM for
2866          * caller to retry later.
2867          * XXX: Experimental.
2868          */
2869         if (flags & SF_MNOWAIT)
2870                 mnw = 1;
2871
2872 #ifdef MAC
2873         error = mac_socket_check_send(td->td_ucred, so);
2874         if (error != 0)
2875                 goto out;
2876 #endif
2877
2878         /* If headers are specified copy them into mbufs. */
2879         if (hdr_uio != NULL) {
2880                 hdr_uio->uio_td = td;
2881                 hdr_uio->uio_rw = UIO_WRITE;
2882                 if (hdr_uio->uio_resid > 0) {
2883                         /*
2884                          * In FBSD < 5.0 the nbytes to send also included
2885                          * the header.  If compat is specified subtract the
2886                          * header size from nbytes.
2887                          */
2888                         if (kflags & SFK_COMPAT) {
2889                                 if (nbytes > hdr_uio->uio_resid)
2890                                         nbytes -= hdr_uio->uio_resid;
2891                                 else
2892                                         nbytes = 0;
2893                         }
2894                         m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2895                             0, 0, 0);
2896                         if (m == NULL) {
2897                                 error = mnw ? EAGAIN : ENOBUFS;
2898                                 goto out;
2899                         }
2900                         hdrlen = m_length(m, NULL);
2901                 }
2902         }
2903
2904         /*
2905          * Protect against multiple writers to the socket.
2906          *
2907          * XXXRW: Historically this has assumed non-interruptibility, so now
2908          * we implement that, but possibly shouldn't.
2909          */
2910         (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2911
2912         /*
2913          * Loop through the pages of the file, starting with the requested
2914          * offset. Get a file page (do I/O if necessary), map the file page
2915          * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2916          * it on the socket.
2917          * This is done in two loops.  The inner loop turns as many pages
2918          * as it can, up to available socket buffer space, without blocking
2919          * into mbufs to have it bulk delivered into the socket send buffer.
2920          * The outer loop checks the state and available space of the socket
2921          * and takes care of the overall progress.
2922          */
2923         for (off = offset; ; ) {
2924                 struct mbuf *mtail;
2925                 int loopbytes;
2926                 int space;
2927                 int done;
2928
2929                 if ((nbytes != 0 && nbytes == fsbytes) ||
2930                     (nbytes == 0 && obj_size == fsbytes))
2931                         break;
2932
2933                 mtail = NULL;
2934                 loopbytes = 0;
2935                 space = 0;
2936                 done = 0;
2937
2938                 /*
2939                  * Check the socket state for ongoing connection,
2940                  * no errors and space in socket buffer.
2941                  * If space is low allow for the remainder of the
2942                  * file to be processed if it fits the socket buffer.
2943                  * Otherwise block in waiting for sufficient space
2944                  * to proceed, or if the socket is nonblocking, return
2945                  * to userland with EAGAIN while reporting how far
2946                  * we've come.
2947                  * We wait until the socket buffer has significant free
2948                  * space to do bulk sends.  This makes good use of file
2949                  * system read ahead and allows packet segmentation
2950                  * offloading hardware to take over lots of work.  If
2951                  * we were not careful here we would send off only one
2952                  * sfbuf at a time.
2953                  */
2954                 SOCKBUF_LOCK(&so->so_snd);
2955                 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2956                         so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2957 retry_space:
2958                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2959                         error = EPIPE;
2960                         SOCKBUF_UNLOCK(&so->so_snd);
2961                         goto done;
2962                 } else if (so->so_error) {
2963                         error = so->so_error;
2964                         so->so_error = 0;
2965                         SOCKBUF_UNLOCK(&so->so_snd);
2966                         goto done;
2967                 }
2968                 space = sbspace(&so->so_snd);
2969                 if (space < rem &&
2970                     (space <= 0 ||
2971                      space < so->so_snd.sb_lowat)) {
2972                         if (so->so_state & SS_NBIO) {
2973                                 SOCKBUF_UNLOCK(&so->so_snd);
2974                                 error = EAGAIN;
2975                                 goto done;
2976                         }
2977                         /*
2978                          * sbwait drops the lock while sleeping.
2979                          * When we loop back to retry_space the
2980                          * state may have changed and we retest
2981                          * for it.
2982                          */
2983                         error = sbwait(&so->so_snd);
2984                         /*
2985                          * An error from sbwait usually indicates that we've
2986                          * been interrupted by a signal. If we've sent anything
2987                          * then return bytes sent, otherwise return the error.
2988                          */
2989                         if (error != 0) {
2990                                 SOCKBUF_UNLOCK(&so->so_snd);
2991                                 goto done;
2992                         }
2993                         goto retry_space;
2994                 }
2995                 SOCKBUF_UNLOCK(&so->so_snd);
2996
2997                 /*
2998                  * Reduce space in the socket buffer by the size of
2999                  * the header mbuf chain.
3000                  * hdrlen is set to 0 after the first loop.
3001                  */
3002                 space -= hdrlen;
3003
3004                 if (vp != NULL) {
3005                         error = vn_lock(vp, LK_SHARED);
3006                         if (error != 0)
3007                                 goto done;
3008                         error = VOP_GETATTR(vp, &va, td->td_ucred);
3009                         if (error != 0 || off >= va.va_size) {
3010                                 VOP_UNLOCK(vp, 0);
3011                                 goto done;
3012                         }
3013                         obj_size = va.va_size;
3014                 }
3015
3016                 /*
3017                  * Loop and construct maximum sized mbuf chain to be bulk
3018                  * dumped into socket buffer.
3019                  */
3020                 while (space > loopbytes) {
3021                         vm_offset_t pgoff;
3022                         struct mbuf *m0;
3023
3024                         /*
3025                          * Calculate the amount to transfer.
3026                          * Not to exceed a page, the EOF,
3027                          * or the passed in nbytes.
3028                          */
3029                         pgoff = (vm_offset_t)(off & PAGE_MASK);
3030                         rem = obj_size - offset;
3031                         if (nbytes != 0)
3032                                 rem = omin(rem, nbytes);
3033                         rem -= fsbytes + loopbytes;
3034                         xfsize = omin(PAGE_SIZE - pgoff, rem);
3035                         xfsize = omin(space - loopbytes, xfsize);
3036                         if (xfsize <= 0) {
3037                                 done = 1;               /* all data sent */
3038                                 break;
3039                         }
3040
3041                         /*
3042                          * Attempt to look up the page.  Allocate
3043                          * if not found or wait and loop if busy.
3044                          */
3045                         if (m != NULL)
3046                                 nd = EAGAIN; /* send what we already got */
3047                         else if ((flags & SF_NODISKIO) != 0)
3048                                 nd = EBUSY;
3049                         else
3050                                 nd = 0;
3051                         error = sendfile_readpage(obj, vp, nd, off,
3052                             xfsize, bsize, td, &pg);
3053                         if (error != 0) {
3054                                 if (error == EAGAIN)
3055                                         error = 0;      /* not a real error */
3056                                 break;
3057                         }
3058
3059                         /*
3060                          * Get a sendfile buf.  When allocating the
3061                          * first buffer for mbuf chain, we usually
3062                          * wait as long as necessary, but this wait
3063                          * can be interrupted.  For consequent
3064                          * buffers, do not sleep, since several
3065                          * threads might exhaust the buffers and then
3066                          * deadlock.
3067                          */
3068                         sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
3069                             SFB_CATCH);
3070                         if (sf == NULL) {
3071                                 SFSTAT_INC(sf_allocfail);
3072                                 vm_page_lock(pg);
3073                                 vm_page_unwire(pg, PQ_INACTIVE);
3074                                 KASSERT(pg->object != NULL,
3075                                     ("%s: object disappeared", __func__));
3076                                 vm_page_unlock(pg);
3077                                 if (m == NULL)
3078                                         error = (mnw ? EAGAIN : EINTR);
3079                                 break;
3080                         }
3081
3082                         /*
3083                          * Get an mbuf and set it up as having
3084                          * external storage.
3085                          */
3086                         m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
3087                         if (m0 == NULL) {
3088                                 error = (mnw ? EAGAIN : ENOBUFS);
3089                                 sf_ext_free(sf, NULL);
3090                                 break;
3091                         }
3092                         /*
3093                          * Attach EXT_SFBUF external storage.
3094                          */
3095                         m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
3096                         m0->m_ext.ext_size = PAGE_SIZE;
3097                         m0->m_ext.ext_arg1 = sf;
3098                         m0->m_ext.ext_arg2 = sfs;
3099                         m0->m_ext.ext_type = EXT_SFBUF;
3100                         m0->m_ext.ext_flags = 0;
3101                         m0->m_flags |= (M_EXT|M_RDONLY);
3102                         m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
3103                         m0->m_len = xfsize;
3104
3105                         /* Append to mbuf chain. */
3106                         if (mtail != NULL)
3107                                 mtail->m_next = m0;
3108                         else if (m != NULL)
3109                                 m_last(m)->m_next = m0;
3110                         else
3111                                 m = m0;
3112                         mtail = m0;
3113
3114                         /* Keep track of bits processed. */
3115                         loopbytes += xfsize;
3116                         off += xfsize;
3117
3118                         /*
3119                          * XXX eventually this should be a sfsync
3120                          * method call!
3121                          */
3122                         if (sfs != NULL)
3123                                 sf_sync_ref(sfs);
3124                 }
3125
3126                 if (vp != NULL)
3127                         VOP_UNLOCK(vp, 0);
3128
3129                 /* Add the buffer chain to the socket buffer. */
3130                 if (m != NULL) {
3131                         int mlen, err;
3132
3133                         mlen = m_length(m, NULL);
3134                         SOCKBUF_LOCK(&so->so_snd);
3135                         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3136                                 error = EPIPE;
3137                                 SOCKBUF_UNLOCK(&so->so_snd);
3138                                 goto done;
3139                         }
3140                         SOCKBUF_UNLOCK(&so->so_snd);
3141                         CURVNET_SET(so->so_vnet);
3142                         /* Avoid error aliasing. */
3143                         err = (*so->so_proto->pr_usrreqs->pru_send)
3144                                     (so, 0, m, NULL, NULL, td);
3145                         CURVNET_RESTORE();
3146                         if (err == 0) {
3147                                 /*
3148                                  * We need two counters to get the
3149                                  * file offset and nbytes to send
3150                                  * right:
3151                                  * - sbytes contains the total amount
3152                                  *   of bytes sent, including headers.
3153                                  * - fsbytes contains the total amount
3154                                  *   of bytes sent from the file.
3155                                  */
3156                                 sbytes += mlen;
3157                                 fsbytes += mlen;
3158                                 if (hdrlen) {
3159                                         fsbytes -= hdrlen;
3160                                         hdrlen = 0;
3161                                 }
3162                         } else if (error == 0)
3163                                 error = err;
3164                         m = NULL;       /* pru_send always consumes */
3165                 }
3166
3167                 /* Quit outer loop on error or when we're done. */
3168                 if (done)
3169                         break;
3170                 if (error != 0)
3171                         goto done;
3172         }
3173
3174         /*
3175          * Send trailers. Wimp out and use writev(2).
3176          */
3177         if (trl_uio != NULL) {
3178                 sbunlock(&so->so_snd);
3179                 error = kern_writev(td, sockfd, trl_uio);
3180                 if (error == 0)
3181                         sbytes += td->td_retval[0];
3182                 goto out;
3183         }
3184
3185 done:
3186         sbunlock(&so->so_snd);
3187 out:
3188         /*
3189          * If there was no error we have to clear td->td_retval[0]
3190          * because it may have been set by writev.
3191          */
3192         if (error == 0) {
3193                 td->td_retval[0] = 0;
3194         }
3195         if (sent != NULL) {
3196                 (*sent) = sbytes;
3197         }
3198         if (obj != NULL)
3199                 vm_object_deallocate(obj);
3200         if (so)
3201                 fdrop(sock_fp, td);
3202         if (m)
3203                 m_freem(m);
3204
3205         if (error == ERESTART)
3206                 error = EINTR;
3207
3208         return (error);
3209 }
3210
3211 /*
3212  * SCTP syscalls.
3213  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
3214  * otherwise all return EOPNOTSUPP.
3215  * XXX: We should make this loadable one day.
3216  */
3217 int
3218 sys_sctp_peeloff(td, uap)
3219         struct thread *td;
3220         struct sctp_peeloff_args /* {
3221                 int     sd;
3222                 caddr_t name;
3223         } */ *uap;
3224 {
3225 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3226         struct file *nfp = NULL;
3227         struct socket *head, *so;
3228         cap_rights_t rights;
3229         u_int fflag;
3230         int error, fd;
3231
3232         AUDIT_ARG_FD(uap->sd);
3233         error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
3234             &head, &fflag);
3235         if (error != 0)
3236                 goto done2;
3237         if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
3238                 error = EOPNOTSUPP;
3239                 goto done;
3240         }
3241         error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
3242         if (error != 0)
3243                 goto done;
3244         /*
3245          * At this point we know we do have a assoc to pull
3246          * we proceed to get the fd setup. This may block
3247          * but that is ok.
3248          */
3249
3250         error = falloc(td, &nfp, &fd, 0);
3251         if (error != 0)
3252                 goto done;
3253         td->td_retval[0] = fd;
3254
3255         CURVNET_SET(head->so_vnet);
3256         so = sonewconn(head, SS_ISCONNECTED);
3257         if (so == NULL) {
3258                 error = ENOMEM;
3259                 goto noconnection;
3260         }
3261         /*
3262          * Before changing the flags on the socket, we have to bump the
3263          * reference count.  Otherwise, if the protocol calls sofree(),
3264          * the socket will be released due to a zero refcount.
3265          */
3266         SOCK_LOCK(so);
3267         soref(so);                      /* file descriptor reference */
3268         SOCK_UNLOCK(so);
3269
3270         ACCEPT_LOCK();
3271
3272         TAILQ_REMOVE(&head->so_comp, so, so_list);
3273         head->so_qlen--;
3274         so->so_state |= (head->so_state & SS_NBIO);
3275         so->so_state &= ~SS_NOFDREF;
3276         so->so_qstate &= ~SQ_COMP;
3277         so->so_head = NULL;
3278         ACCEPT_UNLOCK();
3279         finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
3280         error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
3281         if (error != 0)
3282                 goto noconnection;
3283         if (head->so_sigio != NULL)
3284                 fsetown(fgetown(&head->so_sigio), &so->so_sigio);
3285
3286 noconnection:
3287         /*
3288          * close the new descriptor, assuming someone hasn't ripped it
3289          * out from under us.
3290          */
3291         if (error != 0)
3292                 fdclose(td->td_proc->p_fd, nfp, fd, td);
3293
3294         /*
3295          * Release explicitly held references before returning.
3296          */
3297         CURVNET_RESTORE();
3298 done:
3299         if (nfp != NULL)
3300                 fdrop(nfp, td);
3301         fputsock(head);
3302 done2:
3303         return (error);
3304 #else  /* SCTP */
3305         return (EOPNOTSUPP);
3306 #endif /* SCTP */
3307 }
3308
3309 int
3310 sys_sctp_generic_sendmsg (td, uap)
3311         struct thread *td;
3312         struct sctp_generic_sendmsg_args /* {
3313                 int sd,
3314                 caddr_t msg,
3315                 int mlen,
3316                 caddr_t to,
3317                 __socklen_t tolen,
3318                 struct sctp_sndrcvinfo *sinfo,
3319                 int flags
3320         } */ *uap;
3321 {
3322 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3323         struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
3324         struct socket *so;
3325         struct file *fp = NULL;
3326         struct sockaddr *to = NULL;
3327 #ifdef KTRACE
3328         struct uio *ktruio = NULL;
3329 #endif
3330         struct uio auio;
3331         struct iovec iov[1];
3332         cap_rights_t rights;
3333         int error = 0, len;
3334
3335         if (uap->sinfo != NULL) {
3336                 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
3337                 if (error != 0)
3338                         return (error);
3339                 u_sinfo = &sinfo;
3340         }
3341
3342         cap_rights_init(&rights, CAP_SEND);
3343         if (uap->tolen != 0) {
3344                 error = getsockaddr(&to, uap->to, uap->tolen);
3345                 if (error != 0) {
3346                         to = NULL;
3347                         goto sctp_bad2;
3348                 }
3349                 cap_rights_set(&rights, CAP_CONNECT);
3350         }
3351
3352         AUDIT_ARG_FD(uap->sd);
3353         error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
3354         if (error != 0)
3355                 goto sctp_bad;
3356 #ifdef KTRACE
3357         if (to && (KTRPOINT(td, KTR_STRUCT)))
3358                 ktrsockaddr(to);
3359 #endif
3360
3361         iov[0].iov_base = uap->msg;
3362         iov[0].iov_len = uap->mlen;
3363
3364         so = (struct socket *)fp->f_data;
3365         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3366                 error = EOPNOTSUPP;
3367                 goto sctp_bad;
3368         }
3369 #ifdef MAC
3370         error = mac_socket_check_send(td->td_ucred, so);
3371         if (error != 0)
3372                 goto sctp_bad;
3373 #endif /* MAC */
3374
3375         auio.uio_iov =  iov;
3376         auio.uio_iovcnt = 1;
3377         auio.uio_segflg = UIO_USERSPACE;
3378         auio.uio_rw = UIO_WRITE;
3379         auio.uio_td = td;
3380         auio.uio_offset = 0;                    /* XXX */
3381         auio.uio_resid = 0;
3382         len = auio.uio_resid = uap->mlen;
3383         CURVNET_SET(so->so_vnet);
3384         error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
3385             (struct mbuf *)NULL, uap->flags, u_sinfo, td);
3386         CURVNET_RESTORE();
3387         if (error != 0) {
3388                 if (auio.uio_resid != len && (error == ERESTART ||
3389                     error == EINTR || error == EWOULDBLOCK))
3390                         error = 0;
3391                 /* Generation of SIGPIPE can be controlled per socket. */
3392                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
3393                     !(uap->flags & MSG_NOSIGNAL)) {
3394                         PROC_LOCK(td->td_proc);
3395                         tdsignal(td, SIGPIPE);
3396                         PROC_UNLOCK(td->td_proc);
3397                 }
3398         }
3399         if (error == 0)
3400                 td->td_retval[0] = len - auio.uio_resid;
3401 #ifdef KTRACE
3402         if (ktruio != NULL) {
3403                 ktruio->uio_resid = td->td_retval[0];
3404                 ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
3405         }
3406 #endif /* KTRACE */
3407 sctp_bad:
3408         if (fp != NULL)
3409                 fdrop(fp, td);
3410 sctp_bad2:
3411         free(to, M_SONAME);
3412         return (error);
3413 #else  /* SCTP */
3414         return (EOPNOTSUPP);
3415 #endif /* SCTP */
3416 }
3417
3418 int
3419 sys_sctp_generic_sendmsg_iov(td, uap)
3420         struct thread *td;
3421         struct sctp_generic_sendmsg_iov_args /* {
3422                 int sd,
3423                 struct iovec *iov,
3424                 int iovlen,
3425                 caddr_t to,
3426                 __socklen_t tolen,
3427                 struct sctp_sndrcvinfo *sinfo,
3428                 int flags
3429         } */ *uap;
3430 {
3431 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3432         struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
3433         struct socket *so;
3434         struct file *fp = NULL;
3435         struct sockaddr *to = NULL;
3436 #ifdef KTRACE
3437         struct uio *ktruio = NULL;
3438 #endif
3439         struct uio auio;
3440         struct iovec *iov, *tiov;
3441         cap_rights_t rights;
3442         ssize_t len;
3443         int error, i;
3444
3445         if (uap->sinfo != NULL) {
3446                 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
3447                 if (error != 0)
3448                         return (error);
3449                 u_sinfo = &sinfo;
3450         }
3451         cap_rights_init(&rights, CAP_SEND);
3452         if (uap->tolen != 0) {
3453                 error = getsockaddr(&to, uap->to, uap->tolen);
3454                 if (error != 0) {
3455                         to = NULL;
3456                         goto sctp_bad2;
3457                 }
3458                 cap_rights_set(&rights, CAP_CONNECT);
3459         }
3460
3461         AUDIT_ARG_FD(uap->sd);
3462         error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
3463         if (error != 0)
3464                 goto sctp_bad1;
3465
3466 #ifdef COMPAT_FREEBSD32
3467         if (SV_CURPROC_FLAG(SV_ILP32))
3468                 error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
3469                     uap->iovlen, &iov, EMSGSIZE);
3470         else
3471 #endif
3472                 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
3473         if (error != 0)
3474                 goto sctp_bad1;
3475 #ifdef KTRACE
3476         if (to && (KTRPOINT(td, KTR_STRUCT)))
3477                 ktrsockaddr(to);
3478 #endif
3479
3480         so = (struct socket *)fp->f_data;
3481         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3482                 error = EOPNOTSUPP;
3483                 goto sctp_bad;
3484         }
3485 #ifdef MAC
3486         error = mac_socket_check_send(td->td_ucred, so);
3487         if (error != 0)
3488                 goto sctp_bad;
3489 #endif /* MAC */
3490
3491         auio.uio_iov = iov;
3492         auio.uio_iovcnt = uap->iovlen;
3493         auio.uio_segflg = UIO_USERSPACE;
3494         auio.uio_rw = UIO_WRITE;
3495         auio.uio_td = td;
3496         auio.uio_offset = 0;                    /* XXX */
3497         auio.uio_resid = 0;
3498         tiov = iov;
3499         for (i = 0; i <uap->iovlen; i++, tiov++) {
3500                 if ((auio.uio_resid += tiov->iov_len) < 0) {
3501                         error = EINVAL;
3502                         goto sctp_bad;
3503                 }
3504         }
3505         len = auio.uio_resid;
3506         CURVNET_SET(so->so_vnet);
3507         error = sctp_lower_sosend(so, to, &auio,
3508                     (struct mbuf *)NULL, (struct mbuf *)NULL,
3509                     uap->flags, u_sinfo, td);
3510         CURVNET_RESTORE();
3511         if (error != 0) {
3512                 if (auio.uio_resid != len && (error == ERESTART ||
3513                     error == EINTR || error == EWOULDBLOCK))
3514                         error = 0;
3515                 /* Generation of SIGPIPE can be controlled per socket */
3516                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
3517                     !(uap->flags & MSG_NOSIGNAL)) {
3518                         PROC_LOCK(td->td_proc);
3519                         tdsignal(td, SIGPIPE);
3520                         PROC_UNLOCK(td->td_proc);
3521                 }
3522         }
3523         if (error == 0)
3524                 td->td_retval[0] = len - auio.uio_resid;
3525 #ifdef KTRACE
3526         if (ktruio != NULL) {
3527                 ktruio->uio_resid = td->td_retval[0];
3528                 ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
3529         }
3530 #endif /* KTRACE */
3531 sctp_bad:
3532         free(iov, M_IOV);
3533 sctp_bad1:
3534         if (fp != NULL)
3535                 fdrop(fp, td);
3536 sctp_bad2:
3537         free(to, M_SONAME);
3538         return (error);
3539 #else  /* SCTP */
3540         return (EOPNOTSUPP);
3541 #endif /* SCTP */
3542 }
3543
3544 int
3545 sys_sctp_generic_recvmsg(td, uap)
3546         struct thread *td;
3547         struct sctp_generic_recvmsg_args /* {
3548                 int sd,
3549                 struct iovec *iov,
3550                 int iovlen,
3551                 struct sockaddr *from,
3552                 __socklen_t *fromlenaddr,
3553                 struct sctp_sndrcvinfo *sinfo,
3554                 int *msg_flags
3555         } */ *uap;
3556 {
3557 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3558         uint8_t sockbufstore[256];
3559         struct uio auio;
3560         struct iovec *iov, *tiov;
3561         struct sctp_sndrcvinfo sinfo;
3562         struct socket *so;
3563         struct file *fp = NULL;
3564         struct sockaddr *fromsa;
3565         cap_rights_t rights;
3566 #ifdef KTRACE
3567         struct uio *ktruio = NULL;
3568 #endif
3569         ssize_t len;
3570         int error, fromlen, i, msg_flags;
3571
3572         AUDIT_ARG_FD(uap->sd);
3573         error = getsock_cap(td->td_proc->p_fd, uap->sd,
3574             cap_rights_init(&rights, CAP_RECV), &fp, NULL);
3575         if (error != 0)
3576                 return (error);
3577 #ifdef COMPAT_FREEBSD32
3578         if (SV_CURPROC_FLAG(SV_ILP32))
3579                 error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
3580                     uap->iovlen, &iov, EMSGSIZE);
3581         else
3582 #endif
3583                 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
3584         if (error != 0)
3585                 goto out1;
3586
3587         so = fp->f_data;
3588         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3589                 error = EOPNOTSUPP;
3590                 goto out;
3591         }
3592 #ifdef MAC
3593         error = mac_socket_check_receive(td->td_ucred, so);
3594         if (error != 0)
3595                 goto out;
3596 #endif /* MAC */
3597
3598         if (uap->fromlenaddr != NULL) {
3599                 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
3600                 if (error != 0)
3601                         goto out;
3602         } else {
3603                 fromlen = 0;
3604         }
3605         if (uap->msg_flags) {
3606                 error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
3607                 if (error != 0)
3608                         goto out;
3609         } else {
3610                 msg_flags = 0;
3611         }
3612         auio.uio_iov = iov;
3613         auio.uio_iovcnt = uap->iovlen;
3614         auio.uio_segflg = UIO_USERSPACE;
3615         auio.uio_rw = UIO_READ;
3616         auio.uio_td = td;
3617         auio.uio_offset = 0;                    /* XXX */
3618         auio.uio_resid = 0;
3619         tiov = iov;
3620         for (i = 0; i <uap->iovlen; i++, tiov++) {
3621                 if ((auio.uio_resid += tiov->iov_len) < 0) {
3622                         error = EINVAL;
3623                         goto out;
3624                 }
3625         }
3626         len = auio.uio_resid;
3627         fromsa = (struct sockaddr *)sockbufstore;
3628
3629 #ifdef KTRACE
3630         if (KTRPOINT(td, KTR_GENIO))
3631                 ktruio = cloneuio(&auio);
3632 #endif /* KTRACE */
3633         memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
3634         CURVNET_SET(so->so_vnet);
3635         error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
3636                     fromsa, fromlen, &msg_flags,
3637                     (struct sctp_sndrcvinfo *)&sinfo, 1);
3638         CURVNET_RESTORE();
3639         if (error != 0) {
3640                 if (auio.uio_resid != len && (error == ERESTART ||
3641                     error == EINTR || error == EWOULDBLOCK))
3642                         error = 0;
3643         } else {
3644                 if (uap->sinfo)
3645                         error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
3646         }
3647 #ifdef KTRACE
3648         if (ktruio != NULL) {
3649                 ktruio->uio_resid = len - auio.uio_resid;
3650                 ktrgenio(uap->sd, UIO_READ, ktruio, error);
3651         }
3652 #endif /* KTRACE */
3653         if (error != 0)
3654                 goto out;
3655         td->td_retval[0] = len - auio.uio_resid;
3656
3657         if (fromlen && uap->from) {
3658                 len = fromlen;
3659                 if (len <= 0 || fromsa == 0)
3660                         len = 0;
3661                 else {
3662                         len = MIN(len, fromsa->sa_len);
3663                         error = copyout(fromsa, uap->from, (size_t)len);
3664                         if (error != 0)
3665                                 goto out;
3666                 }
3667                 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
3668                 if (error != 0)
3669                         goto out;
3670         }
3671 #ifdef KTRACE
3672         if (KTRPOINT(td, KTR_STRUCT))
3673                 ktrsockaddr(fromsa);
3674 #endif
3675         if (uap->msg_flags) {
3676                 error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
3677                 if (error != 0)
3678                         goto out;
3679         }
3680 out:
3681         free(iov, M_IOV);
3682 out1:
3683         if (fp != NULL)
3684                 fdrop(fp, td);
3685
3686         return (error);
3687 #else  /* SCTP */
3688         return (EOPNOTSUPP);
3689 #endif /* SCTP */
3690 }