sys/kern/uipc_syscalls.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * sendfile(2) and related extensions:
   6  * Copyright (c) 1998, David Greenman. All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)uipc_syscalls.c     8.4 (Berkeley) 2/21/94
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 #include "opt_capsicum.h"
  39 #include "opt_inet.h"
  40 #include "opt_inet6.h"
  41 #include "opt_sctp.h"
  42 #include "opt_compat.h"
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/capsicum.h>
  48 #include <sys/condvar.h>
  49 #include <sys/kernel.h>
  50 #include <sys/lock.h>
  51 #include <sys/mutex.h>
  52 #include <sys/sysproto.h>
  53 #include <sys/malloc.h>
  54 #include <sys/filedesc.h>
  55 #include <sys/event.h>
  56 #include <sys/proc.h>
  57 #include <sys/fcntl.h>
  58 #include <sys/file.h>
  59 #include <sys/filio.h>
  60 #include <sys/jail.h>
  61 #include <sys/mman.h>
  62 #include <sys/mount.h>
  63 #include <sys/mbuf.h>
  64 #include <sys/protosw.h>
  65 #include <sys/rwlock.h>
  66 #include <sys/sf_buf.h>
  67 #include <sys/sf_sync.h>
  68 #include <sys/sf_base.h>
  69 #include <sys/sysent.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/signalvar.h>
  73 #include <sys/syscallsubr.h>
  74 #include <sys/sysctl.h>
  75 #include <sys/uio.h>
  76 #include <sys/vnode.h>
  77 #ifdef KTRACE
  78 #include <sys/ktrace.h>
  79 #endif
  80 #ifdef COMPAT_FREEBSD32
  81 #include <compat/freebsd32/freebsd32_util.h>
  82 #endif
  83
  84 #include <net/vnet.h>
  85
  86 #include <security/audit/audit.h>
  87 #include <security/mac/mac_framework.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_param.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_page.h>
  93 #include <vm/vm_pager.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_extern.h>
  96 #include <vm/uma.h>
  97
  98 #if defined(INET) || defined(INET6)
  99 #ifdef SCTP
 100 #include <netinet/sctp.h>
 101 #include <netinet/sctp_peeloff.h>
 102 #endif /* SCTP */
 103 #endif /* INET || INET6 */
 104
 105 /*
 106  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
 107  * and SOCK_NONBLOCK.
 108  */
 109 #define ACCEPT4_INHERIT 0x1
 110 #define ACCEPT4_COMPAT  0x2
 111
 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 114
 115 static int accept1(struct thread *td, int s, struct sockaddr *uname,
 116                    socklen_t *anamelen, int flags);
 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
 118                    int compat);
 119 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 120                         int compat);
 121 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 122                         int compat);
 123
 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 125
 126 static int      filt_sfsync_attach(struct knote *kn);
 127 static void     filt_sfsync_detach(struct knote *kn);
 128 static int      filt_sfsync(struct knote *kn, long hint);
 129
 130 /*
 131  * sendfile(2)-related variables and associated sysctls
 132  */
 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
 134     "sendfile(2) tunables");
 135 static int sfreadahead = 1;
 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
 137     &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 138
 139 #ifdef  SFSYNC_DEBUG
 140 static int sf_sync_debug = 0;
 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW,
 142     &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle");
 143 #define SFSYNC_DPRINTF(s, ...)                          \
 144                 do {                                    \
 145                         if (sf_sync_debug)              \
 146                                 printf((s), ##__VA_ARGS__); \
 147                 } while (0)
 148 #else
 149 #define SFSYNC_DPRINTF(c, ...)
 150 #endif
 151
 152 static uma_zone_t       zone_sfsync;
 153
 154 static struct filterops sendfile_filtops = {
 155         .f_isfd = 0,
 156         .f_attach = filt_sfsync_attach,
 157         .f_detach = filt_sfsync_detach,
 158         .f_event = filt_sfsync,
 159 };
 160
 161 static void
 162 sfstat_init(const void *unused)
 163 {
 164
 165         COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 166             M_WAITOK);
 167 }
 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 169
 170 static void
 171 sf_sync_init(const void *unused)
 172 {
 173
 174         zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync),
 175             NULL, NULL,
 176             NULL, NULL,
 177             UMA_ALIGN_CACHE,
 178             0);
 179         kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops);
 180 }
 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL);
 182
 183 static int
 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 185 {
 186         struct sfstat s;
 187
 188         COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 189         if (req->newptr)
 190                 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 191         return (SYSCTL_OUT(req, &s, sizeof(s)));
 192 }
 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
 194     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
 195
 196 /*
 197  * Convert a user file descriptor to a kernel file entry and check if required
 198  * capability rights are present.
 199  * A reference on the file entry is held upon returning.
 200  */
 201 static int
 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
 203     struct file **fpp, u_int *fflagp)
 204 {
 205         struct file *fp;
 206         int error;
 207
 208         error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
 209         if (error != 0)
 210                 return (error);
 211         if (fp->f_type != DTYPE_SOCKET) {
 212                 fdrop(fp, curthread);
 213                 return (ENOTSOCK);
 214         }
 215         if (fflagp != NULL)
 216                 *fflagp = fp->f_flag;
 217         *fpp = fp;
 218         return (0);
 219 }
 220
 221 /*
 222  * System call interface to the socket abstraction.
 223  */
 224 #if defined(COMPAT_43)
 225 #define COMPAT_OLDSOCK
 226 #endif
 227
 228 int
 229 sys_socket(td, uap)
 230         struct thread *td;
 231         struct socket_args /* {
 232                 int     domain;
 233                 int     type;
 234                 int     protocol;
 235         } */ *uap;
 236 {
 237         struct socket *so;
 238         struct file *fp;
 239         int fd, error, type, oflag, fflag;
 240
 241         AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
 242
 243         type = uap->type;
 244         oflag = 0;
 245         fflag = 0;
 246         if ((type & SOCK_CLOEXEC) != 0) {
 247                 type &= ~SOCK_CLOEXEC;
 248                 oflag |= O_CLOEXEC;
 249         }
 250         if ((type & SOCK_NONBLOCK) != 0) {
 251                 type &= ~SOCK_NONBLOCK;
 252                 fflag |= FNONBLOCK;
 253         }
 254
 255 #ifdef MAC
 256         error = mac_socket_check_create(td->td_ucred, uap->domain, type,
 257             uap->protocol);
 258         if (error != 0)
 259                 return (error);
 260 #endif
 261         error = falloc(td, &fp, &fd, oflag);
 262         if (error != 0)
 263                 return (error);
 264         /* An extra reference on `fp' has been held for us by falloc(). */
 265         error = socreate(uap->domain, &so, type, uap->protocol,
 266             td->td_ucred, td);
 267         if (error != 0) {
 268                 fdclose(td->td_proc->p_fd, fp, fd, td);
 269         } else {
 270                 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
 271                 if ((fflag & FNONBLOCK) != 0)
 272                         (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 273                 td->td_retval[0] = fd;
 274         }
 275         fdrop(fp, td);
 276         return (error);
 277 }
 278
 279 /* ARGSUSED */
 280 int
 281 sys_bind(td, uap)
 282         struct thread *td;
 283         struct bind_args /* {
 284                 int     s;
 285                 caddr_t name;
 286                 int     namelen;
 287         } */ *uap;
 288 {
 289         struct sockaddr *sa;
 290         int error;
 291
 292         error = getsockaddr(&sa, uap->name, uap->namelen);
 293         if (error == 0) {
 294                 error = kern_bind(td, uap->s, sa);
 295                 free(sa, M_SONAME);
 296         }
 297         return (error);
 298 }
 299
 300 static int
 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 302 {
 303         struct socket *so;
 304         struct file *fp;
 305         cap_rights_t rights;
 306         int error;
 307
 308         AUDIT_ARG_FD(fd);
 309         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 310         error = getsock_cap(td->td_proc->p_fd, fd,
 311             cap_rights_init(&rights, CAP_BIND), &fp, NULL);
 312         if (error != 0)
 313                 return (error);
 314         so = fp->f_data;
 315 #ifdef KTRACE
 316         if (KTRPOINT(td, KTR_STRUCT))
 317                 ktrsockaddr(sa);
 318 #endif
 319 #ifdef MAC
 320         error = mac_socket_check_bind(td->td_ucred, so, sa);
 321         if (error == 0) {
 322 #endif
 323                 if (dirfd == AT_FDCWD)
 324                         error = sobind(so, sa, td);
 325                 else
 326                         error = sobindat(dirfd, so, sa, td);
 327 #ifdef MAC
 328         }
 329 #endif
 330         fdrop(fp, td);
 331         return (error);
 332 }
 333
 334 int
 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
 336 {
 337
 338         return (kern_bindat(td, AT_FDCWD, fd, sa));
 339 }
 340
 341 /* ARGSUSED */
 342 int
 343 sys_bindat(td, uap)
 344         struct thread *td;
 345         struct bindat_args /* {
 346                 int     fd;
 347                 int     s;
 348                 caddr_t name;
 349                 int     namelen;
 350         } */ *uap;
 351 {
 352         struct sockaddr *sa;
 353         int error;
 354
 355         error = getsockaddr(&sa, uap->name, uap->namelen);
 356         if (error == 0) {
 357                 error = kern_bindat(td, uap->fd, uap->s, sa);
 358                 free(sa, M_SONAME);
 359         }
 360         return (error);
 361 }
 362
 363 /* ARGSUSED */
 364 int
 365 sys_listen(td, uap)
 366         struct thread *td;
 367         struct listen_args /* {
 368                 int     s;
 369                 int     backlog;
 370         } */ *uap;
 371 {
 372         struct socket *so;
 373         struct file *fp;
 374         cap_rights_t rights;
 375         int error;
 376
 377         AUDIT_ARG_FD(uap->s);
 378         error = getsock_cap(td->td_proc->p_fd, uap->s,
 379             cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
 380         if (error == 0) {
 381                 so = fp->f_data;
 382 #ifdef MAC
 383                 error = mac_socket_check_listen(td->td_ucred, so);
 384                 if (error == 0)
 385 #endif
 386                         error = solisten(so, uap->backlog, td);
 387                 fdrop(fp, td);
 388         }
 389         return(error);
 390 }
 391
 392 /*
 393  * accept1()
 394  */
 395 static int
 396 accept1(td, s, uname, anamelen, flags)
 397         struct thread *td;
 398         int s;
 399         struct sockaddr *uname;
 400         socklen_t *anamelen;
 401         int flags;
 402 {
 403         struct sockaddr *name;
 404         socklen_t namelen;
 405         struct file *fp;
 406         int error;
 407
 408         if (uname == NULL)
 409                 return (kern_accept4(td, s, NULL, NULL, flags, NULL));
 410
 411         error = copyin(anamelen, &namelen, sizeof (namelen));
 412         if (error != 0)
 413                 return (error);
 414
 415         error = kern_accept4(td, s, &name, &namelen, flags, &fp);
 416
 417         if (error != 0)
 418                 return (error);
 419
 420         if (error == 0 && uname != NULL) {
 421 #ifdef COMPAT_OLDSOCK
 422                 if (flags & ACCEPT4_COMPAT)
 423                         ((struct osockaddr *)name)->sa_family =
 424                             name->sa_family;
 425 #endif
 426                 error = copyout(name, uname, namelen);
 427         }
 428         if (error == 0)
 429                 error = copyout(&namelen, anamelen,
 430                     sizeof(namelen));
 431         if (error != 0)
 432                 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 433         fdrop(fp, td);
 434         free(name, M_SONAME);
 435         return (error);
 436 }
 437
 438 int
 439 kern_accept(struct thread *td, int s, struct sockaddr **name,
 440     socklen_t *namelen, struct file **fp)
 441 {
 442         return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
 443 }
 444
 445 int
 446 kern_accept4(struct thread *td, int s, struct sockaddr **name,
 447     socklen_t *namelen, int flags, struct file **fp)
 448 {
 449         struct filedesc *fdp;
 450         struct file *headfp, *nfp = NULL;
 451         struct sockaddr *sa = NULL;
 452         struct socket *head, *so;
 453         cap_rights_t rights;
 454         u_int fflag;
 455         pid_t pgid;
 456         int error, fd, tmp;
 457
 458         if (name != NULL)
 459                 *name = NULL;
 460
 461         AUDIT_ARG_FD(s);
 462         fdp = td->td_proc->p_fd;
 463         error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
 464             &headfp, &fflag);
 465         if (error != 0)
 466                 return (error);
 467         head = headfp->f_data;
 468         if ((head->so_options & SO_ACCEPTCONN) == 0) {
 469                 error = EINVAL;
 470                 goto done;
 471         }
 472 #ifdef MAC
 473         error = mac_socket_check_accept(td->td_ucred, head);
 474         if (error != 0)
 475                 goto done;
 476 #endif
 477         error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
 478         if (error != 0)
 479                 goto done;
 480         ACCEPT_LOCK();
 481         if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 482                 ACCEPT_UNLOCK();
 483                 error = EWOULDBLOCK;
 484                 goto noconnection;
 485         }
 486         while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 487                 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 488                         head->so_error = ECONNABORTED;
 489                         break;
 490                 }
 491                 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 492                     "accept", 0);
 493                 if (error != 0) {
 494                         ACCEPT_UNLOCK();
 495                         goto noconnection;
 496                 }
 497         }
 498         if (head->so_error) {
 499                 error = head->so_error;
 500                 head->so_error = 0;
 501                 ACCEPT_UNLOCK();
 502                 goto noconnection;
 503         }
 504         so = TAILQ_FIRST(&head->so_comp);
 505         KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 506         KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 507
 508         /*
 509          * Before changing the flags on the socket, we have to bump the
 510          * reference count.  Otherwise, if the protocol calls sofree(),
 511          * the socket will be released due to a zero refcount.
 512          */
 513         SOCK_LOCK(so);                  /* soref() and so_state update */
 514         soref(so);                      /* file descriptor reference */
 515
 516         TAILQ_REMOVE(&head->so_comp, so, so_list);
 517         head->so_qlen--;
 518         if (flags & ACCEPT4_INHERIT)
 519                 so->so_state |= (head->so_state & SS_NBIO);
 520         else
 521                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 522         so->so_qstate &= ~SQ_COMP;
 523         so->so_head = NULL;
 524
 525         SOCK_UNLOCK(so);
 526         ACCEPT_UNLOCK();
 527
 528         /* An extra reference on `nfp' has been held for us by falloc(). */
 529         td->td_retval[0] = fd;
 530
 531         /* connection has been removed from the listen queue */
 532         KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 533
 534         if (flags & ACCEPT4_INHERIT) {
 535                 pgid = fgetown(&head->so_sigio);
 536                 if (pgid != 0)
 537                         fsetown(pgid, &so->so_sigio);
 538         } else {
 539                 fflag &= ~(FNONBLOCK | FASYNC);
 540                 if (flags & SOCK_NONBLOCK)
 541                         fflag |= FNONBLOCK;
 542         }
 543
 544         finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 545         /* Sync socket nonblocking/async state with file flags */
 546         tmp = fflag & FNONBLOCK;
 547         (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 548         tmp = fflag & FASYNC;
 549         (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 550         sa = 0;
 551         error = soaccept(so, &sa);
 552         if (error != 0)
 553                 goto noconnection;
 554         if (sa == NULL) {
 555                 if (name)
 556                         *namelen = 0;
 557                 goto done;
 558         }
 559         AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 560         if (name) {
 561                 /* check sa_len before it is destroyed */
 562                 if (*namelen > sa->sa_len)
 563                         *namelen = sa->sa_len;
 564 #ifdef KTRACE
 565                 if (KTRPOINT(td, KTR_STRUCT))
 566                         ktrsockaddr(sa);
 567 #endif
 568                 *name = sa;
 569                 sa = NULL;
 570         }
 571 noconnection:
 572         free(sa, M_SONAME);
 573
 574         /*
 575          * close the new descriptor, assuming someone hasn't ripped it
 576          * out from under us.
 577          */
 578         if (error != 0)
 579                 fdclose(fdp, nfp, fd, td);
 580
 581         /*
 582          * Release explicitly held references before returning.  We return
 583          * a reference on nfp to the caller on success if they request it.
 584          */
 585 done:
 586         if (fp != NULL) {
 587                 if (error == 0) {
 588                         *fp = nfp;
 589                         nfp = NULL;
 590                 } else
 591                         *fp = NULL;
 592         }
 593         if (nfp != NULL)
 594                 fdrop(nfp, td);
 595         fdrop(headfp, td);
 596         return (error);
 597 }
 598
 599 int
 600 sys_accept(td, uap)
 601         struct thread *td;
 602         struct accept_args *uap;
 603 {
 604
 605         return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 606 }
 607
 608 int
 609 sys_accept4(td, uap)
 610         struct thread *td;
 611         struct accept4_args *uap;
 612 {
 613
 614         if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 615                 return (EINVAL);
 616
 617         return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
 618 }
 619
 620 #ifdef COMPAT_OLDSOCK
 621 int
 622 oaccept(td, uap)
 623         struct thread *td;
 624         struct accept_args *uap;
 625 {
 626
 627         return (accept1(td, uap->s, uap->name, uap->anamelen,
 628             ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 629 }
 630 #endif /* COMPAT_OLDSOCK */
 631
 632 /* ARGSUSED */
 633 int
 634 sys_connect(td, uap)
 635         struct thread *td;
 636         struct connect_args /* {
 637                 int     s;
 638                 caddr_t name;
 639                 int     namelen;
 640         } */ *uap;
 641 {
 642         struct sockaddr *sa;
 643         int error;
 644
 645         error = getsockaddr(&sa, uap->name, uap->namelen);
 646         if (error == 0) {
 647                 error = kern_connect(td, uap->s, sa);
 648                 free(sa, M_SONAME);
 649         }
 650         return (error);
 651 }
 652
 653 static int
 654 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 655 {
 656         struct socket *so;
 657         struct file *fp;
 658         cap_rights_t rights;
 659         int error, interrupted = 0;
 660
 661         AUDIT_ARG_FD(fd);
 662         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 663         error = getsock_cap(td->td_proc->p_fd, fd,
 664             cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
 665         if (error != 0)
 666                 return (error);
 667         so = fp->f_data;
 668         if (so->so_state & SS_ISCONNECTING) {
 669                 error = EALREADY;
 670                 goto done1;
 671         }
 672 #ifdef KTRACE
 673         if (KTRPOINT(td, KTR_STRUCT))
 674                 ktrsockaddr(sa);
 675 #endif
 676 #ifdef MAC
 677         error = mac_socket_check_connect(td->td_ucred, so, sa);
 678         if (error != 0)
 679                 goto bad;
 680 #endif
 681         if (dirfd == AT_FDCWD)
 682                 error = soconnect(so, sa, td);
 683         else
 684                 error = soconnectat(dirfd, so, sa, td);
 685         if (error != 0)
 686                 goto bad;
 687         if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 688                 error = EINPROGRESS;
 689                 goto done1;
 690         }
 691         SOCK_LOCK(so);
 692         while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 693                 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 694                     "connec", 0);
 695                 if (error != 0) {
 696                         if (error == EINTR || error == ERESTART)
 697                                 interrupted = 1;
 698                         break;
 699                 }
 700         }
 701         if (error == 0) {
 702                 error = so->so_error;
 703                 so->so_error = 0;
 704         }
 705         SOCK_UNLOCK(so);
 706 bad:
 707         if (!interrupted)
 708                 so->so_state &= ~SS_ISCONNECTING;
 709         if (error == ERESTART)
 710                 error = EINTR;
 711 done1:
 712         fdrop(fp, td);
 713         return (error);
 714 }
 715
 716 int
 717 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
 718 {
 719
 720         return (kern_connectat(td, AT_FDCWD, fd, sa));
 721 }
 722
 723 /* ARGSUSED */
 724 int
 725 sys_connectat(td, uap)
 726         struct thread *td;
 727         struct connectat_args /* {
 728                 int     fd;
 729                 int     s;
 730                 caddr_t name;
 731                 int     namelen;
 732         } */ *uap;
 733 {
 734         struct sockaddr *sa;
 735         int error;
 736
 737         error = getsockaddr(&sa, uap->name, uap->namelen);
 738         if (error == 0) {
 739                 error = kern_connectat(td, uap->fd, uap->s, sa);
 740                 free(sa, M_SONAME);
 741         }
 742         return (error);
 743 }
 744
 745 int
 746 kern_socketpair(struct thread *td, int domain, int type, int protocol,
 747     int *rsv)
 748 {
 749         struct filedesc *fdp = td->td_proc->p_fd;
 750         struct file *fp1, *fp2;
 751         struct socket *so1, *so2;
 752         int fd, error, oflag, fflag;
 753
 754         AUDIT_ARG_SOCKET(domain, type, protocol);
 755
 756         oflag = 0;
 757         fflag = 0;
 758         if ((type & SOCK_CLOEXEC) != 0) {
 759                 type &= ~SOCK_CLOEXEC;
 760                 oflag |= O_CLOEXEC;
 761         }
 762         if ((type & SOCK_NONBLOCK) != 0) {
 763                 type &= ~SOCK_NONBLOCK;
 764                 fflag |= FNONBLOCK;
 765         }
 766 #ifdef MAC
 767         /* We might want to have a separate check for socket pairs. */
 768         error = mac_socket_check_create(td->td_ucred, domain, type,
 769             protocol);
 770         if (error != 0)
 771                 return (error);
 772 #endif
 773         error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
 774         if (error != 0)
 775                 return (error);
 776         error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
 777         if (error != 0)
 778                 goto free1;
 779         /* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 780         error = falloc(td, &fp1, &fd, oflag);
 781         if (error != 0)
 782                 goto free2;
 783         rsv[0] = fd;
 784         fp1->f_data = so1;      /* so1 already has ref count */
 785         error = falloc(td, &fp2, &fd, oflag);
 786         if (error != 0)
 787                 goto free3;
 788         fp2->f_data = so2;      /* so2 already has ref count */
 789         rsv[1] = fd;
 790         error = soconnect2(so1, so2);
 791         if (error != 0)
 792                 goto free4;
 793         if (type == SOCK_DGRAM) {
 794                 /*
 795                  * Datagram socket connection is asymmetric.
 796                  */
 797                  error = soconnect2(so2, so1);
 798                  if (error != 0)
 799                         goto free4;
 800         }
 801         finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
 802             &socketops);
 803         finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
 804             &socketops);
 805         if ((fflag & FNONBLOCK) != 0) {
 806                 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
 807                 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
 808         }
 809         fdrop(fp1, td);
 810         fdrop(fp2, td);
 811         return (0);
 812 free4:
 813         fdclose(fdp, fp2, rsv[1], td);
 814         fdrop(fp2, td);
 815 free3:
 816         fdclose(fdp, fp1, rsv[0], td);
 817         fdrop(fp1, td);
 818 free2:
 819         if (so2 != NULL)
 820                 (void)soclose(so2);
 821 free1:
 822         if (so1 != NULL)
 823                 (void)soclose(so1);
 824         return (error);
 825 }
 826
 827 int
 828 sys_socketpair(struct thread *td, struct socketpair_args *uap)
 829 {
 830         int error, sv[2];
 831
 832         error = kern_socketpair(td, uap->domain, uap->type,
 833             uap->protocol, sv);
 834         if (error != 0)
 835                 return (error);
 836         error = copyout(sv, uap->rsv, 2 * sizeof(int));
 837         if (error != 0) {
 838                 (void)kern_close(td, sv[0]);
 839                 (void)kern_close(td, sv[1]);
 840         }
 841         return (error);
 842 }
 843
 844 static int
 845 sendit(td, s, mp, flags)
 846         struct thread *td;
 847         int s;
 848         struct msghdr *mp;
 849         int flags;
 850 {
 851         struct mbuf *control;
 852         struct sockaddr *to;
 853         int error;
 854
 855 #ifdef CAPABILITY_MODE
 856         if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
 857                 return (ECAPMODE);
 858 #endif
 859
 860         if (mp->msg_name != NULL) {
 861                 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 862                 if (error != 0) {
 863                         to = NULL;
 864                         goto bad;
 865                 }
 866                 mp->msg_name = to;
 867         } else {
 868                 to = NULL;
 869         }
 870
 871         if (mp->msg_control) {
 872                 if (mp->msg_controllen < sizeof(struct cmsghdr)
 873 #ifdef COMPAT_OLDSOCK
 874                     && mp->msg_flags != MSG_COMPAT
 875 #endif
 876                 ) {
 877                         error = EINVAL;
 878                         goto bad;
 879                 }
 880                 error = sockargs(&control, mp->msg_control,
 881                     mp->msg_controllen, MT_CONTROL);
 882                 if (error != 0)
 883                         goto bad;
 884 #ifdef COMPAT_OLDSOCK
 885                 if (mp->msg_flags == MSG_COMPAT) {
 886                         struct cmsghdr *cm;
 887
 888                         M_PREPEND(control, sizeof(*cm), M_WAITOK);
 889                         cm = mtod(control, struct cmsghdr *);
 890                         cm->cmsg_len = control->m_len;
 891                         cm->cmsg_level = SOL_SOCKET;
 892                         cm->cmsg_type = SCM_RIGHTS;
 893                 }
 894 #endif
 895         } else {
 896                 control = NULL;
 897         }
 898
 899         error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 900
 901 bad:
 902         free(to, M_SONAME);
 903         return (error);
 904 }
 905
 906 int
 907 kern_sendit(td, s, mp, flags, control, segflg)
 908         struct thread *td;
 909         int s;
 910         struct msghdr *mp;
 911         int flags;
 912         struct mbuf *control;
 913         enum uio_seg segflg;
 914 {
 915         struct file *fp;
 916         struct uio auio;
 917         struct iovec *iov;
 918         struct socket *so;
 919         cap_rights_t rights;
 920 #ifdef KTRACE
 921         struct uio *ktruio = NULL;
 922 #endif
 923         ssize_t len;
 924         int i, error;
 925
 926         AUDIT_ARG_FD(s);
 927         cap_rights_init(&rights, CAP_SEND);
 928         if (mp->msg_name != NULL) {
 929                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
 930                 cap_rights_set(&rights, CAP_CONNECT);
 931         }
 932         error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
 933         if (error != 0)
 934                 return (error);
 935         so = (struct socket *)fp->f_data;
 936
 937 #ifdef KTRACE
 938         if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
 939                 ktrsockaddr(mp->msg_name);
 940 #endif
 941 #ifdef MAC
 942         if (mp->msg_name != NULL) {
 943                 error = mac_socket_check_connect(td->td_ucred, so,
 944                     mp->msg_name);
 945                 if (error != 0)
 946                         goto bad;
 947         }
 948         error = mac_socket_check_send(td->td_ucred, so);
 949         if (error != 0)
 950                 goto bad;
 951 #endif
 952
 953         auio.uio_iov = mp->msg_iov;
 954         auio.uio_iovcnt = mp->msg_iovlen;
 955         auio.uio_segflg = segflg;
 956         auio.uio_rw = UIO_WRITE;
 957         auio.uio_td = td;
 958         auio.uio_offset = 0;                    /* XXX */
 959         auio.uio_resid = 0;
 960         iov = mp->msg_iov;
 961         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 962                 if ((auio.uio_resid += iov->iov_len) < 0) {
 963                         error = EINVAL;
 964                         goto bad;
 965                 }
 966         }
 967 #ifdef KTRACE
 968         if (KTRPOINT(td, KTR_GENIO))
 969                 ktruio = cloneuio(&auio);
 970 #endif
 971         len = auio.uio_resid;
 972         error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 973         if (error != 0) {
 974                 if (auio.uio_resid != len && (error == ERESTART ||
 975                     error == EINTR || error == EWOULDBLOCK))
 976                         error = 0;
 977                 /* Generation of SIGPIPE can be controlled per socket */
 978                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 979                     !(flags & MSG_NOSIGNAL)) {
 980                         PROC_LOCK(td->td_proc);
 981                         tdsignal(td, SIGPIPE);
 982                         PROC_UNLOCK(td->td_proc);
 983                 }
 984         }
 985         if (error == 0)
 986                 td->td_retval[0] = len - auio.uio_resid;
 987 #ifdef KTRACE
 988         if (ktruio != NULL) {
 989                 ktruio->uio_resid = td->td_retval[0];
 990                 ktrgenio(s, UIO_WRITE, ktruio, error);
 991         }
 992 #endif
 993 bad:
 994         fdrop(fp, td);
 995         return (error);
 996 }
 997
 998 int
 999 sys_sendto(td, uap)
1000         struct thread *td;
1001         struct sendto_args /* {
1002                 int     s;
1003                 caddr_t buf;
1004                 size_t  len;
1005                 int     flags;
1006                 caddr_t to;
1007                 int     tolen;
1008         } */ *uap;
1009 {
1010         struct msghdr msg;
1011         struct iovec aiov;
1012
1013         msg.msg_name = uap->to;
1014         msg.msg_namelen = uap->tolen;
1015         msg.msg_iov = &aiov;
1016         msg.msg_iovlen = 1;
1017         msg.msg_control = 0;
1018 #ifdef COMPAT_OLDSOCK
1019         msg.msg_flags = 0;
1020 #endif
1021         aiov.iov_base = uap->buf;
1022         aiov.iov_len = uap->len;
1023         return (sendit(td, uap->s, &msg, uap->flags));
1024 }
1025
1026 #ifdef COMPAT_OLDSOCK
1027 int
1028 osend(td, uap)
1029         struct thread *td;
1030         struct osend_args /* {
1031                 int     s;
1032                 caddr_t buf;
1033                 int     len;
1034                 int     flags;
1035         } */ *uap;
1036 {
1037         struct msghdr msg;
1038         struct iovec aiov;
1039
1040         msg.msg_name = 0;
1041         msg.msg_namelen = 0;
1042         msg.msg_iov = &aiov;
1043         msg.msg_iovlen = 1;
1044         aiov.iov_base = uap->buf;
1045         aiov.iov_len = uap->len;
1046         msg.msg_control = 0;
1047         msg.msg_flags = 0;
1048         return (sendit(td, uap->s, &msg, uap->flags));
1049 }
1050
1051 int
1052 osendmsg(td, uap)
1053         struct thread *td;
1054         struct osendmsg_args /* {
1055                 int     s;
1056                 caddr_t msg;
1057                 int     flags;
1058         } */ *uap;
1059 {
1060         struct msghdr msg;
1061         struct iovec *iov;
1062         int error;
1063
1064         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1065         if (error != 0)
1066                 return (error);
1067         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1068         if (error != 0)
1069                 return (error);
1070         msg.msg_iov = iov;
1071         msg.msg_flags = MSG_COMPAT;
1072         error = sendit(td, uap->s, &msg, uap->flags);
1073         free(iov, M_IOV);
1074         return (error);
1075 }
1076 #endif
1077
1078 int
1079 sys_sendmsg(td, uap)
1080         struct thread *td;
1081         struct sendmsg_args /* {
1082                 int     s;
1083                 caddr_t msg;
1084                 int     flags;
1085         } */ *uap;
1086 {
1087         struct msghdr msg;
1088         struct iovec *iov;
1089         int error;
1090
1091         error = copyin(uap->msg, &msg, sizeof (msg));
1092         if (error != 0)
1093                 return (error);
1094         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1095         if (error != 0)
1096                 return (error);
1097         msg.msg_iov = iov;
1098 #ifdef COMPAT_OLDSOCK
1099         msg.msg_flags = 0;
1100 #endif
1101         error = sendit(td, uap->s, &msg, uap->flags);
1102         free(iov, M_IOV);
1103         return (error);
1104 }
1105
1106 int
1107 kern_recvit(td, s, mp, fromseg, controlp)
1108         struct thread *td;
1109         int s;
1110         struct msghdr *mp;
1111         enum uio_seg fromseg;
1112         struct mbuf **controlp;
1113 {
1114         struct uio auio;
1115         struct iovec *iov;
1116         struct mbuf *m, *control = NULL;
1117         caddr_t ctlbuf;
1118         struct file *fp;
1119         struct socket *so;
1120         struct sockaddr *fromsa = NULL;
1121         cap_rights_t rights;
1122 #ifdef KTRACE
1123         struct uio *ktruio = NULL;
1124 #endif
1125         ssize_t len;
1126         int error, i;
1127
1128         if (controlp != NULL)
1129                 *controlp = NULL;
1130
1131         AUDIT_ARG_FD(s);
1132         error = getsock_cap(td->td_proc->p_fd, s,
1133             cap_rights_init(&rights, CAP_RECV), &fp, NULL);
1134         if (error != 0)
1135                 return (error);
1136         so = fp->f_data;
1137
1138 #ifdef MAC
1139         error = mac_socket_check_receive(td->td_ucred, so);
1140         if (error != 0) {
1141                 fdrop(fp, td);
1142                 return (error);
1143         }
1144 #endif
1145
1146         auio.uio_iov = mp->msg_iov;
1147         auio.uio_iovcnt = mp->msg_iovlen;
1148         auio.uio_segflg = UIO_USERSPACE;
1149         auio.uio_rw = UIO_READ;
1150         auio.uio_td = td;
1151         auio.uio_offset = 0;                    /* XXX */
1152         auio.uio_resid = 0;
1153         iov = mp->msg_iov;
1154         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1155                 if ((auio.uio_resid += iov->iov_len) < 0) {
1156                         fdrop(fp, td);
1157                         return (EINVAL);
1158                 }
1159         }
1160 #ifdef KTRACE
1161         if (KTRPOINT(td, KTR_GENIO))
1162                 ktruio = cloneuio(&auio);
1163 #endif
1164         len = auio.uio_resid;
1165         error = soreceive(so, &fromsa, &auio, NULL,
1166             (mp->msg_control || controlp) ? &control : NULL,
1167             &mp->msg_flags);
1168         if (error != 0) {
1169                 if (auio.uio_resid != len && (error == ERESTART ||
1170                     error == EINTR || error == EWOULDBLOCK))
1171                         error = 0;
1172         }
1173         if (fromsa != NULL)
1174                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1175 #ifdef KTRACE
1176         if (ktruio != NULL) {
1177                 ktruio->uio_resid = len - auio.uio_resid;
1178                 ktrgenio(s, UIO_READ, ktruio, error);
1179         }
1180 #endif
1181         if (error != 0)
1182                 goto out;
1183         td->td_retval[0] = len - auio.uio_resid;
1184         if (mp->msg_name) {
1185                 len = mp->msg_namelen;
1186                 if (len <= 0 || fromsa == NULL)
1187                         len = 0;
1188                 else {
1189                         /* save sa_len before it is destroyed by MSG_COMPAT */
1190                         len = MIN(len, fromsa->sa_len);
1191 #ifdef COMPAT_OLDSOCK
1192                         if (mp->msg_flags & MSG_COMPAT)
1193                                 ((struct osockaddr *)fromsa)->sa_family =
1194                                     fromsa->sa_family;
1195 #endif
1196                         if (fromseg == UIO_USERSPACE) {
1197                                 error = copyout(fromsa, mp->msg_name,
1198                                     (unsigned)len);
1199                                 if (error != 0)
1200                                         goto out;
1201                         } else
1202                                 bcopy(fromsa, mp->msg_name, len);
1203                 }
1204                 mp->msg_namelen = len;
1205         }
1206         if (mp->msg_control && controlp == NULL) {
1207 #ifdef COMPAT_OLDSOCK
1208                 /*
1209                  * We assume that old recvmsg calls won't receive access
1210                  * rights and other control info, esp. as control info
1211                  * is always optional and those options didn't exist in 4.3.
1212                  * If we receive rights, trim the cmsghdr; anything else
1213                  * is tossed.
1214                  */
1215                 if (control && mp->msg_flags & MSG_COMPAT) {
1216                         if (mtod(control, struct cmsghdr *)->cmsg_level !=
1217                             SOL_SOCKET ||
1218                             mtod(control, struct cmsghdr *)->cmsg_type !=
1219                             SCM_RIGHTS) {
1220                                 mp->msg_controllen = 0;
1221                                 goto out;
1222                         }
1223                         control->m_len -= sizeof (struct cmsghdr);
1224                         control->m_data += sizeof (struct cmsghdr);
1225                 }
1226 #endif
1227                 len = mp->msg_controllen;
1228                 m = control;
1229                 mp->msg_controllen = 0;
1230                 ctlbuf = mp->msg_control;
1231
1232                 while (m && len > 0) {
1233                         unsigned int tocopy;
1234
1235                         if (len >= m->m_len)
1236                                 tocopy = m->m_len;
1237                         else {
1238                                 mp->msg_flags |= MSG_CTRUNC;
1239                                 tocopy = len;
1240                         }
1241
1242                         if ((error = copyout(mtod(m, caddr_t),
1243                                         ctlbuf, tocopy)) != 0)
1244                                 goto out;
1245
1246                         ctlbuf += tocopy;
1247                         len -= tocopy;
1248                         m = m->m_next;
1249                 }
1250                 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1251         }
1252 out:
1253         fdrop(fp, td);
1254 #ifdef KTRACE
1255         if (fromsa && KTRPOINT(td, KTR_STRUCT))
1256                 ktrsockaddr(fromsa);
1257 #endif
1258         free(fromsa, M_SONAME);
1259
1260         if (error == 0 && controlp != NULL)
1261                 *controlp = control;
1262         else  if (control)
1263                 m_freem(control);
1264
1265         return (error);
1266 }
1267
1268 static int
1269 recvit(td, s, mp, namelenp)
1270         struct thread *td;
1271         int s;
1272         struct msghdr *mp;
1273         void *namelenp;
1274 {
1275         int error;
1276
1277         error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1278         if (error != 0)
1279                 return (error);
1280         if (namelenp != NULL) {
1281                 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1282 #ifdef COMPAT_OLDSOCK
1283                 if (mp->msg_flags & MSG_COMPAT)
1284                         error = 0;      /* old recvfrom didn't check */
1285 #endif
1286         }
1287         return (error);
1288 }
1289
1290 int
1291 sys_recvfrom(td, uap)
1292         struct thread *td;
1293         struct recvfrom_args /* {
1294                 int     s;
1295                 caddr_t buf;
1296                 size_t  len;
1297                 int     flags;
1298                 struct sockaddr * __restrict    from;
1299                 socklen_t * __restrict fromlenaddr;
1300         } */ *uap;
1301 {
1302         struct msghdr msg;
1303         struct iovec aiov;
1304         int error;
1305
1306         if (uap->fromlenaddr) {
1307                 error = copyin(uap->fromlenaddr,
1308                     &msg.msg_namelen, sizeof (msg.msg_namelen));
1309                 if (error != 0)
1310                         goto done2;
1311         } else {
1312                 msg.msg_namelen = 0;
1313         }
1314         msg.msg_name = uap->from;
1315         msg.msg_iov = &aiov;
1316         msg.msg_iovlen = 1;
1317         aiov.iov_base = uap->buf;
1318         aiov.iov_len = uap->len;
1319         msg.msg_control = 0;
1320         msg.msg_flags = uap->flags;
1321         error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1322 done2:
1323         return (error);
1324 }
1325
1326 #ifdef COMPAT_OLDSOCK
1327 int
1328 orecvfrom(td, uap)
1329         struct thread *td;
1330         struct recvfrom_args *uap;
1331 {
1332
1333         uap->flags |= MSG_COMPAT;
1334         return (sys_recvfrom(td, uap));
1335 }
1336 #endif
1337
1338 #ifdef COMPAT_OLDSOCK
1339 int
1340 orecv(td, uap)
1341         struct thread *td;
1342         struct orecv_args /* {
1343                 int     s;
1344                 caddr_t buf;
1345                 int     len;
1346                 int     flags;
1347         } */ *uap;
1348 {
1349         struct msghdr msg;
1350         struct iovec aiov;
1351
1352         msg.msg_name = 0;
1353         msg.msg_namelen = 0;
1354         msg.msg_iov = &aiov;
1355         msg.msg_iovlen = 1;
1356         aiov.iov_base = uap->buf;
1357         aiov.iov_len = uap->len;
1358         msg.msg_control = 0;
1359         msg.msg_flags = uap->flags;
1360         return (recvit(td, uap->s, &msg, NULL));
1361 }
1362
1363 /*
1364  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1365  * overlays the new one, missing only the flags, and with the (old) access
1366  * rights where the control fields are now.
1367  */
1368 int
1369 orecvmsg(td, uap)
1370         struct thread *td;
1371         struct orecvmsg_args /* {
1372                 int     s;
1373                 struct  omsghdr *msg;
1374                 int     flags;
1375         } */ *uap;
1376 {
1377         struct msghdr msg;
1378         struct iovec *iov;
1379         int error;
1380
1381         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1382         if (error != 0)
1383                 return (error);
1384         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1385         if (error != 0)
1386                 return (error);
1387         msg.msg_flags = uap->flags | MSG_COMPAT;
1388         msg.msg_iov = iov;
1389         error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1390         if (msg.msg_controllen && error == 0)
1391                 error = copyout(&msg.msg_controllen,
1392                     &uap->msg->msg_accrightslen, sizeof (int));
1393         free(iov, M_IOV);
1394         return (error);
1395 }
1396 #endif
1397
1398 int
1399 sys_recvmsg(td, uap)
1400         struct thread *td;
1401         struct recvmsg_args /* {
1402                 int     s;
1403                 struct  msghdr *msg;
1404                 int     flags;
1405         } */ *uap;
1406 {
1407         struct msghdr msg;
1408         struct iovec *uiov, *iov;
1409         int error;
1410
1411         error = copyin(uap->msg, &msg, sizeof (msg));
1412         if (error != 0)
1413                 return (error);
1414         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1415         if (error != 0)
1416                 return (error);
1417         msg.msg_flags = uap->flags;
1418 #ifdef COMPAT_OLDSOCK
1419         msg.msg_flags &= ~MSG_COMPAT;
1420 #endif
1421         uiov = msg.msg_iov;
1422         msg.msg_iov = iov;
1423         error = recvit(td, uap->s, &msg, NULL);
1424         if (error == 0) {
1425                 msg.msg_iov = uiov;
1426                 error = copyout(&msg, uap->msg, sizeof(msg));
1427         }
1428         free(iov, M_IOV);
1429         return (error);
1430 }
1431
1432 /* ARGSUSED */
1433 int
1434 sys_shutdown(td, uap)
1435         struct thread *td;
1436         struct shutdown_args /* {
1437                 int     s;
1438                 int     how;
1439         } */ *uap;
1440 {
1441         struct socket *so;
1442         struct file *fp;
1443         cap_rights_t rights;
1444         int error;
1445
1446         AUDIT_ARG_FD(uap->s);
1447         error = getsock_cap(td->td_proc->p_fd, uap->s,
1448             cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
1449         if (error == 0) {
1450                 so = fp->f_data;
1451                 error = soshutdown(so, uap->how);
1452                 fdrop(fp, td);
1453         }
1454         return (error);
1455 }
1456
1457 /* ARGSUSED */
1458 int
1459 sys_setsockopt(td, uap)
1460         struct thread *td;
1461         struct setsockopt_args /* {
1462                 int     s;
1463                 int     level;
1464                 int     name;
1465                 caddr_t val;
1466                 int     valsize;
1467         } */ *uap;
1468 {
1469
1470         return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1471             uap->val, UIO_USERSPACE, uap->valsize));
1472 }
1473
1474 int
1475 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1476         struct thread *td;
1477         int s;
1478         int level;
1479         int name;
1480         void *val;
1481         enum uio_seg valseg;
1482         socklen_t valsize;
1483 {
1484         struct socket *so;
1485         struct file *fp;
1486         struct sockopt sopt;
1487         cap_rights_t rights;
1488         int error;
1489
1490         if (val == NULL && valsize != 0)
1491                 return (EFAULT);
1492         if ((int)valsize < 0)
1493                 return (EINVAL);
1494
1495         sopt.sopt_dir = SOPT_SET;
1496         sopt.sopt_level = level;
1497         sopt.sopt_name = name;
1498         sopt.sopt_val = val;
1499         sopt.sopt_valsize = valsize;
1500         switch (valseg) {
1501         case UIO_USERSPACE:
1502                 sopt.sopt_td = td;
1503                 break;
1504         case UIO_SYSSPACE:
1505                 sopt.sopt_td = NULL;
1506                 break;
1507         default:
1508                 panic("kern_setsockopt called with bad valseg");
1509         }
1510
1511         AUDIT_ARG_FD(s);
1512         error = getsock_cap(td->td_proc->p_fd, s,
1513             cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
1514         if (error == 0) {
1515                 so = fp->f_data;
1516                 error = sosetopt(so, &sopt);
1517                 fdrop(fp, td);
1518         }
1519         return(error);
1520 }
1521
1522 /* ARGSUSED */
1523 int
1524 sys_getsockopt(td, uap)
1525         struct thread *td;
1526         struct getsockopt_args /* {
1527                 int     s;
1528                 int     level;
1529                 int     name;
1530                 void * __restrict       val;
1531                 socklen_t * __restrict avalsize;
1532         } */ *uap;
1533 {
1534         socklen_t valsize;
1535         int error;
1536
1537         if (uap->val) {
1538                 error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1539                 if (error != 0)
1540                         return (error);
1541         }
1542
1543         error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1544             uap->val, UIO_USERSPACE, &valsize);
1545
1546         if (error == 0)
1547                 error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1548         return (error);
1549 }
1550
1551 /*
1552  * Kernel version of getsockopt.
1553  * optval can be a userland or userspace. optlen is always a kernel pointer.
1554  */
1555 int
1556 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1557         struct thread *td;
1558         int s;
1559         int level;
1560         int name;
1561         void *val;
1562         enum uio_seg valseg;
1563         socklen_t *valsize;
1564 {
1565         struct socket *so;
1566         struct file *fp;
1567         struct sockopt sopt;
1568         cap_rights_t rights;
1569         int error;
1570
1571         if (val == NULL)
1572                 *valsize = 0;
1573         if ((int)*valsize < 0)
1574                 return (EINVAL);
1575
1576         sopt.sopt_dir = SOPT_GET;
1577         sopt.sopt_level = level;
1578         sopt.sopt_name = name;
1579         sopt.sopt_val = val;
1580         sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1581         switch (valseg) {
1582         case UIO_USERSPACE:
1583                 sopt.sopt_td = td;
1584                 break;
1585         case UIO_SYSSPACE:
1586                 sopt.sopt_td = NULL;
1587                 break;
1588         default:
1589                 panic("kern_getsockopt called with bad valseg");
1590         }
1591
1592         AUDIT_ARG_FD(s);
1593         error = getsock_cap(td->td_proc->p_fd, s,
1594             cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
1595         if (error == 0) {
1596                 so = fp->f_data;
1597                 error = sogetopt(so, &sopt);
1598                 *valsize = sopt.sopt_valsize;
1599                 fdrop(fp, td);
1600         }
1601         return (error);
1602 }
1603
1604 /*
1605  * getsockname1() - Get socket name.
1606  */
1607 /* ARGSUSED */
1608 static int
1609 getsockname1(td, uap, compat)
1610         struct thread *td;
1611         struct getsockname_args /* {
1612                 int     fdes;
1613                 struct sockaddr * __restrict asa;
1614                 socklen_t * __restrict alen;
1615         } */ *uap;
1616         int compat;
1617 {
1618         struct sockaddr *sa;
1619         socklen_t len;
1620         int error;
1621
1622         error = copyin(uap->alen, &len, sizeof(len));
1623         if (error != 0)
1624                 return (error);
1625
1626         error = kern_getsockname(td, uap->fdes, &sa, &len);
1627         if (error != 0)
1628                 return (error);
1629
1630         if (len != 0) {
1631 #ifdef COMPAT_OLDSOCK
1632                 if (compat)
1633                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1634 #endif
1635                 error = copyout(sa, uap->asa, (u_int)len);
1636         }
1637         free(sa, M_SONAME);
1638         if (error == 0)
1639                 error = copyout(&len, uap->alen, sizeof(len));
1640         return (error);
1641 }
1642
1643 int
1644 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1645     socklen_t *alen)
1646 {
1647         struct socket *so;
1648         struct file *fp;
1649         cap_rights_t rights;
1650         socklen_t len;
1651         int error;
1652
1653         AUDIT_ARG_FD(fd);
1654         error = getsock_cap(td->td_proc->p_fd, fd,
1655             cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
1656         if (error != 0)
1657                 return (error);
1658         so = fp->f_data;
1659         *sa = NULL;
1660         CURVNET_SET(so->so_vnet);
1661         error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1662         CURVNET_RESTORE();
1663         if (error != 0)
1664                 goto bad;
1665         if (*sa == NULL)
1666                 len = 0;
1667         else
1668                 len = MIN(*alen, (*sa)->sa_len);
1669         *alen = len;
1670 #ifdef KTRACE
1671         if (KTRPOINT(td, KTR_STRUCT))
1672                 ktrsockaddr(*sa);
1673 #endif
1674 bad:
1675         fdrop(fp, td);
1676         if (error != 0 && *sa != NULL) {
1677                 free(*sa, M_SONAME);
1678                 *sa = NULL;
1679         }
1680         return (error);
1681 }
1682
1683 int
1684 sys_getsockname(td, uap)
1685         struct thread *td;
1686         struct getsockname_args *uap;
1687 {
1688
1689         return (getsockname1(td, uap, 0));
1690 }
1691
1692 #ifdef COMPAT_OLDSOCK
1693 int
1694 ogetsockname(td, uap)
1695         struct thread *td;
1696         struct getsockname_args *uap;
1697 {
1698
1699         return (getsockname1(td, uap, 1));
1700 }
1701 #endif /* COMPAT_OLDSOCK */
1702
1703 /*
1704  * getpeername1() - Get name of peer for connected socket.
1705  */
1706 /* ARGSUSED */
1707 static int
1708 getpeername1(td, uap, compat)
1709         struct thread *td;
1710         struct getpeername_args /* {
1711                 int     fdes;
1712                 struct sockaddr * __restrict    asa;
1713                 socklen_t * __restrict  alen;
1714         } */ *uap;
1715         int compat;
1716 {
1717         struct sockaddr *sa;
1718         socklen_t len;
1719         int error;
1720
1721         error = copyin(uap->alen, &len, sizeof (len));
1722         if (error != 0)
1723                 return (error);
1724
1725         error = kern_getpeername(td, uap->fdes, &sa, &len);
1726         if (error != 0)
1727                 return (error);
1728
1729         if (len != 0) {
1730 #ifdef COMPAT_OLDSOCK
1731                 if (compat)
1732                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1733 #endif
1734                 error = copyout(sa, uap->asa, (u_int)len);
1735         }
1736         free(sa, M_SONAME);
1737         if (error == 0)
1738                 error = copyout(&len, uap->alen, sizeof(len));
1739         return (error);
1740 }
1741
1742 int
1743 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1744     socklen_t *alen)
1745 {
1746         struct socket *so;
1747         struct file *fp;
1748         cap_rights_t rights;
1749         socklen_t len;
1750         int error;
1751
1752         AUDIT_ARG_FD(fd);
1753         error = getsock_cap(td->td_proc->p_fd, fd,
1754             cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
1755         if (error != 0)
1756                 return (error);
1757         so = fp->f_data;
1758         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1759                 error = ENOTCONN;
1760                 goto done;
1761         }
1762         *sa = NULL;
1763         CURVNET_SET(so->so_vnet);
1764         error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1765         CURVNET_RESTORE();
1766         if (error != 0)
1767                 goto bad;
1768         if (*sa == NULL)
1769                 len = 0;
1770         else
1771                 len = MIN(*alen, (*sa)->sa_len);
1772         *alen = len;
1773 #ifdef KTRACE
1774         if (KTRPOINT(td, KTR_STRUCT))
1775                 ktrsockaddr(*sa);
1776 #endif
1777 bad:
1778         if (error != 0 && *sa != NULL) {
1779                 free(*sa, M_SONAME);
1780                 *sa = NULL;
1781         }
1782 done:
1783         fdrop(fp, td);
1784         return (error);
1785 }
1786
1787 int
1788 sys_getpeername(td, uap)
1789         struct thread *td;
1790         struct getpeername_args *uap;
1791 {
1792
1793         return (getpeername1(td, uap, 0));
1794 }
1795
1796 #ifdef COMPAT_OLDSOCK
1797 int
1798 ogetpeername(td, uap)
1799         struct thread *td;
1800         struct ogetpeername_args *uap;
1801 {
1802
1803         /* XXX uap should have type `getpeername_args *' to begin with. */
1804         return (getpeername1(td, (struct getpeername_args *)uap, 1));
1805 }
1806 #endif /* COMPAT_OLDSOCK */
1807
1808 int
1809 sockargs(mp, buf, buflen, type)
1810         struct mbuf **mp;
1811         caddr_t buf;
1812         int buflen, type;
1813 {
1814         struct sockaddr *sa;
1815         struct mbuf *m;
1816         int error;
1817
1818         if (buflen > MLEN) {
1819 #ifdef COMPAT_OLDSOCK
1820                 if (type == MT_SONAME && buflen <= 112)
1821                         buflen = MLEN;          /* unix domain compat. hack */
1822                 else
1823 #endif
1824                         if (buflen > MCLBYTES)
1825                                 return (EINVAL);
1826         }
1827         m = m_get2(buflen, M_WAITOK, type, 0);
1828         m->m_len = buflen;
1829         error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1830         if (error != 0)
1831                 (void) m_free(m);
1832         else {
1833                 *mp = m;
1834                 if (type == MT_SONAME) {
1835                         sa = mtod(m, struct sockaddr *);
1836
1837 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1838                         if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1839                                 sa->sa_family = sa->sa_len;
1840 #endif
1841                         sa->sa_len = buflen;
1842                 }
1843         }
1844         return (error);
1845 }
1846
1847 int
1848 getsockaddr(namp, uaddr, len)
1849         struct sockaddr **namp;
1850         caddr_t uaddr;
1851         size_t len;
1852 {
1853         struct sockaddr *sa;
1854         int error;
1855
1856         if (len > SOCK_MAXADDRLEN)
1857                 return (ENAMETOOLONG);
1858         if (len < offsetof(struct sockaddr, sa_data[0]))
1859                 return (EINVAL);
1860         sa = malloc(len, M_SONAME, M_WAITOK);
1861         error = copyin(uaddr, sa, len);
1862         if (error != 0) {
1863                 free(sa, M_SONAME);
1864         } else {
1865 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1866                 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1867                         sa->sa_family = sa->sa_len;
1868 #endif
1869                 sa->sa_len = len;
1870                 *namp = sa;
1871         }
1872         return (error);
1873 }
1874
1875 static int
1876 filt_sfsync_attach(struct knote *kn)
1877 {
1878         struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata;
1879         struct knlist *knl = &sfs->klist;
1880
1881         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1882
1883         /*
1884          * Validate that we actually received this via the kernel API.
1885          */
1886         if ((kn->kn_flags & EV_FLAG1) == 0)
1887                 return (EPERM);
1888
1889         kn->kn_ptr.p_v = sfs;
1890         kn->kn_flags &= ~EV_FLAG1;
1891
1892         knl->kl_lock(knl->kl_lockarg);
1893         /*
1894          * If we're in the "freeing" state,
1895          * don't allow the add.  That way we don't
1896          * end up racing with some other thread that
1897          * is trying to finish some setup.
1898          */
1899         if (sfs->state == SF_STATE_FREEING) {
1900                 knl->kl_unlock(knl->kl_lockarg);
1901                 return (EINVAL);
1902         }
1903         knlist_add(&sfs->klist, kn, 1);
1904         knl->kl_unlock(knl->kl_lockarg);
1905
1906         return (0);
1907 }
1908
1909 /*
1910  * Called when a knote is being detached.
1911  */
1912 static void
1913 filt_sfsync_detach(struct knote *kn)
1914 {
1915         struct knlist *knl;
1916         struct sendfile_sync *sfs;
1917         int do_free = 0;
1918
1919         sfs = kn->kn_ptr.p_v;
1920         knl = &sfs->klist;
1921
1922         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1923
1924         knl->kl_lock(knl->kl_lockarg);
1925         if (!knlist_empty(knl))
1926                 knlist_remove(knl, kn, 1);
1927
1928         /*
1929          * If the list is empty _AND_ the refcount is 0
1930          * _AND_ we've finished the setup phase and now
1931          * we're in the running phase, we can free the
1932          * underlying sendfile_sync.
1933          *
1934          * But we shouldn't do it before finishing the
1935          * underlying divorce from the knote.
1936          *
1937          * So, we have the sfsync lock held; transition
1938          * it to "freeing", then unlock, then free
1939          * normally.
1940          */
1941         if (knlist_empty(knl)) {
1942                 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) {
1943                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
1944                             "count==0, empty list: time to free!\n",
1945                             __func__,
1946                             (unsigned long long) curthread->td_tid,
1947                             sfs);
1948                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
1949                         do_free = 1;
1950                 }
1951         }
1952         knl->kl_unlock(knl->kl_lockarg);
1953
1954         /*
1955          * Only call free if we're the one who has transitioned things
1956          * to free.  Otherwise we could race with another thread that
1957          * is currently tearing things down.
1958          */
1959         if (do_free == 1) {
1960                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n",
1961                     __func__,
1962                     (unsigned long long) curthread->td_tid,
1963                     sfs,
1964                     __FILE__,
1965                     __LINE__);
1966                 sf_sync_free(sfs);
1967         }
1968 }
1969
1970 static int
1971 filt_sfsync(struct knote *kn, long hint)
1972 {
1973         struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v;
1974         int ret;
1975
1976         SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
1977
1978         /*
1979          * XXX add a lock assertion here!
1980          */
1981         ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED);
1982
1983         return (ret);
1984 }
1985
1986 /*
1987  * Add more references to a vm_page + sf_buf + sendfile_sync.
1988  */
1989 void
1990 sf_ext_ref(void *arg1, void *arg2)
1991 {
1992         struct sf_buf *sf = arg1;
1993         struct sendfile_sync *sfs = arg2;
1994         vm_page_t pg = sf_buf_page(sf);
1995
1996         sf_buf_ref(sf);
1997
1998         vm_page_lock(pg);
1999         vm_page_wire(pg);
2000         vm_page_unlock(pg);
2001
2002         if (sfs != NULL) {
2003                 mtx_lock(&sfs->mtx);
2004                 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
2005                 sfs->count++;
2006                 mtx_unlock(&sfs->mtx);
2007         }
2008 }
2009
2010 /*
2011  * Detach mapped page and release resources back to the system.
2012  */
2013 void
2014 sf_ext_free(void *arg1, void *arg2)
2015 {
2016         struct sf_buf *sf = arg1;
2017         struct sendfile_sync *sfs = arg2;
2018         vm_page_t pg = sf_buf_page(sf);
2019
2020         sf_buf_free(sf);
2021
2022         vm_page_lock(pg);
2023         vm_page_unwire(pg, PQ_INACTIVE);
2024         /*
2025          * Check for the object going away on us. This can
2026          * happen since we don't hold a reference to it.
2027          * If so, we're responsible for freeing the page.
2028          */
2029         if (pg->wire_count == 0 && pg->object == NULL)
2030                 vm_page_free(pg);
2031         vm_page_unlock(pg);
2032
2033         if (sfs != NULL)
2034                 sf_sync_deref(sfs);
2035 }
2036
2037 /*
2038  * Called to remove a reference to a sf_sync object.
2039  *
2040  * This is generally done during the mbuf free path to signify
2041  * that one of the mbufs in the transaction has been completed.
2042  *
2043  * If we're doing SF_SYNC and the refcount is zero then we'll wake
2044  * up any waiters.
2045  *
2046  * IF we're doing SF_KQUEUE and the refcount is zero then we'll
2047  * fire off the knote.
2048  */
2049 void
2050 sf_sync_deref(struct sendfile_sync *sfs)
2051 {
2052         int do_free = 0;
2053
2054         if (sfs == NULL)
2055                 return;
2056
2057         mtx_lock(&sfs->mtx);
2058         KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
2059         sfs->count --;
2060
2061         /*
2062          * Only fire off the wakeup / kqueue notification if
2063          * we are in the running state.
2064          */
2065         if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) {
2066                 if (sfs->flags & SF_SYNC)
2067                         cv_signal(&sfs->cv);
2068
2069                 if (sfs->flags & SF_KQUEUE) {
2070                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n",
2071                             __func__,
2072                             (unsigned long long) curthread->td_tid,
2073                             sfs);
2074                         KNOTE_LOCKED(&sfs->klist, 1);
2075                 }
2076
2077                 /*
2078                  * If we're not waiting around for a sync,
2079                  * check if the knote list is empty.
2080                  * If it is, we transition to free.
2081                  *
2082                  * XXX I think it's about time I added some state
2083                  * or flag that says whether we're supposed to be
2084                  * waiting around until we've done a signal.
2085                  *
2086                  * XXX Ie, the reason that I don't free it here
2087                  * is because the caller will free the last reference,
2088                  * not us.  That should be codified in some flag
2089                  * that indicates "self-free" rather than checking
2090                  * for SF_SYNC all the time.
2091                  */
2092                 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) {
2093                         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
2094                             "count==0, empty list: time to free!\n",
2095                             __func__,
2096                             (unsigned long long) curthread->td_tid,
2097                             sfs);
2098                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2099                         do_free = 1;
2100                 }
2101
2102         }
2103         mtx_unlock(&sfs->mtx);
2104
2105         /*
2106          * Attempt to do a free here.
2107          *
2108          * We do this outside of the lock because it may destroy the
2109          * lock in question as it frees things.  We can optimise this
2110          * later.
2111          *
2112          * XXX yes, we should make it a requirement to hold the
2113          * lock across sf_sync_free().
2114          */
2115         if (do_free == 1) {
2116                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n",
2117                     __func__,
2118                     (unsigned long long) curthread->td_tid,
2119                     sfs);
2120                 sf_sync_free(sfs);
2121         }
2122 }
2123
2124 /*
2125  * Allocate a sendfile_sync state structure.
2126  *
2127  * For now this only knows about the "sleep" sync, but later it will
2128  * grow various other personalities.
2129  */
2130 struct sendfile_sync *
2131 sf_sync_alloc(uint32_t flags)
2132 {
2133         struct sendfile_sync *sfs;
2134
2135         sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO);
2136         mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2137         cv_init(&sfs->cv, "sendfile");
2138         sfs->flags = flags;
2139         sfs->state = SF_STATE_SETUP;
2140         knlist_init_mtx(&sfs->klist, &sfs->mtx);
2141
2142         SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags);
2143
2144         return (sfs);
2145 }
2146
2147 /*
2148  * Take a reference to a sfsync instance.
2149  *
2150  * This has to map 1:1 to free calls coming in via sf_ext_free(),
2151  * so typically this will be referenced once for each mbuf allocated.
2152  */
2153 void
2154 sf_sync_ref(struct sendfile_sync *sfs)
2155 {
2156
2157         if (sfs == NULL)
2158                 return;
2159
2160         mtx_lock(&sfs->mtx);
2161         sfs->count++;
2162         mtx_unlock(&sfs->mtx);
2163 }
2164
2165 void
2166 sf_sync_syscall_wait(struct sendfile_sync *sfs)
2167 {
2168
2169         if (sfs == NULL)
2170                 return;
2171
2172         KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
2173             __func__,
2174             sfs));
2175
2176         /*
2177          * If we're not requested to wait during the syscall,
2178          * don't bother waiting.
2179          */
2180         if ((sfs->flags & SF_SYNC) == 0)
2181                 goto out;
2182
2183         /*
2184          * This is a bit suboptimal and confusing, so bear with me.
2185          *
2186          * Ideally sf_sync_syscall_wait() will wait until
2187          * all pending mbuf transmit operations are done.
2188          * This means that when sendfile becomes async, it'll
2189          * run in the background and will transition from
2190          * RUNNING to COMPLETED when it's finished acquiring
2191          * new things to send.  Then, when the mbufs finish
2192          * sending, COMPLETED + sfs->count == 0 is enough to
2193          * know that no further work is being done.
2194          *
2195          * So, we will sleep on both RUNNING and COMPLETED.
2196          * It's up to the (in progress) async sendfile loop
2197          * to transition the sf_sync from RUNNING to
2198          * COMPLETED so the wakeup above will actually
2199          * do the cv_signal() call.
2200          */
2201         if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING)
2202                 goto out;
2203
2204         if (sfs->count != 0)
2205                 cv_wait(&sfs->cv, &sfs->mtx);
2206         KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2207
2208 out:
2209         return;
2210 }
2211
2212 /*
2213  * Free an sf_sync if it's appropriate to.
2214  */
2215 void
2216 sf_sync_free(struct sendfile_sync *sfs)
2217 {
2218
2219         if (sfs == NULL)
2220                 return;
2221
2222         SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x "
2223             "count=%d\n",
2224             __func__,
2225             (long long) curthread->td_tid,
2226             sfs,
2227             sfs->state,
2228             sfs->flags,
2229             sfs->count);
2230
2231         mtx_lock(&sfs->mtx);
2232
2233         /*
2234          * We keep the sf_sync around if the state is active,
2235          * we are doing kqueue notification and we have active
2236          * knotes.
2237          *
2238          * If the caller wants to free us right this second it
2239          * should transition this to the freeing state.
2240          *
2241          * So, complain loudly if they break this rule.
2242          */
2243         if (sfs->state != SF_STATE_FREEING) {
2244                 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n",
2245                     __func__,
2246                     (unsigned long long) curthread->td_tid,
2247                     sfs);
2248                 mtx_unlock(&sfs->mtx);
2249                 return;
2250         }
2251
2252         KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2253         cv_destroy(&sfs->cv);
2254         /*
2255          * This doesn't call knlist_detach() on each knote; it just frees
2256          * the entire list.
2257          */
2258         knlist_delete(&sfs->klist, curthread, 1);
2259         mtx_destroy(&sfs->mtx);
2260         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n",
2261             __func__,
2262             (unsigned long long) curthread->td_tid,
2263             sfs);
2264         uma_zfree(zone_sfsync, sfs);
2265 }
2266
2267 /*
2268  * Setup a sf_sync to post a kqueue notification when things are complete.
2269  */
2270 int
2271 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq)
2272 {
2273         struct kevent kev;
2274         int error;
2275
2276         sfs->flags |= SF_KQUEUE;
2277
2278         /* Check the flags are valid */
2279         if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0)
2280                 return (EINVAL);
2281
2282         SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n",
2283             __func__,
2284             sfs,
2285             sfkq->kq_fd,
2286             sfkq->kq_flags,
2287             (void *) sfkq->kq_ident,
2288             (void *) sfkq->kq_udata);
2289
2290         /* Setup and register a knote on the given kqfd. */
2291         kev.ident = (uintptr_t) sfkq->kq_ident;
2292         kev.filter = EVFILT_SENDFILE;
2293         kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags;
2294         kev.data = (intptr_t) sfs;
2295         kev.udata = sfkq->kq_udata;
2296
2297         error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1);
2298         if (error != 0) {
2299                 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error);
2300         }
2301         return (error);
2302 }
2303
2304 void
2305 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state,
2306     int islocked)
2307 {
2308         sendfile_sync_state_t old_state;
2309
2310         if (! islocked)
2311                 mtx_lock(&sfs->mtx);
2312
2313         /*
2314          * Update our current state.
2315          */
2316         old_state = sfs->state;
2317         sfs->state = state;
2318         SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n",
2319             __func__,
2320             (unsigned long long) curthread->td_tid,
2321             sfs,
2322             old_state,
2323             state);
2324
2325         /*
2326          * If we're transitioning from RUNNING to COMPLETED and the count is
2327          * zero, then post the knote.  The caller may have completed the
2328          * send before we updated the state to COMPLETED and we need to make
2329          * sure this is communicated.
2330          */
2331         if (old_state == SF_STATE_RUNNING
2332             && state == SF_STATE_COMPLETED
2333             && sfs->count == 0
2334             && sfs->flags & SF_KQUEUE) {
2335                 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n",
2336                     __func__,
2337                     (unsigned long long) curthread->td_tid,
2338                     sfs);
2339                 KNOTE_LOCKED(&sfs->klist, 1);
2340         }
2341
2342         if (! islocked)
2343                 mtx_unlock(&sfs->mtx);
2344 }
2345
2346 /*
2347  * Set the retval/errno for the given transaction.
2348  *
2349  * This will eventually/ideally be used when the KNOTE is fired off
2350  * to signify the completion of this transaction.
2351  *
2352  * The sfsync lock should be held before entering this function.
2353  */
2354 void
2355 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno)
2356 {
2357
2358         KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
2359             __func__,
2360             sfs));
2361
2362         SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n",
2363             __func__,
2364             (unsigned long long) curthread->td_tid,
2365             sfs,
2366             xerrno,
2367             (intmax_t) retval);
2368
2369         sfs->retval = retval;
2370         sfs->xerrno = xerrno;
2371 }
2372
2373 /*
2374  * sendfile(2)
2375  *
2376  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
2377  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
2378  *
2379  * Send a file specified by 'fd' and starting at 'offset' to a socket
2380  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
2381  * 0.  Optionally add a header and/or trailer to the socket output.  If
2382  * specified, write the total number of bytes sent into *sbytes.
2383  */
2384 int
2385 sys_sendfile(struct thread *td, struct sendfile_args *uap)
2386 {
2387
2388         return (do_sendfile(td, uap, 0));
2389 }
2390
2391 int
2392 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags,
2393     int compat, off_t offset, size_t nbytes, off_t *sbytes,
2394     struct uio *hdr_uio,
2395     struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq)
2396 {
2397         cap_rights_t rights;
2398         struct sendfile_sync *sfs = NULL;
2399         struct file *fp;
2400         int error;
2401         int do_kqueue = 0;
2402         int do_free = 0;
2403
2404         AUDIT_ARG_FD(src_fd);
2405
2406         if (hdtr_kq != NULL)
2407                 do_kqueue = 1;
2408
2409         /*
2410          * sendfile(2) can start at any offset within a file so we require
2411          * CAP_READ+CAP_SEEK = CAP_PREAD.
2412          */
2413         if ((error = fget_read(td, src_fd,
2414             cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
2415                 goto out;
2416         }
2417
2418         /*
2419          * IF SF_KQUEUE is set but we haven't copied in anything for
2420          * kqueue data, error out.
2421          */
2422         if (flags & SF_KQUEUE && do_kqueue == 0) {
2423                 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__);
2424                 goto out;
2425         }
2426
2427         /*
2428          * If we need to wait for completion, initialise the sfsync
2429          * state here.
2430          */
2431         if (flags & (SF_SYNC | SF_KQUEUE))
2432                 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE));
2433
2434         if (flags & SF_KQUEUE) {
2435                 error = sf_sync_kqueue_setup(sfs, hdtr_kq);
2436                 if (error) {
2437                         SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n",
2438                             __func__,
2439                             (unsigned long long) curthread->td_tid,
2440                             sfs);
2441                         sf_sync_set_state(sfs, SF_STATE_FREEING, 0);
2442                         sf_sync_free(sfs);
2443                         goto out;
2444                 }
2445         }
2446
2447         /*
2448          * Do the sendfile call.
2449          *
2450          * If this fails, it'll free the mbuf chain which will free up the
2451          * sendfile_sync references.
2452          */
2453         error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset,
2454             nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td);
2455
2456         /*
2457          * If the sendfile call succeeded, transition the sf_sync state
2458          * to RUNNING, then COMPLETED.
2459          *
2460          * If the sendfile call failed, then the sendfile call may have
2461          * actually sent some data first - so we check to see whether
2462          * any data was sent.  If some data was queued (ie, count > 0)
2463          * then we can't call free; we have to wait until the partial
2464          * transaction completes before we continue along.
2465          *
2466          * This has the side effect of firing off the knote
2467          * if the refcount has hit zero by the time we get here.
2468          */
2469         if (sfs != NULL) {
2470                 mtx_lock(&sfs->mtx);
2471                 if (error == 0 || sfs->count > 0) {
2472                         /*
2473                          * When it's time to do async sendfile, the transition
2474                          * to RUNNING signifies that we're actually actively
2475                          * adding and completing mbufs.  When the last disk
2476                          * buffer is read (ie, when we're not doing any
2477                          * further read IO and all subsequent stuff is mbuf
2478                          * transmissions) we'll transition to COMPLETED
2479                          * and when the final mbuf is freed, the completion
2480                          * will be signaled.
2481                          */
2482                         sf_sync_set_state(sfs, SF_STATE_RUNNING, 1);
2483
2484                         /*
2485                          * Set the retval before we signal completed.
2486                          * If we do it the other way around then transitioning to
2487                          * COMPLETED may post the knote before you set the return
2488                          * status!
2489                          *
2490                          * XXX for now, errno is always 0, as we don't post
2491                          * knotes if sendfile failed.  Maybe that'll change later.
2492                          */
2493                         sf_sync_set_retval(sfs, *sbytes, error);
2494
2495                         /*
2496                          * And now transition to completed, which will kick off
2497                          * the knote if required.
2498                          */
2499                         sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1);
2500                 } else {
2501                         /*
2502                          * Error isn't zero, sfs_count is zero, so we
2503                          * won't have some other thing to wake things up.
2504                          * Thus free.
2505                          */
2506                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2507                         do_free = 1;
2508                 }
2509
2510                 /*
2511                  * Next - wait if appropriate.
2512                  */
2513                 sf_sync_syscall_wait(sfs);
2514
2515                 /*
2516                  * If we're not doing kqueue notifications, we can
2517                  * transition this immediately to the freeing state.
2518                  */
2519                 if ((sfs->flags & SF_KQUEUE) == 0) {
2520                         sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
2521                         do_free = 1;
2522                 }
2523
2524                 mtx_unlock(&sfs->mtx);
2525         }
2526
2527         /*
2528          * If do_free is set, free here.
2529          *
2530          * If we're doing no-kqueue notification and it's just sleep notification,
2531          * we also do free; it's the only chance we have.
2532          */
2533         if (sfs != NULL && do_free == 1) {
2534                 sf_sync_free(sfs);
2535         }
2536
2537         /*
2538          * XXX Should we wait until the send has completed before freeing the source
2539          * file handle? It's the previous behaviour, sure, but is it required?
2540          * We've wired down the page references after all.
2541          */
2542         fdrop(fp, td);
2543
2544 out:
2545         /* Return error */
2546         return (error);
2547 }
2548
2549
2550 static int
2551 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
2552 {
2553         struct sf_hdtr hdtr;
2554         struct sf_hdtr_kq hdtr_kq;
2555         struct uio *hdr_uio, *trl_uio;
2556         int error;
2557         off_t sbytes;
2558         int do_kqueue = 0;
2559
2560         /*
2561          * File offset must be positive.  If it goes beyond EOF
2562          * we send only the header/trailer and no payload data.
2563          */
2564         if (uap->offset < 0)
2565                 return (EINVAL);
2566
2567         hdr_uio = trl_uio = NULL;
2568
2569         if (uap->hdtr != NULL) {
2570                 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
2571                 if (error != 0)
2572                         goto out;
2573                 if (hdtr.headers != NULL) {
2574                         error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
2575                         if (error != 0)
2576                                 goto out;
2577                 }
2578                 if (hdtr.trailers != NULL) {
2579                         error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
2580                         if (error != 0)
2581                                 goto out;
2582                 }
2583
2584                 /*
2585                  * If SF_KQUEUE is set, then we need to also copy in
2586                  * the kqueue data after the normal hdtr set and set
2587                  * do_kqueue=1.
2588                  */
2589                 if (uap->flags & SF_KQUEUE) {
2590                         error = copyin(((char *) uap->hdtr) + sizeof(hdtr),
2591                             &hdtr_kq,
2592                             sizeof(hdtr_kq));
2593                         if (error != 0)
2594                                 goto out;
2595                         do_kqueue = 1;
2596                 }
2597         }
2598
2599         /* Call sendfile */
2600         error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat,
2601             uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq);
2602
2603         if (uap->sbytes != NULL) {
2604                 copyout(&sbytes, uap->sbytes, sizeof(off_t));
2605         }
2606 out:
2607         free(hdr_uio, M_IOV);
2608         free(trl_uio, M_IOV);
2609         return (error);
2610 }
2611
2612 #ifdef COMPAT_FREEBSD4
2613 int
2614 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
2615 {
2616         struct sendfile_args args;
2617
2618         args.fd = uap->fd;
2619         args.s = uap->s;
2620         args.offset = uap->offset;
2621         args.nbytes = uap->nbytes;
2622         args.hdtr = uap->hdtr;
2623         args.sbytes = uap->sbytes;
2624         args.flags = uap->flags;
2625
2626         return (do_sendfile(td, &args, 1));
2627 }
2628 #endif /* COMPAT_FREEBSD4 */
2629
2630 static int
2631 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
2632     off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
2633 {
2634         vm_page_t m;
2635         vm_pindex_t pindex;
2636         ssize_t resid;
2637         int error, readahead, rv;
2638
2639         pindex = OFF_TO_IDX(off);
2640         VM_OBJECT_WLOCK(obj);
2641         m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
2642             VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
2643
2644         /*
2645          * Check if page is valid for what we need, otherwise initiate I/O.
2646          *
2647          * The non-zero nd argument prevents disk I/O, instead we
2648          * return the caller what he specified in nd.  In particular,
2649          * if we already turned some pages into mbufs, nd == EAGAIN
2650          * and the main function send them the pages before we come
2651          * here again and block.
2652          */
2653         if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
2654                 if (vp == NULL)
2655                         vm_page_xunbusy(m);
2656                 VM_OBJECT_WUNLOCK(obj);
2657                 *res = m;
2658                 return (0);
2659         } else if (nd != 0) {
2660                 if (vp == NULL)
2661                         vm_page_xunbusy(m);
2662                 error = nd;
2663                 goto free_page;
2664         }
2665
2666         /*
2667          * Get the page from backing store.
2668          */
2669         error = 0;
2670         if (vp != NULL) {
2671                 VM_OBJECT_WUNLOCK(obj);
2672                 readahead = sfreadahead * MAXBSIZE;
2673
2674                 /*
2675                  * Use vn_rdwr() instead of the pager interface for
2676                  * the vnode, to allow the read-ahead.
2677                  *
2678                  * XXXMAC: Because we don't have fp->f_cred here, we
2679                  * pass in NOCRED.  This is probably wrong, but is
2680                  * consistent with our original implementation.
2681                  */
2682                 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
2683                     UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
2684                     bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
2685                 SFSTAT_INC(sf_iocnt);
2686                 VM_OBJECT_WLOCK(obj);
2687         } else {
2688                 if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
2689                         rv = vm_pager_get_pages(obj, &m, 1, 0);
2690                         SFSTAT_INC(sf_iocnt);
2691                         m = vm_page_lookup(obj, pindex);
2692                         if (m == NULL)
2693                                 error = EIO;
2694                         else if (rv != VM_PAGER_OK) {
2695                                 vm_page_lock(m);
2696                                 vm_page_free(m);
2697                                 vm_page_unlock(m);
2698                                 m = NULL;
2699                                 error = EIO;
2700                         }
2701                 } else {
2702                         pmap_zero_page(m);
2703                         m->valid = VM_PAGE_BITS_ALL;
2704                         m->dirty = 0;
2705                 }
2706                 if (m != NULL)
2707                         vm_page_xunbusy(m);
2708         }
2709         if (error == 0) {
2710                 *res = m;
2711         } else if (m != NULL) {
2712 free_page:
2713                 vm_page_lock(m);
2714                 vm_page_unwire(m, PQ_INACTIVE);
2715
2716                 /*
2717                  * See if anyone else might know about this page.  If
2718                  * not and it is not valid, then free it.
2719                  */
2720                 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
2721                         vm_page_free(m);
2722                 vm_page_unlock(m);
2723         }
2724         KASSERT(error != 0 || (m->wire_count > 0 &&
2725             vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
2726             ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
2727             xfsize));
2728         VM_OBJECT_WUNLOCK(obj);
2729         return (error);
2730 }
2731
2732 static int
2733 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
2734     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
2735     int *bsize)
2736 {
2737         struct vattr va;
2738         vm_object_t obj;
2739         struct vnode *vp;
2740         struct shmfd *shmfd;
2741         int error;
2742
2743         vp = *vp_res = NULL;
2744         obj = NULL;
2745         shmfd = *shmfd_res = NULL;
2746         *bsize = 0;
2747
2748         /*
2749          * The file descriptor must be a regular file and have a
2750          * backing VM object.
2751          */
2752         if (fp->f_type == DTYPE_VNODE) {
2753                 vp = fp->f_vnode;
2754                 vn_lock(vp, LK_SHARED | LK_RETRY);
2755                 if (vp->v_type != VREG) {
2756                         error = EINVAL;
2757                         goto out;
2758                 }
2759                 *bsize = vp->v_mount->mnt_stat.f_iosize;
2760                 error = VOP_GETATTR(vp, &va, td->td_ucred);
2761                 if (error != 0)
2762                         goto out;
2763                 *obj_size = va.va_size;
2764                 obj = vp->v_object;
2765                 if (obj == NULL) {
2766                         error = EINVAL;
2767                         goto out;
2768                 }
2769         } else if (fp->f_type == DTYPE_SHM) {
2770                 shmfd = fp->f_data;
2771                 obj = shmfd->shm_object;
2772                 *obj_size = shmfd->shm_size;
2773         } else {
2774                 error = EINVAL;
2775                 goto out;
2776         }
2777
2778         VM_OBJECT_WLOCK(obj);
2779         if ((obj->flags & OBJ_DEAD) != 0) {
2780                 VM_OBJECT_WUNLOCK(obj);
2781                 error = EBADF;
2782                 goto out;
2783         }
2784
2785         /*
2786          * Temporarily increase the backing VM object's reference
2787          * count so that a forced reclamation of its vnode does not
2788          * immediately destroy it.
2789          */
2790         vm_object_reference_locked(obj);
2791         VM_OBJECT_WUNLOCK(obj);
2792         *obj_res = obj;
2793         *vp_res = vp;
2794         *shmfd_res = shmfd;
2795
2796 out:
2797         if (vp != NULL)
2798                 VOP_UNLOCK(vp, 0);
2799         return (error);
2800 }
2801
2802 static int
2803 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
2804     struct socket **so)
2805 {
2806         cap_rights_t rights;
2807         int error;
2808
2809         *sock_fp = NULL;
2810         *so = NULL;
2811
2812         /*
2813          * The socket must be a stream socket and connected.
2814          */
2815         error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights,
2816             CAP_SEND), sock_fp, NULL);
2817         if (error != 0)
2818                 return (error);
2819         *so = (*sock_fp)->f_data;
2820         if ((*so)->so_type != SOCK_STREAM)
2821                 return (EINVAL);
2822         if (((*so)->so_state & SS_ISCONNECTED) == 0)
2823                 return (ENOTCONN);
2824         return (0);
2825 }
2826
2827 int
2828 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
2829     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
2830     int kflags, struct sendfile_sync *sfs, struct thread *td)
2831 {
2832         struct file *sock_fp;
2833         struct vnode *vp;
2834         struct vm_object *obj;
2835         struct socket *so;
2836         struct mbuf *m;
2837         struct sf_buf *sf;
2838         struct vm_page *pg;
2839         struct shmfd *shmfd;
2840         struct vattr va;
2841         off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
2842         int error, bsize, nd, hdrlen, mnw;
2843
2844         pg = NULL;
2845         obj = NULL;
2846         so = NULL;
2847         m = NULL;
2848         fsbytes = sbytes = 0;
2849         hdrlen = mnw = 0;
2850         rem = nbytes;
2851         obj_size = 0;
2852
2853         error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
2854         if (error != 0)
2855                 return (error);
2856         if (rem == 0)
2857                 rem = obj_size;
2858
2859         error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
2860         if (error != 0)
2861                 goto out;
2862
2863         /*
2864          * Do not wait on memory allocations but return ENOMEM for
2865          * caller to retry later.
2866          * XXX: Experimental.
2867          */
2868         if (flags & SF_MNOWAIT)
2869                 mnw = 1;
2870
2871 #ifdef MAC
2872         error = mac_socket_check_send(td->td_ucred, so);
2873         if (error != 0)
2874                 goto out;
2875 #endif
2876
2877         /* If headers are specified copy them into mbufs. */
2878         if (hdr_uio != NULL) {
2879                 hdr_uio->uio_td = td;
2880                 hdr_uio->uio_rw = UIO_WRITE;
2881                 if (hdr_uio->uio_resid > 0) {
2882                         /*
2883                          * In FBSD < 5.0 the nbytes to send also included
2884                          * the header.  If compat is specified subtract the
2885                          * header size from nbytes.
2886                          */
2887                         if (kflags & SFK_COMPAT) {
2888                                 if (nbytes > hdr_uio->uio_resid)
2889                                         nbytes -= hdr_uio->uio_resid;
2890                                 else
2891                                         nbytes = 0;
2892                         }
2893                         m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2894                             0, 0, 0);
2895                         if (m == NULL) {
2896                                 error = mnw ? EAGAIN : ENOBUFS;
2897                                 goto out;
2898                         }
2899                         hdrlen = m_length(m, NULL);
2900                 }
2901         }
2902
2903         /*
2904          * Protect against multiple writers to the socket.
2905          *
2906          * XXXRW: Historically this has assumed non-interruptibility, so now
2907          * we implement that, but possibly shouldn't.
2908          */
2909         (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2910
2911         /*
2912          * Loop through the pages of the file, starting with the requested
2913          * offset. Get a file page (do I/O if necessary), map the file page
2914          * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2915          * it on the socket.
2916          * This is done in two loops.  The inner loop turns as many pages
2917          * as it can, up to available socket buffer space, without blocking
2918          * into mbufs to have it bulk delivered into the socket send buffer.
2919          * The outer loop checks the state and available space of the socket
2920          * and takes care of the overall progress.
2921          */
2922         for (off = offset; ; ) {
2923                 struct mbuf *mtail;
2924                 int loopbytes;
2925                 int space;
2926                 int done;
2927
2928                 if ((nbytes != 0 && nbytes == fsbytes) ||
2929                     (nbytes == 0 && obj_size == fsbytes))
2930                         break;
2931
2932                 mtail = NULL;
2933                 loopbytes = 0;
2934                 space = 0;
2935                 done = 0;
2936
2937                 /*
2938                  * Check the socket state for ongoing connection,
2939                  * no errors and space in socket buffer.
2940                  * If space is low allow for the remainder of the
2941                  * file to be processed if it fits the socket buffer.
2942                  * Otherwise block in waiting for sufficient space
2943                  * to proceed, or if the socket is nonblocking, return
2944                  * to userland with EAGAIN while reporting how far
2945                  * we've come.
2946                  * We wait until the socket buffer has significant free
2947                  * space to do bulk sends.  This makes good use of file
2948                  * system read ahead and allows packet segmentation
2949                  * offloading hardware to take over lots of work.  If
2950                  * we were not careful here we would send off only one
2951                  * sfbuf at a time.
2952                  */
2953                 SOCKBUF_LOCK(&so->so_snd);
2954                 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2955                         so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2956 retry_space:
2957                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2958                         error = EPIPE;
2959                         SOCKBUF_UNLOCK(&so->so_snd);
2960                         goto done;
2961                 } else if (so->so_error) {
2962                         error = so->so_error;
2963                         so->so_error = 0;
2964                         SOCKBUF_UNLOCK(&so->so_snd);
2965                         goto done;
2966                 }
2967                 space = sbspace(&so->so_snd);
2968                 if (space < rem &&
2969                     (space <= 0 ||
2970                      space < so->so_snd.sb_lowat)) {
2971                         if (so->so_state & SS_NBIO) {
2972                                 SOCKBUF_UNLOCK(&so->so_snd);
2973                                 error = EAGAIN;
2974                                 goto done;
2975                         }
2976                         /*
2977                          * sbwait drops the lock while sleeping.
2978                          * When we loop back to retry_space the
2979                          * state may have changed and we retest
2980                          * for it.
2981                          */
2982                         error = sbwait(&so->so_snd);
2983                         /*
2984                          * An error from sbwait usually indicates that we've
2985                          * been interrupted by a signal. If we've sent anything
2986                          * then return bytes sent, otherwise return the error.
2987                          */
2988                         if (error != 0) {
2989                                 SOCKBUF_UNLOCK(&so->so_snd);
2990                                 goto done;
2991                         }
2992                         goto retry_space;
2993                 }
2994                 SOCKBUF_UNLOCK(&so->so_snd);
2995
2996                 /*
2997                  * Reduce space in the socket buffer by the size of
2998                  * the header mbuf chain.
2999                  * hdrlen is set to 0 after the first loop.
3000                  */
3001                 space -= hdrlen;
3002
3003                 if (vp != NULL) {
3004                         error = vn_lock(vp, LK_SHARED);
3005                         if (error != 0)
3006                                 goto done;
3007                         error = VOP_GETATTR(vp, &va, td->td_ucred);
3008                         if (error != 0 || off >= va.va_size) {
3009                                 VOP_UNLOCK(vp, 0);
3010                                 goto done;
3011                         }
3012                         obj_size = va.va_size;
3013                 }
3014
3015                 /*
3016                  * Loop and construct maximum sized mbuf chain to be bulk
3017                  * dumped into socket buffer.
3018                  */
3019                 while (space > loopbytes) {
3020                         vm_offset_t pgoff;
3021                         struct mbuf *m0;
3022
3023                         /*
3024                          * Calculate the amount to transfer.
3025                          * Not to exceed a page, the EOF,
3026                          * or the passed in nbytes.
3027                          */
3028                         pgoff = (vm_offset_t)(off & PAGE_MASK);
3029                         rem = obj_size - offset;
3030                         if (nbytes != 0)
3031                                 rem = omin(rem, nbytes);
3032                         rem -= fsbytes + loopbytes;
3033                         xfsize = omin(PAGE_SIZE - pgoff, rem);
3034                         xfsize = omin(space - loopbytes, xfsize);
3035                         if (xfsize <= 0) {
3036                                 done = 1;               /* all data sent */
3037                                 break;
3038                         }
3039
3040                         /*
3041                          * Attempt to look up the page.  Allocate
3042                          * if not found or wait and loop if busy.
3043                          */
3044                         if (m != NULL)
3045                                 nd = EAGAIN; /* send what we already got */
3046                         else if ((flags & SF_NODISKIO) != 0)
3047                                 nd = EBUSY;
3048                         else
3049                                 nd = 0;
3050                         error = sendfile_readpage(obj, vp, nd, off,
3051                             xfsize, bsize, td, &pg);
3052                         if (error != 0) {
3053                                 if (error == EAGAIN)
3054                                         error = 0;      /* not a real error */
3055                                 break;
3056                         }
3057
3058                         /*
3059                          * Get a sendfile buf.  When allocating the
3060                          * first buffer for mbuf chain, we usually
3061                          * wait as long as necessary, but this wait
3062                          * can be interrupted.  For consequent
3063                          * buffers, do not sleep, since several
3064                          * threads might exhaust the buffers and then
3065                          * deadlock.
3066                          */
3067                         sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
3068                             SFB_CATCH);
3069                         if (sf == NULL) {
3070                                 SFSTAT_INC(sf_allocfail);
3071                                 vm_page_lock(pg);
3072                                 vm_page_unwire(pg, PQ_INACTIVE);
3073                                 KASSERT(pg->object != NULL,
3074                                     ("%s: object disappeared", __func__));
3075                                 vm_page_unlock(pg);
3076                                 if (m == NULL)
3077                                         error = (mnw ? EAGAIN : EINTR);
3078                                 break;
3079                         }
3080
3081                         /*
3082                          * Get an mbuf and set it up as having
3083                          * external storage.
3084                          */
3085                         m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
3086                         if (m0 == NULL) {
3087                                 error = (mnw ? EAGAIN : ENOBUFS);
3088                                 sf_ext_free(sf, NULL);
3089                                 break;
3090                         }
3091                         /*
3092                          * Attach EXT_SFBUF external storage.
3093                          */
3094                         m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
3095                         m0->m_ext.ext_size = PAGE_SIZE;
3096                         m0->m_ext.ext_arg1 = sf;
3097                         m0->m_ext.ext_arg2 = sfs;
3098                         m0->m_ext.ext_type = EXT_SFBUF;
3099                         m0->m_ext.ext_flags = 0;
3100                         m0->m_flags |= (M_EXT|M_RDONLY);
3101                         m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
3102                         m0->m_len = xfsize;
3103
3104                         /* Append to mbuf chain. */
3105                         if (mtail != NULL)
3106                                 mtail->m_next = m0;
3107                         else if (m != NULL)
3108                                 m_last(m)->m_next = m0;
3109                         else
3110                                 m = m0;
3111                         mtail = m0;
3112
3113                         /* Keep track of bits processed. */
3114                         loopbytes += xfsize;
3115                         off += xfsize;
3116
3117                         /*
3118                          * XXX eventually this should be a sfsync
3119                          * method call!
3120                          */
3121                         if (sfs != NULL)
3122                                 sf_sync_ref(sfs);
3123                 }
3124
3125                 if (vp != NULL)
3126                         VOP_UNLOCK(vp, 0);
3127
3128                 /* Add the buffer chain to the socket buffer. */
3129                 if (m != NULL) {
3130                         int mlen, err;
3131
3132                         mlen = m_length(m, NULL);
3133                         SOCKBUF_LOCK(&so->so_snd);
3134                         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3135                                 error = EPIPE;
3136                                 SOCKBUF_UNLOCK(&so->so_snd);
3137                                 goto done;
3138                         }
3139                         SOCKBUF_UNLOCK(&so->so_snd);
3140                         CURVNET_SET(so->so_vnet);
3141                         /* Avoid error aliasing. */
3142                         err = (*so->so_proto->pr_usrreqs->pru_send)
3143                                     (so, 0, m, NULL, NULL, td);
3144                         CURVNET_RESTORE();
3145                         if (err == 0) {
3146                                 /*
3147                                  * We need two counters to get the
3148                                  * file offset and nbytes to send
3149                                  * right:
3150                                  * - sbytes contains the total amount
3151                                  *   of bytes sent, including headers.
3152                                  * - fsbytes contains the total amount
3153                                  *   of bytes sent from the file.
3154                                  */
3155                                 sbytes += mlen;
3156                                 fsbytes += mlen;
3157                                 if (hdrlen) {
3158                                         fsbytes -= hdrlen;
3159                                         hdrlen = 0;
3160                                 }
3161                         } else if (error == 0)
3162                                 error = err;
3163                         m = NULL;       /* pru_send always consumes */
3164                 }
3165
3166                 /* Quit outer loop on error or when we're done. */
3167                 if (done)
3168                         break;
3169                 if (error != 0)
3170                         goto done;
3171         }
3172
3173         /*
3174          * Send trailers. Wimp out and use writev(2).
3175          */
3176         if (trl_uio != NULL) {
3177                 sbunlock(&so->so_snd);
3178                 error = kern_writev(td, sockfd, trl_uio);
3179                 if (error == 0)
3180                         sbytes += td->td_retval[0];
3181                 goto out;
3182         }
3183
3184 done:
3185         sbunlock(&so->so_snd);
3186 out:
3187         /*
3188          * If there was no error we have to clear td->td_retval[0]
3189          * because it may have been set by writev.
3190          */
3191         if (error == 0) {
3192                 td->td_retval[0] = 0;
3193         }
3194         if (sent != NULL) {
3195                 (*sent) = sbytes;
3196         }
3197         if (obj != NULL)
3198                 vm_object_deallocate(obj);
3199         if (so)
3200                 fdrop(sock_fp, td);
3201         if (m)
3202                 m_freem(m);
3203
3204         if (error == ERESTART)
3205                 error = EINTR;
3206
3207         return (error);
3208 }
3209
3210 /*
3211  * SCTP syscalls.
3212  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
3213  * otherwise all return EOPNOTSUPP.
3214  * XXX: We should make this loadable one day.
3215  */
3216 int
3217 sys_sctp_peeloff(td, uap)
3218         struct thread *td;
3219         struct sctp_peeloff_args /* {
3220                 int     sd;
3221                 caddr_t name;
3222         } */ *uap;
3223 {
3224 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3225         struct file *nfp = NULL;
3226         struct socket *head, *so;
3227         cap_rights_t rights;
3228         u_int fflag;
3229         int error, fd;
3230
3231         AUDIT_ARG_FD(uap->sd);
3232         error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
3233             &head, &fflag);
3234         if (error != 0)
3235                 goto done2;
3236         if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
3237                 error = EOPNOTSUPP;
3238                 goto done;
3239         }
3240         error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
3241         if (error != 0)
3242                 goto done;
3243         /*
3244          * At this point we know we do have a assoc to pull
3245          * we proceed to get the fd setup. This may block
3246          * but that is ok.
3247          */
3248
3249         error = falloc(td, &nfp, &fd, 0);
3250         if (error != 0)
3251                 goto done;
3252         td->td_retval[0] = fd;
3253
3254         CURVNET_SET(head->so_vnet);
3255         so = sonewconn(head, SS_ISCONNECTED);
3256         if (so == NULL) {
3257                 error = ENOMEM;
3258                 goto noconnection;
3259         }
3260         /*
3261          * Before changing the flags on the socket, we have to bump the
3262          * reference count.  Otherwise, if the protocol calls sofree(),
3263          * the socket will be released due to a zero refcount.
3264          */
3265         SOCK_LOCK(so);
3266         soref(so);                      /* file descriptor reference */
3267         SOCK_UNLOCK(so);
3268
3269         ACCEPT_LOCK();
3270
3271         TAILQ_REMOVE(&head->so_comp, so, so_list);
3272         head->so_qlen--;
3273         so->so_state |= (head->so_state & SS_NBIO);
3274         so->so_state &= ~SS_NOFDREF;
3275         so->so_qstate &= ~SQ_COMP;
3276         so->so_head = NULL;
3277         ACCEPT_UNLOCK();
3278         finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
3279         error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
3280         if (error != 0)
3281                 goto noconnection;
3282         if (head->so_sigio != NULL)
3283                 fsetown(fgetown(&head->so_sigio), &so->so_sigio);
3284
3285 noconnection:
3286         /*
3287          * close the new descriptor, assuming someone hasn't ripped it
3288          * out from under us.
3289          */
3290         if (error != 0)
3291                 fdclose(td->td_proc->p_fd, nfp, fd, td);
3292
3293         /*
3294          * Release explicitly held references before returning.
3295          */
3296         CURVNET_RESTORE();
3297 done:
3298         if (nfp != NULL)
3299                 fdrop(nfp, td);
3300         fputsock(head);
3301 done2:
3302         return (error);
3303 #else  /* SCTP */
3304         return (EOPNOTSUPP);
3305 #endif /* SCTP */
3306 }
3307
3308 int
3309 sys_sctp_generic_sendmsg (td, uap)
3310         struct thread *td;
3311         struct sctp_generic_sendmsg_args /* {
3312                 int sd,
3313                 caddr_t msg,
3314                 int mlen,
3315                 caddr_t to,
3316                 __socklen_t tolen,
3317                 struct sctp_sndrcvinfo *sinfo,
3318                 int flags
3319         } */ *uap;
3320 {
3321 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3322         struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
3323         struct socket *so;
3324         struct file *fp = NULL;
3325         struct sockaddr *to = NULL;
3326 #ifdef KTRACE
3327         struct uio *ktruio = NULL;
3328 #endif
3329         struct uio auio;
3330         struct iovec iov[1];
3331         cap_rights_t rights;
3332         int error = 0, len;
3333
3334         if (uap->sinfo != NULL) {
3335                 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
3336                 if (error != 0)
3337                         return (error);
3338                 u_sinfo = &sinfo;
3339         }
3340
3341         cap_rights_init(&rights, CAP_SEND);
3342         if (uap->tolen != 0) {
3343                 error = getsockaddr(&to, uap->to, uap->tolen);
3344                 if (error != 0) {
3345                         to = NULL;
3346                         goto sctp_bad2;
3347                 }
3348                 cap_rights_set(&rights, CAP_CONNECT);
3349         }
3350
3351         AUDIT_ARG_FD(uap->sd);
3352         error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
3353         if (error != 0)
3354                 goto sctp_bad;
3355 #ifdef KTRACE
3356         if (to && (KTRPOINT(td, KTR_STRUCT)))
3357                 ktrsockaddr(to);
3358 #endif
3359
3360         iov[0].iov_base = uap->msg;
3361         iov[0].iov_len = uap->mlen;
3362
3363         so = (struct socket *)fp->f_data;
3364         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3365                 error = EOPNOTSUPP;
3366                 goto sctp_bad;
3367         }
3368 #ifdef MAC
3369         error = mac_socket_check_send(td->td_ucred, so);
3370         if (error != 0)
3371                 goto sctp_bad;
3372 #endif /* MAC */
3373
3374         auio.uio_iov =  iov;
3375         auio.uio_iovcnt = 1;
3376         auio.uio_segflg = UIO_USERSPACE;
3377         auio.uio_rw = UIO_WRITE;
3378         auio.uio_td = td;
3379         auio.uio_offset = 0;                    /* XXX */
3380         auio.uio_resid = 0;
3381         len = auio.uio_resid = uap->mlen;
3382         CURVNET_SET(so->so_vnet);
3383         error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
3384             (struct mbuf *)NULL, uap->flags, u_sinfo, td);
3385         CURVNET_RESTORE();
3386         if (error != 0) {
3387                 if (auio.uio_resid != len && (error == ERESTART ||
3388                     error == EINTR || error == EWOULDBLOCK))
3389                         error = 0;
3390                 /* Generation of SIGPIPE can be controlled per socket. */
3391                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
3392                     !(uap->flags & MSG_NOSIGNAL)) {
3393                         PROC_LOCK(td->td_proc);
3394                         tdsignal(td, SIGPIPE);
3395                         PROC_UNLOCK(td->td_proc);
3396                 }
3397         }
3398         if (error == 0)
3399                 td->td_retval[0] = len - auio.uio_resid;
3400 #ifdef KTRACE
3401         if (ktruio != NULL) {
3402                 ktruio->uio_resid = td->td_retval[0];
3403                 ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
3404         }
3405 #endif /* KTRACE */
3406 sctp_bad:
3407         if (fp != NULL)
3408                 fdrop(fp, td);
3409 sctp_bad2:
3410         free(to, M_SONAME);
3411         return (error);
3412 #else  /* SCTP */
3413         return (EOPNOTSUPP);
3414 #endif /* SCTP */
3415 }
3416
3417 int
3418 sys_sctp_generic_sendmsg_iov(td, uap)
3419         struct thread *td;
3420         struct sctp_generic_sendmsg_iov_args /* {
3421                 int sd,
3422                 struct iovec *iov,
3423                 int iovlen,
3424                 caddr_t to,
3425                 __socklen_t tolen,
3426                 struct sctp_sndrcvinfo *sinfo,
3427                 int flags
3428         } */ *uap;
3429 {
3430 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3431         struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
3432         struct socket *so;
3433         struct file *fp = NULL;
3434         struct sockaddr *to = NULL;
3435 #ifdef KTRACE
3436         struct uio *ktruio = NULL;
3437 #endif
3438         struct uio auio;
3439         struct iovec *iov, *tiov;
3440         cap_rights_t rights;
3441         ssize_t len;
3442         int error, i;
3443
3444         if (uap->sinfo != NULL) {
3445                 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
3446                 if (error != 0)
3447                         return (error);
3448                 u_sinfo = &sinfo;
3449         }
3450         cap_rights_init(&rights, CAP_SEND);
3451         if (uap->tolen != 0) {
3452                 error = getsockaddr(&to, uap->to, uap->tolen);
3453                 if (error != 0) {
3454                         to = NULL;
3455                         goto sctp_bad2;
3456                 }
3457                 cap_rights_set(&rights, CAP_CONNECT);
3458         }
3459
3460         AUDIT_ARG_FD(uap->sd);
3461         error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
3462         if (error != 0)
3463                 goto sctp_bad1;
3464
3465 #ifdef COMPAT_FREEBSD32
3466         if (SV_CURPROC_FLAG(SV_ILP32))
3467                 error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
3468                     uap->iovlen, &iov, EMSGSIZE);
3469         else
3470 #endif
3471                 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
3472         if (error != 0)
3473                 goto sctp_bad1;
3474 #ifdef KTRACE
3475         if (to && (KTRPOINT(td, KTR_STRUCT)))
3476                 ktrsockaddr(to);
3477 #endif
3478
3479         so = (struct socket *)fp->f_data;
3480         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3481                 error = EOPNOTSUPP;
3482                 goto sctp_bad;
3483         }
3484 #ifdef MAC
3485         error = mac_socket_check_send(td->td_ucred, so);
3486         if (error != 0)
3487                 goto sctp_bad;
3488 #endif /* MAC */
3489
3490         auio.uio_iov = iov;
3491         auio.uio_iovcnt = uap->iovlen;
3492         auio.uio_segflg = UIO_USERSPACE;
3493         auio.uio_rw = UIO_WRITE;
3494         auio.uio_td = td;
3495         auio.uio_offset = 0;                    /* XXX */
3496         auio.uio_resid = 0;
3497         tiov = iov;
3498         for (i = 0; i <uap->iovlen; i++, tiov++) {
3499                 if ((auio.uio_resid += tiov->iov_len) < 0) {
3500                         error = EINVAL;
3501                         goto sctp_bad;
3502                 }
3503         }
3504         len = auio.uio_resid;
3505         CURVNET_SET(so->so_vnet);
3506         error = sctp_lower_sosend(so, to, &auio,
3507                     (struct mbuf *)NULL, (struct mbuf *)NULL,
3508                     uap->flags, u_sinfo, td);
3509         CURVNET_RESTORE();
3510         if (error != 0) {
3511                 if (auio.uio_resid != len && (error == ERESTART ||
3512                     error == EINTR || error == EWOULDBLOCK))
3513                         error = 0;
3514                 /* Generation of SIGPIPE can be controlled per socket */
3515                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
3516                     !(uap->flags & MSG_NOSIGNAL)) {
3517                         PROC_LOCK(td->td_proc);
3518                         tdsignal(td, SIGPIPE);
3519                         PROC_UNLOCK(td->td_proc);
3520                 }
3521         }
3522         if (error == 0)
3523                 td->td_retval[0] = len - auio.uio_resid;
3524 #ifdef KTRACE
3525         if (ktruio != NULL) {
3526                 ktruio->uio_resid = td->td_retval[0];
3527                 ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
3528         }
3529 #endif /* KTRACE */
3530 sctp_bad:
3531         free(iov, M_IOV);
3532 sctp_bad1:
3533         if (fp != NULL)
3534                 fdrop(fp, td);
3535 sctp_bad2:
3536         free(to, M_SONAME);
3537         return (error);
3538 #else  /* SCTP */
3539         return (EOPNOTSUPP);
3540 #endif /* SCTP */
3541 }
3542
3543 int
3544 sys_sctp_generic_recvmsg(td, uap)
3545         struct thread *td;
3546         struct sctp_generic_recvmsg_args /* {
3547                 int sd,
3548                 struct iovec *iov,
3549                 int iovlen,
3550                 struct sockaddr *from,
3551                 __socklen_t *fromlenaddr,
3552                 struct sctp_sndrcvinfo *sinfo,
3553                 int *msg_flags
3554         } */ *uap;
3555 {
3556 #if (defined(INET) || defined(INET6)) && defined(SCTP)
3557         uint8_t sockbufstore[256];
3558         struct uio auio;
3559         struct iovec *iov, *tiov;
3560         struct sctp_sndrcvinfo sinfo;
3561         struct socket *so;
3562         struct file *fp = NULL;
3563         struct sockaddr *fromsa;
3564         cap_rights_t rights;
3565 #ifdef KTRACE
3566         struct uio *ktruio = NULL;
3567 #endif
3568         ssize_t len;
3569         int error, fromlen, i, msg_flags;
3570
3571         AUDIT_ARG_FD(uap->sd);
3572         error = getsock_cap(td->td_proc->p_fd, uap->sd,
3573             cap_rights_init(&rights, CAP_RECV), &fp, NULL);
3574         if (error != 0)
3575                 return (error);
3576 #ifdef COMPAT_FREEBSD32
3577         if (SV_CURPROC_FLAG(SV_ILP32))
3578                 error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
3579                     uap->iovlen, &iov, EMSGSIZE);
3580         else
3581 #endif
3582                 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
3583         if (error != 0)
3584                 goto out1;
3585
3586         so = fp->f_data;
3587         if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
3588                 error = EOPNOTSUPP;
3589                 goto out;
3590         }
3591 #ifdef MAC
3592         error = mac_socket_check_receive(td->td_ucred, so);
3593         if (error != 0)
3594                 goto out;
3595 #endif /* MAC */
3596
3597         if (uap->fromlenaddr != NULL) {
3598                 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
3599                 if (error != 0)
3600                         goto out;
3601         } else {
3602                 fromlen = 0;
3603         }
3604         if (uap->msg_flags) {
3605                 error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
3606                 if (error != 0)
3607                         goto out;
3608         } else {
3609                 msg_flags = 0;
3610         }
3611         auio.uio_iov = iov;
3612         auio.uio_iovcnt = uap->iovlen;
3613         auio.uio_segflg = UIO_USERSPACE;
3614         auio.uio_rw = UIO_READ;
3615         auio.uio_td = td;
3616         auio.uio_offset = 0;                    /* XXX */
3617         auio.uio_resid = 0;
3618         tiov = iov;
3619         for (i = 0; i <uap->iovlen; i++, tiov++) {
3620                 if ((auio.uio_resid += tiov->iov_len) < 0) {
3621                         error = EINVAL;
3622                         goto out;
3623                 }
3624         }
3625         len = auio.uio_resid;
3626         fromsa = (struct sockaddr *)sockbufstore;
3627
3628 #ifdef KTRACE
3629         if (KTRPOINT(td, KTR_GENIO))
3630                 ktruio = cloneuio(&auio);
3631 #endif /* KTRACE */
3632         memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
3633         CURVNET_SET(so->so_vnet);
3634         error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
3635                     fromsa, fromlen, &msg_flags,
3636                     (struct sctp_sndrcvinfo *)&sinfo, 1);
3637         CURVNET_RESTORE();
3638         if (error != 0) {
3639                 if (auio.uio_resid != len && (error == ERESTART ||
3640                     error == EINTR || error == EWOULDBLOCK))
3641                         error = 0;
3642         } else {
3643                 if (uap->sinfo)
3644                         error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
3645         }
3646 #ifdef KTRACE
3647         if (ktruio != NULL) {
3648                 ktruio->uio_resid = len - auio.uio_resid;
3649                 ktrgenio(uap->sd, UIO_READ, ktruio, error);
3650         }
3651 #endif /* KTRACE */
3652         if (error != 0)
3653                 goto out;
3654         td->td_retval[0] = len - auio.uio_resid;
3655
3656         if (fromlen && uap->from) {
3657                 len = fromlen;
3658                 if (len <= 0 || fromsa == 0)
3659                         len = 0;
3660                 else {
3661                         len = MIN(len, fromsa->sa_len);
3662                         error = copyout(fromsa, uap->from, (size_t)len);
3663                         if (error != 0)
3664                                 goto out;
3665                 }
3666                 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
3667                 if (error != 0)
3668                         goto out;
3669         }
3670 #ifdef KTRACE
3671         if (KTRPOINT(td, KTR_STRUCT))
3672                 ktrsockaddr(fromsa);
3673 #endif
3674         if (uap->msg_flags) {
3675                 error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
3676                 if (error != 0)
3677                         goto out;
3678         }
3679 out:
3680         free(iov, M_IOV);
3681 out1:
3682         if (fp != NULL)
3683                 fdrop(fp, td);
3684
3685         return (error);
3686 #else  /* SCTP */
3687         return (EOPNOTSUPP);
3688 #endif /* SCTP */
3689 }