sys/kern/vfs_syscalls.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_capsicum.h"
  41 #include "opt_compat.h"
  42 #include "opt_ktrace.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/bio.h>
  47 #include <sys/buf.h>
  48 #include <sys/capsicum.h>
  49 #include <sys/disk.h>
  50 #include <sys/sysent.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount.h>
  53 #include <sys/mutex.h>
  54 #include <sys/sysproto.h>
  55 #include <sys/namei.h>
  56 #include <sys/filedesc.h>
  57 #include <sys/kernel.h>
  58 #include <sys/fcntl.h>
  59 #include <sys/file.h>
  60 #include <sys/filio.h>
  61 #include <sys/limits.h>
  62 #include <sys/linker.h>
  63 #include <sys/rwlock.h>
  64 #include <sys/sdt.h>
  65 #include <sys/stat.h>
  66 #include <sys/sx.h>
  67 #include <sys/unistd.h>
  68 #include <sys/vnode.h>
  69 #include <sys/priv.h>
  70 #include <sys/proc.h>
  71 #include <sys/dirent.h>
  72 #include <sys/jail.h>
  73 #include <sys/syscallsubr.h>
  74 #include <sys/sysctl.h>
  75 #ifdef KTRACE
  76 #include <sys/ktrace.h>
  77 #endif
  78
  79 #include <machine/stdarg.h>
  80
  81 #include <security/audit/audit.h>
  82 #include <security/mac/mac_framework.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_object.h>
  86 #include <vm/vm_page.h>
  87 #include <vm/uma.h>
  88
  89 #include <ufs/ufs/quota.h>
  90
  91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
  92
  93 SDT_PROVIDER_DEFINE(vfs);
  94 SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
  95 SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
  96
  97 static int kern_chflagsat(struct thread *td, int fd, const char *path,
  98     enum uio_seg pathseg, u_long flags, int atflag);
  99 static int setfflags(struct thread *td, struct vnode *, u_long);
 100 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 101 static int getutimens(const struct timespec *, enum uio_seg,
 102     struct timespec *, int *);
 103 static int setutimes(struct thread *td, struct vnode *,
 104     const struct timespec *, int, int);
 105 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
 106     struct thread *td);
 107
 108 /*
 109  * Sync each mounted filesystem.
 110  */
 111 #ifndef _SYS_SYSPROTO_H_
 112 struct sync_args {
 113         int     dummy;
 114 };
 115 #endif
 116 /* ARGSUSED */
 117 int
 118 sys_sync(td, uap)
 119         struct thread *td;
 120         struct sync_args *uap;
 121 {
 122         struct mount *mp, *nmp;
 123         int save;
 124
 125         mtx_lock(&mountlist_mtx);
 126         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 127                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 128                         nmp = TAILQ_NEXT(mp, mnt_list);
 129                         continue;
 130                 }
 131                 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 132                     vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 133                         save = curthread_pflags_set(TDP_SYNCIO);
 134                         vfs_msync(mp, MNT_NOWAIT);
 135                         VFS_SYNC(mp, MNT_NOWAIT);
 136                         curthread_pflags_restore(save);
 137                         vn_finished_write(mp);
 138                 }
 139                 mtx_lock(&mountlist_mtx);
 140                 nmp = TAILQ_NEXT(mp, mnt_list);
 141                 vfs_unbusy(mp);
 142         }
 143         mtx_unlock(&mountlist_mtx);
 144         return (0);
 145 }
 146
 147 /*
 148  * Change filesystem quotas.
 149  */
 150 #ifndef _SYS_SYSPROTO_H_
 151 struct quotactl_args {
 152         char *path;
 153         int cmd;
 154         int uid;
 155         caddr_t arg;
 156 };
 157 #endif
 158 int
 159 sys_quotactl(td, uap)
 160         struct thread *td;
 161         register struct quotactl_args /* {
 162                 char *path;
 163                 int cmd;
 164                 int uid;
 165                 caddr_t arg;
 166         } */ *uap;
 167 {
 168         struct mount *mp;
 169         struct nameidata nd;
 170         int error;
 171
 172         AUDIT_ARG_CMD(uap->cmd);
 173         AUDIT_ARG_UID(uap->uid);
 174         if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 175                 return (EPERM);
 176         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 177             uap->path, td);
 178         if ((error = namei(&nd)) != 0)
 179                 return (error);
 180         NDFREE(&nd, NDF_ONLY_PNBUF);
 181         mp = nd.ni_vp->v_mount;
 182         vfs_ref(mp);
 183         vput(nd.ni_vp);
 184         error = vfs_busy(mp, 0);
 185         vfs_rel(mp);
 186         if (error != 0)
 187                 return (error);
 188         error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 189
 190         /*
 191          * Since quota on operation typically needs to open quota
 192          * file, the Q_QUOTAON handler needs to unbusy the mount point
 193          * before calling into namei.  Otherwise, unmount might be
 194          * started between two vfs_busy() invocations (first is our,
 195          * second is from mount point cross-walk code in lookup()),
 196          * causing deadlock.
 197          *
 198          * Require that Q_QUOTAON handles the vfs_busy() reference on
 199          * its own, always returning with ubusied mount point.
 200          */
 201         if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
 202                 vfs_unbusy(mp);
 203         return (error);
 204 }
 205
 206 /*
 207  * Used by statfs conversion routines to scale the block size up if
 208  * necessary so that all of the block counts are <= 'max_size'.  Note
 209  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
 210  * value of 'n'.
 211  */
 212 void
 213 statfs_scale_blocks(struct statfs *sf, long max_size)
 214 {
 215         uint64_t count;
 216         int shift;
 217
 218         KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 219
 220         /*
 221          * Attempt to scale the block counts to give a more accurate
 222          * overview to userland of the ratio of free space to used
 223          * space.  To do this, find the largest block count and compute
 224          * a divisor that lets it fit into a signed integer <= max_size.
 225          */
 226         if (sf->f_bavail < 0)
 227                 count = -sf->f_bavail;
 228         else
 229                 count = sf->f_bavail;
 230         count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 231         if (count <= max_size)
 232                 return;
 233
 234         count >>= flsl(max_size);
 235         shift = 0;
 236         while (count > 0) {
 237                 shift++;
 238                 count >>=1;
 239         }
 240
 241         sf->f_bsize <<= shift;
 242         sf->f_blocks >>= shift;
 243         sf->f_bfree >>= shift;
 244         sf->f_bavail >>= shift;
 245 }
 246
 247 static int
 248 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
 249 {
 250         struct statfs *sp;
 251         int error;
 252
 253         if (mp == NULL)
 254                 return (EBADF);
 255         error = vfs_busy(mp, 0);
 256         vfs_rel(mp);
 257         if (error != 0)
 258                 return (error);
 259 #ifdef MAC
 260         error = mac_mount_check_stat(td->td_ucred, mp);
 261         if (error != 0)
 262                 goto out;
 263 #endif
 264         /*
 265          * Set these in case the underlying filesystem fails to do so.
 266          */
 267         sp = &mp->mnt_stat;
 268         sp->f_version = STATFS_VERSION;
 269         sp->f_namemax = NAME_MAX;
 270         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 271         error = VFS_STATFS(mp, sp);
 272         if (error != 0)
 273                 goto out;
 274         *buf = *sp;
 275         if (priv_check(td, PRIV_VFS_GENERATION)) {
 276                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
 277                 prison_enforce_statfs(td->td_ucred, mp, buf);
 278         }
 279 out:
 280         vfs_unbusy(mp);
 281         return (error);
 282 }
 283
 284 /*
 285  * Get filesystem statistics.
 286  */
 287 #ifndef _SYS_SYSPROTO_H_
 288 struct statfs_args {
 289         char *path;
 290         struct statfs *buf;
 291 };
 292 #endif
 293 int
 294 sys_statfs(td, uap)
 295         struct thread *td;
 296         register struct statfs_args /* {
 297                 char *path;
 298                 struct statfs *buf;
 299         } */ *uap;
 300 {
 301         struct statfs sf;
 302         int error;
 303
 304         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 305         if (error == 0)
 306                 error = copyout(&sf, uap->buf, sizeof(sf));
 307         return (error);
 308 }
 309
 310 int
 311 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
 312     struct statfs *buf)
 313 {
 314         struct mount *mp;
 315         struct nameidata nd;
 316         int error;
 317
 318         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 319             pathseg, path, td);
 320         error = namei(&nd);
 321         if (error != 0)
 322                 return (error);
 323         mp = nd.ni_vp->v_mount;
 324         vfs_ref(mp);
 325         NDFREE(&nd, NDF_ONLY_PNBUF);
 326         vput(nd.ni_vp);
 327         return (kern_do_statfs(td, mp, buf));
 328 }
 329
 330 /*
 331  * Get filesystem statistics.
 332  */
 333 #ifndef _SYS_SYSPROTO_H_
 334 struct fstatfs_args {
 335         int fd;
 336         struct statfs *buf;
 337 };
 338 #endif
 339 int
 340 sys_fstatfs(td, uap)
 341         struct thread *td;
 342         register struct fstatfs_args /* {
 343                 int fd;
 344                 struct statfs *buf;
 345         } */ *uap;
 346 {
 347         struct statfs sf;
 348         int error;
 349
 350         error = kern_fstatfs(td, uap->fd, &sf);
 351         if (error == 0)
 352                 error = copyout(&sf, uap->buf, sizeof(sf));
 353         return (error);
 354 }
 355
 356 int
 357 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 358 {
 359         struct file *fp;
 360         struct mount *mp;
 361         struct vnode *vp;
 362         cap_rights_t rights;
 363         int error;
 364
 365         AUDIT_ARG_FD(fd);
 366         error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSTATFS), &fp);
 367         if (error != 0)
 368                 return (error);
 369         vp = fp->f_vnode;
 370         vn_lock(vp, LK_SHARED | LK_RETRY);
 371 #ifdef AUDIT
 372         AUDIT_ARG_VNODE1(vp);
 373 #endif
 374         mp = vp->v_mount;
 375         if (mp != NULL)
 376                 vfs_ref(mp);
 377         VOP_UNLOCK(vp, 0);
 378         fdrop(fp, td);
 379         return (kern_do_statfs(td, mp, buf));
 380 }
 381
 382 /*
 383  * Get statistics on all filesystems.
 384  */
 385 #ifndef _SYS_SYSPROTO_H_
 386 struct getfsstat_args {
 387         struct statfs *buf;
 388         long bufsize;
 389         int flags;
 390 };
 391 #endif
 392 int
 393 sys_getfsstat(td, uap)
 394         struct thread *td;
 395         register struct getfsstat_args /* {
 396                 struct statfs *buf;
 397                 long bufsize;
 398                 int flags;
 399         } */ *uap;
 400 {
 401         size_t count;
 402         int error;
 403
 404         if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
 405                 return (EINVAL);
 406         error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
 407             UIO_USERSPACE, uap->flags);
 408         if (error == 0)
 409                 td->td_retval[0] = count;
 410         return (error);
 411 }
 412
 413 /*
 414  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
 415  *      The caller is responsible for freeing memory which will be allocated
 416  *      in '*buf'.
 417  */
 418 int
 419 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
 420     size_t *countp, enum uio_seg bufseg, int flags)
 421 {
 422         struct mount *mp, *nmp;
 423         struct statfs *sfsp, *sp, sb, *tofree;
 424         size_t count, maxcount;
 425         int error;
 426
 427 restart:
 428         maxcount = bufsize / sizeof(struct statfs);
 429         if (bufsize == 0) {
 430                 sfsp = NULL;
 431                 tofree = NULL;
 432         } else if (bufseg == UIO_USERSPACE) {
 433                 sfsp = *buf;
 434                 tofree = NULL;
 435         } else /* if (bufseg == UIO_SYSSPACE) */ {
 436                 count = 0;
 437                 mtx_lock(&mountlist_mtx);
 438                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 439                         count++;
 440                 }
 441                 mtx_unlock(&mountlist_mtx);
 442                 if (maxcount > count)
 443                         maxcount = count;
 444                 tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
 445                     M_TEMP, M_WAITOK);
 446         }
 447         count = 0;
 448         mtx_lock(&mountlist_mtx);
 449         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 450                 if (prison_canseemount(td->td_ucred, mp) != 0) {
 451                         nmp = TAILQ_NEXT(mp, mnt_list);
 452                         continue;
 453                 }
 454 #ifdef MAC
 455                 if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 456                         nmp = TAILQ_NEXT(mp, mnt_list);
 457                         continue;
 458                 }
 459 #endif
 460                 if (flags == MNT_WAIT) {
 461                         if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
 462                                 /*
 463                                  * If vfs_busy() failed, and MBF_NOWAIT
 464                                  * wasn't passed, then the mp is gone.
 465                                  * Furthermore, because of MBF_MNTLSTLOCK,
 466                                  * the mountlist_mtx was dropped.  We have
 467                                  * no other choice than to start over.
 468                                  */
 469                                 mtx_unlock(&mountlist_mtx);
 470                                 free(tofree, M_TEMP);
 471                                 goto restart;
 472                         }
 473                 } else {
 474                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
 475                                 nmp = TAILQ_NEXT(mp, mnt_list);
 476                                 continue;
 477                         }
 478                 }
 479                 if (sfsp && count < maxcount) {
 480                         sp = &mp->mnt_stat;
 481                         /*
 482                          * Set these in case the underlying filesystem
 483                          * fails to do so.
 484                          */
 485                         sp->f_version = STATFS_VERSION;
 486                         sp->f_namemax = NAME_MAX;
 487                         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 488                         /*
 489                          * If MNT_NOWAIT or MNT_LAZY is specified, do not
 490                          * refresh the fsstat cache.
 491                          */
 492                         if (flags != MNT_LAZY && flags != MNT_NOWAIT) {
 493                                 error = VFS_STATFS(mp, sp);
 494                                 if (error != 0) {
 495                                         mtx_lock(&mountlist_mtx);
 496                                         nmp = TAILQ_NEXT(mp, mnt_list);
 497                                         vfs_unbusy(mp);
 498                                         continue;
 499                                 }
 500                         }
 501                         if (priv_check(td, PRIV_VFS_GENERATION)) {
 502                                 bcopy(sp, &sb, sizeof(sb));
 503                                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 504                                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 505                                 sp = &sb;
 506                         }
 507                         if (bufseg == UIO_SYSSPACE)
 508                                 bcopy(sp, sfsp, sizeof(*sp));
 509                         else /* if (bufseg == UIO_USERSPACE) */ {
 510                                 error = copyout(sp, sfsp, sizeof(*sp));
 511                                 if (error != 0) {
 512                                         vfs_unbusy(mp);
 513                                         return (error);
 514                                 }
 515                         }
 516                         sfsp++;
 517                 }
 518                 count++;
 519                 mtx_lock(&mountlist_mtx);
 520                 nmp = TAILQ_NEXT(mp, mnt_list);
 521                 vfs_unbusy(mp);
 522         }
 523         mtx_unlock(&mountlist_mtx);
 524         if (sfsp && count > maxcount)
 525                 *countp = maxcount;
 526         else
 527                 *countp = count;
 528         return (0);
 529 }
 530
 531 #ifdef COMPAT_FREEBSD4
 532 /*
 533  * Get old format filesystem statistics.
 534  */
 535 static void cvtstatfs(struct statfs *, struct ostatfs *);
 536
 537 #ifndef _SYS_SYSPROTO_H_
 538 struct freebsd4_statfs_args {
 539         char *path;
 540         struct ostatfs *buf;
 541 };
 542 #endif
 543 int
 544 freebsd4_statfs(td, uap)
 545         struct thread *td;
 546         struct freebsd4_statfs_args /* {
 547                 char *path;
 548                 struct ostatfs *buf;
 549         } */ *uap;
 550 {
 551         struct ostatfs osb;
 552         struct statfs sf;
 553         int error;
 554
 555         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 556         if (error != 0)
 557                 return (error);
 558         cvtstatfs(&sf, &osb);
 559         return (copyout(&osb, uap->buf, sizeof(osb)));
 560 }
 561
 562 /*
 563  * Get filesystem statistics.
 564  */
 565 #ifndef _SYS_SYSPROTO_H_
 566 struct freebsd4_fstatfs_args {
 567         int fd;
 568         struct ostatfs *buf;
 569 };
 570 #endif
 571 int
 572 freebsd4_fstatfs(td, uap)
 573         struct thread *td;
 574         struct freebsd4_fstatfs_args /* {
 575                 int fd;
 576                 struct ostatfs *buf;
 577         } */ *uap;
 578 {
 579         struct ostatfs osb;
 580         struct statfs sf;
 581         int error;
 582
 583         error = kern_fstatfs(td, uap->fd, &sf);
 584         if (error != 0)
 585                 return (error);
 586         cvtstatfs(&sf, &osb);
 587         return (copyout(&osb, uap->buf, sizeof(osb)));
 588 }
 589
 590 /*
 591  * Get statistics on all filesystems.
 592  */
 593 #ifndef _SYS_SYSPROTO_H_
 594 struct freebsd4_getfsstat_args {
 595         struct ostatfs *buf;
 596         long bufsize;
 597         int flags;
 598 };
 599 #endif
 600 int
 601 freebsd4_getfsstat(td, uap)
 602         struct thread *td;
 603         register struct freebsd4_getfsstat_args /* {
 604                 struct ostatfs *buf;
 605                 long bufsize;
 606                 int flags;
 607         } */ *uap;
 608 {
 609         struct statfs *buf, *sp;
 610         struct ostatfs osb;
 611         size_t count, size;
 612         int error;
 613
 614         if (uap->bufsize < 0)
 615                 return (EINVAL);
 616         count = uap->bufsize / sizeof(struct ostatfs);
 617         if (count > SIZE_MAX / sizeof(struct statfs))
 618                 return (EINVAL);
 619         size = count * sizeof(struct statfs);
 620         error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
 621             uap->flags);
 622         td->td_retval[0] = count;
 623         if (size != 0) {
 624                 sp = buf;
 625                 while (count != 0 && error == 0) {
 626                         cvtstatfs(sp, &osb);
 627                         error = copyout(&osb, uap->buf, sizeof(osb));
 628                         sp++;
 629                         uap->buf++;
 630                         count--;
 631                 }
 632                 free(buf, M_TEMP);
 633         }
 634         return (error);
 635 }
 636
 637 /*
 638  * Implement fstatfs() for (NFS) file handles.
 639  */
 640 #ifndef _SYS_SYSPROTO_H_
 641 struct freebsd4_fhstatfs_args {
 642         struct fhandle *u_fhp;
 643         struct ostatfs *buf;
 644 };
 645 #endif
 646 int
 647 freebsd4_fhstatfs(td, uap)
 648         struct thread *td;
 649         struct freebsd4_fhstatfs_args /* {
 650                 struct fhandle *u_fhp;
 651                 struct ostatfs *buf;
 652         } */ *uap;
 653 {
 654         struct ostatfs osb;
 655         struct statfs sf;
 656         fhandle_t fh;
 657         int error;
 658
 659         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 660         if (error != 0)
 661                 return (error);
 662         error = kern_fhstatfs(td, fh, &sf);
 663         if (error != 0)
 664                 return (error);
 665         cvtstatfs(&sf, &osb);
 666         return (copyout(&osb, uap->buf, sizeof(osb)));
 667 }
 668
 669 /*
 670  * Convert a new format statfs structure to an old format statfs structure.
 671  */
 672 static void
 673 cvtstatfs(nsp, osp)
 674         struct statfs *nsp;
 675         struct ostatfs *osp;
 676 {
 677
 678         statfs_scale_blocks(nsp, LONG_MAX);
 679         bzero(osp, sizeof(*osp));
 680         osp->f_bsize = nsp->f_bsize;
 681         osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 682         osp->f_blocks = nsp->f_blocks;
 683         osp->f_bfree = nsp->f_bfree;
 684         osp->f_bavail = nsp->f_bavail;
 685         osp->f_files = MIN(nsp->f_files, LONG_MAX);
 686         osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 687         osp->f_owner = nsp->f_owner;
 688         osp->f_type = nsp->f_type;
 689         osp->f_flags = nsp->f_flags;
 690         osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 691         osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 692         osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 693         osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 694         strlcpy(osp->f_fstypename, nsp->f_fstypename,
 695             MIN(MFSNAMELEN, OMFSNAMELEN));
 696         strlcpy(osp->f_mntonname, nsp->f_mntonname,
 697             MIN(MNAMELEN, OMNAMELEN));
 698         strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 699             MIN(MNAMELEN, OMNAMELEN));
 700         osp->f_fsid = nsp->f_fsid;
 701 }
 702 #endif /* COMPAT_FREEBSD4 */
 703
 704 /*
 705  * Change current working directory to a given file descriptor.
 706  */
 707 #ifndef _SYS_SYSPROTO_H_
 708 struct fchdir_args {
 709         int     fd;
 710 };
 711 #endif
 712 int
 713 sys_fchdir(td, uap)
 714         struct thread *td;
 715         struct fchdir_args /* {
 716                 int fd;
 717         } */ *uap;
 718 {
 719         struct vnode *vp, *tdp;
 720         struct mount *mp;
 721         struct file *fp;
 722         cap_rights_t rights;
 723         int error;
 724
 725         AUDIT_ARG_FD(uap->fd);
 726         error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
 727             &fp);
 728         if (error != 0)
 729                 return (error);
 730         vp = fp->f_vnode;
 731         vrefact(vp);
 732         fdrop(fp, td);
 733         vn_lock(vp, LK_SHARED | LK_RETRY);
 734         AUDIT_ARG_VNODE1(vp);
 735         error = change_dir(vp, td);
 736         while (!error && (mp = vp->v_mountedhere) != NULL) {
 737                 if (vfs_busy(mp, 0))
 738                         continue;
 739                 error = VFS_ROOT(mp, LK_SHARED, &tdp);
 740                 vfs_unbusy(mp);
 741                 if (error != 0)
 742                         break;
 743                 vput(vp);
 744                 vp = tdp;
 745         }
 746         if (error != 0) {
 747                 vput(vp);
 748                 return (error);
 749         }
 750         VOP_UNLOCK(vp, 0);
 751         pwd_chdir(td, vp);
 752         return (0);
 753 }
 754
 755 /*
 756  * Change current working directory (``.'').
 757  */
 758 #ifndef _SYS_SYSPROTO_H_
 759 struct chdir_args {
 760         char    *path;
 761 };
 762 #endif
 763 int
 764 sys_chdir(td, uap)
 765         struct thread *td;
 766         struct chdir_args /* {
 767                 char *path;
 768         } */ *uap;
 769 {
 770
 771         return (kern_chdir(td, uap->path, UIO_USERSPACE));
 772 }
 773
 774 int
 775 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 776 {
 777         struct nameidata nd;
 778         int error;
 779
 780         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 781             pathseg, path, td);
 782         if ((error = namei(&nd)) != 0)
 783                 return (error);
 784         if ((error = change_dir(nd.ni_vp, td)) != 0) {
 785                 vput(nd.ni_vp);
 786                 NDFREE(&nd, NDF_ONLY_PNBUF);
 787                 return (error);
 788         }
 789         VOP_UNLOCK(nd.ni_vp, 0);
 790         NDFREE(&nd, NDF_ONLY_PNBUF);
 791         pwd_chdir(td, nd.ni_vp);
 792         return (0);
 793 }
 794
 795 /*
 796  * Change notion of root (``/'') directory.
 797  */
 798 #ifndef _SYS_SYSPROTO_H_
 799 struct chroot_args {
 800         char    *path;
 801 };
 802 #endif
 803 int
 804 sys_chroot(td, uap)
 805         struct thread *td;
 806         struct chroot_args /* {
 807                 char *path;
 808         } */ *uap;
 809 {
 810         struct nameidata nd;
 811         int error;
 812
 813         error = priv_check(td, PRIV_VFS_CHROOT);
 814         if (error != 0)
 815                 return (error);
 816         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 817             UIO_USERSPACE, uap->path, td);
 818         error = namei(&nd);
 819         if (error != 0)
 820                 goto error;
 821         error = change_dir(nd.ni_vp, td);
 822         if (error != 0)
 823                 goto e_vunlock;
 824 #ifdef MAC
 825         error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 826         if (error != 0)
 827                 goto e_vunlock;
 828 #endif
 829         VOP_UNLOCK(nd.ni_vp, 0);
 830         error = pwd_chroot(td, nd.ni_vp);
 831         vrele(nd.ni_vp);
 832         NDFREE(&nd, NDF_ONLY_PNBUF);
 833         return (error);
 834 e_vunlock:
 835         vput(nd.ni_vp);
 836 error:
 837         NDFREE(&nd, NDF_ONLY_PNBUF);
 838         return (error);
 839 }
 840
 841 /*
 842  * Common routine for chroot and chdir.  Callers must provide a locked vnode
 843  * instance.
 844  */
 845 int
 846 change_dir(vp, td)
 847         struct vnode *vp;
 848         struct thread *td;
 849 {
 850 #ifdef MAC
 851         int error;
 852 #endif
 853
 854         ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 855         if (vp->v_type != VDIR)
 856                 return (ENOTDIR);
 857 #ifdef MAC
 858         error = mac_vnode_check_chdir(td->td_ucred, vp);
 859         if (error != 0)
 860                 return (error);
 861 #endif
 862         return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 863 }
 864
 865 static __inline void
 866 flags_to_rights(int flags, cap_rights_t *rightsp)
 867 {
 868
 869         if (flags & O_EXEC) {
 870                 cap_rights_set(rightsp, CAP_FEXECVE);
 871         } else {
 872                 switch ((flags & O_ACCMODE)) {
 873                 case O_RDONLY:
 874                         cap_rights_set(rightsp, CAP_READ);
 875                         break;
 876                 case O_RDWR:
 877                         cap_rights_set(rightsp, CAP_READ);
 878                         /* FALLTHROUGH */
 879                 case O_WRONLY:
 880                         cap_rights_set(rightsp, CAP_WRITE);
 881                         if (!(flags & (O_APPEND | O_TRUNC)))
 882                                 cap_rights_set(rightsp, CAP_SEEK);
 883                         break;
 884                 }
 885         }
 886
 887         if (flags & O_CREAT)
 888                 cap_rights_set(rightsp, CAP_CREATE);
 889
 890         if (flags & O_TRUNC)
 891                 cap_rights_set(rightsp, CAP_FTRUNCATE);
 892
 893         if (flags & (O_SYNC | O_FSYNC))
 894                 cap_rights_set(rightsp, CAP_FSYNC);
 895
 896         if (flags & (O_EXLOCK | O_SHLOCK))
 897                 cap_rights_set(rightsp, CAP_FLOCK);
 898 }
 899
 900 /*
 901  * Check permissions, allocate an open file structure, and call the device
 902  * open routine if any.
 903  */
 904 #ifndef _SYS_SYSPROTO_H_
 905 struct open_args {
 906         char    *path;
 907         int     flags;
 908         int     mode;
 909 };
 910 #endif
 911 int
 912 sys_open(td, uap)
 913         struct thread *td;
 914         register struct open_args /* {
 915                 char *path;
 916                 int flags;
 917                 int mode;
 918         } */ *uap;
 919 {
 920
 921         return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 922             uap->flags, uap->mode));
 923 }
 924
 925 #ifndef _SYS_SYSPROTO_H_
 926 struct openat_args {
 927         int     fd;
 928         char    *path;
 929         int     flag;
 930         int     mode;
 931 };
 932 #endif
 933 int
 934 sys_openat(struct thread *td, struct openat_args *uap)
 935 {
 936
 937         return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 938             uap->mode));
 939 }
 940
 941 int
 942 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
 943     int flags, int mode)
 944 {
 945         struct proc *p = td->td_proc;
 946         struct filedesc *fdp = p->p_fd;
 947         struct file *fp;
 948         struct vnode *vp;
 949         struct nameidata nd;
 950         cap_rights_t rights;
 951         int cmode, error, indx;
 952
 953         indx = -1;
 954
 955         AUDIT_ARG_FFLAGS(flags);
 956         AUDIT_ARG_MODE(mode);
 957         /* XXX: audit dirfd */
 958         cap_rights_init(&rights, CAP_LOOKUP);
 959         flags_to_rights(flags, &rights);
 960         /*
 961          * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 962          * may be specified.
 963          */
 964         if (flags & O_EXEC) {
 965                 if (flags & O_ACCMODE)
 966                         return (EINVAL);
 967         } else if ((flags & O_ACCMODE) == O_ACCMODE) {
 968                 return (EINVAL);
 969         } else {
 970                 flags = FFLAGS(flags);
 971         }
 972
 973         /*
 974          * Allocate a file structure. The descriptor to reference it
 975          * is allocated and set by finstall() below.
 976          */
 977         error = falloc_noinstall(td, &fp);
 978         if (error != 0)
 979                 return (error);
 980         /*
 981          * An extra reference on `fp' has been held for us by
 982          * falloc_noinstall().
 983          */
 984         /* Set the flags early so the finit in devfs can pick them up. */
 985         fp->f_flag = flags & FMASK;
 986         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
 987         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 988             &rights, td);
 989         td->td_dupfd = -1;              /* XXX check for fdopen */
 990         error = vn_open(&nd, &flags, cmode, fp);
 991         if (error != 0) {
 992                 /*
 993                  * If the vn_open replaced the method vector, something
 994                  * wonderous happened deep below and we just pass it up
 995                  * pretending we know what we do.
 996                  */
 997                 if (error == ENXIO && fp->f_ops != &badfileops)
 998                         goto success;
 999
1000                 /*
1001                  * Handle special fdopen() case. bleh.
1002                  *
1003                  * Don't do this for relative (capability) lookups; we don't
1004                  * understand exactly what would happen, and we don't think
1005                  * that it ever should.
1006                  */
1007                 if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
1008                     (error == ENODEV || error == ENXIO) &&
1009                     td->td_dupfd >= 0) {
1010                         error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1011                             &indx);
1012                         if (error == 0)
1013                                 goto success;
1014                 }
1015
1016                 goto bad;
1017         }
1018         td->td_dupfd = 0;
1019         NDFREE(&nd, NDF_ONLY_PNBUF);
1020         vp = nd.ni_vp;
1021
1022         /*
1023          * Store the vnode, for any f_type. Typically, the vnode use
1024          * count is decremented by direct call to vn_closefile() for
1025          * files that switched type in the cdevsw fdopen() method.
1026          */
1027         fp->f_vnode = vp;
1028         /*
1029          * If the file wasn't claimed by devfs bind it to the normal
1030          * vnode operations here.
1031          */
1032         if (fp->f_ops == &badfileops) {
1033                 KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1034                 fp->f_seqcount = 1;
1035                 finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1036                     DTYPE_VNODE, vp, &vnops);
1037         }
1038
1039         VOP_UNLOCK(vp, 0);
1040         if (flags & O_TRUNC) {
1041                 error = fo_truncate(fp, 0, td->td_ucred, td);
1042                 if (error != 0)
1043                         goto bad;
1044         }
1045 success:
1046         /*
1047          * If we haven't already installed the FD (for dupfdopen), do so now.
1048          */
1049         if (indx == -1) {
1050                 struct filecaps *fcaps;
1051
1052 #ifdef CAPABILITIES
1053                 if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
1054                         fcaps = &nd.ni_filecaps;
1055                 else
1056 #endif
1057                         fcaps = NULL;
1058                 error = finstall(td, fp, &indx, flags, fcaps);
1059                 /* On success finstall() consumes fcaps. */
1060                 if (error != 0) {
1061                         filecaps_free(&nd.ni_filecaps);
1062                         goto bad;
1063                 }
1064         } else {
1065                 filecaps_free(&nd.ni_filecaps);
1066         }
1067
1068         /*
1069          * Release our private reference, leaving the one associated with
1070          * the descriptor table intact.
1071          */
1072         fdrop(fp, td);
1073         td->td_retval[0] = indx;
1074         return (0);
1075 bad:
1076         KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1077         fdrop(fp, td);
1078         return (error);
1079 }
1080
1081 #ifdef COMPAT_43
1082 /*
1083  * Create a file.
1084  */
1085 #ifndef _SYS_SYSPROTO_H_
1086 struct ocreat_args {
1087         char    *path;
1088         int     mode;
1089 };
1090 #endif
1091 int
1092 ocreat(td, uap)
1093         struct thread *td;
1094         register struct ocreat_args /* {
1095                 char *path;
1096                 int mode;
1097         } */ *uap;
1098 {
1099
1100         return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1101             O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1102 }
1103 #endif /* COMPAT_43 */
1104
1105 /*
1106  * Create a special file.
1107  */
1108 #ifndef _SYS_SYSPROTO_H_
1109 struct mknod_args {
1110         char    *path;
1111         int     mode;
1112         int     dev;
1113 };
1114 #endif
1115 int
1116 sys_mknod(td, uap)
1117         struct thread *td;
1118         register struct mknod_args /* {
1119                 char *path;
1120                 int mode;
1121                 int dev;
1122         } */ *uap;
1123 {
1124
1125         return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1126             uap->mode, uap->dev));
1127 }
1128
1129 #ifndef _SYS_SYSPROTO_H_
1130 struct mknodat_args {
1131         int     fd;
1132         char    *path;
1133         mode_t  mode;
1134         dev_t   dev;
1135 };
1136 #endif
1137 int
1138 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1139 {
1140
1141         return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1142             uap->dev));
1143 }
1144
1145 int
1146 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1147     int mode, int dev)
1148 {
1149         struct vnode *vp;
1150         struct mount *mp;
1151         struct vattr vattr;
1152         struct nameidata nd;
1153         cap_rights_t rights;
1154         int error, whiteout = 0;
1155
1156         AUDIT_ARG_MODE(mode);
1157         AUDIT_ARG_DEV(dev);
1158         switch (mode & S_IFMT) {
1159         case S_IFCHR:
1160         case S_IFBLK:
1161                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1162                 if (error == 0 && dev == VNOVAL)
1163                         error = EINVAL;
1164                 break;
1165         case S_IFMT:
1166                 error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1167                 break;
1168         case S_IFWHT:
1169                 error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1170                 break;
1171         case S_IFIFO:
1172                 if (dev == 0)
1173                         return (kern_mkfifoat(td, fd, path, pathseg, mode));
1174                 /* FALLTHROUGH */
1175         default:
1176                 error = EINVAL;
1177                 break;
1178         }
1179         if (error != 0)
1180                 return (error);
1181 restart:
1182         bwillwrite();
1183         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1184             NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1185             td);
1186         if ((error = namei(&nd)) != 0)
1187                 return (error);
1188         vp = nd.ni_vp;
1189         if (vp != NULL) {
1190                 NDFREE(&nd, NDF_ONLY_PNBUF);
1191                 if (vp == nd.ni_dvp)
1192                         vrele(nd.ni_dvp);
1193                 else
1194                         vput(nd.ni_dvp);
1195                 vrele(vp);
1196                 return (EEXIST);
1197         } else {
1198                 VATTR_NULL(&vattr);
1199                 vattr.va_mode = (mode & ALLPERMS) &
1200                     ~td->td_proc->p_fd->fd_cmask;
1201                 vattr.va_rdev = dev;
1202                 whiteout = 0;
1203
1204                 switch (mode & S_IFMT) {
1205                 case S_IFMT:    /* used by badsect to flag bad sectors */
1206                         vattr.va_type = VBAD;
1207                         break;
1208                 case S_IFCHR:
1209                         vattr.va_type = VCHR;
1210                         break;
1211                 case S_IFBLK:
1212                         vattr.va_type = VBLK;
1213                         break;
1214                 case S_IFWHT:
1215                         whiteout = 1;
1216                         break;
1217                 default:
1218                         panic("kern_mknod: invalid mode");
1219                 }
1220         }
1221         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1222                 NDFREE(&nd, NDF_ONLY_PNBUF);
1223                 vput(nd.ni_dvp);
1224                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1225                         return (error);
1226                 goto restart;
1227         }
1228 #ifdef MAC
1229         if (error == 0 && !whiteout)
1230                 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1231                     &nd.ni_cnd, &vattr);
1232 #endif
1233         if (error == 0) {
1234                 if (whiteout)
1235                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1236                 else {
1237                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1238                                                 &nd.ni_cnd, &vattr);
1239                         if (error == 0)
1240                                 vput(nd.ni_vp);
1241                 }
1242         }
1243         NDFREE(&nd, NDF_ONLY_PNBUF);
1244         vput(nd.ni_dvp);
1245         vn_finished_write(mp);
1246         return (error);
1247 }
1248
1249 /*
1250  * Create a named pipe.
1251  */
1252 #ifndef _SYS_SYSPROTO_H_
1253 struct mkfifo_args {
1254         char    *path;
1255         int     mode;
1256 };
1257 #endif
1258 int
1259 sys_mkfifo(td, uap)
1260         struct thread *td;
1261         register struct mkfifo_args /* {
1262                 char *path;
1263                 int mode;
1264         } */ *uap;
1265 {
1266
1267         return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1268             uap->mode));
1269 }
1270
1271 #ifndef _SYS_SYSPROTO_H_
1272 struct mkfifoat_args {
1273         int     fd;
1274         char    *path;
1275         mode_t  mode;
1276 };
1277 #endif
1278 int
1279 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1280 {
1281
1282         return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1283             uap->mode));
1284 }
1285
1286 int
1287 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1288     int mode)
1289 {
1290         struct mount *mp;
1291         struct vattr vattr;
1292         struct nameidata nd;
1293         cap_rights_t rights;
1294         int error;
1295
1296         AUDIT_ARG_MODE(mode);
1297 restart:
1298         bwillwrite();
1299         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1300             NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1301             td);
1302         if ((error = namei(&nd)) != 0)
1303                 return (error);
1304         if (nd.ni_vp != NULL) {
1305                 NDFREE(&nd, NDF_ONLY_PNBUF);
1306                 if (nd.ni_vp == nd.ni_dvp)
1307                         vrele(nd.ni_dvp);
1308                 else
1309                         vput(nd.ni_dvp);
1310                 vrele(nd.ni_vp);
1311                 return (EEXIST);
1312         }
1313         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1314                 NDFREE(&nd, NDF_ONLY_PNBUF);
1315                 vput(nd.ni_dvp);
1316                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1317                         return (error);
1318                 goto restart;
1319         }
1320         VATTR_NULL(&vattr);
1321         vattr.va_type = VFIFO;
1322         vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1323 #ifdef MAC
1324         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1325             &vattr);
1326         if (error != 0)
1327                 goto out;
1328 #endif
1329         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1330         if (error == 0)
1331                 vput(nd.ni_vp);
1332 #ifdef MAC
1333 out:
1334 #endif
1335         vput(nd.ni_dvp);
1336         vn_finished_write(mp);
1337         NDFREE(&nd, NDF_ONLY_PNBUF);
1338         return (error);
1339 }
1340
1341 /*
1342  * Make a hard file link.
1343  */
1344 #ifndef _SYS_SYSPROTO_H_
1345 struct link_args {
1346         char    *path;
1347         char    *link;
1348 };
1349 #endif
1350 int
1351 sys_link(td, uap)
1352         struct thread *td;
1353         register struct link_args /* {
1354                 char *path;
1355                 char *link;
1356         } */ *uap;
1357 {
1358
1359         return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1360             UIO_USERSPACE, FOLLOW));
1361 }
1362
1363 #ifndef _SYS_SYSPROTO_H_
1364 struct linkat_args {
1365         int     fd1;
1366         char    *path1;
1367         int     fd2;
1368         char    *path2;
1369         int     flag;
1370 };
1371 #endif
1372 int
1373 sys_linkat(struct thread *td, struct linkat_args *uap)
1374 {
1375         int flag;
1376
1377         flag = uap->flag;
1378         if (flag & ~AT_SYMLINK_FOLLOW)
1379                 return (EINVAL);
1380
1381         return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1382             UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1383 }
1384
1385 int hardlink_check_uid = 0;
1386 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1387     &hardlink_check_uid, 0,
1388     "Unprivileged processes cannot create hard links to files owned by other "
1389     "users");
1390 static int hardlink_check_gid = 0;
1391 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1392     &hardlink_check_gid, 0,
1393     "Unprivileged processes cannot create hard links to files owned by other "
1394     "groups");
1395
1396 static int
1397 can_hardlink(struct vnode *vp, struct ucred *cred)
1398 {
1399         struct vattr va;
1400         int error;
1401
1402         if (!hardlink_check_uid && !hardlink_check_gid)
1403                 return (0);
1404
1405         error = VOP_GETATTR(vp, &va, cred);
1406         if (error != 0)
1407                 return (error);
1408
1409         if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1410                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1411                 if (error != 0)
1412                         return (error);
1413         }
1414
1415         if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1416                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1417                 if (error != 0)
1418                         return (error);
1419         }
1420
1421         return (0);
1422 }
1423
1424 int
1425 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1426     enum uio_seg segflg, int follow)
1427 {
1428         struct vnode *vp;
1429         struct mount *mp;
1430         struct nameidata nd;
1431         cap_rights_t rights;
1432         int error;
1433
1434 again:
1435         bwillwrite();
1436         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1,
1437             cap_rights_init(&rights, CAP_LINKAT_SOURCE), td);
1438
1439         if ((error = namei(&nd)) != 0)
1440                 return (error);
1441         NDFREE(&nd, NDF_ONLY_PNBUF);
1442         vp = nd.ni_vp;
1443         if (vp->v_type == VDIR) {
1444                 vrele(vp);
1445                 return (EPERM);         /* POSIX */
1446         }
1447         NDINIT_ATRIGHTS(&nd, CREATE,
1448             LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflg, path2, fd2,
1449             cap_rights_init(&rights, CAP_LINKAT_TARGET), td);
1450         if ((error = namei(&nd)) == 0) {
1451                 if (nd.ni_vp != NULL) {
1452                         NDFREE(&nd, NDF_ONLY_PNBUF);
1453                         if (nd.ni_dvp == nd.ni_vp)
1454                                 vrele(nd.ni_dvp);
1455                         else
1456                                 vput(nd.ni_dvp);
1457                         vrele(nd.ni_vp);
1458                         vrele(vp);
1459                         return (EEXIST);
1460                 } else if (nd.ni_dvp->v_mount != vp->v_mount) {
1461                         /*
1462                          * Cross-device link.  No need to recheck
1463                          * vp->v_type, since it cannot change, except
1464                          * to VBAD.
1465                          */
1466                         NDFREE(&nd, NDF_ONLY_PNBUF);
1467                         vput(nd.ni_dvp);
1468                         vrele(vp);
1469                         return (EXDEV);
1470                 } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1471                         error = can_hardlink(vp, td->td_ucred);
1472 #ifdef MAC
1473                         if (error == 0)
1474                                 error = mac_vnode_check_link(td->td_ucred,
1475                                     nd.ni_dvp, vp, &nd.ni_cnd);
1476 #endif
1477                         if (error != 0) {
1478                                 vput(vp);
1479                                 vput(nd.ni_dvp);
1480                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1481                                 return (error);
1482                         }
1483                         error = vn_start_write(vp, &mp, V_NOWAIT);
1484                         if (error != 0) {
1485                                 vput(vp);
1486                                 vput(nd.ni_dvp);
1487                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1488                                 error = vn_start_write(NULL, &mp,
1489                                     V_XSLEEP | PCATCH);
1490                                 if (error != 0)
1491                                         return (error);
1492                                 goto again;
1493                         }
1494                         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1495                         VOP_UNLOCK(vp, 0);
1496                         vput(nd.ni_dvp);
1497                         vn_finished_write(mp);
1498                         NDFREE(&nd, NDF_ONLY_PNBUF);
1499                 } else {
1500                         vput(nd.ni_dvp);
1501                         NDFREE(&nd, NDF_ONLY_PNBUF);
1502                         vrele(vp);
1503                         goto again;
1504                 }
1505         }
1506         vrele(vp);
1507         return (error);
1508 }
1509
1510 /*
1511  * Make a symbolic link.
1512  */
1513 #ifndef _SYS_SYSPROTO_H_
1514 struct symlink_args {
1515         char    *path;
1516         char    *link;
1517 };
1518 #endif
1519 int
1520 sys_symlink(td, uap)
1521         struct thread *td;
1522         register struct symlink_args /* {
1523                 char *path;
1524                 char *link;
1525         } */ *uap;
1526 {
1527
1528         return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1529             UIO_USERSPACE));
1530 }
1531
1532 #ifndef _SYS_SYSPROTO_H_
1533 struct symlinkat_args {
1534         char    *path;
1535         int     fd;
1536         char    *path2;
1537 };
1538 #endif
1539 int
1540 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1541 {
1542
1543         return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1544             UIO_USERSPACE));
1545 }
1546
1547 int
1548 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1549     enum uio_seg segflg)
1550 {
1551         struct mount *mp;
1552         struct vattr vattr;
1553         char *syspath;
1554         struct nameidata nd;
1555         int error;
1556         cap_rights_t rights;
1557
1558         if (segflg == UIO_SYSSPACE) {
1559                 syspath = path1;
1560         } else {
1561                 syspath = uma_zalloc(namei_zone, M_WAITOK);
1562                 if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1563                         goto out;
1564         }
1565         AUDIT_ARG_TEXT(syspath);
1566 restart:
1567         bwillwrite();
1568         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1569             NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1570             td);
1571         if ((error = namei(&nd)) != 0)
1572                 goto out;
1573         if (nd.ni_vp) {
1574                 NDFREE(&nd, NDF_ONLY_PNBUF);
1575                 if (nd.ni_vp == nd.ni_dvp)
1576                         vrele(nd.ni_dvp);
1577                 else
1578                         vput(nd.ni_dvp);
1579                 vrele(nd.ni_vp);
1580                 error = EEXIST;
1581                 goto out;
1582         }
1583         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1584                 NDFREE(&nd, NDF_ONLY_PNBUF);
1585                 vput(nd.ni_dvp);
1586                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1587                         goto out;
1588                 goto restart;
1589         }
1590         VATTR_NULL(&vattr);
1591         vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1592 #ifdef MAC
1593         vattr.va_type = VLNK;
1594         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1595             &vattr);
1596         if (error != 0)
1597                 goto out2;
1598 #endif
1599         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1600         if (error == 0)
1601                 vput(nd.ni_vp);
1602 #ifdef MAC
1603 out2:
1604 #endif
1605         NDFREE(&nd, NDF_ONLY_PNBUF);
1606         vput(nd.ni_dvp);
1607         vn_finished_write(mp);
1608 out:
1609         if (segflg != UIO_SYSSPACE)
1610                 uma_zfree(namei_zone, syspath);
1611         return (error);
1612 }
1613
1614 /*
1615  * Delete a whiteout from the filesystem.
1616  */
1617 int
1618 sys_undelete(td, uap)
1619         struct thread *td;
1620         register struct undelete_args /* {
1621                 char *path;
1622         } */ *uap;
1623 {
1624         struct mount *mp;
1625         struct nameidata nd;
1626         int error;
1627
1628 restart:
1629         bwillwrite();
1630         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1631             UIO_USERSPACE, uap->path, td);
1632         error = namei(&nd);
1633         if (error != 0)
1634                 return (error);
1635
1636         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1637                 NDFREE(&nd, NDF_ONLY_PNBUF);
1638                 if (nd.ni_vp == nd.ni_dvp)
1639                         vrele(nd.ni_dvp);
1640                 else
1641                         vput(nd.ni_dvp);
1642                 if (nd.ni_vp)
1643                         vrele(nd.ni_vp);
1644                 return (EEXIST);
1645         }
1646         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1647                 NDFREE(&nd, NDF_ONLY_PNBUF);
1648                 vput(nd.ni_dvp);
1649                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1650                         return (error);
1651                 goto restart;
1652         }
1653         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1654         NDFREE(&nd, NDF_ONLY_PNBUF);
1655         vput(nd.ni_dvp);
1656         vn_finished_write(mp);
1657         return (error);
1658 }
1659
1660 /*
1661  * Delete a name from the filesystem.
1662  */
1663 #ifndef _SYS_SYSPROTO_H_
1664 struct unlink_args {
1665         char    *path;
1666 };
1667 #endif
1668 int
1669 sys_unlink(td, uap)
1670         struct thread *td;
1671         struct unlink_args /* {
1672                 char *path;
1673         } */ *uap;
1674 {
1675
1676         return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
1677 }
1678
1679 #ifndef _SYS_SYSPROTO_H_
1680 struct unlinkat_args {
1681         int     fd;
1682         char    *path;
1683         int     flag;
1684 };
1685 #endif
1686 int
1687 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1688 {
1689         int flag = uap->flag;
1690         int fd = uap->fd;
1691         char *path = uap->path;
1692
1693         if (flag & ~AT_REMOVEDIR)
1694                 return (EINVAL);
1695
1696         if (flag & AT_REMOVEDIR)
1697                 return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1698         else
1699                 return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1700 }
1701
1702 int
1703 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1704     ino_t oldinum)
1705 {
1706         struct mount *mp;
1707         struct vnode *vp;
1708         struct nameidata nd;
1709         struct stat sb;
1710         cap_rights_t rights;
1711         int error;
1712
1713 restart:
1714         bwillwrite();
1715         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1716             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1717         if ((error = namei(&nd)) != 0)
1718                 return (error == EINVAL ? EPERM : error);
1719         vp = nd.ni_vp;
1720         if (vp->v_type == VDIR && oldinum == 0) {
1721                 error = EPERM;          /* POSIX */
1722         } else if (oldinum != 0 &&
1723                   ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1724                   sb.st_ino != oldinum) {
1725                         error = EIDRM;  /* Identifier removed */
1726         } else {
1727                 /*
1728                  * The root of a mounted filesystem cannot be deleted.
1729                  *
1730                  * XXX: can this only be a VDIR case?
1731                  */
1732                 if (vp->v_vflag & VV_ROOT)
1733                         error = EBUSY;
1734         }
1735         if (error == 0) {
1736                 if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1737                         NDFREE(&nd, NDF_ONLY_PNBUF);
1738                         vput(nd.ni_dvp);
1739                         if (vp == nd.ni_dvp)
1740                                 vrele(vp);
1741                         else
1742                                 vput(vp);
1743                         if ((error = vn_start_write(NULL, &mp,
1744                             V_XSLEEP | PCATCH)) != 0)
1745                                 return (error);
1746                         goto restart;
1747                 }
1748 #ifdef MAC
1749                 error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1750                     &nd.ni_cnd);
1751                 if (error != 0)
1752                         goto out;
1753 #endif
1754                 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1755                 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1756 #ifdef MAC
1757 out:
1758 #endif
1759                 vn_finished_write(mp);
1760         }
1761         NDFREE(&nd, NDF_ONLY_PNBUF);
1762         vput(nd.ni_dvp);
1763         if (vp == nd.ni_dvp)
1764                 vrele(vp);
1765         else
1766                 vput(vp);
1767         return (error);
1768 }
1769
1770 /*
1771  * Reposition read/write file offset.
1772  */
1773 #ifndef _SYS_SYSPROTO_H_
1774 struct lseek_args {
1775         int     fd;
1776         int     pad;
1777         off_t   offset;
1778         int     whence;
1779 };
1780 #endif
1781 int
1782 sys_lseek(td, uap)
1783         struct thread *td;
1784         register struct lseek_args /* {
1785                 int fd;
1786                 int pad;
1787                 off_t offset;
1788                 int whence;
1789         } */ *uap;
1790 {
1791         struct file *fp;
1792         cap_rights_t rights;
1793         int error;
1794
1795         AUDIT_ARG_FD(uap->fd);
1796         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1797         if (error != 0)
1798                 return (error);
1799         error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1800             fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1801         fdrop(fp, td);
1802         return (error);
1803 }
1804
1805 #if defined(COMPAT_43)
1806 /*
1807  * Reposition read/write file offset.
1808  */
1809 #ifndef _SYS_SYSPROTO_H_
1810 struct olseek_args {
1811         int     fd;
1812         long    offset;
1813         int     whence;
1814 };
1815 #endif
1816 int
1817 olseek(td, uap)
1818         struct thread *td;
1819         register struct olseek_args /* {
1820                 int fd;
1821                 long offset;
1822                 int whence;
1823         } */ *uap;
1824 {
1825         struct lseek_args /* {
1826                 int fd;
1827                 int pad;
1828                 off_t offset;
1829                 int whence;
1830         } */ nuap;
1831
1832         nuap.fd = uap->fd;
1833         nuap.offset = uap->offset;
1834         nuap.whence = uap->whence;
1835         return (sys_lseek(td, &nuap));
1836 }
1837 #endif /* COMPAT_43 */
1838
1839 #if defined(COMPAT_FREEBSD6)
1840 /* Version with the 'pad' argument */
1841 int
1842 freebsd6_lseek(td, uap)
1843         struct thread *td;
1844         register struct freebsd6_lseek_args *uap;
1845 {
1846         struct lseek_args ouap;
1847
1848         ouap.fd = uap->fd;
1849         ouap.offset = uap->offset;
1850         ouap.whence = uap->whence;
1851         return (sys_lseek(td, &ouap));
1852 }
1853 #endif
1854
1855 /*
1856  * Check access permissions using passed credentials.
1857  */
1858 static int
1859 vn_access(vp, user_flags, cred, td)
1860         struct vnode    *vp;
1861         int             user_flags;
1862         struct ucred    *cred;
1863         struct thread   *td;
1864 {
1865         accmode_t accmode;
1866         int error;
1867
1868         /* Flags == 0 means only check for existence. */
1869         if (user_flags == 0)
1870                 return (0);
1871
1872         accmode = 0;
1873         if (user_flags & R_OK)
1874                 accmode |= VREAD;
1875         if (user_flags & W_OK)
1876                 accmode |= VWRITE;
1877         if (user_flags & X_OK)
1878                 accmode |= VEXEC;
1879 #ifdef MAC
1880         error = mac_vnode_check_access(cred, vp, accmode);
1881         if (error != 0)
1882                 return (error);
1883 #endif
1884         if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1885                 error = VOP_ACCESS(vp, accmode, cred, td);
1886         return (error);
1887 }
1888
1889 /*
1890  * Check access permissions using "real" credentials.
1891  */
1892 #ifndef _SYS_SYSPROTO_H_
1893 struct access_args {
1894         char    *path;
1895         int     amode;
1896 };
1897 #endif
1898 int
1899 sys_access(td, uap)
1900         struct thread *td;
1901         register struct access_args /* {
1902                 char *path;
1903                 int amode;
1904         } */ *uap;
1905 {
1906
1907         return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1908             0, uap->amode));
1909 }
1910
1911 #ifndef _SYS_SYSPROTO_H_
1912 struct faccessat_args {
1913         int     dirfd;
1914         char    *path;
1915         int     amode;
1916         int     flag;
1917 }
1918 #endif
1919 int
1920 sys_faccessat(struct thread *td, struct faccessat_args *uap)
1921 {
1922
1923         return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1924             uap->amode));
1925 }
1926
1927 int
1928 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1929     int flag, int amode)
1930 {
1931         struct ucred *cred, *usecred;
1932         struct vnode *vp;
1933         struct nameidata nd;
1934         cap_rights_t rights;
1935         int error;
1936
1937         if (flag & ~AT_EACCESS)
1938                 return (EINVAL);
1939         if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
1940                 return (EINVAL);
1941
1942         /*
1943          * Create and modify a temporary credential instead of one that
1944          * is potentially shared (if we need one).
1945          */
1946         cred = td->td_ucred;
1947         if ((flag & AT_EACCESS) == 0 &&
1948             ((cred->cr_uid != cred->cr_ruid ||
1949             cred->cr_rgid != cred->cr_groups[0]))) {
1950                 usecred = crdup(cred);
1951                 usecred->cr_uid = cred->cr_ruid;
1952                 usecred->cr_groups[0] = cred->cr_rgid;
1953                 td->td_ucred = usecred;
1954         } else
1955                 usecred = cred;
1956         AUDIT_ARG_VALUE(amode);
1957         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
1958             AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
1959             td);
1960         if ((error = namei(&nd)) != 0)
1961                 goto out;
1962         vp = nd.ni_vp;
1963
1964         error = vn_access(vp, amode, usecred, td);
1965         NDFREE(&nd, NDF_ONLY_PNBUF);
1966         vput(vp);
1967 out:
1968         if (usecred != cred) {
1969                 td->td_ucred = cred;
1970                 crfree(usecred);
1971         }
1972         return (error);
1973 }
1974
1975 /*
1976  * Check access permissions using "effective" credentials.
1977  */
1978 #ifndef _SYS_SYSPROTO_H_
1979 struct eaccess_args {
1980         char    *path;
1981         int     amode;
1982 };
1983 #endif
1984 int
1985 sys_eaccess(td, uap)
1986         struct thread *td;
1987         register struct eaccess_args /* {
1988                 char *path;
1989                 int amode;
1990         } */ *uap;
1991 {
1992
1993         return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1994             AT_EACCESS, uap->amode));
1995 }
1996
1997 #if defined(COMPAT_43)
1998 /*
1999  * Get file status; this version follows links.
2000  */
2001 #ifndef _SYS_SYSPROTO_H_
2002 struct ostat_args {
2003         char    *path;
2004         struct ostat *ub;
2005 };
2006 #endif
2007 int
2008 ostat(td, uap)
2009         struct thread *td;
2010         register struct ostat_args /* {
2011                 char *path;
2012                 struct ostat *ub;
2013         } */ *uap;
2014 {
2015         struct stat sb;
2016         struct ostat osb;
2017         int error;
2018
2019         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2020             &sb, NULL);
2021         if (error != 0)
2022                 return (error);
2023         cvtstat(&sb, &osb);
2024         return (copyout(&osb, uap->ub, sizeof (osb)));
2025 }
2026
2027 /*
2028  * Get file status; this version does not follow links.
2029  */
2030 #ifndef _SYS_SYSPROTO_H_
2031 struct olstat_args {
2032         char    *path;
2033         struct ostat *ub;
2034 };
2035 #endif
2036 int
2037 olstat(td, uap)
2038         struct thread *td;
2039         register struct olstat_args /* {
2040                 char *path;
2041                 struct ostat *ub;
2042         } */ *uap;
2043 {
2044         struct stat sb;
2045         struct ostat osb;
2046         int error;
2047
2048         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2049             UIO_USERSPACE, &sb, NULL);
2050         if (error != 0)
2051                 return (error);
2052         cvtstat(&sb, &osb);
2053         return (copyout(&osb, uap->ub, sizeof (osb)));
2054 }
2055
2056 /*
2057  * Convert from an old to a new stat structure.
2058  */
2059 void
2060 cvtstat(st, ost)
2061         struct stat *st;
2062         struct ostat *ost;
2063 {
2064
2065         bzero(ost, sizeof(*ost));
2066         ost->st_dev = st->st_dev;
2067         ost->st_ino = st->st_ino;
2068         ost->st_mode = st->st_mode;
2069         ost->st_nlink = st->st_nlink;
2070         ost->st_uid = st->st_uid;
2071         ost->st_gid = st->st_gid;
2072         ost->st_rdev = st->st_rdev;
2073         if (st->st_size < (quad_t)1 << 32)
2074                 ost->st_size = st->st_size;
2075         else
2076                 ost->st_size = -2;
2077         ost->st_atim = st->st_atim;
2078         ost->st_mtim = st->st_mtim;
2079         ost->st_ctim = st->st_ctim;
2080         ost->st_blksize = st->st_blksize;
2081         ost->st_blocks = st->st_blocks;
2082         ost->st_flags = st->st_flags;
2083         ost->st_gen = st->st_gen;
2084 }
2085 #endif /* COMPAT_43 */
2086
2087 /*
2088  * Get file status; this version follows links.
2089  */
2090 #ifndef _SYS_SYSPROTO_H_
2091 struct stat_args {
2092         char    *path;
2093         struct stat *ub;
2094 };
2095 #endif
2096 int
2097 sys_stat(td, uap)
2098         struct thread *td;
2099         register struct stat_args /* {
2100                 char *path;
2101                 struct stat *ub;
2102         } */ *uap;
2103 {
2104         struct stat sb;
2105         int error;
2106
2107         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2108             &sb, NULL);
2109         if (error == 0)
2110                 error = copyout(&sb, uap->ub, sizeof (sb));
2111         return (error);
2112 }
2113
2114 #ifndef _SYS_SYSPROTO_H_
2115 struct fstatat_args {
2116         int     fd;
2117         char    *path;
2118         struct stat     *buf;
2119         int     flag;
2120 }
2121 #endif
2122 int
2123 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2124 {
2125         struct stat sb;
2126         int error;
2127
2128         error = kern_statat(td, uap->flag, uap->fd, uap->path,
2129             UIO_USERSPACE, &sb, NULL);
2130         if (error == 0)
2131                 error = copyout(&sb, uap->buf, sizeof (sb));
2132         return (error);
2133 }
2134
2135 int
2136 kern_statat(struct thread *td, int flag, int fd, char *path,
2137     enum uio_seg pathseg, struct stat *sbp,
2138     void (*hook)(struct vnode *vp, struct stat *sbp))
2139 {
2140         struct nameidata nd;
2141         struct stat sb;
2142         cap_rights_t rights;
2143         int error;
2144
2145         if (flag & ~AT_SYMLINK_NOFOLLOW)
2146                 return (EINVAL);
2147
2148         NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2149             FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2150             cap_rights_init(&rights, CAP_FSTAT), td);
2151
2152         if ((error = namei(&nd)) != 0)
2153                 return (error);
2154         error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2155         if (error == 0) {
2156                 SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2157                 if (S_ISREG(sb.st_mode))
2158                         SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2159                 if (__predict_false(hook != NULL))
2160                         hook(nd.ni_vp, &sb);
2161         }
2162         NDFREE(&nd, NDF_ONLY_PNBUF);
2163         vput(nd.ni_vp);
2164         if (error != 0)
2165                 return (error);
2166         *sbp = sb;
2167 #ifdef KTRACE
2168         if (KTRPOINT(td, KTR_STRUCT))
2169                 ktrstat(&sb);
2170 #endif
2171         return (0);
2172 }
2173
2174 /*
2175  * Get file status; this version does not follow links.
2176  */
2177 #ifndef _SYS_SYSPROTO_H_
2178 struct lstat_args {
2179         char    *path;
2180         struct stat *ub;
2181 };
2182 #endif
2183 int
2184 sys_lstat(td, uap)
2185         struct thread *td;
2186         register struct lstat_args /* {
2187                 char *path;
2188                 struct stat *ub;
2189         } */ *uap;
2190 {
2191         struct stat sb;
2192         int error;
2193
2194         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2195             UIO_USERSPACE, &sb, NULL);
2196         if (error == 0)
2197                 error = copyout(&sb, uap->ub, sizeof (sb));
2198         return (error);
2199 }
2200
2201 /*
2202  * Implementation of the NetBSD [l]stat() functions.
2203  */
2204 void
2205 cvtnstat(sb, nsb)
2206         struct stat *sb;
2207         struct nstat *nsb;
2208 {
2209
2210         bzero(nsb, sizeof *nsb);
2211         nsb->st_dev = sb->st_dev;
2212         nsb->st_ino = sb->st_ino;
2213         nsb->st_mode = sb->st_mode;
2214         nsb->st_nlink = sb->st_nlink;
2215         nsb->st_uid = sb->st_uid;
2216         nsb->st_gid = sb->st_gid;
2217         nsb->st_rdev = sb->st_rdev;
2218         nsb->st_atim = sb->st_atim;
2219         nsb->st_mtim = sb->st_mtim;
2220         nsb->st_ctim = sb->st_ctim;
2221         nsb->st_size = sb->st_size;
2222         nsb->st_blocks = sb->st_blocks;
2223         nsb->st_blksize = sb->st_blksize;
2224         nsb->st_flags = sb->st_flags;
2225         nsb->st_gen = sb->st_gen;
2226         nsb->st_birthtim = sb->st_birthtim;
2227 }
2228
2229 #ifndef _SYS_SYSPROTO_H_
2230 struct nstat_args {
2231         char    *path;
2232         struct nstat *ub;
2233 };
2234 #endif
2235 int
2236 sys_nstat(td, uap)
2237         struct thread *td;
2238         register struct nstat_args /* {
2239                 char *path;
2240                 struct nstat *ub;
2241         } */ *uap;
2242 {
2243         struct stat sb;
2244         struct nstat nsb;
2245         int error;
2246
2247         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2248             &sb, NULL);
2249         if (error != 0)
2250                 return (error);
2251         cvtnstat(&sb, &nsb);
2252         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2253 }
2254
2255 /*
2256  * NetBSD lstat.  Get file status; this version does not follow links.
2257  */
2258 #ifndef _SYS_SYSPROTO_H_
2259 struct lstat_args {
2260         char    *path;
2261         struct stat *ub;
2262 };
2263 #endif
2264 int
2265 sys_nlstat(td, uap)
2266         struct thread *td;
2267         register struct nlstat_args /* {
2268                 char *path;
2269                 struct nstat *ub;
2270         } */ *uap;
2271 {
2272         struct stat sb;
2273         struct nstat nsb;
2274         int error;
2275
2276         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2277             UIO_USERSPACE, &sb, NULL);
2278         if (error != 0)
2279                 return (error);
2280         cvtnstat(&sb, &nsb);
2281         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2282 }
2283
2284 /*
2285  * Get configurable pathname variables.
2286  */
2287 #ifndef _SYS_SYSPROTO_H_
2288 struct pathconf_args {
2289         char    *path;
2290         int     name;
2291 };
2292 #endif
2293 int
2294 sys_pathconf(td, uap)
2295         struct thread *td;
2296         register struct pathconf_args /* {
2297                 char *path;
2298                 int name;
2299         } */ *uap;
2300 {
2301
2302         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2303 }
2304
2305 #ifndef _SYS_SYSPROTO_H_
2306 struct lpathconf_args {
2307         char    *path;
2308         int     name;
2309 };
2310 #endif
2311 int
2312 sys_lpathconf(td, uap)
2313         struct thread *td;
2314         register struct lpathconf_args /* {
2315                 char *path;
2316                 int name;
2317         } */ *uap;
2318 {
2319
2320         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2321             NOFOLLOW));
2322 }
2323
2324 int
2325 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2326     u_long flags)
2327 {
2328         struct nameidata nd;
2329         int error;
2330
2331         NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2332             pathseg, path, td);
2333         if ((error = namei(&nd)) != 0)
2334                 return (error);
2335         NDFREE(&nd, NDF_ONLY_PNBUF);
2336
2337         error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2338         vput(nd.ni_vp);
2339         return (error);
2340 }
2341
2342 /*
2343  * Return target name of a symbolic link.
2344  */
2345 #ifndef _SYS_SYSPROTO_H_
2346 struct readlink_args {
2347         char    *path;
2348         char    *buf;
2349         size_t  count;
2350 };
2351 #endif
2352 int
2353 sys_readlink(td, uap)
2354         struct thread *td;
2355         register struct readlink_args /* {
2356                 char *path;
2357                 char *buf;
2358                 size_t count;
2359         } */ *uap;
2360 {
2361
2362         return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2363             uap->buf, UIO_USERSPACE, uap->count));
2364 }
2365 #ifndef _SYS_SYSPROTO_H_
2366 struct readlinkat_args {
2367         int     fd;
2368         char    *path;
2369         char    *buf;
2370         size_t  bufsize;
2371 };
2372 #endif
2373 int
2374 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2375 {
2376
2377         return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2378             uap->buf, UIO_USERSPACE, uap->bufsize));
2379 }
2380
2381 int
2382 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2383     char *buf, enum uio_seg bufseg, size_t count)
2384 {
2385         struct vnode *vp;
2386         struct iovec aiov;
2387         struct uio auio;
2388         struct nameidata nd;
2389         int error;
2390
2391         if (count > IOSIZE_MAX)
2392                 return (EINVAL);
2393
2394         NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2395             pathseg, path, fd, td);
2396
2397         if ((error = namei(&nd)) != 0)
2398                 return (error);
2399         NDFREE(&nd, NDF_ONLY_PNBUF);
2400         vp = nd.ni_vp;
2401 #ifdef MAC
2402         error = mac_vnode_check_readlink(td->td_ucred, vp);
2403         if (error != 0) {
2404                 vput(vp);
2405                 return (error);
2406         }
2407 #endif
2408         if (vp->v_type != VLNK)
2409                 error = EINVAL;
2410         else {
2411                 aiov.iov_base = buf;
2412                 aiov.iov_len = count;
2413                 auio.uio_iov = &aiov;
2414                 auio.uio_iovcnt = 1;
2415                 auio.uio_offset = 0;
2416                 auio.uio_rw = UIO_READ;
2417                 auio.uio_segflg = bufseg;
2418                 auio.uio_td = td;
2419                 auio.uio_resid = count;
2420                 error = VOP_READLINK(vp, &auio, td->td_ucred);
2421                 td->td_retval[0] = count - auio.uio_resid;
2422         }
2423         vput(vp);
2424         return (error);
2425 }
2426
2427 /*
2428  * Common implementation code for chflags() and fchflags().
2429  */
2430 static int
2431 setfflags(td, vp, flags)
2432         struct thread *td;
2433         struct vnode *vp;
2434         u_long flags;
2435 {
2436         struct mount *mp;
2437         struct vattr vattr;
2438         int error;
2439
2440         /* We can't support the value matching VNOVAL. */
2441         if (flags == VNOVAL)
2442                 return (EOPNOTSUPP);
2443
2444         /*
2445          * Prevent non-root users from setting flags on devices.  When
2446          * a device is reused, users can retain ownership of the device
2447          * if they are allowed to set flags and programs assume that
2448          * chown can't fail when done as root.
2449          */
2450         if (vp->v_type == VCHR || vp->v_type == VBLK) {
2451                 error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2452                 if (error != 0)
2453                         return (error);
2454         }
2455
2456         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2457                 return (error);
2458         VATTR_NULL(&vattr);
2459         vattr.va_flags = flags;
2460         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2461 #ifdef MAC
2462         error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2463         if (error == 0)
2464 #endif
2465                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2466         VOP_UNLOCK(vp, 0);
2467         vn_finished_write(mp);
2468         return (error);
2469 }
2470
2471 /*
2472  * Change flags of a file given a path name.
2473  */
2474 #ifndef _SYS_SYSPROTO_H_
2475 struct chflags_args {
2476         const char *path;
2477         u_long  flags;
2478 };
2479 #endif
2480 int
2481 sys_chflags(td, uap)
2482         struct thread *td;
2483         register struct chflags_args /* {
2484                 const char *path;
2485                 u_long flags;
2486         } */ *uap;
2487 {
2488
2489         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2490             uap->flags, 0));
2491 }
2492
2493 #ifndef _SYS_SYSPROTO_H_
2494 struct chflagsat_args {
2495         int     fd;
2496         const char *path;
2497         u_long  flags;
2498         int     atflag;
2499 }
2500 #endif
2501 int
2502 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2503 {
2504         int fd = uap->fd;
2505         const char *path = uap->path;
2506         u_long flags = uap->flags;
2507         int atflag = uap->atflag;
2508
2509         if (atflag & ~AT_SYMLINK_NOFOLLOW)
2510                 return (EINVAL);
2511
2512         return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2513 }
2514
2515 /*
2516  * Same as chflags() but doesn't follow symlinks.
2517  */
2518 int
2519 sys_lchflags(td, uap)
2520         struct thread *td;
2521         register struct lchflags_args /* {
2522                 const char *path;
2523                 u_long flags;
2524         } */ *uap;
2525 {
2526
2527         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2528             uap->flags, AT_SYMLINK_NOFOLLOW));
2529 }
2530
2531 static int
2532 kern_chflagsat(struct thread *td, int fd, const char *path,
2533     enum uio_seg pathseg, u_long flags, int atflag)
2534 {
2535         struct nameidata nd;
2536         cap_rights_t rights;
2537         int error, follow;
2538
2539         AUDIT_ARG_FFLAGS(flags);
2540         follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2541         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2542             cap_rights_init(&rights, CAP_FCHFLAGS), td);
2543         if ((error = namei(&nd)) != 0)
2544                 return (error);
2545         NDFREE(&nd, NDF_ONLY_PNBUF);
2546         error = setfflags(td, nd.ni_vp, flags);
2547         vrele(nd.ni_vp);
2548         return (error);
2549 }
2550
2551 /*
2552  * Change flags of a file given a file descriptor.
2553  */
2554 #ifndef _SYS_SYSPROTO_H_
2555 struct fchflags_args {
2556         int     fd;
2557         u_long  flags;
2558 };
2559 #endif
2560 int
2561 sys_fchflags(td, uap)
2562         struct thread *td;
2563         register struct fchflags_args /* {
2564                 int fd;
2565                 u_long flags;
2566         } */ *uap;
2567 {
2568         struct file *fp;
2569         cap_rights_t rights;
2570         int error;
2571
2572         AUDIT_ARG_FD(uap->fd);
2573         AUDIT_ARG_FFLAGS(uap->flags);
2574         error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHFLAGS),
2575             &fp);
2576         if (error != 0)
2577                 return (error);
2578 #ifdef AUDIT
2579         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2580         AUDIT_ARG_VNODE1(fp->f_vnode);
2581         VOP_UNLOCK(fp->f_vnode, 0);
2582 #endif
2583         error = setfflags(td, fp->f_vnode, uap->flags);
2584         fdrop(fp, td);
2585         return (error);
2586 }
2587
2588 /*
2589  * Common implementation code for chmod(), lchmod() and fchmod().
2590  */
2591 int
2592 setfmode(td, cred, vp, mode)
2593         struct thread *td;
2594         struct ucred *cred;
2595         struct vnode *vp;
2596         int mode;
2597 {
2598         struct mount *mp;
2599         struct vattr vattr;
2600         int error;
2601
2602         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2603                 return (error);
2604         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2605         VATTR_NULL(&vattr);
2606         vattr.va_mode = mode & ALLPERMS;
2607 #ifdef MAC
2608         error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2609         if (error == 0)
2610 #endif
2611                 error = VOP_SETATTR(vp, &vattr, cred);
2612         VOP_UNLOCK(vp, 0);
2613         vn_finished_write(mp);
2614         return (error);
2615 }
2616
2617 /*
2618  * Change mode of a file given path name.
2619  */
2620 #ifndef _SYS_SYSPROTO_H_
2621 struct chmod_args {
2622         char    *path;
2623         int     mode;
2624 };
2625 #endif
2626 int
2627 sys_chmod(td, uap)
2628         struct thread *td;
2629         register struct chmod_args /* {
2630                 char *path;
2631                 int mode;
2632         } */ *uap;
2633 {
2634
2635         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2636             uap->mode, 0));
2637 }
2638
2639 #ifndef _SYS_SYSPROTO_H_
2640 struct fchmodat_args {
2641         int     dirfd;
2642         char    *path;
2643         mode_t  mode;
2644         int     flag;
2645 }
2646 #endif
2647 int
2648 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2649 {
2650         int flag = uap->flag;
2651         int fd = uap->fd;
2652         char *path = uap->path;
2653         mode_t mode = uap->mode;
2654
2655         if (flag & ~AT_SYMLINK_NOFOLLOW)
2656                 return (EINVAL);
2657
2658         return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2659 }
2660
2661 /*
2662  * Change mode of a file given path name (don't follow links.)
2663  */
2664 #ifndef _SYS_SYSPROTO_H_
2665 struct lchmod_args {
2666         char    *path;
2667         int     mode;
2668 };
2669 #endif
2670 int
2671 sys_lchmod(td, uap)
2672         struct thread *td;
2673         register struct lchmod_args /* {
2674                 char *path;
2675                 int mode;
2676         } */ *uap;
2677 {
2678
2679         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2680             uap->mode, AT_SYMLINK_NOFOLLOW));
2681 }
2682
2683 int
2684 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2685     mode_t mode, int flag)
2686 {
2687         struct nameidata nd;
2688         cap_rights_t rights;
2689         int error, follow;
2690
2691         AUDIT_ARG_MODE(mode);
2692         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2693         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2694             cap_rights_init(&rights, CAP_FCHMOD), td);
2695         if ((error = namei(&nd)) != 0)
2696                 return (error);
2697         NDFREE(&nd, NDF_ONLY_PNBUF);
2698         error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2699         vrele(nd.ni_vp);
2700         return (error);
2701 }
2702
2703 /*
2704  * Change mode of a file given a file descriptor.
2705  */
2706 #ifndef _SYS_SYSPROTO_H_
2707 struct fchmod_args {
2708         int     fd;
2709         int     mode;
2710 };
2711 #endif
2712 int
2713 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2714 {
2715         struct file *fp;
2716         cap_rights_t rights;
2717         int error;
2718
2719         AUDIT_ARG_FD(uap->fd);
2720         AUDIT_ARG_MODE(uap->mode);
2721
2722         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2723         if (error != 0)
2724                 return (error);
2725         error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2726         fdrop(fp, td);
2727         return (error);
2728 }
2729
2730 /*
2731  * Common implementation for chown(), lchown(), and fchown()
2732  */
2733 int
2734 setfown(td, cred, vp, uid, gid)
2735         struct thread *td;
2736         struct ucred *cred;
2737         struct vnode *vp;
2738         uid_t uid;
2739         gid_t gid;
2740 {
2741         struct mount *mp;
2742         struct vattr vattr;
2743         int error;
2744
2745         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2746                 return (error);
2747         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2748         VATTR_NULL(&vattr);
2749         vattr.va_uid = uid;
2750         vattr.va_gid = gid;
2751 #ifdef MAC
2752         error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2753             vattr.va_gid);
2754         if (error == 0)
2755 #endif
2756                 error = VOP_SETATTR(vp, &vattr, cred);
2757         VOP_UNLOCK(vp, 0);
2758         vn_finished_write(mp);
2759         return (error);
2760 }
2761
2762 /*
2763  * Set ownership given a path name.
2764  */
2765 #ifndef _SYS_SYSPROTO_H_
2766 struct chown_args {
2767         char    *path;
2768         int     uid;
2769         int     gid;
2770 };
2771 #endif
2772 int
2773 sys_chown(td, uap)
2774         struct thread *td;
2775         register struct chown_args /* {
2776                 char *path;
2777                 int uid;
2778                 int gid;
2779         } */ *uap;
2780 {
2781
2782         return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
2783             uap->gid, 0));
2784 }
2785
2786 #ifndef _SYS_SYSPROTO_H_
2787 struct fchownat_args {
2788         int fd;
2789         const char * path;
2790         uid_t uid;
2791         gid_t gid;
2792         int flag;
2793 };
2794 #endif
2795 int
2796 sys_fchownat(struct thread *td, struct fchownat_args *uap)
2797 {
2798         int flag;
2799
2800         flag = uap->flag;
2801         if (flag & ~AT_SYMLINK_NOFOLLOW)
2802                 return (EINVAL);
2803
2804         return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2805             uap->gid, uap->flag));
2806 }
2807
2808 int
2809 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2810     int uid, int gid, int flag)
2811 {
2812         struct nameidata nd;
2813         cap_rights_t rights;
2814         int error, follow;
2815
2816         AUDIT_ARG_OWNER(uid, gid);
2817         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2818         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2819             cap_rights_init(&rights, CAP_FCHOWN), td);
2820
2821         if ((error = namei(&nd)) != 0)
2822                 return (error);
2823         NDFREE(&nd, NDF_ONLY_PNBUF);
2824         error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2825         vrele(nd.ni_vp);
2826         return (error);
2827 }
2828
2829 /*
2830  * Set ownership given a path name, do not cross symlinks.
2831  */
2832 #ifndef _SYS_SYSPROTO_H_
2833 struct lchown_args {
2834         char    *path;
2835         int     uid;
2836         int     gid;
2837 };
2838 #endif
2839 int
2840 sys_lchown(td, uap)
2841         struct thread *td;
2842         register struct lchown_args /* {
2843                 char *path;
2844                 int uid;
2845                 int gid;
2846         } */ *uap;
2847 {
2848
2849         return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2850             uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
2851 }
2852
2853 /*
2854  * Set ownership given a file descriptor.
2855  */
2856 #ifndef _SYS_SYSPROTO_H_
2857 struct fchown_args {
2858         int     fd;
2859         int     uid;
2860         int     gid;
2861 };
2862 #endif
2863 int
2864 sys_fchown(td, uap)
2865         struct thread *td;
2866         register struct fchown_args /* {
2867                 int fd;
2868                 int uid;
2869                 int gid;
2870         } */ *uap;
2871 {
2872         struct file *fp;
2873         cap_rights_t rights;
2874         int error;
2875
2876         AUDIT_ARG_FD(uap->fd);
2877         AUDIT_ARG_OWNER(uap->uid, uap->gid);
2878         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
2879         if (error != 0)
2880                 return (error);
2881         error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
2882         fdrop(fp, td);
2883         return (error);
2884 }
2885
2886 /*
2887  * Common implementation code for utimes(), lutimes(), and futimes().
2888  */
2889 static int
2890 getutimes(usrtvp, tvpseg, tsp)
2891         const struct timeval *usrtvp;
2892         enum uio_seg tvpseg;
2893         struct timespec *tsp;
2894 {
2895         struct timeval tv[2];
2896         const struct timeval *tvp;
2897         int error;
2898
2899         if (usrtvp == NULL) {
2900                 vfs_timestamp(&tsp[0]);
2901                 tsp[1] = tsp[0];
2902         } else {
2903                 if (tvpseg == UIO_SYSSPACE) {
2904                         tvp = usrtvp;
2905                 } else {
2906                         if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
2907                                 return (error);
2908                         tvp = tv;
2909                 }
2910
2911                 if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
2912                     tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
2913                         return (EINVAL);
2914                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2915                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2916         }
2917         return (0);
2918 }
2919
2920 /*
2921  * Common implementation code for futimens(), utimensat().
2922  */
2923 #define UTIMENS_NULL    0x1
2924 #define UTIMENS_EXIT    0x2
2925 static int
2926 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
2927     struct timespec *tsp, int *retflags)
2928 {
2929         struct timespec tsnow;
2930         int error;
2931
2932         vfs_timestamp(&tsnow);
2933         *retflags = 0;
2934         if (usrtsp == NULL) {
2935                 tsp[0] = tsnow;
2936                 tsp[1] = tsnow;
2937                 *retflags |= UTIMENS_NULL;
2938                 return (0);
2939         }
2940         if (tspseg == UIO_SYSSPACE) {
2941                 tsp[0] = usrtsp[0];
2942                 tsp[1] = usrtsp[1];
2943         } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
2944                 return (error);
2945         if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
2946                 *retflags |= UTIMENS_EXIT;
2947         if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
2948                 *retflags |= UTIMENS_NULL;
2949         if (tsp[0].tv_nsec == UTIME_OMIT)
2950                 tsp[0].tv_sec = VNOVAL;
2951         else if (tsp[0].tv_nsec == UTIME_NOW)
2952                 tsp[0] = tsnow;
2953         else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
2954                 return (EINVAL);
2955         if (tsp[1].tv_nsec == UTIME_OMIT)
2956                 tsp[1].tv_sec = VNOVAL;
2957         else if (tsp[1].tv_nsec == UTIME_NOW)
2958                 tsp[1] = tsnow;
2959         else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
2960                 return (EINVAL);
2961
2962         return (0);
2963 }
2964
2965 /*
2966  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
2967  * and utimensat().
2968  */
2969 static int
2970 setutimes(td, vp, ts, numtimes, nullflag)
2971         struct thread *td;
2972         struct vnode *vp;
2973         const struct timespec *ts;
2974         int numtimes;
2975         int nullflag;
2976 {
2977         struct mount *mp;
2978         struct vattr vattr;
2979         int error, setbirthtime;
2980
2981         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2982                 return (error);
2983         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2984         setbirthtime = 0;
2985         if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
2986             timespeccmp(&ts[1], &vattr.va_birthtime, < ))
2987                 setbirthtime = 1;
2988         VATTR_NULL(&vattr);
2989         vattr.va_atime = ts[0];
2990         vattr.va_mtime = ts[1];
2991         if (setbirthtime)
2992                 vattr.va_birthtime = ts[1];
2993         if (numtimes > 2)
2994                 vattr.va_birthtime = ts[2];
2995         if (nullflag)
2996                 vattr.va_vaflags |= VA_UTIMES_NULL;
2997 #ifdef MAC
2998         error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
2999             vattr.va_mtime);
3000 #endif
3001         if (error == 0)
3002                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3003         VOP_UNLOCK(vp, 0);
3004         vn_finished_write(mp);
3005         return (error);
3006 }
3007
3008 /*
3009  * Set the access and modification times of a file.
3010  */
3011 #ifndef _SYS_SYSPROTO_H_
3012 struct utimes_args {
3013         char    *path;
3014         struct  timeval *tptr;
3015 };
3016 #endif
3017 int
3018 sys_utimes(td, uap)
3019         struct thread *td;
3020         register struct utimes_args /* {
3021                 char *path;
3022                 struct timeval *tptr;
3023         } */ *uap;
3024 {
3025
3026         return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3027             uap->tptr, UIO_USERSPACE));
3028 }
3029
3030 #ifndef _SYS_SYSPROTO_H_
3031 struct futimesat_args {
3032         int fd;
3033         const char * path;
3034         const struct timeval * times;
3035 };
3036 #endif
3037 int
3038 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3039 {
3040
3041         return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3042             uap->times, UIO_USERSPACE));
3043 }
3044
3045 int
3046 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3047     struct timeval *tptr, enum uio_seg tptrseg)
3048 {
3049         struct nameidata nd;
3050         struct timespec ts[2];
3051         cap_rights_t rights;
3052         int error;
3053
3054         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3055                 return (error);
3056         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3057             cap_rights_init(&rights, CAP_FUTIMES), td);
3058
3059         if ((error = namei(&nd)) != 0)
3060                 return (error);
3061         NDFREE(&nd, NDF_ONLY_PNBUF);
3062         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3063         vrele(nd.ni_vp);
3064         return (error);
3065 }
3066
3067 /*
3068  * Set the access and modification times of a file.
3069  */
3070 #ifndef _SYS_SYSPROTO_H_
3071 struct lutimes_args {
3072         char    *path;
3073         struct  timeval *tptr;
3074 };
3075 #endif
3076 int
3077 sys_lutimes(td, uap)
3078         struct thread *td;
3079         register struct lutimes_args /* {
3080                 char *path;
3081                 struct timeval *tptr;
3082         } */ *uap;
3083 {
3084
3085         return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3086             UIO_USERSPACE));
3087 }
3088
3089 int
3090 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3091     struct timeval *tptr, enum uio_seg tptrseg)
3092 {
3093         struct timespec ts[2];
3094         struct nameidata nd;
3095         int error;
3096
3097         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3098                 return (error);
3099         NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3100         if ((error = namei(&nd)) != 0)
3101                 return (error);
3102         NDFREE(&nd, NDF_ONLY_PNBUF);
3103         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3104         vrele(nd.ni_vp);
3105         return (error);
3106 }
3107
3108 /*
3109  * Set the access and modification times of a file.
3110  */
3111 #ifndef _SYS_SYSPROTO_H_
3112 struct futimes_args {
3113         int     fd;
3114         struct  timeval *tptr;
3115 };
3116 #endif
3117 int
3118 sys_futimes(td, uap)
3119         struct thread *td;
3120         register struct futimes_args /* {
3121                 int  fd;
3122                 struct timeval *tptr;
3123         } */ *uap;
3124 {
3125
3126         return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3127 }
3128
3129 int
3130 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3131     enum uio_seg tptrseg)
3132 {
3133         struct timespec ts[2];
3134         struct file *fp;
3135         cap_rights_t rights;
3136         int error;
3137
3138         AUDIT_ARG_FD(fd);
3139         error = getutimes(tptr, tptrseg, ts);
3140         if (error != 0)
3141                 return (error);
3142         error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
3143         if (error != 0)
3144                 return (error);
3145 #ifdef AUDIT
3146         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3147         AUDIT_ARG_VNODE1(fp->f_vnode);
3148         VOP_UNLOCK(fp->f_vnode, 0);
3149 #endif
3150         error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3151         fdrop(fp, td);
3152         return (error);
3153 }
3154
3155 int
3156 sys_futimens(struct thread *td, struct futimens_args *uap)
3157 {
3158
3159         return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3160 }
3161
3162 int
3163 kern_futimens(struct thread *td, int fd, struct timespec *tptr,
3164     enum uio_seg tptrseg)
3165 {
3166         struct timespec ts[2];
3167         struct file *fp;
3168         cap_rights_t rights;
3169         int error, flags;
3170
3171         AUDIT_ARG_FD(fd);
3172         error = getutimens(tptr, tptrseg, ts, &flags);
3173         if (error != 0)
3174                 return (error);
3175         if (flags & UTIMENS_EXIT)
3176                 return (0);
3177         error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
3178         if (error != 0)
3179                 return (error);
3180 #ifdef AUDIT
3181         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3182         AUDIT_ARG_VNODE1(fp->f_vnode);
3183         VOP_UNLOCK(fp->f_vnode, 0);
3184 #endif
3185         error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3186         fdrop(fp, td);
3187         return (error);
3188 }
3189
3190 int
3191 sys_utimensat(struct thread *td, struct utimensat_args *uap)
3192 {
3193
3194         return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3195             uap->times, UIO_USERSPACE, uap->flag));
3196 }
3197
3198 int
3199 kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3200     struct timespec *tptr, enum uio_seg tptrseg, int flag)
3201 {
3202         struct nameidata nd;
3203         struct timespec ts[2];
3204         cap_rights_t rights;
3205         int error, flags;
3206
3207         if (flag & ~AT_SYMLINK_NOFOLLOW)
3208                 return (EINVAL);
3209
3210         if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3211                 return (error);
3212         NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
3213             FOLLOW) | AUDITVNODE1, pathseg, path, fd,
3214             cap_rights_init(&rights, CAP_FUTIMES), td);
3215         if ((error = namei(&nd)) != 0)
3216                 return (error);
3217         /*
3218          * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3219          * POSIX states:
3220          * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3221          * "Search permission is denied by a component of the path prefix."
3222          */
3223         NDFREE(&nd, NDF_ONLY_PNBUF);
3224         if ((flags & UTIMENS_EXIT) == 0)
3225                 error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3226         vrele(nd.ni_vp);
3227         return (error);
3228 }
3229
3230 /*
3231  * Truncate a file given its path name.
3232  */
3233 #ifndef _SYS_SYSPROTO_H_
3234 struct truncate_args {
3235         char    *path;
3236         int     pad;
3237         off_t   length;
3238 };
3239 #endif
3240 int
3241 sys_truncate(td, uap)
3242         struct thread *td;
3243         register struct truncate_args /* {
3244                 char *path;
3245                 int pad;
3246                 off_t length;
3247         } */ *uap;
3248 {
3249
3250         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3251 }
3252
3253 int
3254 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3255 {
3256         struct mount *mp;
3257         struct vnode *vp;
3258         void *rl_cookie;
3259         struct vattr vattr;
3260         struct nameidata nd;
3261         int error;
3262
3263         if (length < 0)
3264                 return(EINVAL);
3265         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3266         if ((error = namei(&nd)) != 0)
3267                 return (error);
3268         vp = nd.ni_vp;
3269         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3270         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3271                 vn_rangelock_unlock(vp, rl_cookie);
3272                 vrele(vp);
3273                 return (error);
3274         }
3275         NDFREE(&nd, NDF_ONLY_PNBUF);
3276         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3277         if (vp->v_type == VDIR)
3278                 error = EISDIR;
3279 #ifdef MAC
3280         else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3281         }
3282 #endif
3283         else if ((error = vn_writechk(vp)) == 0 &&
3284             (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3285                 VATTR_NULL(&vattr);
3286                 vattr.va_size = length;
3287                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3288         }
3289         VOP_UNLOCK(vp, 0);
3290         vn_finished_write(mp);
3291         vn_rangelock_unlock(vp, rl_cookie);
3292         vrele(vp);
3293         return (error);
3294 }
3295
3296 #if defined(COMPAT_43)
3297 /*
3298  * Truncate a file given its path name.
3299  */
3300 #ifndef _SYS_SYSPROTO_H_
3301 struct otruncate_args {
3302         char    *path;
3303         long    length;
3304 };
3305 #endif
3306 int
3307 otruncate(td, uap)
3308         struct thread *td;
3309         register struct otruncate_args /* {
3310                 char *path;
3311                 long length;
3312         } */ *uap;
3313 {
3314         struct truncate_args /* {
3315                 char *path;
3316                 int pad;
3317                 off_t length;
3318         } */ nuap;
3319
3320         nuap.path = uap->path;
3321         nuap.length = uap->length;
3322         return (sys_truncate(td, &nuap));
3323 }
3324 #endif /* COMPAT_43 */
3325
3326 #if defined(COMPAT_FREEBSD6)
3327 /* Versions with the pad argument */
3328 int
3329 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3330 {
3331         struct truncate_args ouap;
3332
3333         ouap.path = uap->path;
3334         ouap.length = uap->length;
3335         return (sys_truncate(td, &ouap));
3336 }
3337
3338 int
3339 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3340 {
3341         struct ftruncate_args ouap;
3342
3343         ouap.fd = uap->fd;
3344         ouap.length = uap->length;
3345         return (sys_ftruncate(td, &ouap));
3346 }
3347 #endif
3348
3349 int
3350 kern_fsync(struct thread *td, int fd, bool fullsync)
3351 {
3352         struct vnode *vp;
3353         struct mount *mp;
3354         struct file *fp;
3355         cap_rights_t rights;
3356         int error, lock_flags;
3357
3358         AUDIT_ARG_FD(fd);
3359         error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
3360         if (error != 0)
3361                 return (error);
3362         vp = fp->f_vnode;
3363 #if 0
3364         if (!fullsync)
3365                 /* XXXKIB: compete outstanding aio writes */;
3366 #endif
3367         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3368         if (error != 0)
3369                 goto drop;
3370         if (MNT_SHARED_WRITES(mp) ||
3371             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3372                 lock_flags = LK_SHARED;
3373         } else {
3374                 lock_flags = LK_EXCLUSIVE;
3375         }
3376         vn_lock(vp, lock_flags | LK_RETRY);
3377         AUDIT_ARG_VNODE1(vp);
3378         if (vp->v_object != NULL) {
3379                 VM_OBJECT_WLOCK(vp->v_object);
3380                 vm_object_page_clean(vp->v_object, 0, 0, 0);
3381                 VM_OBJECT_WUNLOCK(vp->v_object);
3382         }
3383         error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3384         VOP_UNLOCK(vp, 0);
3385         vn_finished_write(mp);
3386 drop:
3387         fdrop(fp, td);
3388         return (error);
3389 }
3390
3391 /*
3392  * Sync an open file.
3393  */
3394 #ifndef _SYS_SYSPROTO_H_
3395 struct fsync_args {
3396         int     fd;
3397 };
3398 #endif
3399 int
3400 sys_fsync(struct thread *td, struct fsync_args *uap)
3401 {
3402
3403         return (kern_fsync(td, uap->fd, true));
3404 }
3405
3406 int
3407 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3408 {
3409
3410         return (kern_fsync(td, uap->fd, false));
3411 }
3412
3413 /*
3414  * Rename files.  Source and destination must either both be directories, or
3415  * both not be directories.  If target is a directory, it must be empty.
3416  */
3417 #ifndef _SYS_SYSPROTO_H_
3418 struct rename_args {
3419         char    *from;
3420         char    *to;
3421 };
3422 #endif
3423 int
3424 sys_rename(td, uap)
3425         struct thread *td;
3426         register struct rename_args /* {
3427                 char *from;
3428                 char *to;
3429         } */ *uap;
3430 {
3431
3432         return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3433             uap->to, UIO_USERSPACE));
3434 }
3435
3436 #ifndef _SYS_SYSPROTO_H_
3437 struct renameat_args {
3438         int     oldfd;
3439         char    *old;
3440         int     newfd;
3441         char    *new;
3442 };
3443 #endif
3444 int
3445 sys_renameat(struct thread *td, struct renameat_args *uap)
3446 {
3447
3448         return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3449             UIO_USERSPACE));
3450 }
3451
3452 int
3453 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3454     enum uio_seg pathseg)
3455 {
3456         struct mount *mp = NULL;
3457         struct vnode *tvp, *fvp, *tdvp;
3458         struct nameidata fromnd, tond;
3459         cap_rights_t rights;
3460         int error;
3461
3462 again:
3463         bwillwrite();
3464 #ifdef MAC
3465         NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3466             AUDITVNODE1, pathseg, old, oldfd,
3467             cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3468 #else
3469         NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3470             pathseg, old, oldfd,
3471             cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3472 #endif
3473
3474         if ((error = namei(&fromnd)) != 0)
3475                 return (error);
3476 #ifdef MAC
3477         error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3478             fromnd.ni_vp, &fromnd.ni_cnd);
3479         VOP_UNLOCK(fromnd.ni_dvp, 0);
3480         if (fromnd.ni_dvp != fromnd.ni_vp)
3481                 VOP_UNLOCK(fromnd.ni_vp, 0);
3482 #endif
3483         fvp = fromnd.ni_vp;
3484         NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3485             SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3486             cap_rights_init(&rights, CAP_RENAMEAT_TARGET), td);
3487         if (fromnd.ni_vp->v_type == VDIR)
3488                 tond.ni_cnd.cn_flags |= WILLBEDIR;
3489         if ((error = namei(&tond)) != 0) {
3490                 /* Translate error code for rename("dir1", "dir2/."). */
3491                 if (error == EISDIR && fvp->v_type == VDIR)
3492                         error = EINVAL;
3493                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3494                 vrele(fromnd.ni_dvp);
3495                 vrele(fvp);
3496                 goto out1;
3497         }
3498         tdvp = tond.ni_dvp;
3499         tvp = tond.ni_vp;
3500         error = vn_start_write(fvp, &mp, V_NOWAIT);
3501         if (error != 0) {
3502                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3503                 NDFREE(&tond, NDF_ONLY_PNBUF);
3504                 if (tvp != NULL)
3505                         vput(tvp);
3506                 if (tdvp == tvp)
3507                         vrele(tdvp);
3508                 else
3509                         vput(tdvp);
3510                 vrele(fromnd.ni_dvp);
3511                 vrele(fvp);
3512                 vrele(tond.ni_startdir);
3513                 if (fromnd.ni_startdir != NULL)
3514                         vrele(fromnd.ni_startdir);
3515                 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3516                 if (error != 0)
3517                         return (error);
3518                 goto again;
3519         }
3520         if (tvp != NULL) {
3521                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3522                         error = ENOTDIR;
3523                         goto out;
3524                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3525                         error = EISDIR;
3526                         goto out;
3527                 }
3528 #ifdef CAPABILITIES
3529                 if (newfd != AT_FDCWD) {
3530                         /*
3531                          * If the target already exists we require CAP_UNLINKAT
3532                          * from 'newfd'.
3533                          */
3534                         error = cap_check(&tond.ni_filecaps.fc_rights,
3535                             cap_rights_init(&rights, CAP_UNLINKAT));
3536                         if (error != 0)
3537                                 goto out;
3538                 }
3539 #endif
3540         }
3541         if (fvp == tdvp) {
3542                 error = EINVAL;
3543                 goto out;
3544         }
3545         /*
3546          * If the source is the same as the destination (that is, if they
3547          * are links to the same vnode), then there is nothing to do.
3548          */
3549         if (fvp == tvp)
3550                 error = -1;
3551 #ifdef MAC
3552         else
3553                 error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3554                     tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3555 #endif
3556 out:
3557         if (error == 0) {
3558                 error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3559                     tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3560                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3561                 NDFREE(&tond, NDF_ONLY_PNBUF);
3562         } else {
3563                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3564                 NDFREE(&tond, NDF_ONLY_PNBUF);
3565                 if (tvp != NULL)
3566                         vput(tvp);
3567                 if (tdvp == tvp)
3568                         vrele(tdvp);
3569                 else
3570                         vput(tdvp);
3571                 vrele(fromnd.ni_dvp);
3572                 vrele(fvp);
3573         }
3574         vrele(tond.ni_startdir);
3575         vn_finished_write(mp);
3576 out1:
3577         if (fromnd.ni_startdir)
3578                 vrele(fromnd.ni_startdir);
3579         if (error == -1)
3580                 return (0);
3581         return (error);
3582 }
3583
3584 /*
3585  * Make a directory file.
3586  */
3587 #ifndef _SYS_SYSPROTO_H_
3588 struct mkdir_args {
3589         char    *path;
3590         int     mode;
3591 };
3592 #endif
3593 int
3594 sys_mkdir(td, uap)
3595         struct thread *td;
3596         register struct mkdir_args /* {
3597                 char *path;
3598                 int mode;
3599         } */ *uap;
3600 {
3601
3602         return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3603             uap->mode));
3604 }
3605
3606 #ifndef _SYS_SYSPROTO_H_
3607 struct mkdirat_args {
3608         int     fd;
3609         char    *path;
3610         mode_t  mode;
3611 };
3612 #endif
3613 int
3614 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3615 {
3616
3617         return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3618 }
3619
3620 int
3621 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3622     int mode)
3623 {
3624         struct mount *mp;
3625         struct vnode *vp;
3626         struct vattr vattr;
3627         struct nameidata nd;
3628         cap_rights_t rights;
3629         int error;
3630
3631         AUDIT_ARG_MODE(mode);
3632 restart:
3633         bwillwrite();
3634         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3635             NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3636             td);
3637         nd.ni_cnd.cn_flags |= WILLBEDIR;
3638         if ((error = namei(&nd)) != 0)
3639                 return (error);
3640         vp = nd.ni_vp;
3641         if (vp != NULL) {
3642                 NDFREE(&nd, NDF_ONLY_PNBUF);
3643                 /*
3644                  * XXX namei called with LOCKPARENT but not LOCKLEAF has
3645                  * the strange behaviour of leaving the vnode unlocked
3646                  * if the target is the same vnode as the parent.
3647                  */
3648                 if (vp == nd.ni_dvp)
3649                         vrele(nd.ni_dvp);
3650                 else
3651                         vput(nd.ni_dvp);
3652                 vrele(vp);
3653                 return (EEXIST);
3654         }
3655         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3656                 NDFREE(&nd, NDF_ONLY_PNBUF);
3657                 vput(nd.ni_dvp);
3658                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3659                         return (error);
3660                 goto restart;
3661         }
3662         VATTR_NULL(&vattr);
3663         vattr.va_type = VDIR;
3664         vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3665 #ifdef MAC
3666         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3667             &vattr);
3668         if (error != 0)
3669                 goto out;
3670 #endif
3671         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3672 #ifdef MAC
3673 out:
3674 #endif
3675         NDFREE(&nd, NDF_ONLY_PNBUF);
3676         vput(nd.ni_dvp);
3677         if (error == 0)
3678                 vput(nd.ni_vp);
3679         vn_finished_write(mp);
3680         return (error);
3681 }
3682
3683 /*
3684  * Remove a directory file.
3685  */
3686 #ifndef _SYS_SYSPROTO_H_
3687 struct rmdir_args {
3688         char    *path;
3689 };
3690 #endif
3691 int
3692 sys_rmdir(td, uap)
3693         struct thread *td;
3694         struct rmdir_args /* {
3695                 char *path;
3696         } */ *uap;
3697 {
3698
3699         return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
3700 }
3701
3702 int
3703 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3704 {
3705         struct mount *mp;
3706         struct vnode *vp;
3707         struct nameidata nd;
3708         cap_rights_t rights;
3709         int error;
3710
3711 restart:
3712         bwillwrite();
3713         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3714             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3715         if ((error = namei(&nd)) != 0)
3716                 return (error);
3717         vp = nd.ni_vp;
3718         if (vp->v_type != VDIR) {
3719                 error = ENOTDIR;
3720                 goto out;
3721         }
3722         /*
3723          * No rmdir "." please.
3724          */
3725         if (nd.ni_dvp == vp) {
3726                 error = EINVAL;
3727                 goto out;
3728         }
3729         /*
3730          * The root of a mounted filesystem cannot be deleted.
3731          */
3732         if (vp->v_vflag & VV_ROOT) {
3733                 error = EBUSY;
3734                 goto out;
3735         }
3736 #ifdef MAC
3737         error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3738             &nd.ni_cnd);
3739         if (error != 0)
3740                 goto out;
3741 #endif
3742         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3743                 NDFREE(&nd, NDF_ONLY_PNBUF);
3744                 vput(vp);
3745                 if (nd.ni_dvp == vp)
3746                         vrele(nd.ni_dvp);
3747                 else
3748                         vput(nd.ni_dvp);
3749                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3750                         return (error);
3751                 goto restart;
3752         }
3753         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3754         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3755         vn_finished_write(mp);
3756 out:
3757         NDFREE(&nd, NDF_ONLY_PNBUF);
3758         vput(vp);
3759         if (nd.ni_dvp == vp)
3760                 vrele(nd.ni_dvp);
3761         else
3762                 vput(nd.ni_dvp);
3763         return (error);
3764 }
3765
3766 #ifdef COMPAT_43
3767 /*
3768  * Read a block of directory entries in a filesystem independent format.
3769  */
3770 #ifndef _SYS_SYSPROTO_H_
3771 struct ogetdirentries_args {
3772         int     fd;
3773         char    *buf;
3774         u_int   count;
3775         long    *basep;
3776 };
3777 #endif
3778 int
3779 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3780 {
3781         long loff;
3782         int error;
3783
3784         error = kern_ogetdirentries(td, uap, &loff);
3785         if (error == 0)
3786                 error = copyout(&loff, uap->basep, sizeof(long));
3787         return (error);
3788 }
3789
3790 int
3791 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3792     long *ploff)
3793 {
3794         struct vnode *vp;
3795         struct file *fp;
3796         struct uio auio, kuio;
3797         struct iovec aiov, kiov;
3798         struct dirent *dp, *edp;
3799         cap_rights_t rights;
3800         caddr_t dirbuf;
3801         int error, eofflag, readcnt;
3802         long loff;
3803         off_t foffset;
3804
3805         /* XXX arbitrary sanity limit on `count'. */
3806         if (uap->count > 64 * 1024)
3807                 return (EINVAL);
3808         error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp);
3809         if (error != 0)
3810                 return (error);
3811         if ((fp->f_flag & FREAD) == 0) {
3812                 fdrop(fp, td);
3813                 return (EBADF);
3814         }
3815         vp = fp->f_vnode;
3816         foffset = foffset_lock(fp, 0);
3817 unionread:
3818         if (vp->v_type != VDIR) {
3819                 foffset_unlock(fp, foffset, 0);
3820                 fdrop(fp, td);
3821                 return (EINVAL);
3822         }
3823         aiov.iov_base = uap->buf;
3824         aiov.iov_len = uap->count;
3825         auio.uio_iov = &aiov;
3826         auio.uio_iovcnt = 1;
3827         auio.uio_rw = UIO_READ;
3828         auio.uio_segflg = UIO_USERSPACE;
3829         auio.uio_td = td;
3830         auio.uio_resid = uap->count;
3831         vn_lock(vp, LK_SHARED | LK_RETRY);
3832         loff = auio.uio_offset = foffset;
3833 #ifdef MAC
3834         error = mac_vnode_check_readdir(td->td_ucred, vp);
3835         if (error != 0) {
3836                 VOP_UNLOCK(vp, 0);
3837                 foffset_unlock(fp, foffset, FOF_NOUPDATE);
3838                 fdrop(fp, td);
3839                 return (error);
3840         }
3841 #endif
3842 #       if (BYTE_ORDER != LITTLE_ENDIAN)
3843                 if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3844                         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3845                             NULL, NULL);
3846                         foffset = auio.uio_offset;
3847                 } else
3848 #       endif
3849         {
3850                 kuio = auio;
3851                 kuio.uio_iov = &kiov;
3852                 kuio.uio_segflg = UIO_SYSSPACE;
3853                 kiov.iov_len = uap->count;
3854                 dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3855                 kiov.iov_base = dirbuf;
3856                 error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3857                             NULL, NULL);
3858                 foffset = kuio.uio_offset;
3859                 if (error == 0) {
3860                         readcnt = uap->count - kuio.uio_resid;
3861                         edp = (struct dirent *)&dirbuf[readcnt];
3862                         for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3863 #                               if (BYTE_ORDER == LITTLE_ENDIAN)
3864                                         /*
3865                                          * The expected low byte of
3866                                          * dp->d_namlen is our dp->d_type.
3867                                          * The high MBZ byte of dp->d_namlen
3868                                          * is our dp->d_namlen.
3869                                          */
3870                                         dp->d_type = dp->d_namlen;
3871                                         dp->d_namlen = 0;
3872 #                               else
3873                                         /*
3874                                          * The dp->d_type is the high byte
3875                                          * of the expected dp->d_namlen,
3876                                          * so must be zero'ed.
3877                                          */
3878                                         dp->d_type = 0;
3879 #                               endif
3880                                 if (dp->d_reclen > 0) {
3881                                         dp = (struct dirent *)
3882                                             ((char *)dp + dp->d_reclen);
3883                                 } else {
3884                                         error = EIO;
3885                                         break;
3886                                 }
3887                         }
3888                         if (dp >= edp)
3889                                 error = uiomove(dirbuf, readcnt, &auio);
3890                 }
3891                 free(dirbuf, M_TEMP);
3892         }
3893         if (error != 0) {
3894                 VOP_UNLOCK(vp, 0);
3895                 foffset_unlock(fp, foffset, 0);
3896                 fdrop(fp, td);
3897                 return (error);
3898         }
3899         if (uap->count == auio.uio_resid &&
3900             (vp->v_vflag & VV_ROOT) &&
3901             (vp->v_mount->mnt_flag & MNT_UNION)) {
3902                 struct vnode *tvp = vp;
3903                 vp = vp->v_mount->mnt_vnodecovered;
3904                 VREF(vp);
3905                 fp->f_vnode = vp;
3906                 fp->f_data = vp;
3907                 foffset = 0;
3908                 vput(tvp);
3909                 goto unionread;
3910         }
3911         VOP_UNLOCK(vp, 0);
3912         foffset_unlock(fp, foffset, 0);
3913         fdrop(fp, td);
3914         td->td_retval[0] = uap->count - auio.uio_resid;
3915         if (error == 0)
3916                 *ploff = loff;
3917         return (error);
3918 }
3919 #endif /* COMPAT_43 */
3920
3921 /*
3922  * Read a block of directory entries in a filesystem independent format.
3923  */
3924 #ifndef _SYS_SYSPROTO_H_
3925 struct getdirentries_args {
3926         int     fd;
3927         char    *buf;
3928         u_int   count;
3929         long    *basep;
3930 };
3931 #endif
3932 int
3933 sys_getdirentries(td, uap)
3934         struct thread *td;
3935         register struct getdirentries_args /* {
3936                 int fd;
3937                 char *buf;
3938                 u_int count;
3939                 long *basep;
3940         } */ *uap;
3941 {
3942         long base;
3943         int error;
3944
3945         error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
3946             NULL, UIO_USERSPACE);
3947         if (error != 0)
3948                 return (error);
3949         if (uap->basep != NULL)
3950                 error = copyout(&base, uap->basep, sizeof(long));
3951         return (error);
3952 }
3953
3954 int
3955 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
3956     long *basep, ssize_t *residp, enum uio_seg bufseg)
3957 {
3958         struct vnode *vp;
3959         struct file *fp;
3960         struct uio auio;
3961         struct iovec aiov;
3962         cap_rights_t rights;
3963         long loff;
3964         int error, eofflag;
3965         off_t foffset;
3966
3967         AUDIT_ARG_FD(fd);
3968         if (count > IOSIZE_MAX)
3969                 return (EINVAL);
3970         auio.uio_resid = count;
3971         error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
3972         if (error != 0)
3973                 return (error);
3974         if ((fp->f_flag & FREAD) == 0) {
3975                 fdrop(fp, td);
3976                 return (EBADF);
3977         }
3978         vp = fp->f_vnode;
3979         foffset = foffset_lock(fp, 0);
3980 unionread:
3981         if (vp->v_type != VDIR) {
3982                 error = EINVAL;
3983                 goto fail;
3984         }
3985         aiov.iov_base = buf;
3986         aiov.iov_len = count;
3987         auio.uio_iov = &aiov;
3988         auio.uio_iovcnt = 1;
3989         auio.uio_rw = UIO_READ;
3990         auio.uio_segflg = bufseg;
3991         auio.uio_td = td;
3992         vn_lock(vp, LK_SHARED | LK_RETRY);
3993         AUDIT_ARG_VNODE1(vp);
3994         loff = auio.uio_offset = foffset;
3995 #ifdef MAC
3996         error = mac_vnode_check_readdir(td->td_ucred, vp);
3997         if (error == 0)
3998 #endif
3999                 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4000                     NULL);
4001         foffset = auio.uio_offset;
4002         if (error != 0) {
4003                 VOP_UNLOCK(vp, 0);
4004                 goto fail;
4005         }
4006         if (count == auio.uio_resid &&
4007             (vp->v_vflag & VV_ROOT) &&
4008             (vp->v_mount->mnt_flag & MNT_UNION)) {
4009                 struct vnode *tvp = vp;
4010
4011                 vp = vp->v_mount->mnt_vnodecovered;
4012                 VREF(vp);
4013                 fp->f_vnode = vp;
4014                 fp->f_data = vp;
4015                 foffset = 0;
4016                 vput(tvp);
4017                 goto unionread;
4018         }
4019         VOP_UNLOCK(vp, 0);
4020         *basep = loff;
4021         if (residp != NULL)
4022                 *residp = auio.uio_resid;
4023         td->td_retval[0] = count - auio.uio_resid;
4024 fail:
4025         foffset_unlock(fp, foffset, 0);
4026         fdrop(fp, td);
4027         return (error);
4028 }
4029
4030 #ifndef _SYS_SYSPROTO_H_
4031 struct getdents_args {
4032         int fd;
4033         char *buf;
4034         size_t count;
4035 };
4036 #endif
4037 int
4038 sys_getdents(td, uap)
4039         struct thread *td;
4040         register struct getdents_args /* {
4041                 int fd;
4042                 char *buf;
4043                 u_int count;
4044         } */ *uap;
4045 {
4046         struct getdirentries_args ap;
4047
4048         ap.fd = uap->fd;
4049         ap.buf = uap->buf;
4050         ap.count = uap->count;
4051         ap.basep = NULL;
4052         return (sys_getdirentries(td, &ap));
4053 }
4054
4055 /*
4056  * Set the mode mask for creation of filesystem nodes.
4057  */
4058 #ifndef _SYS_SYSPROTO_H_
4059 struct umask_args {
4060         int     newmask;
4061 };
4062 #endif
4063 int
4064 sys_umask(td, uap)
4065         struct thread *td;
4066         struct umask_args /* {
4067                 int newmask;
4068         } */ *uap;
4069 {
4070         struct filedesc *fdp;
4071
4072         fdp = td->td_proc->p_fd;
4073         FILEDESC_XLOCK(fdp);
4074         td->td_retval[0] = fdp->fd_cmask;
4075         fdp->fd_cmask = uap->newmask & ALLPERMS;
4076         FILEDESC_XUNLOCK(fdp);
4077         return (0);
4078 }
4079
4080 /*
4081  * Void all references to file by ripping underlying filesystem away from
4082  * vnode.
4083  */
4084 #ifndef _SYS_SYSPROTO_H_
4085 struct revoke_args {
4086         char    *path;
4087 };
4088 #endif
4089 int
4090 sys_revoke(td, uap)
4091         struct thread *td;
4092         register struct revoke_args /* {
4093                 char *path;
4094         } */ *uap;
4095 {
4096         struct vnode *vp;
4097         struct vattr vattr;
4098         struct nameidata nd;
4099         int error;
4100
4101         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4102             uap->path, td);
4103         if ((error = namei(&nd)) != 0)
4104                 return (error);
4105         vp = nd.ni_vp;
4106         NDFREE(&nd, NDF_ONLY_PNBUF);
4107         if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4108                 error = EINVAL;
4109                 goto out;
4110         }
4111 #ifdef MAC
4112         error = mac_vnode_check_revoke(td->td_ucred, vp);
4113         if (error != 0)
4114                 goto out;
4115 #endif
4116         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4117         if (error != 0)
4118                 goto out;
4119         if (td->td_ucred->cr_uid != vattr.va_uid) {
4120                 error = priv_check(td, PRIV_VFS_ADMIN);
4121                 if (error != 0)
4122                         goto out;
4123         }
4124         if (vcount(vp) > 1)
4125                 VOP_REVOKE(vp, REVOKEALL);
4126 out:
4127         vput(vp);
4128         return (error);
4129 }
4130
4131 /*
4132  * Convert a user file descriptor to a kernel file entry and check that, if it
4133  * is a capability, the correct rights are present. A reference on the file
4134  * entry is held upon returning.
4135  */
4136 int
4137 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
4138 {
4139         struct file *fp;
4140         int error;
4141
4142         error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
4143         if (error != 0)
4144                 return (error);
4145
4146         /*
4147          * The file could be not of the vnode type, or it may be not
4148          * yet fully initialized, in which case the f_vnode pointer
4149          * may be set, but f_ops is still badfileops.  E.g.,
4150          * devfs_open() transiently create such situation to
4151          * facilitate csw d_fdopen().
4152          *
4153          * Dupfdopen() handling in kern_openat() installs the
4154          * half-baked file into the process descriptor table, allowing
4155          * other thread to dereference it. Guard against the race by
4156          * checking f_ops.
4157          */
4158         if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4159                 fdrop(fp, td);
4160                 return (EINVAL);
4161         }
4162         *fpp = fp;
4163         return (0);
4164 }
4165
4166
4167 /*
4168  * Get an (NFS) file handle.
4169  */
4170 #ifndef _SYS_SYSPROTO_H_
4171 struct lgetfh_args {
4172         char    *fname;
4173         fhandle_t *fhp;
4174 };
4175 #endif
4176 int
4177 sys_lgetfh(td, uap)
4178         struct thread *td;
4179         register struct lgetfh_args *uap;
4180 {
4181         struct nameidata nd;
4182         fhandle_t fh;
4183         register struct vnode *vp;
4184         int error;
4185
4186         error = priv_check(td, PRIV_VFS_GETFH);
4187         if (error != 0)
4188                 return (error);
4189         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4190             uap->fname, td);
4191         error = namei(&nd);
4192         if (error != 0)
4193                 return (error);
4194         NDFREE(&nd, NDF_ONLY_PNBUF);
4195         vp = nd.ni_vp;
4196         bzero(&fh, sizeof(fh));
4197         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4198         error = VOP_VPTOFH(vp, &fh.fh_fid);
4199         vput(vp);
4200         if (error == 0)
4201                 error = copyout(&fh, uap->fhp, sizeof (fh));
4202         return (error);
4203 }
4204
4205 #ifndef _SYS_SYSPROTO_H_
4206 struct getfh_args {
4207         char    *fname;
4208         fhandle_t *fhp;
4209 };
4210 #endif
4211 int
4212 sys_getfh(td, uap)
4213         struct thread *td;
4214         register struct getfh_args *uap;
4215 {
4216         struct nameidata nd;
4217         fhandle_t fh;
4218         register struct vnode *vp;
4219         int error;
4220
4221         error = priv_check(td, PRIV_VFS_GETFH);
4222         if (error != 0)
4223                 return (error);
4224         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4225             uap->fname, td);
4226         error = namei(&nd);
4227         if (error != 0)
4228                 return (error);
4229         NDFREE(&nd, NDF_ONLY_PNBUF);
4230         vp = nd.ni_vp;
4231         bzero(&fh, sizeof(fh));
4232         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4233         error = VOP_VPTOFH(vp, &fh.fh_fid);
4234         vput(vp);
4235         if (error == 0)
4236                 error = copyout(&fh, uap->fhp, sizeof (fh));
4237         return (error);
4238 }
4239
4240 /*
4241  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4242  * open descriptor.
4243  *
4244  * warning: do not remove the priv_check() call or this becomes one giant
4245  * security hole.
4246  */
4247 #ifndef _SYS_SYSPROTO_H_
4248 struct fhopen_args {
4249         const struct fhandle *u_fhp;
4250         int flags;
4251 };
4252 #endif
4253 int
4254 sys_fhopen(td, uap)
4255         struct thread *td;
4256         struct fhopen_args /* {
4257                 const struct fhandle *u_fhp;
4258                 int flags;
4259         } */ *uap;
4260 {
4261         struct mount *mp;
4262         struct vnode *vp;
4263         struct fhandle fhp;
4264         struct file *fp;
4265         int fmode, error;
4266         int indx;
4267
4268         error = priv_check(td, PRIV_VFS_FHOPEN);
4269         if (error != 0)
4270                 return (error);
4271         indx = -1;
4272         fmode = FFLAGS(uap->flags);
4273         /* why not allow a non-read/write open for our lockd? */
4274         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4275                 return (EINVAL);
4276         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4277         if (error != 0)
4278                 return(error);
4279         /* find the mount point */
4280         mp = vfs_busyfs(&fhp.fh_fsid);
4281         if (mp == NULL)
4282                 return (ESTALE);
4283         /* now give me my vnode, it gets returned to me locked */
4284         error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4285         vfs_unbusy(mp);
4286         if (error != 0)
4287                 return (error);
4288
4289         error = falloc_noinstall(td, &fp);
4290         if (error != 0) {
4291                 vput(vp);
4292                 return (error);
4293         }
4294         /*
4295          * An extra reference on `fp' has been held for us by
4296          * falloc_noinstall().
4297          */
4298
4299 #ifdef INVARIANTS
4300         td->td_dupfd = -1;
4301 #endif
4302         error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4303         if (error != 0) {
4304                 KASSERT(fp->f_ops == &badfileops,
4305                     ("VOP_OPEN in fhopen() set f_ops"));
4306                 KASSERT(td->td_dupfd < 0,
4307                     ("fhopen() encountered fdopen()"));
4308
4309                 vput(vp);
4310                 goto bad;
4311         }
4312 #ifdef INVARIANTS
4313         td->td_dupfd = 0;
4314 #endif
4315         fp->f_vnode = vp;
4316         fp->f_seqcount = 1;
4317         finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4318             &vnops);
4319         VOP_UNLOCK(vp, 0);
4320         if ((fmode & O_TRUNC) != 0) {
4321                 error = fo_truncate(fp, 0, td->td_ucred, td);
4322                 if (error != 0)
4323                         goto bad;
4324         }
4325
4326         error = finstall(td, fp, &indx, fmode, NULL);
4327 bad:
4328         fdrop(fp, td);
4329         td->td_retval[0] = indx;
4330         return (error);
4331 }
4332
4333 /*
4334  * Stat an (NFS) file handle.
4335  */
4336 #ifndef _SYS_SYSPROTO_H_
4337 struct fhstat_args {
4338         struct fhandle *u_fhp;
4339         struct stat *sb;
4340 };
4341 #endif
4342 int
4343 sys_fhstat(td, uap)
4344         struct thread *td;
4345         register struct fhstat_args /* {
4346                 struct fhandle *u_fhp;
4347                 struct stat *sb;
4348         } */ *uap;
4349 {
4350         struct stat sb;
4351         struct fhandle fh;
4352         int error;
4353
4354         error = copyin(uap->u_fhp, &fh, sizeof(fh));
4355         if (error != 0)
4356                 return (error);
4357         error = kern_fhstat(td, fh, &sb);
4358         if (error == 0)
4359                 error = copyout(&sb, uap->sb, sizeof(sb));
4360         return (error);
4361 }
4362
4363 int
4364 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4365 {
4366         struct mount *mp;
4367         struct vnode *vp;
4368         int error;
4369
4370         error = priv_check(td, PRIV_VFS_FHSTAT);
4371         if (error != 0)
4372                 return (error);
4373         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4374                 return (ESTALE);
4375         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4376         vfs_unbusy(mp);
4377         if (error != 0)
4378                 return (error);
4379         error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4380         vput(vp);
4381         return (error);
4382 }
4383
4384 /*
4385  * Implement fstatfs() for (NFS) file handles.
4386  */
4387 #ifndef _SYS_SYSPROTO_H_
4388 struct fhstatfs_args {
4389         struct fhandle *u_fhp;
4390         struct statfs *buf;
4391 };
4392 #endif
4393 int
4394 sys_fhstatfs(td, uap)
4395         struct thread *td;
4396         struct fhstatfs_args /* {
4397                 struct fhandle *u_fhp;
4398                 struct statfs *buf;
4399         } */ *uap;
4400 {
4401         struct statfs sf;
4402         fhandle_t fh;
4403         int error;
4404
4405         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4406         if (error != 0)
4407                 return (error);
4408         error = kern_fhstatfs(td, fh, &sf);
4409         if (error != 0)
4410                 return (error);
4411         return (copyout(&sf, uap->buf, sizeof(sf)));
4412 }
4413
4414 int
4415 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4416 {
4417         struct statfs *sp;
4418         struct mount *mp;
4419         struct vnode *vp;
4420         int error;
4421
4422         error = priv_check(td, PRIV_VFS_FHSTATFS);
4423         if (error != 0)
4424                 return (error);
4425         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4426                 return (ESTALE);
4427         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4428         if (error != 0) {
4429                 vfs_unbusy(mp);
4430                 return (error);
4431         }
4432         vput(vp);
4433         error = prison_canseemount(td->td_ucred, mp);
4434         if (error != 0)
4435                 goto out;
4436 #ifdef MAC
4437         error = mac_mount_check_stat(td->td_ucred, mp);
4438         if (error != 0)
4439                 goto out;
4440 #endif
4441         /*
4442          * Set these in case the underlying filesystem fails to do so.
4443          */
4444         sp = &mp->mnt_stat;
4445         sp->f_version = STATFS_VERSION;
4446         sp->f_namemax = NAME_MAX;
4447         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4448         error = VFS_STATFS(mp, sp);
4449         if (error == 0)
4450                 *buf = *sp;
4451 out:
4452         vfs_unbusy(mp);
4453         return (error);
4454 }
4455
4456 int
4457 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4458 {
4459         struct file *fp;
4460         struct mount *mp;
4461         struct vnode *vp;
4462         cap_rights_t rights;
4463         off_t olen, ooffset;
4464         int error;
4465
4466         if (offset < 0 || len <= 0)
4467                 return (EINVAL);
4468         /* Check for wrap. */
4469         if (offset > OFF_MAX - len)
4470                 return (EFBIG);
4471         error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4472         if (error != 0)
4473                 return (error);
4474         if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4475                 error = ESPIPE;
4476                 goto out;
4477         }
4478         if ((fp->f_flag & FWRITE) == 0) {
4479                 error = EBADF;
4480                 goto out;
4481         }
4482         if (fp->f_type != DTYPE_VNODE) {
4483                 error = ENODEV;
4484                 goto out;
4485         }
4486         vp = fp->f_vnode;
4487         if (vp->v_type != VREG) {
4488                 error = ENODEV;
4489                 goto out;
4490         }
4491
4492         /* Allocating blocks may take a long time, so iterate. */
4493         for (;;) {
4494                 olen = len;
4495                 ooffset = offset;
4496
4497                 bwillwrite();
4498                 mp = NULL;
4499                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4500                 if (error != 0)
4501                         break;
4502                 error = vn_lock(vp, LK_EXCLUSIVE);
4503                 if (error != 0) {
4504                         vn_finished_write(mp);
4505                         break;
4506                 }
4507 #ifdef MAC
4508                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4509                 if (error == 0)
4510 #endif
4511                         error = VOP_ALLOCATE(vp, &offset, &len);
4512                 VOP_UNLOCK(vp, 0);
4513                 vn_finished_write(mp);
4514
4515                 if (olen + ooffset != offset + len) {
4516                         panic("offset + len changed from %jx/%jx to %jx/%jx",
4517                             ooffset, olen, offset, len);
4518                 }
4519                 if (error != 0 || len == 0)
4520                         break;
4521                 KASSERT(olen > len, ("Iteration did not make progress?"));
4522                 maybe_yield();
4523         }
4524  out:
4525         fdrop(fp, td);
4526         return (error);
4527 }
4528
4529 int
4530 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4531 {
4532         int error;
4533
4534         error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
4535         return (kern_posix_error(td, error));
4536 }
4537
4538 /*
4539  * Unlike madvise(2), we do not make a best effort to remember every
4540  * possible caching hint.  Instead, we remember the last setting with
4541  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4542  * region of any current setting.
4543  */
4544 int
4545 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4546     int advice)
4547 {
4548         struct fadvise_info *fa, *new;
4549         struct file *fp;
4550         struct vnode *vp;
4551         cap_rights_t rights;
4552         off_t end;
4553         int error;
4554
4555         if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4556                 return (EINVAL);
4557         switch (advice) {
4558         case POSIX_FADV_SEQUENTIAL:
4559         case POSIX_FADV_RANDOM:
4560         case POSIX_FADV_NOREUSE:
4561                 new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4562                 break;
4563         case POSIX_FADV_NORMAL:
4564         case POSIX_FADV_WILLNEED:
4565         case POSIX_FADV_DONTNEED:
4566                 new = NULL;
4567                 break;
4568         default:
4569                 return (EINVAL);
4570         }
4571         /* XXX: CAP_POSIX_FADVISE? */
4572         error = fget(td, fd, cap_rights_init(&rights), &fp);
4573         if (error != 0)
4574                 goto out;
4575         if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4576                 error = ESPIPE;
4577                 goto out;
4578         }
4579         if (fp->f_type != DTYPE_VNODE) {
4580                 error = ENODEV;
4581                 goto out;
4582         }
4583         vp = fp->f_vnode;
4584         if (vp->v_type != VREG) {
4585                 error = ENODEV;
4586                 goto out;
4587         }
4588         if (len == 0)
4589                 end = OFF_MAX;
4590         else
4591                 end = offset + len - 1;
4592         switch (advice) {
4593         case POSIX_FADV_SEQUENTIAL:
4594         case POSIX_FADV_RANDOM:
4595         case POSIX_FADV_NOREUSE:
4596                 /*
4597                  * Try to merge any existing non-standard region with
4598                  * this new region if possible, otherwise create a new
4599                  * non-standard region for this request.
4600                  */
4601                 mtx_pool_lock(mtxpool_sleep, fp);
4602                 fa = fp->f_advice;
4603                 if (fa != NULL && fa->fa_advice == advice &&
4604                     ((fa->fa_start <= end && fa->fa_end >= offset) ||
4605                     (end != OFF_MAX && fa->fa_start == end + 1) ||
4606                     (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4607                         if (offset < fa->fa_start)
4608                                 fa->fa_start = offset;
4609                         if (end > fa->fa_end)
4610                                 fa->fa_end = end;
4611                 } else {
4612                         new->fa_advice = advice;
4613                         new->fa_start = offset;
4614                         new->fa_end = end;
4615                         fp->f_advice = new;
4616                         new = fa;
4617                 }
4618                 mtx_pool_unlock(mtxpool_sleep, fp);
4619                 break;
4620         case POSIX_FADV_NORMAL:
4621                 /*
4622                  * If a the "normal" region overlaps with an existing
4623                  * non-standard region, trim or remove the
4624                  * non-standard region.
4625                  */
4626                 mtx_pool_lock(mtxpool_sleep, fp);
4627                 fa = fp->f_advice;
4628                 if (fa != NULL) {
4629                         if (offset <= fa->fa_start && end >= fa->fa_end) {
4630                                 new = fa;
4631                                 fp->f_advice = NULL;
4632                         } else if (offset <= fa->fa_start &&
4633                             end >= fa->fa_start)
4634                                 fa->fa_start = end + 1;
4635                         else if (offset <= fa->fa_end && end >= fa->fa_end)
4636                                 fa->fa_end = offset - 1;
4637                         else if (offset >= fa->fa_start && end <= fa->fa_end) {
4638                                 /*
4639                                  * If the "normal" region is a middle
4640                                  * portion of the existing
4641                                  * non-standard region, just remove
4642                                  * the whole thing rather than picking
4643                                  * one side or the other to
4644                                  * preserve.
4645                                  */
4646                                 new = fa;
4647                                 fp->f_advice = NULL;
4648                         }
4649                 }
4650                 mtx_pool_unlock(mtxpool_sleep, fp);
4651                 break;
4652         case POSIX_FADV_WILLNEED:
4653         case POSIX_FADV_DONTNEED:
4654                 error = VOP_ADVISE(vp, offset, end, advice);
4655                 break;
4656         }
4657 out:
4658         if (fp != NULL)
4659                 fdrop(fp, td);
4660         free(new, M_FADVISE);
4661         return (error);
4662 }
4663
4664 int
4665 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4666 {
4667         int error;
4668
4669         error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4670             uap->advice);
4671         return (kern_posix_error(td, error));
4672 }