sys/kern/vfs_syscalls.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_capsicum.h"
  41 #include "opt_compat.h"
  42 #include "opt_kdtrace.h"
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/bio.h>
  48 #include <sys/buf.h>
  49 #include <sys/capsicum.h>
  50 #include <sys/disk.h>
  51 #include <sys/sysent.h>
  52 #include <sys/malloc.h>
  53 #include <sys/mount.h>
  54 #include <sys/mutex.h>
  55 #include <sys/sysproto.h>
  56 #include <sys/namei.h>
  57 #include <sys/filedesc.h>
  58 #include <sys/kernel.h>
  59 #include <sys/fcntl.h>
  60 #include <sys/file.h>
  61 #include <sys/filio.h>
  62 #include <sys/limits.h>
  63 #include <sys/linker.h>
  64 #include <sys/rwlock.h>
  65 #include <sys/sdt.h>
  66 #include <sys/stat.h>
  67 #include <sys/sx.h>
  68 #include <sys/unistd.h>
  69 #include <sys/vnode.h>
  70 #include <sys/priv.h>
  71 #include <sys/proc.h>
  72 #include <sys/dirent.h>
  73 #include <sys/jail.h>
  74 #include <sys/syscallsubr.h>
  75 #include <sys/sysctl.h>
  76 #ifdef KTRACE
  77 #include <sys/ktrace.h>
  78 #endif
  79
  80 #include <machine/stdarg.h>
  81
  82 #include <security/audit/audit.h>
  83 #include <security/mac/mac_framework.h>
  84
  85 #include <vm/vm.h>
  86 #include <vm/vm_object.h>
  87 #include <vm/vm_page.h>
  88 #include <vm/uma.h>
  89
  90 #include <ufs/ufs/quota.h>
  91
  92 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
  93
  94 SDT_PROVIDER_DEFINE(vfs);
  95 SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
  96 SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
  97
  98 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
  99 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 100 static int kern_chflags(struct thread *td, const char *path,
 101     enum uio_seg pathseg, u_long flags);
 102 static int kern_chflagsat(struct thread *td, int fd, const char *path,
 103     enum uio_seg pathseg, u_long flags, int atflag);
 104 static int setfflags(struct thread *td, struct vnode *, u_long);
 105 static int setutimes(struct thread *td, struct vnode *,
 106     const struct timespec *, int, int);
 107 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
 108     struct thread *td);
 109
 110 /*
 111  * The module initialization routine for POSIX asynchronous I/O will
 112  * set this to the version of AIO that it implements.  (Zero means
 113  * that it is not implemented.)  This value is used here by pathconf()
 114  * and in kern_descrip.c by fpathconf().
 115  */
 116 int async_io_version;
 117
 118 /*
 119  * Sync each mounted filesystem.
 120  */
 121 #ifndef _SYS_SYSPROTO_H_
 122 struct sync_args {
 123         int     dummy;
 124 };
 125 #endif
 126 /* ARGSUSED */
 127 int
 128 sys_sync(td, uap)
 129         struct thread *td;
 130         struct sync_args *uap;
 131 {
 132         struct mount *mp, *nmp;
 133         int save;
 134
 135         mtx_lock(&mountlist_mtx);
 136         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 137                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 138                         nmp = TAILQ_NEXT(mp, mnt_list);
 139                         continue;
 140                 }
 141                 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 142                     vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 143                         save = curthread_pflags_set(TDP_SYNCIO);
 144                         vfs_msync(mp, MNT_NOWAIT);
 145                         VFS_SYNC(mp, MNT_NOWAIT);
 146                         curthread_pflags_restore(save);
 147                         vn_finished_write(mp);
 148                 }
 149                 mtx_lock(&mountlist_mtx);
 150                 nmp = TAILQ_NEXT(mp, mnt_list);
 151                 vfs_unbusy(mp);
 152         }
 153         mtx_unlock(&mountlist_mtx);
 154         return (0);
 155 }
 156
 157 /*
 158  * Change filesystem quotas.
 159  */
 160 #ifndef _SYS_SYSPROTO_H_
 161 struct quotactl_args {
 162         char *path;
 163         int cmd;
 164         int uid;
 165         caddr_t arg;
 166 };
 167 #endif
 168 int
 169 sys_quotactl(td, uap)
 170         struct thread *td;
 171         register struct quotactl_args /* {
 172                 char *path;
 173                 int cmd;
 174                 int uid;
 175                 caddr_t arg;
 176         } */ *uap;
 177 {
 178         struct mount *mp;
 179         struct nameidata nd;
 180         int error;
 181
 182         AUDIT_ARG_CMD(uap->cmd);
 183         AUDIT_ARG_UID(uap->uid);
 184         if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 185                 return (EPERM);
 186         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 187             uap->path, td);
 188         if ((error = namei(&nd)) != 0)
 189                 return (error);
 190         NDFREE(&nd, NDF_ONLY_PNBUF);
 191         mp = nd.ni_vp->v_mount;
 192         vfs_ref(mp);
 193         vput(nd.ni_vp);
 194         error = vfs_busy(mp, 0);
 195         vfs_rel(mp);
 196         if (error != 0)
 197                 return (error);
 198         error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 199
 200         /*
 201          * Since quota on operation typically needs to open quota
 202          * file, the Q_QUOTAON handler needs to unbusy the mount point
 203          * before calling into namei.  Otherwise, unmount might be
 204          * started between two vfs_busy() invocations (first is our,
 205          * second is from mount point cross-walk code in lookup()),
 206          * causing deadlock.
 207          *
 208          * Require that Q_QUOTAON handles the vfs_busy() reference on
 209          * its own, always returning with ubusied mount point.
 210          */
 211         if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
 212                 vfs_unbusy(mp);
 213         return (error);
 214 }
 215
 216 /*
 217  * Used by statfs conversion routines to scale the block size up if
 218  * necessary so that all of the block counts are <= 'max_size'.  Note
 219  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
 220  * value of 'n'.
 221  */
 222 void
 223 statfs_scale_blocks(struct statfs *sf, long max_size)
 224 {
 225         uint64_t count;
 226         int shift;
 227
 228         KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 229
 230         /*
 231          * Attempt to scale the block counts to give a more accurate
 232          * overview to userland of the ratio of free space to used
 233          * space.  To do this, find the largest block count and compute
 234          * a divisor that lets it fit into a signed integer <= max_size.
 235          */
 236         if (sf->f_bavail < 0)
 237                 count = -sf->f_bavail;
 238         else
 239                 count = sf->f_bavail;
 240         count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 241         if (count <= max_size)
 242                 return;
 243
 244         count >>= flsl(max_size);
 245         shift = 0;
 246         while (count > 0) {
 247                 shift++;
 248                 count >>=1;
 249         }
 250
 251         sf->f_bsize <<= shift;
 252         sf->f_blocks >>= shift;
 253         sf->f_bfree >>= shift;
 254         sf->f_bavail >>= shift;
 255 }
 256
 257 /*
 258  * Get filesystem statistics.
 259  */
 260 #ifndef _SYS_SYSPROTO_H_
 261 struct statfs_args {
 262         char *path;
 263         struct statfs *buf;
 264 };
 265 #endif
 266 int
 267 sys_statfs(td, uap)
 268         struct thread *td;
 269         register struct statfs_args /* {
 270                 char *path;
 271                 struct statfs *buf;
 272         } */ *uap;
 273 {
 274         struct statfs sf;
 275         int error;
 276
 277         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 278         if (error == 0)
 279                 error = copyout(&sf, uap->buf, sizeof(sf));
 280         return (error);
 281 }
 282
 283 int
 284 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
 285     struct statfs *buf)
 286 {
 287         struct mount *mp;
 288         struct statfs *sp, sb;
 289         struct nameidata nd;
 290         int error;
 291
 292         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 293             pathseg, path, td);
 294         error = namei(&nd);
 295         if (error != 0)
 296                 return (error);
 297         mp = nd.ni_vp->v_mount;
 298         vfs_ref(mp);
 299         NDFREE(&nd, NDF_ONLY_PNBUF);
 300         vput(nd.ni_vp);
 301         error = vfs_busy(mp, 0);
 302         vfs_rel(mp);
 303         if (error != 0)
 304                 return (error);
 305 #ifdef MAC
 306         error = mac_mount_check_stat(td->td_ucred, mp);
 307         if (error != 0)
 308                 goto out;
 309 #endif
 310         /*
 311          * Set these in case the underlying filesystem fails to do so.
 312          */
 313         sp = &mp->mnt_stat;
 314         sp->f_version = STATFS_VERSION;
 315         sp->f_namemax = NAME_MAX;
 316         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 317         error = VFS_STATFS(mp, sp);
 318         if (error != 0)
 319                 goto out;
 320         if (priv_check(td, PRIV_VFS_GENERATION)) {
 321                 bcopy(sp, &sb, sizeof(sb));
 322                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 323                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 324                 sp = &sb;
 325         }
 326         *buf = *sp;
 327 out:
 328         vfs_unbusy(mp);
 329         return (error);
 330 }
 331
 332 /*
 333  * Get filesystem statistics.
 334  */
 335 #ifndef _SYS_SYSPROTO_H_
 336 struct fstatfs_args {
 337         int fd;
 338         struct statfs *buf;
 339 };
 340 #endif
 341 int
 342 sys_fstatfs(td, uap)
 343         struct thread *td;
 344         register struct fstatfs_args /* {
 345                 int fd;
 346                 struct statfs *buf;
 347         } */ *uap;
 348 {
 349         struct statfs sf;
 350         int error;
 351
 352         error = kern_fstatfs(td, uap->fd, &sf);
 353         if (error == 0)
 354                 error = copyout(&sf, uap->buf, sizeof(sf));
 355         return (error);
 356 }
 357
 358 int
 359 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 360 {
 361         struct file *fp;
 362         struct mount *mp;
 363         struct statfs *sp, sb;
 364         struct vnode *vp;
 365         cap_rights_t rights;
 366         int error;
 367
 368         AUDIT_ARG_FD(fd);
 369         error = getvnode(td->td_proc->p_fd, fd,
 370             cap_rights_init(&rights, CAP_FSTATFS), &fp);
 371         if (error != 0)
 372                 return (error);
 373         vp = fp->f_vnode;
 374         vn_lock(vp, LK_SHARED | LK_RETRY);
 375 #ifdef AUDIT
 376         AUDIT_ARG_VNODE1(vp);
 377 #endif
 378         mp = vp->v_mount;
 379         if (mp)
 380                 vfs_ref(mp);
 381         VOP_UNLOCK(vp, 0);
 382         fdrop(fp, td);
 383         if (mp == NULL) {
 384                 error = EBADF;
 385                 goto out;
 386         }
 387         error = vfs_busy(mp, 0);
 388         vfs_rel(mp);
 389         if (error != 0)
 390                 return (error);
 391 #ifdef MAC
 392         error = mac_mount_check_stat(td->td_ucred, mp);
 393         if (error != 0)
 394                 goto out;
 395 #endif
 396         /*
 397          * Set these in case the underlying filesystem fails to do so.
 398          */
 399         sp = &mp->mnt_stat;
 400         sp->f_version = STATFS_VERSION;
 401         sp->f_namemax = NAME_MAX;
 402         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 403         error = VFS_STATFS(mp, sp);
 404         if (error != 0)
 405                 goto out;
 406         if (priv_check(td, PRIV_VFS_GENERATION)) {
 407                 bcopy(sp, &sb, sizeof(sb));
 408                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 409                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 410                 sp = &sb;
 411         }
 412         *buf = *sp;
 413 out:
 414         if (mp)
 415                 vfs_unbusy(mp);
 416         return (error);
 417 }
 418
 419 /*
 420  * Get statistics on all filesystems.
 421  */
 422 #ifndef _SYS_SYSPROTO_H_
 423 struct getfsstat_args {
 424         struct statfs *buf;
 425         long bufsize;
 426         int flags;
 427 };
 428 #endif
 429 int
 430 sys_getfsstat(td, uap)
 431         struct thread *td;
 432         register struct getfsstat_args /* {
 433                 struct statfs *buf;
 434                 long bufsize;
 435                 int flags;
 436         } */ *uap;
 437 {
 438
 439         return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
 440             uap->flags));
 441 }
 442
 443 /*
 444  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
 445  *      The caller is responsible for freeing memory which will be allocated
 446  *      in '*buf'.
 447  */
 448 int
 449 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
 450     enum uio_seg bufseg, int flags)
 451 {
 452         struct mount *mp, *nmp;
 453         struct statfs *sfsp, *sp, sb;
 454         size_t count, maxcount;
 455         int error;
 456
 457         maxcount = bufsize / sizeof(struct statfs);
 458         if (bufsize == 0)
 459                 sfsp = NULL;
 460         else if (bufseg == UIO_USERSPACE)
 461                 sfsp = *buf;
 462         else /* if (bufseg == UIO_SYSSPACE) */ {
 463                 count = 0;
 464                 mtx_lock(&mountlist_mtx);
 465                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 466                         count++;
 467                 }
 468                 mtx_unlock(&mountlist_mtx);
 469                 if (maxcount > count)
 470                         maxcount = count;
 471                 sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
 472                     M_WAITOK);
 473         }
 474         count = 0;
 475         mtx_lock(&mountlist_mtx);
 476         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 477                 if (prison_canseemount(td->td_ucred, mp) != 0) {
 478                         nmp = TAILQ_NEXT(mp, mnt_list);
 479                         continue;
 480                 }
 481 #ifdef MAC
 482                 if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 483                         nmp = TAILQ_NEXT(mp, mnt_list);
 484                         continue;
 485                 }
 486 #endif
 487                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 488                         nmp = TAILQ_NEXT(mp, mnt_list);
 489                         continue;
 490                 }
 491                 if (sfsp && count < maxcount) {
 492                         sp = &mp->mnt_stat;
 493                         /*
 494                          * Set these in case the underlying filesystem
 495                          * fails to do so.
 496                          */
 497                         sp->f_version = STATFS_VERSION;
 498                         sp->f_namemax = NAME_MAX;
 499                         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 500                         /*
 501                          * If MNT_NOWAIT or MNT_LAZY is specified, do not
 502                          * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 503                          * overrides MNT_WAIT.
 504                          */
 505                         if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 506                             (flags & MNT_WAIT)) &&
 507                             (error = VFS_STATFS(mp, sp))) {
 508                                 mtx_lock(&mountlist_mtx);
 509                                 nmp = TAILQ_NEXT(mp, mnt_list);
 510                                 vfs_unbusy(mp);
 511                                 continue;
 512                         }
 513                         if (priv_check(td, PRIV_VFS_GENERATION)) {
 514                                 bcopy(sp, &sb, sizeof(sb));
 515                                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 516                                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 517                                 sp = &sb;
 518                         }
 519                         if (bufseg == UIO_SYSSPACE)
 520                                 bcopy(sp, sfsp, sizeof(*sp));
 521                         else /* if (bufseg == UIO_USERSPACE) */ {
 522                                 error = copyout(sp, sfsp, sizeof(*sp));
 523                                 if (error != 0) {
 524                                         vfs_unbusy(mp);
 525                                         return (error);
 526                                 }
 527                         }
 528                         sfsp++;
 529                 }
 530                 count++;
 531                 mtx_lock(&mountlist_mtx);
 532                 nmp = TAILQ_NEXT(mp, mnt_list);
 533                 vfs_unbusy(mp);
 534         }
 535         mtx_unlock(&mountlist_mtx);
 536         if (sfsp && count > maxcount)
 537                 td->td_retval[0] = maxcount;
 538         else
 539                 td->td_retval[0] = count;
 540         return (0);
 541 }
 542
 543 #ifdef COMPAT_FREEBSD4
 544 /*
 545  * Get old format filesystem statistics.
 546  */
 547 static void cvtstatfs(struct statfs *, struct ostatfs *);
 548
 549 #ifndef _SYS_SYSPROTO_H_
 550 struct freebsd4_statfs_args {
 551         char *path;
 552         struct ostatfs *buf;
 553 };
 554 #endif
 555 int
 556 freebsd4_statfs(td, uap)
 557         struct thread *td;
 558         struct freebsd4_statfs_args /* {
 559                 char *path;
 560                 struct ostatfs *buf;
 561         } */ *uap;
 562 {
 563         struct ostatfs osb;
 564         struct statfs sf;
 565         int error;
 566
 567         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 568         if (error != 0)
 569                 return (error);
 570         cvtstatfs(&sf, &osb);
 571         return (copyout(&osb, uap->buf, sizeof(osb)));
 572 }
 573
 574 /*
 575  * Get filesystem statistics.
 576  */
 577 #ifndef _SYS_SYSPROTO_H_
 578 struct freebsd4_fstatfs_args {
 579         int fd;
 580         struct ostatfs *buf;
 581 };
 582 #endif
 583 int
 584 freebsd4_fstatfs(td, uap)
 585         struct thread *td;
 586         struct freebsd4_fstatfs_args /* {
 587                 int fd;
 588                 struct ostatfs *buf;
 589         } */ *uap;
 590 {
 591         struct ostatfs osb;
 592         struct statfs sf;
 593         int error;
 594
 595         error = kern_fstatfs(td, uap->fd, &sf);
 596         if (error != 0)
 597                 return (error);
 598         cvtstatfs(&sf, &osb);
 599         return (copyout(&osb, uap->buf, sizeof(osb)));
 600 }
 601
 602 /*
 603  * Get statistics on all filesystems.
 604  */
 605 #ifndef _SYS_SYSPROTO_H_
 606 struct freebsd4_getfsstat_args {
 607         struct ostatfs *buf;
 608         long bufsize;
 609         int flags;
 610 };
 611 #endif
 612 int
 613 freebsd4_getfsstat(td, uap)
 614         struct thread *td;
 615         register struct freebsd4_getfsstat_args /* {
 616                 struct ostatfs *buf;
 617                 long bufsize;
 618                 int flags;
 619         } */ *uap;
 620 {
 621         struct statfs *buf, *sp;
 622         struct ostatfs osb;
 623         size_t count, size;
 624         int error;
 625
 626         count = uap->bufsize / sizeof(struct ostatfs);
 627         size = count * sizeof(struct statfs);
 628         error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
 629         if (size > 0) {
 630                 count = td->td_retval[0];
 631                 sp = buf;
 632                 while (count > 0 && error == 0) {
 633                         cvtstatfs(sp, &osb);
 634                         error = copyout(&osb, uap->buf, sizeof(osb));
 635                         sp++;
 636                         uap->buf++;
 637                         count--;
 638                 }
 639                 free(buf, M_TEMP);
 640         }
 641         return (error);
 642 }
 643
 644 /*
 645  * Implement fstatfs() for (NFS) file handles.
 646  */
 647 #ifndef _SYS_SYSPROTO_H_
 648 struct freebsd4_fhstatfs_args {
 649         struct fhandle *u_fhp;
 650         struct ostatfs *buf;
 651 };
 652 #endif
 653 int
 654 freebsd4_fhstatfs(td, uap)
 655         struct thread *td;
 656         struct freebsd4_fhstatfs_args /* {
 657                 struct fhandle *u_fhp;
 658                 struct ostatfs *buf;
 659         } */ *uap;
 660 {
 661         struct ostatfs osb;
 662         struct statfs sf;
 663         fhandle_t fh;
 664         int error;
 665
 666         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 667         if (error != 0)
 668                 return (error);
 669         error = kern_fhstatfs(td, fh, &sf);
 670         if (error != 0)
 671                 return (error);
 672         cvtstatfs(&sf, &osb);
 673         return (copyout(&osb, uap->buf, sizeof(osb)));
 674 }
 675
 676 /*
 677  * Convert a new format statfs structure to an old format statfs structure.
 678  */
 679 static void
 680 cvtstatfs(nsp, osp)
 681         struct statfs *nsp;
 682         struct ostatfs *osp;
 683 {
 684
 685         statfs_scale_blocks(nsp, LONG_MAX);
 686         bzero(osp, sizeof(*osp));
 687         osp->f_bsize = nsp->f_bsize;
 688         osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 689         osp->f_blocks = nsp->f_blocks;
 690         osp->f_bfree = nsp->f_bfree;
 691         osp->f_bavail = nsp->f_bavail;
 692         osp->f_files = MIN(nsp->f_files, LONG_MAX);
 693         osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 694         osp->f_owner = nsp->f_owner;
 695         osp->f_type = nsp->f_type;
 696         osp->f_flags = nsp->f_flags;
 697         osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 698         osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 699         osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 700         osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 701         strlcpy(osp->f_fstypename, nsp->f_fstypename,
 702             MIN(MFSNAMELEN, OMFSNAMELEN));
 703         strlcpy(osp->f_mntonname, nsp->f_mntonname,
 704             MIN(MNAMELEN, OMNAMELEN));
 705         strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 706             MIN(MNAMELEN, OMNAMELEN));
 707         osp->f_fsid = nsp->f_fsid;
 708 }
 709 #endif /* COMPAT_FREEBSD4 */
 710
 711 /*
 712  * Change current working directory to a given file descriptor.
 713  */
 714 #ifndef _SYS_SYSPROTO_H_
 715 struct fchdir_args {
 716         int     fd;
 717 };
 718 #endif
 719 int
 720 sys_fchdir(td, uap)
 721         struct thread *td;
 722         struct fchdir_args /* {
 723                 int fd;
 724         } */ *uap;
 725 {
 726         register struct filedesc *fdp = td->td_proc->p_fd;
 727         struct vnode *vp, *tdp, *vpold;
 728         struct mount *mp;
 729         struct file *fp;
 730         cap_rights_t rights;
 731         int error;
 732
 733         AUDIT_ARG_FD(uap->fd);
 734         error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
 735             &fp);
 736         if (error != 0)
 737                 return (error);
 738         vp = fp->f_vnode;
 739         VREF(vp);
 740         fdrop(fp, td);
 741         vn_lock(vp, LK_SHARED | LK_RETRY);
 742         AUDIT_ARG_VNODE1(vp);
 743         error = change_dir(vp, td);
 744         while (!error && (mp = vp->v_mountedhere) != NULL) {
 745                 if (vfs_busy(mp, 0))
 746                         continue;
 747                 error = VFS_ROOT(mp, LK_SHARED, &tdp);
 748                 vfs_unbusy(mp);
 749                 if (error != 0)
 750                         break;
 751                 vput(vp);
 752                 vp = tdp;
 753         }
 754         if (error != 0) {
 755                 vput(vp);
 756                 return (error);
 757         }
 758         VOP_UNLOCK(vp, 0);
 759         FILEDESC_XLOCK(fdp);
 760         vpold = fdp->fd_cdir;
 761         fdp->fd_cdir = vp;
 762         FILEDESC_XUNLOCK(fdp);
 763         vrele(vpold);
 764         return (0);
 765 }
 766
 767 /*
 768  * Change current working directory (``.'').
 769  */
 770 #ifndef _SYS_SYSPROTO_H_
 771 struct chdir_args {
 772         char    *path;
 773 };
 774 #endif
 775 int
 776 sys_chdir(td, uap)
 777         struct thread *td;
 778         struct chdir_args /* {
 779                 char *path;
 780         } */ *uap;
 781 {
 782
 783         return (kern_chdir(td, uap->path, UIO_USERSPACE));
 784 }
 785
 786 int
 787 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 788 {
 789         register struct filedesc *fdp = td->td_proc->p_fd;
 790         struct nameidata nd;
 791         struct vnode *vp;
 792         int error;
 793
 794         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 795             pathseg, path, td);
 796         if ((error = namei(&nd)) != 0)
 797                 return (error);
 798         if ((error = change_dir(nd.ni_vp, td)) != 0) {
 799                 vput(nd.ni_vp);
 800                 NDFREE(&nd, NDF_ONLY_PNBUF);
 801                 return (error);
 802         }
 803         VOP_UNLOCK(nd.ni_vp, 0);
 804         NDFREE(&nd, NDF_ONLY_PNBUF);
 805         FILEDESC_XLOCK(fdp);
 806         vp = fdp->fd_cdir;
 807         fdp->fd_cdir = nd.ni_vp;
 808         FILEDESC_XUNLOCK(fdp);
 809         vrele(vp);
 810         return (0);
 811 }
 812
 813 /*
 814  * Helper function for raised chroot(2) security function:  Refuse if
 815  * any filedescriptors are open directories.
 816  */
 817 static int
 818 chroot_refuse_vdir_fds(fdp)
 819         struct filedesc *fdp;
 820 {
 821         struct vnode *vp;
 822         struct file *fp;
 823         int fd;
 824
 825         FILEDESC_LOCK_ASSERT(fdp);
 826
 827         for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 828                 fp = fget_locked(fdp, fd);
 829                 if (fp == NULL)
 830                         continue;
 831                 if (fp->f_type == DTYPE_VNODE) {
 832                         vp = fp->f_vnode;
 833                         if (vp->v_type == VDIR)
 834                                 return (EPERM);
 835                 }
 836         }
 837         return (0);
 838 }
 839
 840 /*
 841  * This sysctl determines if we will allow a process to chroot(2) if it
 842  * has a directory open:
 843  *      0: disallowed for all processes.
 844  *      1: allowed for processes that were not already chroot(2)'ed.
 845  *      2: allowed for all processes.
 846  */
 847
 848 static int chroot_allow_open_directories = 1;
 849
 850 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
 851      &chroot_allow_open_directories, 0,
 852      "Allow a process to chroot(2) if it has a directory open");
 853
 854 /*
 855  * Change notion of root (``/'') directory.
 856  */
 857 #ifndef _SYS_SYSPROTO_H_
 858 struct chroot_args {
 859         char    *path;
 860 };
 861 #endif
 862 int
 863 sys_chroot(td, uap)
 864         struct thread *td;
 865         struct chroot_args /* {
 866                 char *path;
 867         } */ *uap;
 868 {
 869         struct nameidata nd;
 870         int error;
 871
 872         error = priv_check(td, PRIV_VFS_CHROOT);
 873         if (error != 0)
 874                 return (error);
 875         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 876             UIO_USERSPACE, uap->path, td);
 877         error = namei(&nd);
 878         if (error != 0)
 879                 goto error;
 880         error = change_dir(nd.ni_vp, td);
 881         if (error != 0)
 882                 goto e_vunlock;
 883 #ifdef MAC
 884         error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 885         if (error != 0)
 886                 goto e_vunlock;
 887 #endif
 888         VOP_UNLOCK(nd.ni_vp, 0);
 889         error = change_root(nd.ni_vp, td);
 890         vrele(nd.ni_vp);
 891         NDFREE(&nd, NDF_ONLY_PNBUF);
 892         return (error);
 893 e_vunlock:
 894         vput(nd.ni_vp);
 895 error:
 896         NDFREE(&nd, NDF_ONLY_PNBUF);
 897         return (error);
 898 }
 899
 900 /*
 901  * Common routine for chroot and chdir.  Callers must provide a locked vnode
 902  * instance.
 903  */
 904 int
 905 change_dir(vp, td)
 906         struct vnode *vp;
 907         struct thread *td;
 908 {
 909 #ifdef MAC
 910         int error;
 911 #endif
 912
 913         ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 914         if (vp->v_type != VDIR)
 915                 return (ENOTDIR);
 916 #ifdef MAC
 917         error = mac_vnode_check_chdir(td->td_ucred, vp);
 918         if (error != 0)
 919                 return (error);
 920 #endif
 921         return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 922 }
 923
 924 /*
 925  * Common routine for kern_chroot() and jail_attach().  The caller is
 926  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
 927  * authorize this operation.
 928  */
 929 int
 930 change_root(vp, td)
 931         struct vnode *vp;
 932         struct thread *td;
 933 {
 934         struct filedesc *fdp;
 935         struct vnode *oldvp;
 936         int error;
 937
 938         fdp = td->td_proc->p_fd;
 939         FILEDESC_XLOCK(fdp);
 940         if (chroot_allow_open_directories == 0 ||
 941             (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 942                 error = chroot_refuse_vdir_fds(fdp);
 943                 if (error != 0) {
 944                         FILEDESC_XUNLOCK(fdp);
 945                         return (error);
 946                 }
 947         }
 948         oldvp = fdp->fd_rdir;
 949         fdp->fd_rdir = vp;
 950         VREF(fdp->fd_rdir);
 951         if (!fdp->fd_jdir) {
 952                 fdp->fd_jdir = vp;
 953                 VREF(fdp->fd_jdir);
 954         }
 955         FILEDESC_XUNLOCK(fdp);
 956         vrele(oldvp);
 957         return (0);
 958 }
 959
 960 static __inline void
 961 flags_to_rights(int flags, cap_rights_t *rightsp)
 962 {
 963
 964         if (flags & O_EXEC) {
 965                 cap_rights_set(rightsp, CAP_FEXECVE);
 966         } else {
 967                 switch ((flags & O_ACCMODE)) {
 968                 case O_RDONLY:
 969                         cap_rights_set(rightsp, CAP_READ);
 970                         break;
 971                 case O_RDWR:
 972                         cap_rights_set(rightsp, CAP_READ);
 973                         /* FALLTHROUGH */
 974                 case O_WRONLY:
 975                         cap_rights_set(rightsp, CAP_WRITE);
 976                         if (!(flags & (O_APPEND | O_TRUNC)))
 977                                 cap_rights_set(rightsp, CAP_SEEK);
 978                         break;
 979                 }
 980         }
 981
 982         if (flags & O_CREAT)
 983                 cap_rights_set(rightsp, CAP_CREATE);
 984
 985         if (flags & O_TRUNC)
 986                 cap_rights_set(rightsp, CAP_FTRUNCATE);
 987
 988         if (flags & (O_SYNC | O_FSYNC))
 989                 cap_rights_set(rightsp, CAP_FSYNC);
 990
 991         if (flags & (O_EXLOCK | O_SHLOCK))
 992                 cap_rights_set(rightsp, CAP_FLOCK);
 993 }
 994
 995 /*
 996  * Check permissions, allocate an open file structure, and call the device
 997  * open routine if any.
 998  */
 999 #ifndef _SYS_SYSPROTO_H_
1000 struct open_args {
1001         char    *path;
1002         int     flags;
1003         int     mode;
1004 };
1005 #endif
1006 int
1007 sys_open(td, uap)
1008         struct thread *td;
1009         register struct open_args /* {
1010                 char *path;
1011                 int flags;
1012                 int mode;
1013         } */ *uap;
1014 {
1015
1016         return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1017 }
1018
1019 #ifndef _SYS_SYSPROTO_H_
1020 struct openat_args {
1021         int     fd;
1022         char    *path;
1023         int     flag;
1024         int     mode;
1025 };
1026 #endif
1027 int
1028 sys_openat(struct thread *td, struct openat_args *uap)
1029 {
1030
1031         return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1032             uap->mode));
1033 }
1034
1035 int
1036 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1037     int mode)
1038 {
1039
1040         return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1041 }
1042
1043 int
1044 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1045     int flags, int mode)
1046 {
1047         struct proc *p = td->td_proc;
1048         struct filedesc *fdp = p->p_fd;
1049         struct file *fp;
1050         struct vnode *vp;
1051         struct nameidata nd;
1052         cap_rights_t rights;
1053         int cmode, error, indx;
1054
1055         indx = -1;
1056
1057         AUDIT_ARG_FFLAGS(flags);
1058         AUDIT_ARG_MODE(mode);
1059         /* XXX: audit dirfd */
1060         cap_rights_init(&rights, CAP_LOOKUP);
1061         flags_to_rights(flags, &rights);
1062         /*
1063          * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1064          * may be specified.
1065          */
1066         if (flags & O_EXEC) {
1067                 if (flags & O_ACCMODE)
1068                         return (EINVAL);
1069         } else if ((flags & O_ACCMODE) == O_ACCMODE) {
1070                 return (EINVAL);
1071         } else {
1072                 flags = FFLAGS(flags);
1073         }
1074
1075         /*
1076          * Allocate the file descriptor, but don't install a descriptor yet.
1077          */
1078         error = falloc_noinstall(td, &fp);
1079         if (error != 0)
1080                 return (error);
1081         /*
1082          * An extra reference on `fp' has been held for us by
1083          * falloc_noinstall().
1084          */
1085         /* Set the flags early so the finit in devfs can pick them up. */
1086         fp->f_flag = flags & FMASK;
1087         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1088         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1089             &rights, td);
1090         td->td_dupfd = -1;              /* XXX check for fdopen */
1091         error = vn_open(&nd, &flags, cmode, fp);
1092         if (error != 0) {
1093                 /*
1094                  * If the vn_open replaced the method vector, something
1095                  * wonderous happened deep below and we just pass it up
1096                  * pretending we know what we do.
1097                  */
1098                 if (error == ENXIO && fp->f_ops != &badfileops)
1099                         goto success;
1100
1101                 /*
1102                  * Handle special fdopen() case. bleh.
1103                  *
1104                  * Don't do this for relative (capability) lookups; we don't
1105                  * understand exactly what would happen, and we don't think
1106                  * that it ever should.
1107                  */
1108                 if (nd.ni_strictrelative == 0 &&
1109                     (error == ENODEV || error == ENXIO) &&
1110                     td->td_dupfd >= 0) {
1111                         error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1112                             &indx);
1113                         if (error == 0)
1114                                 goto success;
1115                 }
1116
1117                 goto bad;
1118         }
1119         td->td_dupfd = 0;
1120         NDFREE(&nd, NDF_ONLY_PNBUF);
1121         vp = nd.ni_vp;
1122
1123         /*
1124          * Store the vnode, for any f_type. Typically, the vnode use
1125          * count is decremented by direct call to vn_closefile() for
1126          * files that switched type in the cdevsw fdopen() method.
1127          */
1128         fp->f_vnode = vp;
1129         /*
1130          * If the file wasn't claimed by devfs bind it to the normal
1131          * vnode operations here.
1132          */
1133         if (fp->f_ops == &badfileops) {
1134                 KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1135                 fp->f_seqcount = 1;
1136                 finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1137                     DTYPE_VNODE, vp, &vnops);
1138         }
1139
1140         VOP_UNLOCK(vp, 0);
1141         if (flags & O_TRUNC) {
1142                 error = fo_truncate(fp, 0, td->td_ucred, td);
1143                 if (error != 0)
1144                         goto bad;
1145         }
1146 success:
1147         /*
1148          * If we haven't already installed the FD (for dupfdopen), do so now.
1149          */
1150         if (indx == -1) {
1151                 struct filecaps *fcaps;
1152
1153 #ifdef CAPABILITIES
1154                 if (nd.ni_strictrelative == 1)
1155                         fcaps = &nd.ni_filecaps;
1156                 else
1157 #endif
1158                         fcaps = NULL;
1159                 error = finstall(td, fp, &indx, flags, fcaps);
1160                 /* On success finstall() consumes fcaps. */
1161                 if (error != 0) {
1162                         filecaps_free(&nd.ni_filecaps);
1163                         goto bad;
1164                 }
1165         } else {
1166                 filecaps_free(&nd.ni_filecaps);
1167         }
1168
1169         /*
1170          * Release our private reference, leaving the one associated with
1171          * the descriptor table intact.
1172          */
1173         fdrop(fp, td);
1174         td->td_retval[0] = indx;
1175         return (0);
1176 bad:
1177         KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1178         fdrop(fp, td);
1179         return (error);
1180 }
1181
1182 #ifdef COMPAT_43
1183 /*
1184  * Create a file.
1185  */
1186 #ifndef _SYS_SYSPROTO_H_
1187 struct ocreat_args {
1188         char    *path;
1189         int     mode;
1190 };
1191 #endif
1192 int
1193 ocreat(td, uap)
1194         struct thread *td;
1195         register struct ocreat_args /* {
1196                 char *path;
1197                 int mode;
1198         } */ *uap;
1199 {
1200
1201         return (kern_open(td, uap->path, UIO_USERSPACE,
1202             O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1203 }
1204 #endif /* COMPAT_43 */
1205
1206 /*
1207  * Create a special file.
1208  */
1209 #ifndef _SYS_SYSPROTO_H_
1210 struct mknod_args {
1211         char    *path;
1212         int     mode;
1213         int     dev;
1214 };
1215 #endif
1216 int
1217 sys_mknod(td, uap)
1218         struct thread *td;
1219         register struct mknod_args /* {
1220                 char *path;
1221                 int mode;
1222                 int dev;
1223         } */ *uap;
1224 {
1225
1226         return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1227 }
1228
1229 #ifndef _SYS_SYSPROTO_H_
1230 struct mknodat_args {
1231         int     fd;
1232         char    *path;
1233         mode_t  mode;
1234         dev_t   dev;
1235 };
1236 #endif
1237 int
1238 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1239 {
1240
1241         return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1242             uap->dev));
1243 }
1244
1245 int
1246 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1247     int dev)
1248 {
1249
1250         return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1251 }
1252
1253 int
1254 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1255     int mode, int dev)
1256 {
1257         struct vnode *vp;
1258         struct mount *mp;
1259         struct vattr vattr;
1260         struct nameidata nd;
1261         cap_rights_t rights;
1262         int error, whiteout = 0;
1263
1264         AUDIT_ARG_MODE(mode);
1265         AUDIT_ARG_DEV(dev);
1266         switch (mode & S_IFMT) {
1267         case S_IFCHR:
1268         case S_IFBLK:
1269                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1270                 break;
1271         case S_IFMT:
1272                 error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1273                 break;
1274         case S_IFWHT:
1275                 error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1276                 break;
1277         case S_IFIFO:
1278                 if (dev == 0)
1279                         return (kern_mkfifoat(td, fd, path, pathseg, mode));
1280                 /* FALLTHROUGH */
1281         default:
1282                 error = EINVAL;
1283                 break;
1284         }
1285         if (error != 0)
1286                 return (error);
1287 restart:
1288         bwillwrite();
1289         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1290             NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1291             td);
1292         if ((error = namei(&nd)) != 0)
1293                 return (error);
1294         vp = nd.ni_vp;
1295         if (vp != NULL) {
1296                 NDFREE(&nd, NDF_ONLY_PNBUF);
1297                 if (vp == nd.ni_dvp)
1298                         vrele(nd.ni_dvp);
1299                 else
1300                         vput(nd.ni_dvp);
1301                 vrele(vp);
1302                 return (EEXIST);
1303         } else {
1304                 VATTR_NULL(&vattr);
1305                 vattr.va_mode = (mode & ALLPERMS) &
1306                     ~td->td_proc->p_fd->fd_cmask;
1307                 vattr.va_rdev = dev;
1308                 whiteout = 0;
1309
1310                 switch (mode & S_IFMT) {
1311                 case S_IFMT:    /* used by badsect to flag bad sectors */
1312                         vattr.va_type = VBAD;
1313                         break;
1314                 case S_IFCHR:
1315                         vattr.va_type = VCHR;
1316                         break;
1317                 case S_IFBLK:
1318                         vattr.va_type = VBLK;
1319                         break;
1320                 case S_IFWHT:
1321                         whiteout = 1;
1322                         break;
1323                 default:
1324                         panic("kern_mknod: invalid mode");
1325                 }
1326         }
1327         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1328                 NDFREE(&nd, NDF_ONLY_PNBUF);
1329                 vput(nd.ni_dvp);
1330                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1331                         return (error);
1332                 goto restart;
1333         }
1334 #ifdef MAC
1335         if (error == 0 && !whiteout)
1336                 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1337                     &nd.ni_cnd, &vattr);
1338 #endif
1339         if (error == 0) {
1340                 if (whiteout)
1341                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1342                 else {
1343                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1344                                                 &nd.ni_cnd, &vattr);
1345                         if (error == 0)
1346                                 vput(nd.ni_vp);
1347                 }
1348         }
1349         NDFREE(&nd, NDF_ONLY_PNBUF);
1350         vput(nd.ni_dvp);
1351         vn_finished_write(mp);
1352         return (error);
1353 }
1354
1355 /*
1356  * Create a named pipe.
1357  */
1358 #ifndef _SYS_SYSPROTO_H_
1359 struct mkfifo_args {
1360         char    *path;
1361         int     mode;
1362 };
1363 #endif
1364 int
1365 sys_mkfifo(td, uap)
1366         struct thread *td;
1367         register struct mkfifo_args /* {
1368                 char *path;
1369                 int mode;
1370         } */ *uap;
1371 {
1372
1373         return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1374 }
1375
1376 #ifndef _SYS_SYSPROTO_H_
1377 struct mkfifoat_args {
1378         int     fd;
1379         char    *path;
1380         mode_t  mode;
1381 };
1382 #endif
1383 int
1384 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1385 {
1386
1387         return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1388             uap->mode));
1389 }
1390
1391 int
1392 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1393 {
1394
1395         return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1396 }
1397
1398 int
1399 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1400     int mode)
1401 {
1402         struct mount *mp;
1403         struct vattr vattr;
1404         struct nameidata nd;
1405         cap_rights_t rights;
1406         int error;
1407
1408         AUDIT_ARG_MODE(mode);
1409 restart:
1410         bwillwrite();
1411         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1412             NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1413             td);
1414         if ((error = namei(&nd)) != 0)
1415                 return (error);
1416         if (nd.ni_vp != NULL) {
1417                 NDFREE(&nd, NDF_ONLY_PNBUF);
1418                 if (nd.ni_vp == nd.ni_dvp)
1419                         vrele(nd.ni_dvp);
1420                 else
1421                         vput(nd.ni_dvp);
1422                 vrele(nd.ni_vp);
1423                 return (EEXIST);
1424         }
1425         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1426                 NDFREE(&nd, NDF_ONLY_PNBUF);
1427                 vput(nd.ni_dvp);
1428                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1429                         return (error);
1430                 goto restart;
1431         }
1432         VATTR_NULL(&vattr);
1433         vattr.va_type = VFIFO;
1434         vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1435 #ifdef MAC
1436         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1437             &vattr);
1438         if (error != 0)
1439                 goto out;
1440 #endif
1441         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1442         if (error == 0)
1443                 vput(nd.ni_vp);
1444 #ifdef MAC
1445 out:
1446 #endif
1447         vput(nd.ni_dvp);
1448         vn_finished_write(mp);
1449         NDFREE(&nd, NDF_ONLY_PNBUF);
1450         return (error);
1451 }
1452
1453 /*
1454  * Make a hard file link.
1455  */
1456 #ifndef _SYS_SYSPROTO_H_
1457 struct link_args {
1458         char    *path;
1459         char    *link;
1460 };
1461 #endif
1462 int
1463 sys_link(td, uap)
1464         struct thread *td;
1465         register struct link_args /* {
1466                 char *path;
1467                 char *link;
1468         } */ *uap;
1469 {
1470
1471         return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1472 }
1473
1474 #ifndef _SYS_SYSPROTO_H_
1475 struct linkat_args {
1476         int     fd1;
1477         char    *path1;
1478         int     fd2;
1479         char    *path2;
1480         int     flag;
1481 };
1482 #endif
1483 int
1484 sys_linkat(struct thread *td, struct linkat_args *uap)
1485 {
1486         int flag;
1487
1488         flag = uap->flag;
1489         if (flag & ~AT_SYMLINK_FOLLOW)
1490                 return (EINVAL);
1491
1492         return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1493             UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1494 }
1495
1496 int hardlink_check_uid = 0;
1497 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1498     &hardlink_check_uid, 0,
1499     "Unprivileged processes cannot create hard links to files owned by other "
1500     "users");
1501 static int hardlink_check_gid = 0;
1502 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1503     &hardlink_check_gid, 0,
1504     "Unprivileged processes cannot create hard links to files owned by other "
1505     "groups");
1506
1507 static int
1508 can_hardlink(struct vnode *vp, struct ucred *cred)
1509 {
1510         struct vattr va;
1511         int error;
1512
1513         if (!hardlink_check_uid && !hardlink_check_gid)
1514                 return (0);
1515
1516         error = VOP_GETATTR(vp, &va, cred);
1517         if (error != 0)
1518                 return (error);
1519
1520         if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1521                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1522                 if (error != 0)
1523                         return (error);
1524         }
1525
1526         if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1527                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1528                 if (error != 0)
1529                         return (error);
1530         }
1531
1532         return (0);
1533 }
1534
1535 int
1536 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1537 {
1538
1539         return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1540 }
1541
1542 int
1543 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1544     enum uio_seg segflg, int follow)
1545 {
1546         struct vnode *vp;
1547         struct mount *mp;
1548         struct nameidata nd;
1549         cap_rights_t rights;
1550         int error;
1551
1552 again:
1553         bwillwrite();
1554         NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1555
1556         if ((error = namei(&nd)) != 0)
1557                 return (error);
1558         NDFREE(&nd, NDF_ONLY_PNBUF);
1559         vp = nd.ni_vp;
1560         if (vp->v_type == VDIR) {
1561                 vrele(vp);
1562                 return (EPERM);         /* POSIX */
1563         }
1564         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
1565             NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
1566             td);
1567         if ((error = namei(&nd)) == 0) {
1568                 if (nd.ni_vp != NULL) {
1569                         NDFREE(&nd, NDF_ONLY_PNBUF);
1570                         if (nd.ni_dvp == nd.ni_vp)
1571                                 vrele(nd.ni_dvp);
1572                         else
1573                                 vput(nd.ni_dvp);
1574                         vrele(nd.ni_vp);
1575                         vrele(vp);
1576                         return (EEXIST);
1577                 } else if (nd.ni_dvp->v_mount != vp->v_mount) {
1578                         /*
1579                          * Cross-device link.  No need to recheck
1580                          * vp->v_type, since it cannot change, except
1581                          * to VBAD.
1582                          */
1583                         NDFREE(&nd, NDF_ONLY_PNBUF);
1584                         vput(nd.ni_dvp);
1585                         vrele(vp);
1586                         return (EXDEV);
1587                 } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1588                         error = can_hardlink(vp, td->td_ucred);
1589 #ifdef MAC
1590                         if (error == 0)
1591                                 error = mac_vnode_check_link(td->td_ucred,
1592                                     nd.ni_dvp, vp, &nd.ni_cnd);
1593 #endif
1594                         if (error != 0) {
1595                                 vput(vp);
1596                                 vput(nd.ni_dvp);
1597                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1598                                 return (error);
1599                         }
1600                         error = vn_start_write(vp, &mp, V_NOWAIT);
1601                         if (error != 0) {
1602                                 vput(vp);
1603                                 vput(nd.ni_dvp);
1604                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1605                                 error = vn_start_write(NULL, &mp,
1606                                     V_XSLEEP | PCATCH);
1607                                 if (error != 0)
1608                                         return (error);
1609                                 goto again;
1610                         }
1611                         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1612                         VOP_UNLOCK(vp, 0);
1613                         vput(nd.ni_dvp);
1614                         vn_finished_write(mp);
1615                         NDFREE(&nd, NDF_ONLY_PNBUF);
1616                 } else {
1617                         vput(nd.ni_dvp);
1618                         NDFREE(&nd, NDF_ONLY_PNBUF);
1619                         vrele(vp);
1620                         goto again;
1621                 }
1622         }
1623         vrele(vp);
1624         return (error);
1625 }
1626
1627 /*
1628  * Make a symbolic link.
1629  */
1630 #ifndef _SYS_SYSPROTO_H_
1631 struct symlink_args {
1632         char    *path;
1633         char    *link;
1634 };
1635 #endif
1636 int
1637 sys_symlink(td, uap)
1638         struct thread *td;
1639         register struct symlink_args /* {
1640                 char *path;
1641                 char *link;
1642         } */ *uap;
1643 {
1644
1645         return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1646 }
1647
1648 #ifndef _SYS_SYSPROTO_H_
1649 struct symlinkat_args {
1650         char    *path;
1651         int     fd;
1652         char    *path2;
1653 };
1654 #endif
1655 int
1656 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1657 {
1658
1659         return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1660             UIO_USERSPACE));
1661 }
1662
1663 int
1664 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1665 {
1666
1667         return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1668 }
1669
1670 int
1671 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1672     enum uio_seg segflg)
1673 {
1674         struct mount *mp;
1675         struct vattr vattr;
1676         char *syspath;
1677         struct nameidata nd;
1678         int error;
1679         cap_rights_t rights;
1680
1681         if (segflg == UIO_SYSSPACE) {
1682                 syspath = path1;
1683         } else {
1684                 syspath = uma_zalloc(namei_zone, M_WAITOK);
1685                 if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1686                         goto out;
1687         }
1688         AUDIT_ARG_TEXT(syspath);
1689 restart:
1690         bwillwrite();
1691         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1692             NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1693             td);
1694         if ((error = namei(&nd)) != 0)
1695                 goto out;
1696         if (nd.ni_vp) {
1697                 NDFREE(&nd, NDF_ONLY_PNBUF);
1698                 if (nd.ni_vp == nd.ni_dvp)
1699                         vrele(nd.ni_dvp);
1700                 else
1701                         vput(nd.ni_dvp);
1702                 vrele(nd.ni_vp);
1703                 error = EEXIST;
1704                 goto out;
1705         }
1706         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1707                 NDFREE(&nd, NDF_ONLY_PNBUF);
1708                 vput(nd.ni_dvp);
1709                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1710                         goto out;
1711                 goto restart;
1712         }
1713         VATTR_NULL(&vattr);
1714         vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1715 #ifdef MAC
1716         vattr.va_type = VLNK;
1717         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1718             &vattr);
1719         if (error != 0)
1720                 goto out2;
1721 #endif
1722         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1723         if (error == 0)
1724                 vput(nd.ni_vp);
1725 #ifdef MAC
1726 out2:
1727 #endif
1728         NDFREE(&nd, NDF_ONLY_PNBUF);
1729         vput(nd.ni_dvp);
1730         vn_finished_write(mp);
1731 out:
1732         if (segflg != UIO_SYSSPACE)
1733                 uma_zfree(namei_zone, syspath);
1734         return (error);
1735 }
1736
1737 /*
1738  * Delete a whiteout from the filesystem.
1739  */
1740 int
1741 sys_undelete(td, uap)
1742         struct thread *td;
1743         register struct undelete_args /* {
1744                 char *path;
1745         } */ *uap;
1746 {
1747         struct mount *mp;
1748         struct nameidata nd;
1749         int error;
1750
1751 restart:
1752         bwillwrite();
1753         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1754             UIO_USERSPACE, uap->path, td);
1755         error = namei(&nd);
1756         if (error != 0)
1757                 return (error);
1758
1759         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1760                 NDFREE(&nd, NDF_ONLY_PNBUF);
1761                 if (nd.ni_vp == nd.ni_dvp)
1762                         vrele(nd.ni_dvp);
1763                 else
1764                         vput(nd.ni_dvp);
1765                 if (nd.ni_vp)
1766                         vrele(nd.ni_vp);
1767                 return (EEXIST);
1768         }
1769         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1770                 NDFREE(&nd, NDF_ONLY_PNBUF);
1771                 vput(nd.ni_dvp);
1772                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773                         return (error);
1774                 goto restart;
1775         }
1776         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1777         NDFREE(&nd, NDF_ONLY_PNBUF);
1778         vput(nd.ni_dvp);
1779         vn_finished_write(mp);
1780         return (error);
1781 }
1782
1783 /*
1784  * Delete a name from the filesystem.
1785  */
1786 #ifndef _SYS_SYSPROTO_H_
1787 struct unlink_args {
1788         char    *path;
1789 };
1790 #endif
1791 int
1792 sys_unlink(td, uap)
1793         struct thread *td;
1794         struct unlink_args /* {
1795                 char *path;
1796         } */ *uap;
1797 {
1798
1799         return (kern_unlink(td, uap->path, UIO_USERSPACE));
1800 }
1801
1802 #ifndef _SYS_SYSPROTO_H_
1803 struct unlinkat_args {
1804         int     fd;
1805         char    *path;
1806         int     flag;
1807 };
1808 #endif
1809 int
1810 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1811 {
1812         int flag = uap->flag;
1813         int fd = uap->fd;
1814         char *path = uap->path;
1815
1816         if (flag & ~AT_REMOVEDIR)
1817                 return (EINVAL);
1818
1819         if (flag & AT_REMOVEDIR)
1820                 return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1821         else
1822                 return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1823 }
1824
1825 int
1826 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1827 {
1828
1829         return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1830 }
1831
1832 int
1833 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1834     ino_t oldinum)
1835 {
1836         struct mount *mp;
1837         struct vnode *vp;
1838         struct nameidata nd;
1839         struct stat sb;
1840         cap_rights_t rights;
1841         int error;
1842
1843 restart:
1844         bwillwrite();
1845         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1846             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1847         if ((error = namei(&nd)) != 0)
1848                 return (error == EINVAL ? EPERM : error);
1849         vp = nd.ni_vp;
1850         if (vp->v_type == VDIR && oldinum == 0) {
1851                 error = EPERM;          /* POSIX */
1852         } else if (oldinum != 0 &&
1853                   ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1854                   sb.st_ino != oldinum) {
1855                         error = EIDRM;  /* Identifier removed */
1856         } else {
1857                 /*
1858                  * The root of a mounted filesystem cannot be deleted.
1859                  *
1860                  * XXX: can this only be a VDIR case?
1861                  */
1862                 if (vp->v_vflag & VV_ROOT)
1863                         error = EBUSY;
1864         }
1865         if (error == 0) {
1866                 if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1867                         NDFREE(&nd, NDF_ONLY_PNBUF);
1868                         vput(nd.ni_dvp);
1869                         if (vp == nd.ni_dvp)
1870                                 vrele(vp);
1871                         else
1872                                 vput(vp);
1873                         if ((error = vn_start_write(NULL, &mp,
1874                             V_XSLEEP | PCATCH)) != 0)
1875                                 return (error);
1876                         goto restart;
1877                 }
1878 #ifdef MAC
1879                 error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1880                     &nd.ni_cnd);
1881                 if (error != 0)
1882                         goto out;
1883 #endif
1884                 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1885                 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1886 #ifdef MAC
1887 out:
1888 #endif
1889                 vn_finished_write(mp);
1890         }
1891         NDFREE(&nd, NDF_ONLY_PNBUF);
1892         vput(nd.ni_dvp);
1893         if (vp == nd.ni_dvp)
1894                 vrele(vp);
1895         else
1896                 vput(vp);
1897         return (error);
1898 }
1899
1900 /*
1901  * Reposition read/write file offset.
1902  */
1903 #ifndef _SYS_SYSPROTO_H_
1904 struct lseek_args {
1905         int     fd;
1906         int     pad;
1907         off_t   offset;
1908         int     whence;
1909 };
1910 #endif
1911 int
1912 sys_lseek(td, uap)
1913         struct thread *td;
1914         register struct lseek_args /* {
1915                 int fd;
1916                 int pad;
1917                 off_t offset;
1918                 int whence;
1919         } */ *uap;
1920 {
1921         struct file *fp;
1922         cap_rights_t rights;
1923         int error;
1924
1925         AUDIT_ARG_FD(uap->fd);
1926         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1927         if (error != 0)
1928                 return (error);
1929         error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1930             fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1931         fdrop(fp, td);
1932         return (error);
1933 }
1934
1935 #if defined(COMPAT_43)
1936 /*
1937  * Reposition read/write file offset.
1938  */
1939 #ifndef _SYS_SYSPROTO_H_
1940 struct olseek_args {
1941         int     fd;
1942         long    offset;
1943         int     whence;
1944 };
1945 #endif
1946 int
1947 olseek(td, uap)
1948         struct thread *td;
1949         register struct olseek_args /* {
1950                 int fd;
1951                 long offset;
1952                 int whence;
1953         } */ *uap;
1954 {
1955         struct lseek_args /* {
1956                 int fd;
1957                 int pad;
1958                 off_t offset;
1959                 int whence;
1960         } */ nuap;
1961
1962         nuap.fd = uap->fd;
1963         nuap.offset = uap->offset;
1964         nuap.whence = uap->whence;
1965         return (sys_lseek(td, &nuap));
1966 }
1967 #endif /* COMPAT_43 */
1968
1969 /* Version with the 'pad' argument */
1970 int
1971 freebsd6_lseek(td, uap)
1972         struct thread *td;
1973         register struct freebsd6_lseek_args *uap;
1974 {
1975         struct lseek_args ouap;
1976
1977         ouap.fd = uap->fd;
1978         ouap.offset = uap->offset;
1979         ouap.whence = uap->whence;
1980         return (sys_lseek(td, &ouap));
1981 }
1982
1983 /*
1984  * Check access permissions using passed credentials.
1985  */
1986 static int
1987 vn_access(vp, user_flags, cred, td)
1988         struct vnode    *vp;
1989         int             user_flags;
1990         struct ucred    *cred;
1991         struct thread   *td;
1992 {
1993         accmode_t accmode;
1994         int error;
1995
1996         /* Flags == 0 means only check for existence. */
1997         error = 0;
1998         if (user_flags) {
1999                 accmode = 0;
2000                 if (user_flags & R_OK)
2001                         accmode |= VREAD;
2002                 if (user_flags & W_OK)
2003                         accmode |= VWRITE;
2004                 if (user_flags & X_OK)
2005                         accmode |= VEXEC;
2006 #ifdef MAC
2007                 error = mac_vnode_check_access(cred, vp, accmode);
2008                 if (error != 0)
2009                         return (error);
2010 #endif
2011                 if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2012                         error = VOP_ACCESS(vp, accmode, cred, td);
2013         }
2014         return (error);
2015 }
2016
2017 /*
2018  * Check access permissions using "real" credentials.
2019  */
2020 #ifndef _SYS_SYSPROTO_H_
2021 struct access_args {
2022         char    *path;
2023         int     amode;
2024 };
2025 #endif
2026 int
2027 sys_access(td, uap)
2028         struct thread *td;
2029         register struct access_args /* {
2030                 char *path;
2031                 int amode;
2032         } */ *uap;
2033 {
2034
2035         return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2036 }
2037
2038 #ifndef _SYS_SYSPROTO_H_
2039 struct faccessat_args {
2040         int     dirfd;
2041         char    *path;
2042         int     amode;
2043         int     flag;
2044 }
2045 #endif
2046 int
2047 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2048 {
2049
2050         if (uap->flag & ~AT_EACCESS)
2051                 return (EINVAL);
2052         return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2053             uap->amode));
2054 }
2055
2056 int
2057 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2058 {
2059
2060         return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2061 }
2062
2063 int
2064 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2065     int flag, int amode)
2066 {
2067         struct ucred *cred, *tmpcred;
2068         struct vnode *vp;
2069         struct nameidata nd;
2070         cap_rights_t rights;
2071         int error;
2072
2073         /*
2074          * Create and modify a temporary credential instead of one that
2075          * is potentially shared.
2076          */
2077         if (!(flag & AT_EACCESS)) {
2078                 cred = td->td_ucred;
2079                 tmpcred = crdup(cred);
2080                 tmpcred->cr_uid = cred->cr_ruid;
2081                 tmpcred->cr_groups[0] = cred->cr_rgid;
2082                 td->td_ucred = tmpcred;
2083         } else
2084                 cred = tmpcred = td->td_ucred;
2085         AUDIT_ARG_VALUE(amode);
2086         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2087             AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2088             td);
2089         if ((error = namei(&nd)) != 0)
2090                 goto out1;
2091         vp = nd.ni_vp;
2092
2093         error = vn_access(vp, amode, tmpcred, td);
2094         NDFREE(&nd, NDF_ONLY_PNBUF);
2095         vput(vp);
2096 out1:
2097         if (!(flag & AT_EACCESS)) {
2098                 td->td_ucred = cred;
2099                 crfree(tmpcred);
2100         }
2101         return (error);
2102 }
2103
2104 /*
2105  * Check access permissions using "effective" credentials.
2106  */
2107 #ifndef _SYS_SYSPROTO_H_
2108 struct eaccess_args {
2109         char    *path;
2110         int     amode;
2111 };
2112 #endif
2113 int
2114 sys_eaccess(td, uap)
2115         struct thread *td;
2116         register struct eaccess_args /* {
2117                 char *path;
2118                 int amode;
2119         } */ *uap;
2120 {
2121
2122         return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2123 }
2124
2125 int
2126 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2127 {
2128
2129         return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2130 }
2131
2132 #if defined(COMPAT_43)
2133 /*
2134  * Get file status; this version follows links.
2135  */
2136 #ifndef _SYS_SYSPROTO_H_
2137 struct ostat_args {
2138         char    *path;
2139         struct ostat *ub;
2140 };
2141 #endif
2142 int
2143 ostat(td, uap)
2144         struct thread *td;
2145         register struct ostat_args /* {
2146                 char *path;
2147                 struct ostat *ub;
2148         } */ *uap;
2149 {
2150         struct stat sb;
2151         struct ostat osb;
2152         int error;
2153
2154         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2155         if (error != 0)
2156                 return (error);
2157         cvtstat(&sb, &osb);
2158         return (copyout(&osb, uap->ub, sizeof (osb)));
2159 }
2160
2161 /*
2162  * Get file status; this version does not follow links.
2163  */
2164 #ifndef _SYS_SYSPROTO_H_
2165 struct olstat_args {
2166         char    *path;
2167         struct ostat *ub;
2168 };
2169 #endif
2170 int
2171 olstat(td, uap)
2172         struct thread *td;
2173         register struct olstat_args /* {
2174                 char *path;
2175                 struct ostat *ub;
2176         } */ *uap;
2177 {
2178         struct stat sb;
2179         struct ostat osb;
2180         int error;
2181
2182         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2183         if (error != 0)
2184                 return (error);
2185         cvtstat(&sb, &osb);
2186         return (copyout(&osb, uap->ub, sizeof (osb)));
2187 }
2188
2189 /*
2190  * Convert from an old to a new stat structure.
2191  */
2192 void
2193 cvtstat(st, ost)
2194         struct stat *st;
2195         struct ostat *ost;
2196 {
2197
2198         ost->st_dev = st->st_dev;
2199         ost->st_ino = st->st_ino;
2200         ost->st_mode = st->st_mode;
2201         ost->st_nlink = st->st_nlink;
2202         ost->st_uid = st->st_uid;
2203         ost->st_gid = st->st_gid;
2204         ost->st_rdev = st->st_rdev;
2205         if (st->st_size < (quad_t)1 << 32)
2206                 ost->st_size = st->st_size;
2207         else
2208                 ost->st_size = -2;
2209         ost->st_atim = st->st_atim;
2210         ost->st_mtim = st->st_mtim;
2211         ost->st_ctim = st->st_ctim;
2212         ost->st_blksize = st->st_blksize;
2213         ost->st_blocks = st->st_blocks;
2214         ost->st_flags = st->st_flags;
2215         ost->st_gen = st->st_gen;
2216 }
2217 #endif /* COMPAT_43 */
2218
2219 /*
2220  * Get file status; this version follows links.
2221  */
2222 #ifndef _SYS_SYSPROTO_H_
2223 struct stat_args {
2224         char    *path;
2225         struct stat *ub;
2226 };
2227 #endif
2228 int
2229 sys_stat(td, uap)
2230         struct thread *td;
2231         register struct stat_args /* {
2232                 char *path;
2233                 struct stat *ub;
2234         } */ *uap;
2235 {
2236         struct stat sb;
2237         int error;
2238
2239         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2240         if (error == 0)
2241                 error = copyout(&sb, uap->ub, sizeof (sb));
2242         return (error);
2243 }
2244
2245 #ifndef _SYS_SYSPROTO_H_
2246 struct fstatat_args {
2247         int     fd;
2248         char    *path;
2249         struct stat     *buf;
2250         int     flag;
2251 }
2252 #endif
2253 int
2254 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2255 {
2256         struct stat sb;
2257         int error;
2258
2259         error = kern_statat(td, uap->flag, uap->fd, uap->path,
2260             UIO_USERSPACE, &sb);
2261         if (error == 0)
2262                 error = copyout(&sb, uap->buf, sizeof (sb));
2263         return (error);
2264 }
2265
2266 int
2267 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2268 {
2269
2270         return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2271 }
2272
2273 int
2274 kern_statat(struct thread *td, int flag, int fd, char *path,
2275     enum uio_seg pathseg, struct stat *sbp)
2276 {
2277
2278         return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2279 }
2280
2281 int
2282 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2283     enum uio_seg pathseg, struct stat *sbp,
2284     void (*hook)(struct vnode *vp, struct stat *sbp))
2285 {
2286         struct nameidata nd;
2287         struct stat sb;
2288         cap_rights_t rights;
2289         int error;
2290
2291         if (flag & ~AT_SYMLINK_NOFOLLOW)
2292                 return (EINVAL);
2293
2294         NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2295             FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2296             cap_rights_init(&rights, CAP_FSTAT), td);
2297
2298         if ((error = namei(&nd)) != 0)
2299                 return (error);
2300         error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2301         if (error == 0) {
2302                 SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2303                 if (S_ISREG(sb.st_mode))
2304                         SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2305                 if (__predict_false(hook != NULL))
2306                         hook(nd.ni_vp, &sb);
2307         }
2308         NDFREE(&nd, NDF_ONLY_PNBUF);
2309         vput(nd.ni_vp);
2310         if (error != 0)
2311                 return (error);
2312         *sbp = sb;
2313 #ifdef KTRACE
2314         if (KTRPOINT(td, KTR_STRUCT))
2315                 ktrstat(&sb);
2316 #endif
2317         return (0);
2318 }
2319
2320 /*
2321  * Get file status; this version does not follow links.
2322  */
2323 #ifndef _SYS_SYSPROTO_H_
2324 struct lstat_args {
2325         char    *path;
2326         struct stat *ub;
2327 };
2328 #endif
2329 int
2330 sys_lstat(td, uap)
2331         struct thread *td;
2332         register struct lstat_args /* {
2333                 char *path;
2334                 struct stat *ub;
2335         } */ *uap;
2336 {
2337         struct stat sb;
2338         int error;
2339
2340         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2341         if (error == 0)
2342                 error = copyout(&sb, uap->ub, sizeof (sb));
2343         return (error);
2344 }
2345
2346 int
2347 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2348 {
2349
2350         return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2351             sbp));
2352 }
2353
2354 /*
2355  * Implementation of the NetBSD [l]stat() functions.
2356  */
2357 void
2358 cvtnstat(sb, nsb)
2359         struct stat *sb;
2360         struct nstat *nsb;
2361 {
2362
2363         bzero(nsb, sizeof *nsb);
2364         nsb->st_dev = sb->st_dev;
2365         nsb->st_ino = sb->st_ino;
2366         nsb->st_mode = sb->st_mode;
2367         nsb->st_nlink = sb->st_nlink;
2368         nsb->st_uid = sb->st_uid;
2369         nsb->st_gid = sb->st_gid;
2370         nsb->st_rdev = sb->st_rdev;
2371         nsb->st_atim = sb->st_atim;
2372         nsb->st_mtim = sb->st_mtim;
2373         nsb->st_ctim = sb->st_ctim;
2374         nsb->st_size = sb->st_size;
2375         nsb->st_blocks = sb->st_blocks;
2376         nsb->st_blksize = sb->st_blksize;
2377         nsb->st_flags = sb->st_flags;
2378         nsb->st_gen = sb->st_gen;
2379         nsb->st_birthtim = sb->st_birthtim;
2380 }
2381
2382 #ifndef _SYS_SYSPROTO_H_
2383 struct nstat_args {
2384         char    *path;
2385         struct nstat *ub;
2386 };
2387 #endif
2388 int
2389 sys_nstat(td, uap)
2390         struct thread *td;
2391         register struct nstat_args /* {
2392                 char *path;
2393                 struct nstat *ub;
2394         } */ *uap;
2395 {
2396         struct stat sb;
2397         struct nstat nsb;
2398         int error;
2399
2400         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2401         if (error != 0)
2402                 return (error);
2403         cvtnstat(&sb, &nsb);
2404         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2405 }
2406
2407 /*
2408  * NetBSD lstat.  Get file status; this version does not follow links.
2409  */
2410 #ifndef _SYS_SYSPROTO_H_
2411 struct lstat_args {
2412         char    *path;
2413         struct stat *ub;
2414 };
2415 #endif
2416 int
2417 sys_nlstat(td, uap)
2418         struct thread *td;
2419         register struct nlstat_args /* {
2420                 char *path;
2421                 struct nstat *ub;
2422         } */ *uap;
2423 {
2424         struct stat sb;
2425         struct nstat nsb;
2426         int error;
2427
2428         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2429         if (error != 0)
2430                 return (error);
2431         cvtnstat(&sb, &nsb);
2432         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2433 }
2434
2435 /*
2436  * Get configurable pathname variables.
2437  */
2438 #ifndef _SYS_SYSPROTO_H_
2439 struct pathconf_args {
2440         char    *path;
2441         int     name;
2442 };
2443 #endif
2444 int
2445 sys_pathconf(td, uap)
2446         struct thread *td;
2447         register struct pathconf_args /* {
2448                 char *path;
2449                 int name;
2450         } */ *uap;
2451 {
2452
2453         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2454 }
2455
2456 #ifndef _SYS_SYSPROTO_H_
2457 struct lpathconf_args {
2458         char    *path;
2459         int     name;
2460 };
2461 #endif
2462 int
2463 sys_lpathconf(td, uap)
2464         struct thread *td;
2465         register struct lpathconf_args /* {
2466                 char *path;
2467                 int name;
2468         } */ *uap;
2469 {
2470
2471         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2472             NOFOLLOW));
2473 }
2474
2475 int
2476 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2477     u_long flags)
2478 {
2479         struct nameidata nd;
2480         int error;
2481
2482         NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2483             pathseg, path, td);
2484         if ((error = namei(&nd)) != 0)
2485                 return (error);
2486         NDFREE(&nd, NDF_ONLY_PNBUF);
2487
2488         /* If asynchronous I/O is available, it works for all files. */
2489         if (name == _PC_ASYNC_IO)
2490                 td->td_retval[0] = async_io_version;
2491         else
2492                 error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2493         vput(nd.ni_vp);
2494         return (error);
2495 }
2496
2497 /*
2498  * Return target name of a symbolic link.
2499  */
2500 #ifndef _SYS_SYSPROTO_H_
2501 struct readlink_args {
2502         char    *path;
2503         char    *buf;
2504         size_t  count;
2505 };
2506 #endif
2507 int
2508 sys_readlink(td, uap)
2509         struct thread *td;
2510         register struct readlink_args /* {
2511                 char *path;
2512                 char *buf;
2513                 size_t count;
2514         } */ *uap;
2515 {
2516
2517         return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2518             UIO_USERSPACE, uap->count));
2519 }
2520 #ifndef _SYS_SYSPROTO_H_
2521 struct readlinkat_args {
2522         int     fd;
2523         char    *path;
2524         char    *buf;
2525         size_t  bufsize;
2526 };
2527 #endif
2528 int
2529 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2530 {
2531
2532         return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2533             uap->buf, UIO_USERSPACE, uap->bufsize));
2534 }
2535
2536 int
2537 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2538     enum uio_seg bufseg, size_t count)
2539 {
2540
2541         return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2542             count));
2543 }
2544
2545 int
2546 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2547     char *buf, enum uio_seg bufseg, size_t count)
2548 {
2549         struct vnode *vp;
2550         struct iovec aiov;
2551         struct uio auio;
2552         struct nameidata nd;
2553         int error;
2554
2555         if (count > IOSIZE_MAX)
2556                 return (EINVAL);
2557
2558         NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2559             pathseg, path, fd, td);
2560
2561         if ((error = namei(&nd)) != 0)
2562                 return (error);
2563         NDFREE(&nd, NDF_ONLY_PNBUF);
2564         vp = nd.ni_vp;
2565 #ifdef MAC
2566         error = mac_vnode_check_readlink(td->td_ucred, vp);
2567         if (error != 0) {
2568                 vput(vp);
2569                 return (error);
2570         }
2571 #endif
2572         if (vp->v_type != VLNK)
2573                 error = EINVAL;
2574         else {
2575                 aiov.iov_base = buf;
2576                 aiov.iov_len = count;
2577                 auio.uio_iov = &aiov;
2578                 auio.uio_iovcnt = 1;
2579                 auio.uio_offset = 0;
2580                 auio.uio_rw = UIO_READ;
2581                 auio.uio_segflg = bufseg;
2582                 auio.uio_td = td;
2583                 auio.uio_resid = count;
2584                 error = VOP_READLINK(vp, &auio, td->td_ucred);
2585                 td->td_retval[0] = count - auio.uio_resid;
2586         }
2587         vput(vp);
2588         return (error);
2589 }
2590
2591 /*
2592  * Common implementation code for chflags() and fchflags().
2593  */
2594 static int
2595 setfflags(td, vp, flags)
2596         struct thread *td;
2597         struct vnode *vp;
2598         u_long flags;
2599 {
2600         struct mount *mp;
2601         struct vattr vattr;
2602         int error;
2603
2604         /* We can't support the value matching VNOVAL. */
2605         if (flags == VNOVAL)
2606                 return (EOPNOTSUPP);
2607
2608         /*
2609          * Prevent non-root users from setting flags on devices.  When
2610          * a device is reused, users can retain ownership of the device
2611          * if they are allowed to set flags and programs assume that
2612          * chown can't fail when done as root.
2613          */
2614         if (vp->v_type == VCHR || vp->v_type == VBLK) {
2615                 error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2616                 if (error != 0)
2617                         return (error);
2618         }
2619
2620         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2621                 return (error);
2622         VATTR_NULL(&vattr);
2623         vattr.va_flags = flags;
2624         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2625 #ifdef MAC
2626         error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2627         if (error == 0)
2628 #endif
2629                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2630         VOP_UNLOCK(vp, 0);
2631         vn_finished_write(mp);
2632         return (error);
2633 }
2634
2635 /*
2636  * Change flags of a file given a path name.
2637  */
2638 #ifndef _SYS_SYSPROTO_H_
2639 struct chflags_args {
2640         const char *path;
2641         u_long  flags;
2642 };
2643 #endif
2644 int
2645 sys_chflags(td, uap)
2646         struct thread *td;
2647         register struct chflags_args /* {
2648                 const char *path;
2649                 u_long flags;
2650         } */ *uap;
2651 {
2652
2653         return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2654 }
2655
2656 #ifndef _SYS_SYSPROTO_H_
2657 struct chflagsat_args {
2658         int     fd;
2659         const char *path;
2660         u_long  flags;
2661         int     atflag;
2662 }
2663 #endif
2664 int
2665 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2666 {
2667         int fd = uap->fd;
2668         const char *path = uap->path;
2669         u_long flags = uap->flags;
2670         int atflag = uap->atflag;
2671
2672         if (atflag & ~AT_SYMLINK_NOFOLLOW)
2673                 return (EINVAL);
2674
2675         return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2676 }
2677
2678 static int
2679 kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2680     u_long flags)
2681 {
2682
2683         return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2684 }
2685
2686 /*
2687  * Same as chflags() but doesn't follow symlinks.
2688  */
2689 int
2690 sys_lchflags(td, uap)
2691         struct thread *td;
2692         register struct lchflags_args /* {
2693                 const char *path;
2694                 u_long flags;
2695         } */ *uap;
2696 {
2697
2698         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2699             uap->flags, AT_SYMLINK_NOFOLLOW));
2700 }
2701
2702 static int
2703 kern_chflagsat(struct thread *td, int fd, const char *path,
2704     enum uio_seg pathseg, u_long flags, int atflag)
2705 {
2706         struct nameidata nd;
2707         cap_rights_t rights;
2708         int error, follow;
2709
2710         AUDIT_ARG_FFLAGS(flags);
2711         follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2712         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2713             cap_rights_init(&rights, CAP_FCHFLAGS), td);
2714         if ((error = namei(&nd)) != 0)
2715                 return (error);
2716         NDFREE(&nd, NDF_ONLY_PNBUF);
2717         error = setfflags(td, nd.ni_vp, flags);
2718         vrele(nd.ni_vp);
2719         return (error);
2720 }
2721
2722 /*
2723  * Change flags of a file given a file descriptor.
2724  */
2725 #ifndef _SYS_SYSPROTO_H_
2726 struct fchflags_args {
2727         int     fd;
2728         u_long  flags;
2729 };
2730 #endif
2731 int
2732 sys_fchflags(td, uap)
2733         struct thread *td;
2734         register struct fchflags_args /* {
2735                 int fd;
2736                 u_long flags;
2737         } */ *uap;
2738 {
2739         struct file *fp;
2740         cap_rights_t rights;
2741         int error;
2742
2743         AUDIT_ARG_FD(uap->fd);
2744         AUDIT_ARG_FFLAGS(uap->flags);
2745         error = getvnode(td->td_proc->p_fd, uap->fd,
2746             cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2747         if (error != 0)
2748                 return (error);
2749 #ifdef AUDIT
2750         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2751         AUDIT_ARG_VNODE1(fp->f_vnode);
2752         VOP_UNLOCK(fp->f_vnode, 0);
2753 #endif
2754         error = setfflags(td, fp->f_vnode, uap->flags);
2755         fdrop(fp, td);
2756         return (error);
2757 }
2758
2759 /*
2760  * Common implementation code for chmod(), lchmod() and fchmod().
2761  */
2762 int
2763 setfmode(td, cred, vp, mode)
2764         struct thread *td;
2765         struct ucred *cred;
2766         struct vnode *vp;
2767         int mode;
2768 {
2769         struct mount *mp;
2770         struct vattr vattr;
2771         int error;
2772
2773         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2774                 return (error);
2775         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2776         VATTR_NULL(&vattr);
2777         vattr.va_mode = mode & ALLPERMS;
2778 #ifdef MAC
2779         error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2780         if (error == 0)
2781 #endif
2782                 error = VOP_SETATTR(vp, &vattr, cred);
2783         VOP_UNLOCK(vp, 0);
2784         vn_finished_write(mp);
2785         return (error);
2786 }
2787
2788 /*
2789  * Change mode of a file given path name.
2790  */
2791 #ifndef _SYS_SYSPROTO_H_
2792 struct chmod_args {
2793         char    *path;
2794         int     mode;
2795 };
2796 #endif
2797 int
2798 sys_chmod(td, uap)
2799         struct thread *td;
2800         register struct chmod_args /* {
2801                 char *path;
2802                 int mode;
2803         } */ *uap;
2804 {
2805
2806         return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2807 }
2808
2809 #ifndef _SYS_SYSPROTO_H_
2810 struct fchmodat_args {
2811         int     dirfd;
2812         char    *path;
2813         mode_t  mode;
2814         int     flag;
2815 }
2816 #endif
2817 int
2818 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2819 {
2820         int flag = uap->flag;
2821         int fd = uap->fd;
2822         char *path = uap->path;
2823         mode_t mode = uap->mode;
2824
2825         if (flag & ~AT_SYMLINK_NOFOLLOW)
2826                 return (EINVAL);
2827
2828         return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2829 }
2830
2831 int
2832 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2833 {
2834
2835         return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2836 }
2837
2838 /*
2839  * Change mode of a file given path name (don't follow links.)
2840  */
2841 #ifndef _SYS_SYSPROTO_H_
2842 struct lchmod_args {
2843         char    *path;
2844         int     mode;
2845 };
2846 #endif
2847 int
2848 sys_lchmod(td, uap)
2849         struct thread *td;
2850         register struct lchmod_args /* {
2851                 char *path;
2852                 int mode;
2853         } */ *uap;
2854 {
2855
2856         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2857             uap->mode, AT_SYMLINK_NOFOLLOW));
2858 }
2859
2860 int
2861 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2862     mode_t mode, int flag)
2863 {
2864         struct nameidata nd;
2865         cap_rights_t rights;
2866         int error, follow;
2867
2868         AUDIT_ARG_MODE(mode);
2869         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2870         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2871             cap_rights_init(&rights, CAP_FCHMOD), td);
2872         if ((error = namei(&nd)) != 0)
2873                 return (error);
2874         NDFREE(&nd, NDF_ONLY_PNBUF);
2875         error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2876         vrele(nd.ni_vp);
2877         return (error);
2878 }
2879
2880 /*
2881  * Change mode of a file given a file descriptor.
2882  */
2883 #ifndef _SYS_SYSPROTO_H_
2884 struct fchmod_args {
2885         int     fd;
2886         int     mode;
2887 };
2888 #endif
2889 int
2890 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2891 {
2892         struct file *fp;
2893         cap_rights_t rights;
2894         int error;
2895
2896         AUDIT_ARG_FD(uap->fd);
2897         AUDIT_ARG_MODE(uap->mode);
2898
2899         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2900         if (error != 0)
2901                 return (error);
2902         error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2903         fdrop(fp, td);
2904         return (error);
2905 }
2906
2907 /*
2908  * Common implementation for chown(), lchown(), and fchown()
2909  */
2910 int
2911 setfown(td, cred, vp, uid, gid)
2912         struct thread *td;
2913         struct ucred *cred;
2914         struct vnode *vp;
2915         uid_t uid;
2916         gid_t gid;
2917 {
2918         struct mount *mp;
2919         struct vattr vattr;
2920         int error;
2921
2922         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2923                 return (error);
2924         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2925         VATTR_NULL(&vattr);
2926         vattr.va_uid = uid;
2927         vattr.va_gid = gid;
2928 #ifdef MAC
2929         error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2930             vattr.va_gid);
2931         if (error == 0)
2932 #endif
2933                 error = VOP_SETATTR(vp, &vattr, cred);
2934         VOP_UNLOCK(vp, 0);
2935         vn_finished_write(mp);
2936         return (error);
2937 }
2938
2939 /*
2940  * Set ownership given a path name.
2941  */
2942 #ifndef _SYS_SYSPROTO_H_
2943 struct chown_args {
2944         char    *path;
2945         int     uid;
2946         int     gid;
2947 };
2948 #endif
2949 int
2950 sys_chown(td, uap)
2951         struct thread *td;
2952         register struct chown_args /* {
2953                 char *path;
2954                 int uid;
2955                 int gid;
2956         } */ *uap;
2957 {
2958
2959         return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2960 }
2961
2962 #ifndef _SYS_SYSPROTO_H_
2963 struct fchownat_args {
2964         int fd;
2965         const char * path;
2966         uid_t uid;
2967         gid_t gid;
2968         int flag;
2969 };
2970 #endif
2971 int
2972 sys_fchownat(struct thread *td, struct fchownat_args *uap)
2973 {
2974         int flag;
2975
2976         flag = uap->flag;
2977         if (flag & ~AT_SYMLINK_NOFOLLOW)
2978                 return (EINVAL);
2979
2980         return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2981             uap->gid, uap->flag));
2982 }
2983
2984 int
2985 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2986     int gid)
2987 {
2988
2989         return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2990 }
2991
2992 int
2993 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2994     int uid, int gid, int flag)
2995 {
2996         struct nameidata nd;
2997         cap_rights_t rights;
2998         int error, follow;
2999
3000         AUDIT_ARG_OWNER(uid, gid);
3001         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3002         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3003             cap_rights_init(&rights, CAP_FCHOWN), td);
3004
3005         if ((error = namei(&nd)) != 0)
3006                 return (error);
3007         NDFREE(&nd, NDF_ONLY_PNBUF);
3008         error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3009         vrele(nd.ni_vp);
3010         return (error);
3011 }
3012
3013 /*
3014  * Set ownership given a path name, do not cross symlinks.
3015  */
3016 #ifndef _SYS_SYSPROTO_H_
3017 struct lchown_args {
3018         char    *path;
3019         int     uid;
3020         int     gid;
3021 };
3022 #endif
3023 int
3024 sys_lchown(td, uap)
3025         struct thread *td;
3026         register struct lchown_args /* {
3027                 char *path;
3028                 int uid;
3029                 int gid;
3030         } */ *uap;
3031 {
3032
3033         return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3034 }
3035
3036 int
3037 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3038     int gid)
3039 {
3040
3041         return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3042             AT_SYMLINK_NOFOLLOW));
3043 }
3044
3045 /*
3046  * Set ownership given a file descriptor.
3047  */
3048 #ifndef _SYS_SYSPROTO_H_
3049 struct fchown_args {
3050         int     fd;
3051         int     uid;
3052         int     gid;
3053 };
3054 #endif
3055 int
3056 sys_fchown(td, uap)
3057         struct thread *td;
3058         register struct fchown_args /* {
3059                 int fd;
3060                 int uid;
3061                 int gid;
3062         } */ *uap;
3063 {
3064         struct file *fp;
3065         cap_rights_t rights;
3066         int error;
3067
3068         AUDIT_ARG_FD(uap->fd);
3069         AUDIT_ARG_OWNER(uap->uid, uap->gid);
3070         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3071         if (error != 0)
3072                 return (error);
3073         error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3074         fdrop(fp, td);
3075         return (error);
3076 }
3077
3078 /*
3079  * Common implementation code for utimes(), lutimes(), and futimes().
3080  */
3081 static int
3082 getutimes(usrtvp, tvpseg, tsp)
3083         const struct timeval *usrtvp;
3084         enum uio_seg tvpseg;
3085         struct timespec *tsp;
3086 {
3087         struct timeval tv[2];
3088         const struct timeval *tvp;
3089         int error;
3090
3091         if (usrtvp == NULL) {
3092                 vfs_timestamp(&tsp[0]);
3093                 tsp[1] = tsp[0];
3094         } else {
3095                 if (tvpseg == UIO_SYSSPACE) {
3096                         tvp = usrtvp;
3097                 } else {
3098                         if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3099                                 return (error);
3100                         tvp = tv;
3101                 }
3102
3103                 if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3104                     tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3105                         return (EINVAL);
3106                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3107                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3108         }
3109         return (0);
3110 }
3111
3112 /*
3113  * Common implementation code for utimes(), lutimes(), and futimes().
3114  */
3115 static int
3116 setutimes(td, vp, ts, numtimes, nullflag)
3117         struct thread *td;
3118         struct vnode *vp;
3119         const struct timespec *ts;
3120         int numtimes;
3121         int nullflag;
3122 {
3123         struct mount *mp;
3124         struct vattr vattr;
3125         int error, setbirthtime;
3126
3127         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3128                 return (error);
3129         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3130         setbirthtime = 0;
3131         if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3132             timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3133                 setbirthtime = 1;
3134         VATTR_NULL(&vattr);
3135         vattr.va_atime = ts[0];
3136         vattr.va_mtime = ts[1];
3137         if (setbirthtime)
3138                 vattr.va_birthtime = ts[1];
3139         if (numtimes > 2)
3140                 vattr.va_birthtime = ts[2];
3141         if (nullflag)
3142                 vattr.va_vaflags |= VA_UTIMES_NULL;
3143 #ifdef MAC
3144         error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3145             vattr.va_mtime);
3146 #endif
3147         if (error == 0)
3148                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3149         VOP_UNLOCK(vp, 0);
3150         vn_finished_write(mp);
3151         return (error);
3152 }
3153
3154 /*
3155  * Set the access and modification times of a file.
3156  */
3157 #ifndef _SYS_SYSPROTO_H_
3158 struct utimes_args {
3159         char    *path;
3160         struct  timeval *tptr;
3161 };
3162 #endif
3163 int
3164 sys_utimes(td, uap)
3165         struct thread *td;
3166         register struct utimes_args /* {
3167                 char *path;
3168                 struct timeval *tptr;
3169         } */ *uap;
3170 {
3171
3172         return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3173             UIO_USERSPACE));
3174 }
3175
3176 #ifndef _SYS_SYSPROTO_H_
3177 struct futimesat_args {
3178         int fd;
3179         const char * path;
3180         const struct timeval * times;
3181 };
3182 #endif
3183 int
3184 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3185 {
3186
3187         return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3188             uap->times, UIO_USERSPACE));
3189 }
3190
3191 int
3192 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3193     struct timeval *tptr, enum uio_seg tptrseg)
3194 {
3195
3196         return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3197 }
3198
3199 int
3200 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3201     struct timeval *tptr, enum uio_seg tptrseg)
3202 {
3203         struct nameidata nd;
3204         struct timespec ts[2];
3205         cap_rights_t rights;
3206         int error;
3207
3208         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3209                 return (error);
3210         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3211             cap_rights_init(&rights, CAP_FUTIMES), td);
3212
3213         if ((error = namei(&nd)) != 0)
3214                 return (error);
3215         NDFREE(&nd, NDF_ONLY_PNBUF);
3216         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3217         vrele(nd.ni_vp);
3218         return (error);
3219 }
3220
3221 /*
3222  * Set the access and modification times of a file.
3223  */
3224 #ifndef _SYS_SYSPROTO_H_
3225 struct lutimes_args {
3226         char    *path;
3227         struct  timeval *tptr;
3228 };
3229 #endif
3230 int
3231 sys_lutimes(td, uap)
3232         struct thread *td;
3233         register struct lutimes_args /* {
3234                 char *path;
3235                 struct timeval *tptr;
3236         } */ *uap;
3237 {
3238
3239         return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3240             UIO_USERSPACE));
3241 }
3242
3243 int
3244 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3245     struct timeval *tptr, enum uio_seg tptrseg)
3246 {
3247         struct timespec ts[2];
3248         struct nameidata nd;
3249         int error;
3250
3251         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3252                 return (error);
3253         NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3254         if ((error = namei(&nd)) != 0)
3255                 return (error);
3256         NDFREE(&nd, NDF_ONLY_PNBUF);
3257         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3258         vrele(nd.ni_vp);
3259         return (error);
3260 }
3261
3262 /*
3263  * Set the access and modification times of a file.
3264  */
3265 #ifndef _SYS_SYSPROTO_H_
3266 struct futimes_args {
3267         int     fd;
3268         struct  timeval *tptr;
3269 };
3270 #endif
3271 int
3272 sys_futimes(td, uap)
3273         struct thread *td;
3274         register struct futimes_args /* {
3275                 int  fd;
3276                 struct timeval *tptr;
3277         } */ *uap;
3278 {
3279
3280         return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3281 }
3282
3283 int
3284 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3285     enum uio_seg tptrseg)
3286 {
3287         struct timespec ts[2];
3288         struct file *fp;
3289         cap_rights_t rights;
3290         int error;
3291
3292         AUDIT_ARG_FD(fd);
3293         error = getutimes(tptr, tptrseg, ts);
3294         if (error != 0)
3295                 return (error);
3296         error = getvnode(td->td_proc->p_fd, fd,
3297             cap_rights_init(&rights, CAP_FUTIMES), &fp);
3298         if (error != 0)
3299                 return (error);
3300 #ifdef AUDIT
3301         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3302         AUDIT_ARG_VNODE1(fp->f_vnode);
3303         VOP_UNLOCK(fp->f_vnode, 0);
3304 #endif
3305         error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3306         fdrop(fp, td);
3307         return (error);
3308 }
3309
3310 /*
3311  * Truncate a file given its path name.
3312  */
3313 #ifndef _SYS_SYSPROTO_H_
3314 struct truncate_args {
3315         char    *path;
3316         int     pad;
3317         off_t   length;
3318 };
3319 #endif
3320 int
3321 sys_truncate(td, uap)
3322         struct thread *td;
3323         register struct truncate_args /* {
3324                 char *path;
3325                 int pad;
3326                 off_t length;
3327         } */ *uap;
3328 {
3329
3330         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3331 }
3332
3333 int
3334 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3335 {
3336         struct mount *mp;
3337         struct vnode *vp;
3338         void *rl_cookie;
3339         struct vattr vattr;
3340         struct nameidata nd;
3341         int error;
3342
3343         if (length < 0)
3344                 return(EINVAL);
3345         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3346         if ((error = namei(&nd)) != 0)
3347                 return (error);
3348         vp = nd.ni_vp;
3349         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3350         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3351                 vn_rangelock_unlock(vp, rl_cookie);
3352                 vrele(vp);
3353                 return (error);
3354         }
3355         NDFREE(&nd, NDF_ONLY_PNBUF);
3356         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3357         if (vp->v_type == VDIR)
3358                 error = EISDIR;
3359 #ifdef MAC
3360         else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3361         }
3362 #endif
3363         else if ((error = vn_writechk(vp)) == 0 &&
3364             (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3365                 VATTR_NULL(&vattr);
3366                 vattr.va_size = length;
3367                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3368         }
3369         VOP_UNLOCK(vp, 0);
3370         vn_finished_write(mp);
3371         vn_rangelock_unlock(vp, rl_cookie);
3372         vrele(vp);
3373         return (error);
3374 }
3375
3376 #if defined(COMPAT_43)
3377 /*
3378  * Truncate a file given its path name.
3379  */
3380 #ifndef _SYS_SYSPROTO_H_
3381 struct otruncate_args {
3382         char    *path;
3383         long    length;
3384 };
3385 #endif
3386 int
3387 otruncate(td, uap)
3388         struct thread *td;
3389         register struct otruncate_args /* {
3390                 char *path;
3391                 long length;
3392         } */ *uap;
3393 {
3394         struct truncate_args /* {
3395                 char *path;
3396                 int pad;
3397                 off_t length;
3398         } */ nuap;
3399
3400         nuap.path = uap->path;
3401         nuap.length = uap->length;
3402         return (sys_truncate(td, &nuap));
3403 }
3404 #endif /* COMPAT_43 */
3405
3406 /* Versions with the pad argument */
3407 int
3408 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3409 {
3410         struct truncate_args ouap;
3411
3412         ouap.path = uap->path;
3413         ouap.length = uap->length;
3414         return (sys_truncate(td, &ouap));
3415 }
3416
3417 int
3418 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3419 {
3420         struct ftruncate_args ouap;
3421
3422         ouap.fd = uap->fd;
3423         ouap.length = uap->length;
3424         return (sys_ftruncate(td, &ouap));
3425 }
3426
3427 /*
3428  * Sync an open file.
3429  */
3430 #ifndef _SYS_SYSPROTO_H_
3431 struct fsync_args {
3432         int     fd;
3433 };
3434 #endif
3435 int
3436 sys_fsync(td, uap)
3437         struct thread *td;
3438         struct fsync_args /* {
3439                 int fd;
3440         } */ *uap;
3441 {
3442         struct vnode *vp;
3443         struct mount *mp;
3444         struct file *fp;
3445         cap_rights_t rights;
3446         int error, lock_flags;
3447
3448         AUDIT_ARG_FD(uap->fd);
3449         error = getvnode(td->td_proc->p_fd, uap->fd,
3450             cap_rights_init(&rights, CAP_FSYNC), &fp);
3451         if (error != 0)
3452                 return (error);
3453         vp = fp->f_vnode;
3454         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3455         if (error != 0)
3456                 goto drop;
3457         if (MNT_SHARED_WRITES(mp) ||
3458             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3459                 lock_flags = LK_SHARED;
3460         } else {
3461                 lock_flags = LK_EXCLUSIVE;
3462         }
3463         vn_lock(vp, lock_flags | LK_RETRY);
3464         AUDIT_ARG_VNODE1(vp);
3465         if (vp->v_object != NULL) {
3466                 VM_OBJECT_WLOCK(vp->v_object);
3467                 vm_object_page_clean(vp->v_object, 0, 0, 0);
3468                 VM_OBJECT_WUNLOCK(vp->v_object);
3469         }
3470         error = VOP_FSYNC(vp, MNT_WAIT, td);
3471
3472         VOP_UNLOCK(vp, 0);
3473         vn_finished_write(mp);
3474 drop:
3475         fdrop(fp, td);
3476         return (error);
3477 }
3478
3479 /*
3480  * Rename files.  Source and destination must either both be directories, or
3481  * both not be directories.  If target is a directory, it must be empty.
3482  */
3483 #ifndef _SYS_SYSPROTO_H_
3484 struct rename_args {
3485         char    *from;
3486         char    *to;
3487 };
3488 #endif
3489 int
3490 sys_rename(td, uap)
3491         struct thread *td;
3492         register struct rename_args /* {
3493                 char *from;
3494                 char *to;
3495         } */ *uap;
3496 {
3497
3498         return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3499 }
3500
3501 #ifndef _SYS_SYSPROTO_H_
3502 struct renameat_args {
3503         int     oldfd;
3504         char    *old;
3505         int     newfd;
3506         char    *new;
3507 };
3508 #endif
3509 int
3510 sys_renameat(struct thread *td, struct renameat_args *uap)
3511 {
3512
3513         return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3514             UIO_USERSPACE));
3515 }
3516
3517 int
3518 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3519 {
3520
3521         return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3522 }
3523
3524 int
3525 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3526     enum uio_seg pathseg)
3527 {
3528         struct mount *mp = NULL;
3529         struct vnode *tvp, *fvp, *tdvp;
3530         struct nameidata fromnd, tond;
3531         cap_rights_t rights;
3532         int error;
3533
3534 again:
3535         bwillwrite();
3536 #ifdef MAC
3537         NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3538             AUDITVNODE1, pathseg, old, oldfd,
3539             cap_rights_init(&rights, CAP_RENAMEAT), td);
3540 #else
3541         NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3542             pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3543 #endif
3544
3545         if ((error = namei(&fromnd)) != 0)
3546                 return (error);
3547 #ifdef MAC
3548         error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3549             fromnd.ni_vp, &fromnd.ni_cnd);
3550         VOP_UNLOCK(fromnd.ni_dvp, 0);
3551         if (fromnd.ni_dvp != fromnd.ni_vp)
3552                 VOP_UNLOCK(fromnd.ni_vp, 0);
3553 #endif
3554         fvp = fromnd.ni_vp;
3555         NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3556             SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3557             cap_rights_init(&rights, CAP_LINKAT), td);
3558         if (fromnd.ni_vp->v_type == VDIR)
3559                 tond.ni_cnd.cn_flags |= WILLBEDIR;
3560         if ((error = namei(&tond)) != 0) {
3561                 /* Translate error code for rename("dir1", "dir2/."). */
3562                 if (error == EISDIR && fvp->v_type == VDIR)
3563                         error = EINVAL;
3564                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3565                 vrele(fromnd.ni_dvp);
3566                 vrele(fvp);
3567                 goto out1;
3568         }
3569         tdvp = tond.ni_dvp;
3570         tvp = tond.ni_vp;
3571         error = vn_start_write(fvp, &mp, V_NOWAIT);
3572         if (error != 0) {
3573                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3574                 NDFREE(&tond, NDF_ONLY_PNBUF);
3575                 if (tvp != NULL)
3576                         vput(tvp);
3577                 if (tdvp == tvp)
3578                         vrele(tdvp);
3579                 else
3580                         vput(tdvp);
3581                 vrele(fromnd.ni_dvp);
3582                 vrele(fvp);
3583                 vrele(tond.ni_startdir);
3584                 if (fromnd.ni_startdir != NULL)
3585                         vrele(fromnd.ni_startdir);
3586                 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3587                 if (error != 0)
3588                         return (error);
3589                 goto again;
3590         }
3591         if (tvp != NULL) {
3592                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3593                         error = ENOTDIR;
3594                         goto out;
3595                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3596                         error = EISDIR;
3597                         goto out;
3598                 }
3599 #ifdef CAPABILITIES
3600                 if (newfd != AT_FDCWD) {
3601                         /*
3602                          * If the target already exists we require CAP_UNLINKAT
3603                          * from 'newfd'.
3604                          */
3605                         error = cap_check(&tond.ni_filecaps.fc_rights,
3606                             cap_rights_init(&rights, CAP_UNLINKAT));
3607                         if (error != 0)
3608                                 goto out;
3609                 }
3610 #endif
3611         }
3612         if (fvp == tdvp) {
3613                 error = EINVAL;
3614                 goto out;
3615         }
3616         /*
3617          * If the source is the same as the destination (that is, if they
3618          * are links to the same vnode), then there is nothing to do.
3619          */
3620         if (fvp == tvp)
3621                 error = -1;
3622 #ifdef MAC
3623         else
3624                 error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3625                     tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3626 #endif
3627 out:
3628         if (error == 0) {
3629                 error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3630                     tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3631                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3632                 NDFREE(&tond, NDF_ONLY_PNBUF);
3633         } else {
3634                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3635                 NDFREE(&tond, NDF_ONLY_PNBUF);
3636                 if (tvp != NULL)
3637                         vput(tvp);
3638                 if (tdvp == tvp)
3639                         vrele(tdvp);
3640                 else
3641                         vput(tdvp);
3642                 vrele(fromnd.ni_dvp);
3643                 vrele(fvp);
3644         }
3645         vrele(tond.ni_startdir);
3646         vn_finished_write(mp);
3647 out1:
3648         if (fromnd.ni_startdir)
3649                 vrele(fromnd.ni_startdir);
3650         if (error == -1)
3651                 return (0);
3652         return (error);
3653 }
3654
3655 /*
3656  * Make a directory file.
3657  */
3658 #ifndef _SYS_SYSPROTO_H_
3659 struct mkdir_args {
3660         char    *path;
3661         int     mode;
3662 };
3663 #endif
3664 int
3665 sys_mkdir(td, uap)
3666         struct thread *td;
3667         register struct mkdir_args /* {
3668                 char *path;
3669                 int mode;
3670         } */ *uap;
3671 {
3672
3673         return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3674 }
3675
3676 #ifndef _SYS_SYSPROTO_H_
3677 struct mkdirat_args {
3678         int     fd;
3679         char    *path;
3680         mode_t  mode;
3681 };
3682 #endif
3683 int
3684 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3685 {
3686
3687         return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3688 }
3689
3690 int
3691 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3692 {
3693
3694         return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3695 }
3696
3697 int
3698 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3699     int mode)
3700 {
3701         struct mount *mp;
3702         struct vnode *vp;
3703         struct vattr vattr;
3704         struct nameidata nd;
3705         cap_rights_t rights;
3706         int error;
3707
3708         AUDIT_ARG_MODE(mode);
3709 restart:
3710         bwillwrite();
3711         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3712             NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3713             td);
3714         nd.ni_cnd.cn_flags |= WILLBEDIR;
3715         if ((error = namei(&nd)) != 0)
3716                 return (error);
3717         vp = nd.ni_vp;
3718         if (vp != NULL) {
3719                 NDFREE(&nd, NDF_ONLY_PNBUF);
3720                 /*
3721                  * XXX namei called with LOCKPARENT but not LOCKLEAF has
3722                  * the strange behaviour of leaving the vnode unlocked
3723                  * if the target is the same vnode as the parent.
3724                  */
3725                 if (vp == nd.ni_dvp)
3726                         vrele(nd.ni_dvp);
3727                 else
3728                         vput(nd.ni_dvp);
3729                 vrele(vp);
3730                 return (EEXIST);
3731         }
3732         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3733                 NDFREE(&nd, NDF_ONLY_PNBUF);
3734                 vput(nd.ni_dvp);
3735                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3736                         return (error);
3737                 goto restart;
3738         }
3739         VATTR_NULL(&vattr);
3740         vattr.va_type = VDIR;
3741         vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3742 #ifdef MAC
3743         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3744             &vattr);
3745         if (error != 0)
3746                 goto out;
3747 #endif
3748         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3749 #ifdef MAC
3750 out:
3751 #endif
3752         NDFREE(&nd, NDF_ONLY_PNBUF);
3753         vput(nd.ni_dvp);
3754         if (error == 0)
3755                 vput(nd.ni_vp);
3756         vn_finished_write(mp);
3757         return (error);
3758 }
3759
3760 /*
3761  * Remove a directory file.
3762  */
3763 #ifndef _SYS_SYSPROTO_H_
3764 struct rmdir_args {
3765         char    *path;
3766 };
3767 #endif
3768 int
3769 sys_rmdir(td, uap)
3770         struct thread *td;
3771         struct rmdir_args /* {
3772                 char *path;
3773         } */ *uap;
3774 {
3775
3776         return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3777 }
3778
3779 int
3780 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3781 {
3782
3783         return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3784 }
3785
3786 int
3787 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3788 {
3789         struct mount *mp;
3790         struct vnode *vp;
3791         struct nameidata nd;
3792         cap_rights_t rights;
3793         int error;
3794
3795 restart:
3796         bwillwrite();
3797         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3798             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3799         if ((error = namei(&nd)) != 0)
3800                 return (error);
3801         vp = nd.ni_vp;
3802         if (vp->v_type != VDIR) {
3803                 error = ENOTDIR;
3804                 goto out;
3805         }
3806         /*
3807          * No rmdir "." please.
3808          */
3809         if (nd.ni_dvp == vp) {
3810                 error = EINVAL;
3811                 goto out;
3812         }
3813         /*
3814          * The root of a mounted filesystem cannot be deleted.
3815          */
3816         if (vp->v_vflag & VV_ROOT) {
3817                 error = EBUSY;
3818                 goto out;
3819         }
3820 #ifdef MAC
3821         error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3822             &nd.ni_cnd);
3823         if (error != 0)
3824                 goto out;
3825 #endif
3826         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3827                 NDFREE(&nd, NDF_ONLY_PNBUF);
3828                 vput(vp);
3829                 if (nd.ni_dvp == vp)
3830                         vrele(nd.ni_dvp);
3831                 else
3832                         vput(nd.ni_dvp);
3833                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3834                         return (error);
3835                 goto restart;
3836         }
3837         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3838         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3839         vn_finished_write(mp);
3840 out:
3841         NDFREE(&nd, NDF_ONLY_PNBUF);
3842         vput(vp);
3843         if (nd.ni_dvp == vp)
3844                 vrele(nd.ni_dvp);
3845         else
3846                 vput(nd.ni_dvp);
3847         return (error);
3848 }
3849
3850 #ifdef COMPAT_43
3851 /*
3852  * Read a block of directory entries in a filesystem independent format.
3853  */
3854 #ifndef _SYS_SYSPROTO_H_
3855 struct ogetdirentries_args {
3856         int     fd;
3857         char    *buf;
3858         u_int   count;
3859         long    *basep;
3860 };
3861 #endif
3862 int
3863 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3864 {
3865         long loff;
3866         int error;
3867
3868         error = kern_ogetdirentries(td, uap, &loff);
3869         if (error == 0)
3870                 error = copyout(&loff, uap->basep, sizeof(long));
3871         return (error);
3872 }
3873
3874 int
3875 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3876     long *ploff)
3877 {
3878         struct vnode *vp;
3879         struct file *fp;
3880         struct uio auio, kuio;
3881         struct iovec aiov, kiov;
3882         struct dirent *dp, *edp;
3883         cap_rights_t rights;
3884         caddr_t dirbuf;
3885         int error, eofflag, readcnt;
3886         long loff;
3887         off_t foffset;
3888
3889         /* XXX arbitrary sanity limit on `count'. */
3890         if (uap->count > 64 * 1024)
3891                 return (EINVAL);
3892         error = getvnode(td->td_proc->p_fd, uap->fd,
3893             cap_rights_init(&rights, CAP_READ), &fp);
3894         if (error != 0)
3895                 return (error);
3896         if ((fp->f_flag & FREAD) == 0) {
3897                 fdrop(fp, td);
3898                 return (EBADF);
3899         }
3900         vp = fp->f_vnode;
3901         foffset = foffset_lock(fp, 0);
3902 unionread:
3903         if (vp->v_type != VDIR) {
3904                 foffset_unlock(fp, foffset, 0);
3905                 fdrop(fp, td);
3906                 return (EINVAL);
3907         }
3908         aiov.iov_base = uap->buf;
3909         aiov.iov_len = uap->count;
3910         auio.uio_iov = &aiov;
3911         auio.uio_iovcnt = 1;
3912         auio.uio_rw = UIO_READ;
3913         auio.uio_segflg = UIO_USERSPACE;
3914         auio.uio_td = td;
3915         auio.uio_resid = uap->count;
3916         vn_lock(vp, LK_SHARED | LK_RETRY);
3917         loff = auio.uio_offset = foffset;
3918 #ifdef MAC
3919         error = mac_vnode_check_readdir(td->td_ucred, vp);
3920         if (error != 0) {
3921                 VOP_UNLOCK(vp, 0);
3922                 foffset_unlock(fp, foffset, FOF_NOUPDATE);
3923                 fdrop(fp, td);
3924                 return (error);
3925         }
3926 #endif
3927 #       if (BYTE_ORDER != LITTLE_ENDIAN)
3928                 if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3929                         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3930                             NULL, NULL);
3931                         foffset = auio.uio_offset;
3932                 } else
3933 #       endif
3934         {
3935                 kuio = auio;
3936                 kuio.uio_iov = &kiov;
3937                 kuio.uio_segflg = UIO_SYSSPACE;
3938                 kiov.iov_len = uap->count;
3939                 dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3940                 kiov.iov_base = dirbuf;
3941                 error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3942                             NULL, NULL);
3943                 foffset = kuio.uio_offset;
3944                 if (error == 0) {
3945                         readcnt = uap->count - kuio.uio_resid;
3946                         edp = (struct dirent *)&dirbuf[readcnt];
3947                         for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3948 #                               if (BYTE_ORDER == LITTLE_ENDIAN)
3949                                         /*
3950                                          * The expected low byte of
3951                                          * dp->d_namlen is our dp->d_type.
3952                                          * The high MBZ byte of dp->d_namlen
3953                                          * is our dp->d_namlen.
3954                                          */
3955                                         dp->d_type = dp->d_namlen;
3956                                         dp->d_namlen = 0;
3957 #                               else
3958                                         /*
3959                                          * The dp->d_type is the high byte
3960                                          * of the expected dp->d_namlen,
3961                                          * so must be zero'ed.
3962                                          */
3963                                         dp->d_type = 0;
3964 #                               endif
3965                                 if (dp->d_reclen > 0) {
3966                                         dp = (struct dirent *)
3967                                             ((char *)dp + dp->d_reclen);
3968                                 } else {
3969                                         error = EIO;
3970                                         break;
3971                                 }
3972                         }
3973                         if (dp >= edp)
3974                                 error = uiomove(dirbuf, readcnt, &auio);
3975                 }
3976                 free(dirbuf, M_TEMP);
3977         }
3978         if (error != 0) {
3979                 VOP_UNLOCK(vp, 0);
3980                 foffset_unlock(fp, foffset, 0);
3981                 fdrop(fp, td);
3982                 return (error);
3983         }
3984         if (uap->count == auio.uio_resid &&
3985             (vp->v_vflag & VV_ROOT) &&
3986             (vp->v_mount->mnt_flag & MNT_UNION)) {
3987                 struct vnode *tvp = vp;
3988                 vp = vp->v_mount->mnt_vnodecovered;
3989                 VREF(vp);
3990                 fp->f_vnode = vp;
3991                 fp->f_data = vp;
3992                 foffset = 0;
3993                 vput(tvp);
3994                 goto unionread;
3995         }
3996         VOP_UNLOCK(vp, 0);
3997         foffset_unlock(fp, foffset, 0);
3998         fdrop(fp, td);
3999         td->td_retval[0] = uap->count - auio.uio_resid;
4000         if (error == 0)
4001                 *ploff = loff;
4002         return (error);
4003 }
4004 #endif /* COMPAT_43 */
4005
4006 /*
4007  * Read a block of directory entries in a filesystem independent format.
4008  */
4009 #ifndef _SYS_SYSPROTO_H_
4010 struct getdirentries_args {
4011         int     fd;
4012         char    *buf;
4013         u_int   count;
4014         long    *basep;
4015 };
4016 #endif
4017 int
4018 sys_getdirentries(td, uap)
4019         struct thread *td;
4020         register struct getdirentries_args /* {
4021                 int fd;
4022                 char *buf;
4023                 u_int count;
4024                 long *basep;
4025         } */ *uap;
4026 {
4027         long base;
4028         int error;
4029
4030         error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4031             NULL, UIO_USERSPACE);
4032         if (error != 0)
4033                 return (error);
4034         if (uap->basep != NULL)
4035                 error = copyout(&base, uap->basep, sizeof(long));
4036         return (error);
4037 }
4038
4039 int
4040 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4041     long *basep, ssize_t *residp, enum uio_seg bufseg)
4042 {
4043         struct vnode *vp;
4044         struct file *fp;
4045         struct uio auio;
4046         struct iovec aiov;
4047         cap_rights_t rights;
4048         long loff;
4049         int error, eofflag;
4050         off_t foffset;
4051
4052         AUDIT_ARG_FD(fd);
4053         if (count > IOSIZE_MAX)
4054                 return (EINVAL);
4055         auio.uio_resid = count;
4056         error = getvnode(td->td_proc->p_fd, fd,
4057             cap_rights_init(&rights, CAP_READ), &fp);
4058         if (error != 0)
4059                 return (error);
4060         if ((fp->f_flag & FREAD) == 0) {
4061                 fdrop(fp, td);
4062                 return (EBADF);
4063         }
4064         vp = fp->f_vnode;
4065         foffset = foffset_lock(fp, 0);
4066 unionread:
4067         if (vp->v_type != VDIR) {
4068                 error = EINVAL;
4069                 goto fail;
4070         }
4071         aiov.iov_base = buf;
4072         aiov.iov_len = count;
4073         auio.uio_iov = &aiov;
4074         auio.uio_iovcnt = 1;
4075         auio.uio_rw = UIO_READ;
4076         auio.uio_segflg = bufseg;
4077         auio.uio_td = td;
4078         vn_lock(vp, LK_SHARED | LK_RETRY);
4079         AUDIT_ARG_VNODE1(vp);
4080         loff = auio.uio_offset = foffset;
4081 #ifdef MAC
4082         error = mac_vnode_check_readdir(td->td_ucred, vp);
4083         if (error == 0)
4084 #endif
4085                 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4086                     NULL);
4087         foffset = auio.uio_offset;
4088         if (error != 0) {
4089                 VOP_UNLOCK(vp, 0);
4090                 goto fail;
4091         }
4092         if (count == auio.uio_resid &&
4093             (vp->v_vflag & VV_ROOT) &&
4094             (vp->v_mount->mnt_flag & MNT_UNION)) {
4095                 struct vnode *tvp = vp;
4096
4097                 vp = vp->v_mount->mnt_vnodecovered;
4098                 VREF(vp);
4099                 fp->f_vnode = vp;
4100                 fp->f_data = vp;
4101                 foffset = 0;
4102                 vput(tvp);
4103                 goto unionread;
4104         }
4105         VOP_UNLOCK(vp, 0);
4106         *basep = loff;
4107         if (residp != NULL)
4108                 *residp = auio.uio_resid;
4109         td->td_retval[0] = count - auio.uio_resid;
4110 fail:
4111         foffset_unlock(fp, foffset, 0);
4112         fdrop(fp, td);
4113         return (error);
4114 }
4115
4116 #ifndef _SYS_SYSPROTO_H_
4117 struct getdents_args {
4118         int fd;
4119         char *buf;
4120         size_t count;
4121 };
4122 #endif
4123 int
4124 sys_getdents(td, uap)
4125         struct thread *td;
4126         register struct getdents_args /* {
4127                 int fd;
4128                 char *buf;
4129                 u_int count;
4130         } */ *uap;
4131 {
4132         struct getdirentries_args ap;
4133
4134         ap.fd = uap->fd;
4135         ap.buf = uap->buf;
4136         ap.count = uap->count;
4137         ap.basep = NULL;
4138         return (sys_getdirentries(td, &ap));
4139 }
4140
4141 /*
4142  * Set the mode mask for creation of filesystem nodes.
4143  */
4144 #ifndef _SYS_SYSPROTO_H_
4145 struct umask_args {
4146         int     newmask;
4147 };
4148 #endif
4149 int
4150 sys_umask(td, uap)
4151         struct thread *td;
4152         struct umask_args /* {
4153                 int newmask;
4154         } */ *uap;
4155 {
4156         register struct filedesc *fdp;
4157
4158         FILEDESC_XLOCK(td->td_proc->p_fd);
4159         fdp = td->td_proc->p_fd;
4160         td->td_retval[0] = fdp->fd_cmask;
4161         fdp->fd_cmask = uap->newmask & ALLPERMS;
4162         FILEDESC_XUNLOCK(td->td_proc->p_fd);
4163         return (0);
4164 }
4165
4166 /*
4167  * Void all references to file by ripping underlying filesystem away from
4168  * vnode.
4169  */
4170 #ifndef _SYS_SYSPROTO_H_
4171 struct revoke_args {
4172         char    *path;
4173 };
4174 #endif
4175 int
4176 sys_revoke(td, uap)
4177         struct thread *td;
4178         register struct revoke_args /* {
4179                 char *path;
4180         } */ *uap;
4181 {
4182         struct vnode *vp;
4183         struct vattr vattr;
4184         struct nameidata nd;
4185         int error;
4186
4187         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4188             uap->path, td);
4189         if ((error = namei(&nd)) != 0)
4190                 return (error);
4191         vp = nd.ni_vp;
4192         NDFREE(&nd, NDF_ONLY_PNBUF);
4193         if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4194                 error = EINVAL;
4195                 goto out;
4196         }
4197 #ifdef MAC
4198         error = mac_vnode_check_revoke(td->td_ucred, vp);
4199         if (error != 0)
4200                 goto out;
4201 #endif
4202         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4203         if (error != 0)
4204                 goto out;
4205         if (td->td_ucred->cr_uid != vattr.va_uid) {
4206                 error = priv_check(td, PRIV_VFS_ADMIN);
4207                 if (error != 0)
4208                         goto out;
4209         }
4210         if (vcount(vp) > 1)
4211                 VOP_REVOKE(vp, REVOKEALL);
4212 out:
4213         vput(vp);
4214         return (error);
4215 }
4216
4217 /*
4218  * Convert a user file descriptor to a kernel file entry and check that, if it
4219  * is a capability, the correct rights are present. A reference on the file
4220  * entry is held upon returning.
4221  */
4222 int
4223 getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4224 {
4225         struct file *fp;
4226         int error;
4227
4228         error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4229         if (error != 0)
4230                 return (error);
4231
4232         /*
4233          * The file could be not of the vnode type, or it may be not
4234          * yet fully initialized, in which case the f_vnode pointer
4235          * may be set, but f_ops is still badfileops.  E.g.,
4236          * devfs_open() transiently create such situation to
4237          * facilitate csw d_fdopen().
4238          *
4239          * Dupfdopen() handling in kern_openat() installs the
4240          * half-baked file into the process descriptor table, allowing
4241          * other thread to dereference it. Guard against the race by
4242          * checking f_ops.
4243          */
4244         if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4245                 fdrop(fp, curthread);
4246                 return (EINVAL);
4247         }
4248         *fpp = fp;
4249         return (0);
4250 }
4251
4252
4253 /*
4254  * Get an (NFS) file handle.
4255  */
4256 #ifndef _SYS_SYSPROTO_H_
4257 struct lgetfh_args {
4258         char    *fname;
4259         fhandle_t *fhp;
4260 };
4261 #endif
4262 int
4263 sys_lgetfh(td, uap)
4264         struct thread *td;
4265         register struct lgetfh_args *uap;
4266 {
4267         struct nameidata nd;
4268         fhandle_t fh;
4269         register struct vnode *vp;
4270         int error;
4271
4272         error = priv_check(td, PRIV_VFS_GETFH);
4273         if (error != 0)
4274                 return (error);
4275         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4276             uap->fname, td);
4277         error = namei(&nd);
4278         if (error != 0)
4279                 return (error);
4280         NDFREE(&nd, NDF_ONLY_PNBUF);
4281         vp = nd.ni_vp;
4282         bzero(&fh, sizeof(fh));
4283         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4284         error = VOP_VPTOFH(vp, &fh.fh_fid);
4285         vput(vp);
4286         if (error == 0)
4287                 error = copyout(&fh, uap->fhp, sizeof (fh));
4288         return (error);
4289 }
4290
4291 #ifndef _SYS_SYSPROTO_H_
4292 struct getfh_args {
4293         char    *fname;
4294         fhandle_t *fhp;
4295 };
4296 #endif
4297 int
4298 sys_getfh(td, uap)
4299         struct thread *td;
4300         register struct getfh_args *uap;
4301 {
4302         struct nameidata nd;
4303         fhandle_t fh;
4304         register struct vnode *vp;
4305         int error;
4306
4307         error = priv_check(td, PRIV_VFS_GETFH);
4308         if (error != 0)
4309                 return (error);
4310         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4311             uap->fname, td);
4312         error = namei(&nd);
4313         if (error != 0)
4314                 return (error);
4315         NDFREE(&nd, NDF_ONLY_PNBUF);
4316         vp = nd.ni_vp;
4317         bzero(&fh, sizeof(fh));
4318         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4319         error = VOP_VPTOFH(vp, &fh.fh_fid);
4320         vput(vp);
4321         if (error == 0)
4322                 error = copyout(&fh, uap->fhp, sizeof (fh));
4323         return (error);
4324 }
4325
4326 /*
4327  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4328  * open descriptor.
4329  *
4330  * warning: do not remove the priv_check() call or this becomes one giant
4331  * security hole.
4332  */
4333 #ifndef _SYS_SYSPROTO_H_
4334 struct fhopen_args {
4335         const struct fhandle *u_fhp;
4336         int flags;
4337 };
4338 #endif
4339 int
4340 sys_fhopen(td, uap)
4341         struct thread *td;
4342         struct fhopen_args /* {
4343                 const struct fhandle *u_fhp;
4344                 int flags;
4345         } */ *uap;
4346 {
4347         struct mount *mp;
4348         struct vnode *vp;
4349         struct fhandle fhp;
4350         struct file *fp;
4351         int fmode, error;
4352         int indx;
4353
4354         error = priv_check(td, PRIV_VFS_FHOPEN);
4355         if (error != 0)
4356                 return (error);
4357         indx = -1;
4358         fmode = FFLAGS(uap->flags);
4359         /* why not allow a non-read/write open for our lockd? */
4360         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4361                 return (EINVAL);
4362         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4363         if (error != 0)
4364                 return(error);
4365         /* find the mount point */
4366         mp = vfs_busyfs(&fhp.fh_fsid);
4367         if (mp == NULL)
4368                 return (ESTALE);
4369         /* now give me my vnode, it gets returned to me locked */
4370         error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4371         vfs_unbusy(mp);
4372         if (error != 0)
4373                 return (error);
4374
4375         error = falloc_noinstall(td, &fp);
4376         if (error != 0) {
4377                 vput(vp);
4378                 return (error);
4379         }
4380         /*
4381          * An extra reference on `fp' has been held for us by
4382          * falloc_noinstall().
4383          */
4384
4385 #ifdef INVARIANTS
4386         td->td_dupfd = -1;
4387 #endif
4388         error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4389         if (error != 0) {
4390                 KASSERT(fp->f_ops == &badfileops,
4391                     ("VOP_OPEN in fhopen() set f_ops"));
4392                 KASSERT(td->td_dupfd < 0,
4393                     ("fhopen() encountered fdopen()"));
4394
4395                 vput(vp);
4396                 goto bad;
4397         }
4398 #ifdef INVARIANTS
4399         td->td_dupfd = 0;
4400 #endif
4401         fp->f_vnode = vp;
4402         fp->f_seqcount = 1;
4403         finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4404             &vnops);
4405         VOP_UNLOCK(vp, 0);
4406         if ((fmode & O_TRUNC) != 0) {
4407                 error = fo_truncate(fp, 0, td->td_ucred, td);
4408                 if (error != 0)
4409                         goto bad;
4410         }
4411
4412         error = finstall(td, fp, &indx, fmode, NULL);
4413 bad:
4414         fdrop(fp, td);
4415         td->td_retval[0] = indx;
4416         return (error);
4417 }
4418
4419 /*
4420  * Stat an (NFS) file handle.
4421  */
4422 #ifndef _SYS_SYSPROTO_H_
4423 struct fhstat_args {
4424         struct fhandle *u_fhp;
4425         struct stat *sb;
4426 };
4427 #endif
4428 int
4429 sys_fhstat(td, uap)
4430         struct thread *td;
4431         register struct fhstat_args /* {
4432                 struct fhandle *u_fhp;
4433                 struct stat *sb;
4434         } */ *uap;
4435 {
4436         struct stat sb;
4437         struct fhandle fh;
4438         int error;
4439
4440         error = copyin(uap->u_fhp, &fh, sizeof(fh));
4441         if (error != 0)
4442                 return (error);
4443         error = kern_fhstat(td, fh, &sb);
4444         if (error == 0)
4445                 error = copyout(&sb, uap->sb, sizeof(sb));
4446         return (error);
4447 }
4448
4449 int
4450 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4451 {
4452         struct mount *mp;
4453         struct vnode *vp;
4454         int error;
4455
4456         error = priv_check(td, PRIV_VFS_FHSTAT);
4457         if (error != 0)
4458                 return (error);
4459         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4460                 return (ESTALE);
4461         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4462         vfs_unbusy(mp);
4463         if (error != 0)
4464                 return (error);
4465         error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4466         vput(vp);
4467         return (error);
4468 }
4469
4470 /*
4471  * Implement fstatfs() for (NFS) file handles.
4472  */
4473 #ifndef _SYS_SYSPROTO_H_
4474 struct fhstatfs_args {
4475         struct fhandle *u_fhp;
4476         struct statfs *buf;
4477 };
4478 #endif
4479 int
4480 sys_fhstatfs(td, uap)
4481         struct thread *td;
4482         struct fhstatfs_args /* {
4483                 struct fhandle *u_fhp;
4484                 struct statfs *buf;
4485         } */ *uap;
4486 {
4487         struct statfs sf;
4488         fhandle_t fh;
4489         int error;
4490
4491         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4492         if (error != 0)
4493                 return (error);
4494         error = kern_fhstatfs(td, fh, &sf);
4495         if (error != 0)
4496                 return (error);
4497         return (copyout(&sf, uap->buf, sizeof(sf)));
4498 }
4499
4500 int
4501 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4502 {
4503         struct statfs *sp;
4504         struct mount *mp;
4505         struct vnode *vp;
4506         int error;
4507
4508         error = priv_check(td, PRIV_VFS_FHSTATFS);
4509         if (error != 0)
4510                 return (error);
4511         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4512                 return (ESTALE);
4513         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4514         if (error != 0) {
4515                 vfs_unbusy(mp);
4516                 return (error);
4517         }
4518         vput(vp);
4519         error = prison_canseemount(td->td_ucred, mp);
4520         if (error != 0)
4521                 goto out;
4522 #ifdef MAC
4523         error = mac_mount_check_stat(td->td_ucred, mp);
4524         if (error != 0)
4525                 goto out;
4526 #endif
4527         /*
4528          * Set these in case the underlying filesystem fails to do so.
4529          */
4530         sp = &mp->mnt_stat;
4531         sp->f_version = STATFS_VERSION;
4532         sp->f_namemax = NAME_MAX;
4533         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4534         error = VFS_STATFS(mp, sp);
4535         if (error == 0)
4536                 *buf = *sp;
4537 out:
4538         vfs_unbusy(mp);
4539         return (error);
4540 }
4541
4542 int
4543 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4544 {
4545         struct file *fp;
4546         struct mount *mp;
4547         struct vnode *vp;
4548         cap_rights_t rights;
4549         off_t olen, ooffset;
4550         int error;
4551
4552         if (offset < 0 || len <= 0)
4553                 return (EINVAL);
4554         /* Check for wrap. */
4555         if (offset > OFF_MAX - len)
4556                 return (EFBIG);
4557         error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4558         if (error != 0)
4559                 return (error);
4560         if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4561                 error = ESPIPE;
4562                 goto out;
4563         }
4564         if ((fp->f_flag & FWRITE) == 0) {
4565                 error = EBADF;
4566                 goto out;
4567         }
4568         if (fp->f_type != DTYPE_VNODE) {
4569                 error = ENODEV;
4570                 goto out;
4571         }
4572         vp = fp->f_vnode;
4573         if (vp->v_type != VREG) {
4574                 error = ENODEV;
4575                 goto out;
4576         }
4577
4578         /* Allocating blocks may take a long time, so iterate. */
4579         for (;;) {
4580                 olen = len;
4581                 ooffset = offset;
4582
4583                 bwillwrite();
4584                 mp = NULL;
4585                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4586                 if (error != 0)
4587                         break;
4588                 error = vn_lock(vp, LK_EXCLUSIVE);
4589                 if (error != 0) {
4590                         vn_finished_write(mp);
4591                         break;
4592                 }
4593 #ifdef MAC
4594                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4595                 if (error == 0)
4596 #endif
4597                         error = VOP_ALLOCATE(vp, &offset, &len);
4598                 VOP_UNLOCK(vp, 0);
4599                 vn_finished_write(mp);
4600
4601                 if (olen + ooffset != offset + len) {
4602                         panic("offset + len changed from %jx/%jx to %jx/%jx",
4603                             ooffset, olen, offset, len);
4604                 }
4605                 if (error != 0 || len == 0)
4606                         break;
4607                 KASSERT(olen > len, ("Iteration did not make progress?"));
4608                 maybe_yield();
4609         }
4610  out:
4611         fdrop(fp, td);
4612         return (error);
4613 }
4614
4615 int
4616 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4617 {
4618
4619         td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4620             uap->len);
4621         return (0);
4622 }
4623
4624 /*
4625  * Unlike madvise(2), we do not make a best effort to remember every
4626  * possible caching hint.  Instead, we remember the last setting with
4627  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4628  * region of any current setting.
4629  */
4630 int
4631 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4632     int advice)
4633 {
4634         struct fadvise_info *fa, *new;
4635         struct file *fp;
4636         struct vnode *vp;
4637         cap_rights_t rights;
4638         off_t end;
4639         int error;
4640
4641         if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4642                 return (EINVAL);
4643         switch (advice) {
4644         case POSIX_FADV_SEQUENTIAL:
4645         case POSIX_FADV_RANDOM:
4646         case POSIX_FADV_NOREUSE:
4647                 new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4648                 break;
4649         case POSIX_FADV_NORMAL:
4650         case POSIX_FADV_WILLNEED:
4651         case POSIX_FADV_DONTNEED:
4652                 new = NULL;
4653                 break;
4654         default:
4655                 return (EINVAL);
4656         }
4657         /* XXX: CAP_POSIX_FADVISE? */
4658         error = fget(td, fd, cap_rights_init(&rights), &fp);
4659         if (error != 0)
4660                 goto out;
4661         if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4662                 error = ESPIPE;
4663                 goto out;
4664         }
4665         if (fp->f_type != DTYPE_VNODE) {
4666                 error = ENODEV;
4667                 goto out;
4668         }
4669         vp = fp->f_vnode;
4670         if (vp->v_type != VREG) {
4671                 error = ENODEV;
4672                 goto out;
4673         }
4674         if (len == 0)
4675                 end = OFF_MAX;
4676         else
4677                 end = offset + len - 1;
4678         switch (advice) {
4679         case POSIX_FADV_SEQUENTIAL:
4680         case POSIX_FADV_RANDOM:
4681         case POSIX_FADV_NOREUSE:
4682                 /*
4683                  * Try to merge any existing non-standard region with
4684                  * this new region if possible, otherwise create a new
4685                  * non-standard region for this request.
4686                  */
4687                 mtx_pool_lock(mtxpool_sleep, fp);
4688                 fa = fp->f_advice;
4689                 if (fa != NULL && fa->fa_advice == advice &&
4690                     ((fa->fa_start <= end && fa->fa_end >= offset) ||
4691                     (end != OFF_MAX && fa->fa_start == end + 1) ||
4692                     (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4693                         if (offset < fa->fa_start)
4694                                 fa->fa_start = offset;
4695                         if (end > fa->fa_end)
4696                                 fa->fa_end = end;
4697                 } else {
4698                         new->fa_advice = advice;
4699                         new->fa_start = offset;
4700                         new->fa_end = end;
4701                         new->fa_prevstart = 0;
4702                         new->fa_prevend = 0;
4703                         fp->f_advice = new;
4704                         new = fa;
4705                 }
4706                 mtx_pool_unlock(mtxpool_sleep, fp);
4707                 break;
4708         case POSIX_FADV_NORMAL:
4709                 /*
4710                  * If a the "normal" region overlaps with an existing
4711                  * non-standard region, trim or remove the
4712                  * non-standard region.
4713                  */
4714                 mtx_pool_lock(mtxpool_sleep, fp);
4715                 fa = fp->f_advice;
4716                 if (fa != NULL) {
4717                         if (offset <= fa->fa_start && end >= fa->fa_end) {
4718                                 new = fa;
4719                                 fp->f_advice = NULL;
4720                         } else if (offset <= fa->fa_start &&
4721                             end >= fa->fa_start)
4722                                 fa->fa_start = end + 1;
4723                         else if (offset <= fa->fa_end && end >= fa->fa_end)
4724                                 fa->fa_end = offset - 1;
4725                         else if (offset >= fa->fa_start && end <= fa->fa_end) {
4726                                 /*
4727                                  * If the "normal" region is a middle
4728                                  * portion of the existing
4729                                  * non-standard region, just remove
4730                                  * the whole thing rather than picking
4731                                  * one side or the other to
4732                                  * preserve.
4733                                  */
4734                                 new = fa;
4735                                 fp->f_advice = NULL;
4736                         }
4737                 }
4738                 mtx_pool_unlock(mtxpool_sleep, fp);
4739                 break;
4740         case POSIX_FADV_WILLNEED:
4741         case POSIX_FADV_DONTNEED:
4742                 error = VOP_ADVISE(vp, offset, end, advice);
4743                 break;
4744         }
4745 out:
4746         if (fp != NULL)
4747                 fdrop(fp, td);
4748         free(new, M_FADVISE);
4749         return (error);
4750 }
4751
4752 int
4753 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4754 {
4755
4756         td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4757             uap->len, uap->advice);
4758         return (0);
4759 }