sys/kern/vfs_syscalls.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_capsicum.h"
  41 #include "opt_compat.h"
  42 #include "opt_ktrace.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/bio.h>
  47 #include <sys/buf.h>
  48 #include <sys/capsicum.h>
  49 #include <sys/disk.h>
  50 #include <sys/sysent.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount.h>
  53 #include <sys/mutex.h>
  54 #include <sys/sysproto.h>
  55 #include <sys/namei.h>
  56 #include <sys/filedesc.h>
  57 #include <sys/kernel.h>
  58 #include <sys/fcntl.h>
  59 #include <sys/file.h>
  60 #include <sys/filio.h>
  61 #include <sys/limits.h>
  62 #include <sys/linker.h>
  63 #include <sys/rwlock.h>
  64 #include <sys/sdt.h>
  65 #include <sys/stat.h>
  66 #include <sys/sx.h>
  67 #include <sys/unistd.h>
  68 #include <sys/vnode.h>
  69 #include <sys/priv.h>
  70 #include <sys/proc.h>
  71 #include <sys/dirent.h>
  72 #include <sys/jail.h>
  73 #include <sys/syscallsubr.h>
  74 #include <sys/sysctl.h>
  75 #ifdef KTRACE
  76 #include <sys/ktrace.h>
  77 #endif
  78
  79 #include <machine/stdarg.h>
  80
  81 #include <security/audit/audit.h>
  82 #include <security/mac/mac_framework.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_object.h>
  86 #include <vm/vm_page.h>
  87 #include <vm/uma.h>
  88
  89 #include <ufs/ufs/quota.h>
  90
  91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
  92
  93 SDT_PROVIDER_DEFINE(vfs);
  94 SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
  95 SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
  96
  97 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
  98 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
  99 static int kern_chflags(struct thread *td, const char *path,
 100     enum uio_seg pathseg, u_long flags);
 101 static int kern_chflagsat(struct thread *td, int fd, const char *path,
 102     enum uio_seg pathseg, u_long flags, int atflag);
 103 static int setfflags(struct thread *td, struct vnode *, u_long);
 104 static int setutimes(struct thread *td, struct vnode *,
 105     const struct timespec *, int, int);
 106 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
 107     struct thread *td);
 108
 109 /*
 110  * The module initialization routine for POSIX asynchronous I/O will
 111  * set this to the version of AIO that it implements.  (Zero means
 112  * that it is not implemented.)  This value is used here by pathconf()
 113  * and in kern_descrip.c by fpathconf().
 114  */
 115 int async_io_version;
 116
 117 #ifdef DEBUG
 118 static int syncprt = 0;
 119 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 120 #endif
 121
 122 /*
 123  * Sync each mounted filesystem.
 124  */
 125 #ifndef _SYS_SYSPROTO_H_
 126 struct sync_args {
 127         int     dummy;
 128 };
 129 #endif
 130 /* ARGSUSED */
 131 int
 132 sys_sync(td, uap)
 133         struct thread *td;
 134         struct sync_args *uap;
 135 {
 136         struct mount *mp, *nmp;
 137         int save;
 138
 139         mtx_lock(&mountlist_mtx);
 140         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 141                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 142                         nmp = TAILQ_NEXT(mp, mnt_list);
 143                         continue;
 144                 }
 145                 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 146                     vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 147                         save = curthread_pflags_set(TDP_SYNCIO);
 148                         vfs_msync(mp, MNT_NOWAIT);
 149                         VFS_SYNC(mp, MNT_NOWAIT);
 150                         curthread_pflags_restore(save);
 151                         vn_finished_write(mp);
 152                 }
 153                 mtx_lock(&mountlist_mtx);
 154                 nmp = TAILQ_NEXT(mp, mnt_list);
 155                 vfs_unbusy(mp);
 156         }
 157         mtx_unlock(&mountlist_mtx);
 158         return (0);
 159 }
 160
 161 /*
 162  * Change filesystem quotas.
 163  */
 164 #ifndef _SYS_SYSPROTO_H_
 165 struct quotactl_args {
 166         char *path;
 167         int cmd;
 168         int uid;
 169         caddr_t arg;
 170 };
 171 #endif
 172 int
 173 sys_quotactl(td, uap)
 174         struct thread *td;
 175         register struct quotactl_args /* {
 176                 char *path;
 177                 int cmd;
 178                 int uid;
 179                 caddr_t arg;
 180         } */ *uap;
 181 {
 182         struct mount *mp;
 183         struct nameidata nd;
 184         int error;
 185
 186         AUDIT_ARG_CMD(uap->cmd);
 187         AUDIT_ARG_UID(uap->uid);
 188         if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 189                 return (EPERM);
 190         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 191             uap->path, td);
 192         if ((error = namei(&nd)) != 0)
 193                 return (error);
 194         NDFREE(&nd, NDF_ONLY_PNBUF);
 195         mp = nd.ni_vp->v_mount;
 196         vfs_ref(mp);
 197         vput(nd.ni_vp);
 198         error = vfs_busy(mp, 0);
 199         vfs_rel(mp);
 200         if (error != 0)
 201                 return (error);
 202         error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 203
 204         /*
 205          * Since quota on operation typically needs to open quota
 206          * file, the Q_QUOTAON handler needs to unbusy the mount point
 207          * before calling into namei.  Otherwise, unmount might be
 208          * started between two vfs_busy() invocations (first is our,
 209          * second is from mount point cross-walk code in lookup()),
 210          * causing deadlock.
 211          *
 212          * Require that Q_QUOTAON handles the vfs_busy() reference on
 213          * its own, always returning with ubusied mount point.
 214          */
 215         if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
 216                 vfs_unbusy(mp);
 217         return (error);
 218 }
 219
 220 /*
 221  * Used by statfs conversion routines to scale the block size up if
 222  * necessary so that all of the block counts are <= 'max_size'.  Note
 223  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
 224  * value of 'n'.
 225  */
 226 void
 227 statfs_scale_blocks(struct statfs *sf, long max_size)
 228 {
 229         uint64_t count;
 230         int shift;
 231
 232         KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 233
 234         /*
 235          * Attempt to scale the block counts to give a more accurate
 236          * overview to userland of the ratio of free space to used
 237          * space.  To do this, find the largest block count and compute
 238          * a divisor that lets it fit into a signed integer <= max_size.
 239          */
 240         if (sf->f_bavail < 0)
 241                 count = -sf->f_bavail;
 242         else
 243                 count = sf->f_bavail;
 244         count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 245         if (count <= max_size)
 246                 return;
 247
 248         count >>= flsl(max_size);
 249         shift = 0;
 250         while (count > 0) {
 251                 shift++;
 252                 count >>=1;
 253         }
 254
 255         sf->f_bsize <<= shift;
 256         sf->f_blocks >>= shift;
 257         sf->f_bfree >>= shift;
 258         sf->f_bavail >>= shift;
 259 }
 260
 261 /*
 262  * Get filesystem statistics.
 263  */
 264 #ifndef _SYS_SYSPROTO_H_
 265 struct statfs_args {
 266         char *path;
 267         struct statfs *buf;
 268 };
 269 #endif
 270 int
 271 sys_statfs(td, uap)
 272         struct thread *td;
 273         register struct statfs_args /* {
 274                 char *path;
 275                 struct statfs *buf;
 276         } */ *uap;
 277 {
 278         struct statfs sf;
 279         int error;
 280
 281         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 282         if (error == 0)
 283                 error = copyout(&sf, uap->buf, sizeof(sf));
 284         return (error);
 285 }
 286
 287 int
 288 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
 289     struct statfs *buf)
 290 {
 291         struct mount *mp;
 292         struct statfs *sp, sb;
 293         struct nameidata nd;
 294         int error;
 295
 296         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 297             pathseg, path, td);
 298         error = namei(&nd);
 299         if (error != 0)
 300                 return (error);
 301         mp = nd.ni_vp->v_mount;
 302         vfs_ref(mp);
 303         NDFREE(&nd, NDF_ONLY_PNBUF);
 304         vput(nd.ni_vp);
 305         error = vfs_busy(mp, 0);
 306         vfs_rel(mp);
 307         if (error != 0)
 308                 return (error);
 309 #ifdef MAC
 310         error = mac_mount_check_stat(td->td_ucred, mp);
 311         if (error != 0)
 312                 goto out;
 313 #endif
 314         /*
 315          * Set these in case the underlying filesystem fails to do so.
 316          */
 317         sp = &mp->mnt_stat;
 318         sp->f_version = STATFS_VERSION;
 319         sp->f_namemax = NAME_MAX;
 320         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 321         error = VFS_STATFS(mp, sp);
 322         if (error != 0)
 323                 goto out;
 324         if (priv_check(td, PRIV_VFS_GENERATION)) {
 325                 bcopy(sp, &sb, sizeof(sb));
 326                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 327                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 328                 sp = &sb;
 329         }
 330         *buf = *sp;
 331 out:
 332         vfs_unbusy(mp);
 333         return (error);
 334 }
 335
 336 /*
 337  * Get filesystem statistics.
 338  */
 339 #ifndef _SYS_SYSPROTO_H_
 340 struct fstatfs_args {
 341         int fd;
 342         struct statfs *buf;
 343 };
 344 #endif
 345 int
 346 sys_fstatfs(td, uap)
 347         struct thread *td;
 348         register struct fstatfs_args /* {
 349                 int fd;
 350                 struct statfs *buf;
 351         } */ *uap;
 352 {
 353         struct statfs sf;
 354         int error;
 355
 356         error = kern_fstatfs(td, uap->fd, &sf);
 357         if (error == 0)
 358                 error = copyout(&sf, uap->buf, sizeof(sf));
 359         return (error);
 360 }
 361
 362 int
 363 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 364 {
 365         struct file *fp;
 366         struct mount *mp;
 367         struct statfs *sp, sb;
 368         struct vnode *vp;
 369         cap_rights_t rights;
 370         int error;
 371
 372         AUDIT_ARG_FD(fd);
 373         error = getvnode(td->td_proc->p_fd, fd,
 374             cap_rights_init(&rights, CAP_FSTATFS), &fp);
 375         if (error != 0)
 376                 return (error);
 377         vp = fp->f_vnode;
 378         vn_lock(vp, LK_SHARED | LK_RETRY);
 379 #ifdef AUDIT
 380         AUDIT_ARG_VNODE1(vp);
 381 #endif
 382         mp = vp->v_mount;
 383         if (mp)
 384                 vfs_ref(mp);
 385         VOP_UNLOCK(vp, 0);
 386         fdrop(fp, td);
 387         if (mp == NULL) {
 388                 error = EBADF;
 389                 goto out;
 390         }
 391         error = vfs_busy(mp, 0);
 392         vfs_rel(mp);
 393         if (error != 0)
 394                 return (error);
 395 #ifdef MAC
 396         error = mac_mount_check_stat(td->td_ucred, mp);
 397         if (error != 0)
 398                 goto out;
 399 #endif
 400         /*
 401          * Set these in case the underlying filesystem fails to do so.
 402          */
 403         sp = &mp->mnt_stat;
 404         sp->f_version = STATFS_VERSION;
 405         sp->f_namemax = NAME_MAX;
 406         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 407         error = VFS_STATFS(mp, sp);
 408         if (error != 0)
 409                 goto out;
 410         if (priv_check(td, PRIV_VFS_GENERATION)) {
 411                 bcopy(sp, &sb, sizeof(sb));
 412                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 413                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 414                 sp = &sb;
 415         }
 416         *buf = *sp;
 417 out:
 418         if (mp)
 419                 vfs_unbusy(mp);
 420         return (error);
 421 }
 422
 423 /*
 424  * Get statistics on all filesystems.
 425  */
 426 #ifndef _SYS_SYSPROTO_H_
 427 struct getfsstat_args {
 428         struct statfs *buf;
 429         long bufsize;
 430         int flags;
 431 };
 432 #endif
 433 int
 434 sys_getfsstat(td, uap)
 435         struct thread *td;
 436         register struct getfsstat_args /* {
 437                 struct statfs *buf;
 438                 long bufsize;
 439                 int flags;
 440         } */ *uap;
 441 {
 442
 443         return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
 444             uap->flags));
 445 }
 446
 447 /*
 448  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
 449  *      The caller is responsible for freeing memory which will be allocated
 450  *      in '*buf'.
 451  */
 452 int
 453 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
 454     enum uio_seg bufseg, int flags)
 455 {
 456         struct mount *mp, *nmp;
 457         struct statfs *sfsp, *sp, sb;
 458         size_t count, maxcount;
 459         int error;
 460
 461         maxcount = bufsize / sizeof(struct statfs);
 462         if (bufsize == 0)
 463                 sfsp = NULL;
 464         else if (bufseg == UIO_USERSPACE)
 465                 sfsp = *buf;
 466         else /* if (bufseg == UIO_SYSSPACE) */ {
 467                 count = 0;
 468                 mtx_lock(&mountlist_mtx);
 469                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 470                         count++;
 471                 }
 472                 mtx_unlock(&mountlist_mtx);
 473                 if (maxcount > count)
 474                         maxcount = count;
 475                 sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
 476                     M_WAITOK);
 477         }
 478         count = 0;
 479         mtx_lock(&mountlist_mtx);
 480         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 481                 if (prison_canseemount(td->td_ucred, mp) != 0) {
 482                         nmp = TAILQ_NEXT(mp, mnt_list);
 483                         continue;
 484                 }
 485 #ifdef MAC
 486                 if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 487                         nmp = TAILQ_NEXT(mp, mnt_list);
 488                         continue;
 489                 }
 490 #endif
 491                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 492                         nmp = TAILQ_NEXT(mp, mnt_list);
 493                         continue;
 494                 }
 495                 if (sfsp && count < maxcount) {
 496                         sp = &mp->mnt_stat;
 497                         /*
 498                          * Set these in case the underlying filesystem
 499                          * fails to do so.
 500                          */
 501                         sp->f_version = STATFS_VERSION;
 502                         sp->f_namemax = NAME_MAX;
 503                         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 504                         /*
 505                          * If MNT_NOWAIT or MNT_LAZY is specified, do not
 506                          * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 507                          * overrides MNT_WAIT.
 508                          */
 509                         if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 510                             (flags & MNT_WAIT)) &&
 511                             (error = VFS_STATFS(mp, sp))) {
 512                                 mtx_lock(&mountlist_mtx);
 513                                 nmp = TAILQ_NEXT(mp, mnt_list);
 514                                 vfs_unbusy(mp);
 515                                 continue;
 516                         }
 517                         if (priv_check(td, PRIV_VFS_GENERATION)) {
 518                                 bcopy(sp, &sb, sizeof(sb));
 519                                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 520                                 prison_enforce_statfs(td->td_ucred, mp, &sb);
 521                                 sp = &sb;
 522                         }
 523                         if (bufseg == UIO_SYSSPACE)
 524                                 bcopy(sp, sfsp, sizeof(*sp));
 525                         else /* if (bufseg == UIO_USERSPACE) */ {
 526                                 error = copyout(sp, sfsp, sizeof(*sp));
 527                                 if (error != 0) {
 528                                         vfs_unbusy(mp);
 529                                         return (error);
 530                                 }
 531                         }
 532                         sfsp++;
 533                 }
 534                 count++;
 535                 mtx_lock(&mountlist_mtx);
 536                 nmp = TAILQ_NEXT(mp, mnt_list);
 537                 vfs_unbusy(mp);
 538         }
 539         mtx_unlock(&mountlist_mtx);
 540         if (sfsp && count > maxcount)
 541                 td->td_retval[0] = maxcount;
 542         else
 543                 td->td_retval[0] = count;
 544         return (0);
 545 }
 546
 547 #ifdef COMPAT_FREEBSD4
 548 /*
 549  * Get old format filesystem statistics.
 550  */
 551 static void cvtstatfs(struct statfs *, struct ostatfs *);
 552
 553 #ifndef _SYS_SYSPROTO_H_
 554 struct freebsd4_statfs_args {
 555         char *path;
 556         struct ostatfs *buf;
 557 };
 558 #endif
 559 int
 560 freebsd4_statfs(td, uap)
 561         struct thread *td;
 562         struct freebsd4_statfs_args /* {
 563                 char *path;
 564                 struct ostatfs *buf;
 565         } */ *uap;
 566 {
 567         struct ostatfs osb;
 568         struct statfs sf;
 569         int error;
 570
 571         error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 572         if (error != 0)
 573                 return (error);
 574         cvtstatfs(&sf, &osb);
 575         return (copyout(&osb, uap->buf, sizeof(osb)));
 576 }
 577
 578 /*
 579  * Get filesystem statistics.
 580  */
 581 #ifndef _SYS_SYSPROTO_H_
 582 struct freebsd4_fstatfs_args {
 583         int fd;
 584         struct ostatfs *buf;
 585 };
 586 #endif
 587 int
 588 freebsd4_fstatfs(td, uap)
 589         struct thread *td;
 590         struct freebsd4_fstatfs_args /* {
 591                 int fd;
 592                 struct ostatfs *buf;
 593         } */ *uap;
 594 {
 595         struct ostatfs osb;
 596         struct statfs sf;
 597         int error;
 598
 599         error = kern_fstatfs(td, uap->fd, &sf);
 600         if (error != 0)
 601                 return (error);
 602         cvtstatfs(&sf, &osb);
 603         return (copyout(&osb, uap->buf, sizeof(osb)));
 604 }
 605
 606 /*
 607  * Get statistics on all filesystems.
 608  */
 609 #ifndef _SYS_SYSPROTO_H_
 610 struct freebsd4_getfsstat_args {
 611         struct ostatfs *buf;
 612         long bufsize;
 613         int flags;
 614 };
 615 #endif
 616 int
 617 freebsd4_getfsstat(td, uap)
 618         struct thread *td;
 619         register struct freebsd4_getfsstat_args /* {
 620                 struct ostatfs *buf;
 621                 long bufsize;
 622                 int flags;
 623         } */ *uap;
 624 {
 625         struct statfs *buf, *sp;
 626         struct ostatfs osb;
 627         size_t count, size;
 628         int error;
 629
 630         count = uap->bufsize / sizeof(struct ostatfs);
 631         size = count * sizeof(struct statfs);
 632         error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
 633         if (size > 0) {
 634                 count = td->td_retval[0];
 635                 sp = buf;
 636                 while (count > 0 && error == 0) {
 637                         cvtstatfs(sp, &osb);
 638                         error = copyout(&osb, uap->buf, sizeof(osb));
 639                         sp++;
 640                         uap->buf++;
 641                         count--;
 642                 }
 643                 free(buf, M_TEMP);
 644         }
 645         return (error);
 646 }
 647
 648 /*
 649  * Implement fstatfs() for (NFS) file handles.
 650  */
 651 #ifndef _SYS_SYSPROTO_H_
 652 struct freebsd4_fhstatfs_args {
 653         struct fhandle *u_fhp;
 654         struct ostatfs *buf;
 655 };
 656 #endif
 657 int
 658 freebsd4_fhstatfs(td, uap)
 659         struct thread *td;
 660         struct freebsd4_fhstatfs_args /* {
 661                 struct fhandle *u_fhp;
 662                 struct ostatfs *buf;
 663         } */ *uap;
 664 {
 665         struct ostatfs osb;
 666         struct statfs sf;
 667         fhandle_t fh;
 668         int error;
 669
 670         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 671         if (error != 0)
 672                 return (error);
 673         error = kern_fhstatfs(td, fh, &sf);
 674         if (error != 0)
 675                 return (error);
 676         cvtstatfs(&sf, &osb);
 677         return (copyout(&osb, uap->buf, sizeof(osb)));
 678 }
 679
 680 /*
 681  * Convert a new format statfs structure to an old format statfs structure.
 682  */
 683 static void
 684 cvtstatfs(nsp, osp)
 685         struct statfs *nsp;
 686         struct ostatfs *osp;
 687 {
 688
 689         statfs_scale_blocks(nsp, LONG_MAX);
 690         bzero(osp, sizeof(*osp));
 691         osp->f_bsize = nsp->f_bsize;
 692         osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 693         osp->f_blocks = nsp->f_blocks;
 694         osp->f_bfree = nsp->f_bfree;
 695         osp->f_bavail = nsp->f_bavail;
 696         osp->f_files = MIN(nsp->f_files, LONG_MAX);
 697         osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 698         osp->f_owner = nsp->f_owner;
 699         osp->f_type = nsp->f_type;
 700         osp->f_flags = nsp->f_flags;
 701         osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 702         osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 703         osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 704         osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 705         strlcpy(osp->f_fstypename, nsp->f_fstypename,
 706             MIN(MFSNAMELEN, OMFSNAMELEN));
 707         strlcpy(osp->f_mntonname, nsp->f_mntonname,
 708             MIN(MNAMELEN, OMNAMELEN));
 709         strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 710             MIN(MNAMELEN, OMNAMELEN));
 711         osp->f_fsid = nsp->f_fsid;
 712 }
 713 #endif /* COMPAT_FREEBSD4 */
 714
 715 /*
 716  * Change current working directory to a given file descriptor.
 717  */
 718 #ifndef _SYS_SYSPROTO_H_
 719 struct fchdir_args {
 720         int     fd;
 721 };
 722 #endif
 723 int
 724 sys_fchdir(td, uap)
 725         struct thread *td;
 726         struct fchdir_args /* {
 727                 int fd;
 728         } */ *uap;
 729 {
 730         register struct filedesc *fdp = td->td_proc->p_fd;
 731         struct vnode *vp, *tdp, *vpold;
 732         struct mount *mp;
 733         struct file *fp;
 734         cap_rights_t rights;
 735         int error;
 736
 737         AUDIT_ARG_FD(uap->fd);
 738         error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
 739             &fp);
 740         if (error != 0)
 741                 return (error);
 742         vp = fp->f_vnode;
 743         VREF(vp);
 744         fdrop(fp, td);
 745         vn_lock(vp, LK_SHARED | LK_RETRY);
 746         AUDIT_ARG_VNODE1(vp);
 747         error = change_dir(vp, td);
 748         while (!error && (mp = vp->v_mountedhere) != NULL) {
 749                 if (vfs_busy(mp, 0))
 750                         continue;
 751                 error = VFS_ROOT(mp, LK_SHARED, &tdp);
 752                 vfs_unbusy(mp);
 753                 if (error != 0)
 754                         break;
 755                 vput(vp);
 756                 vp = tdp;
 757         }
 758         if (error != 0) {
 759                 vput(vp);
 760                 return (error);
 761         }
 762         VOP_UNLOCK(vp, 0);
 763         FILEDESC_XLOCK(fdp);
 764         vpold = fdp->fd_cdir;
 765         fdp->fd_cdir = vp;
 766         FILEDESC_XUNLOCK(fdp);
 767         vrele(vpold);
 768         return (0);
 769 }
 770
 771 /*
 772  * Change current working directory (``.'').
 773  */
 774 #ifndef _SYS_SYSPROTO_H_
 775 struct chdir_args {
 776         char    *path;
 777 };
 778 #endif
 779 int
 780 sys_chdir(td, uap)
 781         struct thread *td;
 782         struct chdir_args /* {
 783                 char *path;
 784         } */ *uap;
 785 {
 786
 787         return (kern_chdir(td, uap->path, UIO_USERSPACE));
 788 }
 789
 790 int
 791 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 792 {
 793         register struct filedesc *fdp = td->td_proc->p_fd;
 794         struct nameidata nd;
 795         struct vnode *vp;
 796         int error;
 797
 798         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 799             pathseg, path, td);
 800         if ((error = namei(&nd)) != 0)
 801                 return (error);
 802         if ((error = change_dir(nd.ni_vp, td)) != 0) {
 803                 vput(nd.ni_vp);
 804                 NDFREE(&nd, NDF_ONLY_PNBUF);
 805                 return (error);
 806         }
 807         VOP_UNLOCK(nd.ni_vp, 0);
 808         NDFREE(&nd, NDF_ONLY_PNBUF);
 809         FILEDESC_XLOCK(fdp);
 810         vp = fdp->fd_cdir;
 811         fdp->fd_cdir = nd.ni_vp;
 812         FILEDESC_XUNLOCK(fdp);
 813         vrele(vp);
 814         return (0);
 815 }
 816
 817 /*
 818  * Helper function for raised chroot(2) security function:  Refuse if
 819  * any filedescriptors are open directories.
 820  */
 821 static int
 822 chroot_refuse_vdir_fds(fdp)
 823         struct filedesc *fdp;
 824 {
 825         struct vnode *vp;
 826         struct file *fp;
 827         int fd;
 828
 829         FILEDESC_LOCK_ASSERT(fdp);
 830
 831         for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 832                 fp = fget_locked(fdp, fd);
 833                 if (fp == NULL)
 834                         continue;
 835                 if (fp->f_type == DTYPE_VNODE) {
 836                         vp = fp->f_vnode;
 837                         if (vp->v_type == VDIR)
 838                                 return (EPERM);
 839                 }
 840         }
 841         return (0);
 842 }
 843
 844 /*
 845  * This sysctl determines if we will allow a process to chroot(2) if it
 846  * has a directory open:
 847  *      0: disallowed for all processes.
 848  *      1: allowed for processes that were not already chroot(2)'ed.
 849  *      2: allowed for all processes.
 850  */
 851
 852 static int chroot_allow_open_directories = 1;
 853
 854 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
 855      &chroot_allow_open_directories, 0,
 856      "Allow a process to chroot(2) if it has a directory open");
 857
 858 /*
 859  * Change notion of root (``/'') directory.
 860  */
 861 #ifndef _SYS_SYSPROTO_H_
 862 struct chroot_args {
 863         char    *path;
 864 };
 865 #endif
 866 int
 867 sys_chroot(td, uap)
 868         struct thread *td;
 869         struct chroot_args /* {
 870                 char *path;
 871         } */ *uap;
 872 {
 873         struct nameidata nd;
 874         int error;
 875
 876         error = priv_check(td, PRIV_VFS_CHROOT);
 877         if (error != 0)
 878                 return (error);
 879         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 880             UIO_USERSPACE, uap->path, td);
 881         error = namei(&nd);
 882         if (error != 0)
 883                 goto error;
 884         error = change_dir(nd.ni_vp, td);
 885         if (error != 0)
 886                 goto e_vunlock;
 887 #ifdef MAC
 888         error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 889         if (error != 0)
 890                 goto e_vunlock;
 891 #endif
 892         VOP_UNLOCK(nd.ni_vp, 0);
 893         error = change_root(nd.ni_vp, td);
 894         vrele(nd.ni_vp);
 895         NDFREE(&nd, NDF_ONLY_PNBUF);
 896         return (error);
 897 e_vunlock:
 898         vput(nd.ni_vp);
 899 error:
 900         NDFREE(&nd, NDF_ONLY_PNBUF);
 901         return (error);
 902 }
 903
 904 /*
 905  * Common routine for chroot and chdir.  Callers must provide a locked vnode
 906  * instance.
 907  */
 908 int
 909 change_dir(vp, td)
 910         struct vnode *vp;
 911         struct thread *td;
 912 {
 913 #ifdef MAC
 914         int error;
 915 #endif
 916
 917         ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 918         if (vp->v_type != VDIR)
 919                 return (ENOTDIR);
 920 #ifdef MAC
 921         error = mac_vnode_check_chdir(td->td_ucred, vp);
 922         if (error != 0)
 923                 return (error);
 924 #endif
 925         return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 926 }
 927
 928 /*
 929  * Common routine for kern_chroot() and jail_attach().  The caller is
 930  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
 931  * authorize this operation.
 932  */
 933 int
 934 change_root(vp, td)
 935         struct vnode *vp;
 936         struct thread *td;
 937 {
 938         struct filedesc *fdp;
 939         struct vnode *oldvp;
 940         int error;
 941
 942         fdp = td->td_proc->p_fd;
 943         FILEDESC_XLOCK(fdp);
 944         if (chroot_allow_open_directories == 0 ||
 945             (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 946                 error = chroot_refuse_vdir_fds(fdp);
 947                 if (error != 0) {
 948                         FILEDESC_XUNLOCK(fdp);
 949                         return (error);
 950                 }
 951         }
 952         oldvp = fdp->fd_rdir;
 953         fdp->fd_rdir = vp;
 954         VREF(fdp->fd_rdir);
 955         if (!fdp->fd_jdir) {
 956                 fdp->fd_jdir = vp;
 957                 VREF(fdp->fd_jdir);
 958         }
 959         FILEDESC_XUNLOCK(fdp);
 960         vrele(oldvp);
 961         return (0);
 962 }
 963
 964 static __inline void
 965 flags_to_rights(int flags, cap_rights_t *rightsp)
 966 {
 967
 968         if (flags & O_EXEC) {
 969                 cap_rights_set(rightsp, CAP_FEXECVE);
 970         } else {
 971                 switch ((flags & O_ACCMODE)) {
 972                 case O_RDONLY:
 973                         cap_rights_set(rightsp, CAP_READ);
 974                         break;
 975                 case O_RDWR:
 976                         cap_rights_set(rightsp, CAP_READ);
 977                         /* FALLTHROUGH */
 978                 case O_WRONLY:
 979                         cap_rights_set(rightsp, CAP_WRITE);
 980                         if (!(flags & (O_APPEND | O_TRUNC)))
 981                                 cap_rights_set(rightsp, CAP_SEEK);
 982                         break;
 983                 }
 984         }
 985
 986         if (flags & O_CREAT)
 987                 cap_rights_set(rightsp, CAP_CREATE);
 988
 989         if (flags & O_TRUNC)
 990                 cap_rights_set(rightsp, CAP_FTRUNCATE);
 991
 992         if (flags & (O_SYNC | O_FSYNC))
 993                 cap_rights_set(rightsp, CAP_FSYNC);
 994
 995         if (flags & (O_EXLOCK | O_SHLOCK))
 996                 cap_rights_set(rightsp, CAP_FLOCK);
 997 }
 998
 999 /*
1000  * Check permissions, allocate an open file structure, and call the device
1001  * open routine if any.
1002  */
1003 #ifndef _SYS_SYSPROTO_H_
1004 struct open_args {
1005         char    *path;
1006         int     flags;
1007         int     mode;
1008 };
1009 #endif
1010 int
1011 sys_open(td, uap)
1012         struct thread *td;
1013         register struct open_args /* {
1014                 char *path;
1015                 int flags;
1016                 int mode;
1017         } */ *uap;
1018 {
1019
1020         return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1021 }
1022
1023 #ifndef _SYS_SYSPROTO_H_
1024 struct openat_args {
1025         int     fd;
1026         char    *path;
1027         int     flag;
1028         int     mode;
1029 };
1030 #endif
1031 int
1032 sys_openat(struct thread *td, struct openat_args *uap)
1033 {
1034
1035         return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1036             uap->mode));
1037 }
1038
1039 int
1040 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1041     int mode)
1042 {
1043
1044         return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1045 }
1046
1047 int
1048 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1049     int flags, int mode)
1050 {
1051         struct proc *p = td->td_proc;
1052         struct filedesc *fdp = p->p_fd;
1053         struct file *fp;
1054         struct vnode *vp;
1055         struct nameidata nd;
1056         cap_rights_t rights;
1057         int cmode, error, indx;
1058
1059         indx = -1;
1060
1061         AUDIT_ARG_FFLAGS(flags);
1062         AUDIT_ARG_MODE(mode);
1063         /* XXX: audit dirfd */
1064         cap_rights_init(&rights, CAP_LOOKUP);
1065         flags_to_rights(flags, &rights);
1066         /*
1067          * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1068          * may be specified.
1069          */
1070         if (flags & O_EXEC) {
1071                 if (flags & O_ACCMODE)
1072                         return (EINVAL);
1073         } else if ((flags & O_ACCMODE) == O_ACCMODE) {
1074                 return (EINVAL);
1075         } else {
1076                 flags = FFLAGS(flags);
1077         }
1078
1079         /*
1080          * Allocate the file descriptor, but don't install a descriptor yet.
1081          */
1082         error = falloc_noinstall(td, &fp);
1083         if (error != 0)
1084                 return (error);
1085         /*
1086          * An extra reference on `fp' has been held for us by
1087          * falloc_noinstall().
1088          */
1089         /* Set the flags early so the finit in devfs can pick them up. */
1090         fp->f_flag = flags & FMASK;
1091         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1092         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1093             &rights, td);
1094         td->td_dupfd = -1;              /* XXX check for fdopen */
1095         error = vn_open(&nd, &flags, cmode, fp);
1096         if (error != 0) {
1097                 /*
1098                  * If the vn_open replaced the method vector, something
1099                  * wonderous happened deep below and we just pass it up
1100                  * pretending we know what we do.
1101                  */
1102                 if (error == ENXIO && fp->f_ops != &badfileops)
1103                         goto success;
1104
1105                 /*
1106                  * Handle special fdopen() case. bleh.
1107                  *
1108                  * Don't do this for relative (capability) lookups; we don't
1109                  * understand exactly what would happen, and we don't think
1110                  * that it ever should.
1111                  */
1112                 if (nd.ni_strictrelative == 0 &&
1113                     (error == ENODEV || error == ENXIO) &&
1114                     td->td_dupfd >= 0) {
1115                         error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1116                             &indx);
1117                         if (error == 0)
1118                                 goto success;
1119                 }
1120
1121                 goto bad;
1122         }
1123         td->td_dupfd = 0;
1124         NDFREE(&nd, NDF_ONLY_PNBUF);
1125         vp = nd.ni_vp;
1126
1127         /*
1128          * Store the vnode, for any f_type. Typically, the vnode use
1129          * count is decremented by direct call to vn_closefile() for
1130          * files that switched type in the cdevsw fdopen() method.
1131          */
1132         fp->f_vnode = vp;
1133         /*
1134          * If the file wasn't claimed by devfs bind it to the normal
1135          * vnode operations here.
1136          */
1137         if (fp->f_ops == &badfileops) {
1138                 KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1139                 fp->f_seqcount = 1;
1140                 finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1141                     DTYPE_VNODE, vp, &vnops);
1142         }
1143
1144         VOP_UNLOCK(vp, 0);
1145         if (flags & O_TRUNC) {
1146                 error = fo_truncate(fp, 0, td->td_ucred, td);
1147                 if (error != 0)
1148                         goto bad;
1149         }
1150 success:
1151         /*
1152          * If we haven't already installed the FD (for dupfdopen), do so now.
1153          */
1154         if (indx == -1) {
1155                 struct filecaps *fcaps;
1156
1157 #ifdef CAPABILITIES
1158                 if (nd.ni_strictrelative == 1)
1159                         fcaps = &nd.ni_filecaps;
1160                 else
1161 #endif
1162                         fcaps = NULL;
1163                 error = finstall(td, fp, &indx, flags, fcaps);
1164                 /* On success finstall() consumes fcaps. */
1165                 if (error != 0) {
1166                         filecaps_free(&nd.ni_filecaps);
1167                         goto bad;
1168                 }
1169         } else {
1170                 filecaps_free(&nd.ni_filecaps);
1171         }
1172
1173         /*
1174          * Release our private reference, leaving the one associated with
1175          * the descriptor table intact.
1176          */
1177         fdrop(fp, td);
1178         td->td_retval[0] = indx;
1179         return (0);
1180 bad:
1181         KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1182         fdrop(fp, td);
1183         return (error);
1184 }
1185
1186 #ifdef COMPAT_43
1187 /*
1188  * Create a file.
1189  */
1190 #ifndef _SYS_SYSPROTO_H_
1191 struct ocreat_args {
1192         char    *path;
1193         int     mode;
1194 };
1195 #endif
1196 int
1197 ocreat(td, uap)
1198         struct thread *td;
1199         register struct ocreat_args /* {
1200                 char *path;
1201                 int mode;
1202         } */ *uap;
1203 {
1204
1205         return (kern_open(td, uap->path, UIO_USERSPACE,
1206             O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1207 }
1208 #endif /* COMPAT_43 */
1209
1210 /*
1211  * Create a special file.
1212  */
1213 #ifndef _SYS_SYSPROTO_H_
1214 struct mknod_args {
1215         char    *path;
1216         int     mode;
1217         int     dev;
1218 };
1219 #endif
1220 int
1221 sys_mknod(td, uap)
1222         struct thread *td;
1223         register struct mknod_args /* {
1224                 char *path;
1225                 int mode;
1226                 int dev;
1227         } */ *uap;
1228 {
1229
1230         return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1231 }
1232
1233 #ifndef _SYS_SYSPROTO_H_
1234 struct mknodat_args {
1235         int     fd;
1236         char    *path;
1237         mode_t  mode;
1238         dev_t   dev;
1239 };
1240 #endif
1241 int
1242 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1243 {
1244
1245         return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1246             uap->dev));
1247 }
1248
1249 int
1250 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1251     int dev)
1252 {
1253
1254         return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1255 }
1256
1257 int
1258 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1259     int mode, int dev)
1260 {
1261         struct vnode *vp;
1262         struct mount *mp;
1263         struct vattr vattr;
1264         struct nameidata nd;
1265         cap_rights_t rights;
1266         int error, whiteout = 0;
1267
1268         AUDIT_ARG_MODE(mode);
1269         AUDIT_ARG_DEV(dev);
1270         switch (mode & S_IFMT) {
1271         case S_IFCHR:
1272         case S_IFBLK:
1273                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1274                 break;
1275         case S_IFMT:
1276                 error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1277                 break;
1278         case S_IFWHT:
1279                 error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1280                 break;
1281         case S_IFIFO:
1282                 if (dev == 0)
1283                         return (kern_mkfifoat(td, fd, path, pathseg, mode));
1284                 /* FALLTHROUGH */
1285         default:
1286                 error = EINVAL;
1287                 break;
1288         }
1289         if (error != 0)
1290                 return (error);
1291 restart:
1292         bwillwrite();
1293         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1294             pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
1295         if ((error = namei(&nd)) != 0)
1296                 return (error);
1297         vp = nd.ni_vp;
1298         if (vp != NULL) {
1299                 NDFREE(&nd, NDF_ONLY_PNBUF);
1300                 if (vp == nd.ni_dvp)
1301                         vrele(nd.ni_dvp);
1302                 else
1303                         vput(nd.ni_dvp);
1304                 vrele(vp);
1305                 return (EEXIST);
1306         } else {
1307                 VATTR_NULL(&vattr);
1308                 vattr.va_mode = (mode & ALLPERMS) &
1309                     ~td->td_proc->p_fd->fd_cmask;
1310                 vattr.va_rdev = dev;
1311                 whiteout = 0;
1312
1313                 switch (mode & S_IFMT) {
1314                 case S_IFMT:    /* used by badsect to flag bad sectors */
1315                         vattr.va_type = VBAD;
1316                         break;
1317                 case S_IFCHR:
1318                         vattr.va_type = VCHR;
1319                         break;
1320                 case S_IFBLK:
1321                         vattr.va_type = VBLK;
1322                         break;
1323                 case S_IFWHT:
1324                         whiteout = 1;
1325                         break;
1326                 default:
1327                         panic("kern_mknod: invalid mode");
1328                 }
1329         }
1330         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1331                 NDFREE(&nd, NDF_ONLY_PNBUF);
1332                 vput(nd.ni_dvp);
1333                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1334                         return (error);
1335                 goto restart;
1336         }
1337 #ifdef MAC
1338         if (error == 0 && !whiteout)
1339                 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1340                     &nd.ni_cnd, &vattr);
1341 #endif
1342         if (error == 0) {
1343                 if (whiteout)
1344                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1345                 else {
1346                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1347                                                 &nd.ni_cnd, &vattr);
1348                         if (error == 0)
1349                                 vput(nd.ni_vp);
1350                 }
1351         }
1352         NDFREE(&nd, NDF_ONLY_PNBUF);
1353         vput(nd.ni_dvp);
1354         vn_finished_write(mp);
1355         return (error);
1356 }
1357
1358 /*
1359  * Create a named pipe.
1360  */
1361 #ifndef _SYS_SYSPROTO_H_
1362 struct mkfifo_args {
1363         char    *path;
1364         int     mode;
1365 };
1366 #endif
1367 int
1368 sys_mkfifo(td, uap)
1369         struct thread *td;
1370         register struct mkfifo_args /* {
1371                 char *path;
1372                 int mode;
1373         } */ *uap;
1374 {
1375
1376         return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1377 }
1378
1379 #ifndef _SYS_SYSPROTO_H_
1380 struct mkfifoat_args {
1381         int     fd;
1382         char    *path;
1383         mode_t  mode;
1384 };
1385 #endif
1386 int
1387 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1388 {
1389
1390         return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1391             uap->mode));
1392 }
1393
1394 int
1395 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1396 {
1397
1398         return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1399 }
1400
1401 int
1402 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1403     int mode)
1404 {
1405         struct mount *mp;
1406         struct vattr vattr;
1407         struct nameidata nd;
1408         cap_rights_t rights;
1409         int error;
1410
1411         AUDIT_ARG_MODE(mode);
1412 restart:
1413         bwillwrite();
1414         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1415             pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
1416         if ((error = namei(&nd)) != 0)
1417                 return (error);
1418         if (nd.ni_vp != NULL) {
1419                 NDFREE(&nd, NDF_ONLY_PNBUF);
1420                 if (nd.ni_vp == nd.ni_dvp)
1421                         vrele(nd.ni_dvp);
1422                 else
1423                         vput(nd.ni_dvp);
1424                 vrele(nd.ni_vp);
1425                 return (EEXIST);
1426         }
1427         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1428                 NDFREE(&nd, NDF_ONLY_PNBUF);
1429                 vput(nd.ni_dvp);
1430                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1431                         return (error);
1432                 goto restart;
1433         }
1434         VATTR_NULL(&vattr);
1435         vattr.va_type = VFIFO;
1436         vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1437 #ifdef MAC
1438         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1439             &vattr);
1440         if (error != 0)
1441                 goto out;
1442 #endif
1443         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1444         if (error == 0)
1445                 vput(nd.ni_vp);
1446 #ifdef MAC
1447 out:
1448 #endif
1449         vput(nd.ni_dvp);
1450         vn_finished_write(mp);
1451         NDFREE(&nd, NDF_ONLY_PNBUF);
1452         return (error);
1453 }
1454
1455 /*
1456  * Make a hard file link.
1457  */
1458 #ifndef _SYS_SYSPROTO_H_
1459 struct link_args {
1460         char    *path;
1461         char    *link;
1462 };
1463 #endif
1464 int
1465 sys_link(td, uap)
1466         struct thread *td;
1467         register struct link_args /* {
1468                 char *path;
1469                 char *link;
1470         } */ *uap;
1471 {
1472
1473         return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1474 }
1475
1476 #ifndef _SYS_SYSPROTO_H_
1477 struct linkat_args {
1478         int     fd1;
1479         char    *path1;
1480         int     fd2;
1481         char    *path2;
1482         int     flag;
1483 };
1484 #endif
1485 int
1486 sys_linkat(struct thread *td, struct linkat_args *uap)
1487 {
1488         int flag;
1489
1490         flag = uap->flag;
1491         if (flag & ~AT_SYMLINK_FOLLOW)
1492                 return (EINVAL);
1493
1494         return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1495             UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1496 }
1497
1498 int hardlink_check_uid = 0;
1499 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1500     &hardlink_check_uid, 0,
1501     "Unprivileged processes cannot create hard links to files owned by other "
1502     "users");
1503 static int hardlink_check_gid = 0;
1504 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1505     &hardlink_check_gid, 0,
1506     "Unprivileged processes cannot create hard links to files owned by other "
1507     "groups");
1508
1509 static int
1510 can_hardlink(struct vnode *vp, struct ucred *cred)
1511 {
1512         struct vattr va;
1513         int error;
1514
1515         if (!hardlink_check_uid && !hardlink_check_gid)
1516                 return (0);
1517
1518         error = VOP_GETATTR(vp, &va, cred);
1519         if (error != 0)
1520                 return (error);
1521
1522         if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1523                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1524                 if (error != 0)
1525                         return (error);
1526         }
1527
1528         if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1529                 error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1530                 if (error != 0)
1531                         return (error);
1532         }
1533
1534         return (0);
1535 }
1536
1537 int
1538 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1539 {
1540
1541         return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1542 }
1543
1544 int
1545 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1546     enum uio_seg segflg, int follow)
1547 {
1548         struct vnode *vp;
1549         struct mount *mp;
1550         struct nameidata nd;
1551         cap_rights_t rights;
1552         int error;
1553
1554 again:
1555         bwillwrite();
1556         NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1557
1558         if ((error = namei(&nd)) != 0)
1559                 return (error);
1560         NDFREE(&nd, NDF_ONLY_PNBUF);
1561         vp = nd.ni_vp;
1562         if (vp->v_type == VDIR) {
1563                 vrele(vp);
1564                 return (EPERM);         /* POSIX */
1565         }
1566         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
1567             segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
1568         if ((error = namei(&nd)) == 0) {
1569                 if (nd.ni_vp != NULL) {
1570                         NDFREE(&nd, NDF_ONLY_PNBUF);
1571                         if (nd.ni_dvp == nd.ni_vp)
1572                                 vrele(nd.ni_dvp);
1573                         else
1574                                 vput(nd.ni_dvp);
1575                         vrele(nd.ni_vp);
1576                         vrele(vp);
1577                         return (EEXIST);
1578                 } else if (nd.ni_dvp->v_mount != vp->v_mount) {
1579                         /*
1580                          * Cross-device link.  No need to recheck
1581                          * vp->v_type, since it cannot change, except
1582                          * to VBAD.
1583                          */
1584                         NDFREE(&nd, NDF_ONLY_PNBUF);
1585                         vput(nd.ni_dvp);
1586                         vrele(vp);
1587                         return (EXDEV);
1588                 } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1589                         error = can_hardlink(vp, td->td_ucred);
1590 #ifdef MAC
1591                         if (error == 0)
1592                                 error = mac_vnode_check_link(td->td_ucred,
1593                                     nd.ni_dvp, vp, &nd.ni_cnd);
1594 #endif
1595                         if (error != 0) {
1596                                 vput(vp);
1597                                 vput(nd.ni_dvp);
1598                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1599                                 return (error);
1600                         }
1601                         error = vn_start_write(vp, &mp, V_NOWAIT);
1602                         if (error != 0) {
1603                                 vput(vp);
1604                                 vput(nd.ni_dvp);
1605                                 NDFREE(&nd, NDF_ONLY_PNBUF);
1606                                 error = vn_start_write(NULL, &mp,
1607                                     V_XSLEEP | PCATCH);
1608                                 if (error != 0)
1609                                         return (error);
1610                                 goto again;
1611                         }
1612                         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1613                         VOP_UNLOCK(vp, 0);
1614                         vput(nd.ni_dvp);
1615                         vn_finished_write(mp);
1616                         NDFREE(&nd, NDF_ONLY_PNBUF);
1617                 } else {
1618                         vput(nd.ni_dvp);
1619                         NDFREE(&nd, NDF_ONLY_PNBUF);
1620                         vrele(vp);
1621                         goto again;
1622                 }
1623         }
1624         vrele(vp);
1625         return (error);
1626 }
1627
1628 /*
1629  * Make a symbolic link.
1630  */
1631 #ifndef _SYS_SYSPROTO_H_
1632 struct symlink_args {
1633         char    *path;
1634         char    *link;
1635 };
1636 #endif
1637 int
1638 sys_symlink(td, uap)
1639         struct thread *td;
1640         register struct symlink_args /* {
1641                 char *path;
1642                 char *link;
1643         } */ *uap;
1644 {
1645
1646         return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1647 }
1648
1649 #ifndef _SYS_SYSPROTO_H_
1650 struct symlinkat_args {
1651         char    *path;
1652         int     fd;
1653         char    *path2;
1654 };
1655 #endif
1656 int
1657 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1658 {
1659
1660         return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1661             UIO_USERSPACE));
1662 }
1663
1664 int
1665 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1666 {
1667
1668         return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1669 }
1670
1671 int
1672 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1673     enum uio_seg segflg)
1674 {
1675         struct mount *mp;
1676         struct vattr vattr;
1677         char *syspath;
1678         struct nameidata nd;
1679         int error;
1680         cap_rights_t rights;
1681
1682         if (segflg == UIO_SYSSPACE) {
1683                 syspath = path1;
1684         } else {
1685                 syspath = uma_zalloc(namei_zone, M_WAITOK);
1686                 if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1687                         goto out;
1688         }
1689         AUDIT_ARG_TEXT(syspath);
1690 restart:
1691         bwillwrite();
1692         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1693             segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
1694         if ((error = namei(&nd)) != 0)
1695                 goto out;
1696         if (nd.ni_vp) {
1697                 NDFREE(&nd, NDF_ONLY_PNBUF);
1698                 if (nd.ni_vp == nd.ni_dvp)
1699                         vrele(nd.ni_dvp);
1700                 else
1701                         vput(nd.ni_dvp);
1702                 vrele(nd.ni_vp);
1703                 error = EEXIST;
1704                 goto out;
1705         }
1706         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1707                 NDFREE(&nd, NDF_ONLY_PNBUF);
1708                 vput(nd.ni_dvp);
1709                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1710                         goto out;
1711                 goto restart;
1712         }
1713         VATTR_NULL(&vattr);
1714         vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1715 #ifdef MAC
1716         vattr.va_type = VLNK;
1717         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1718             &vattr);
1719         if (error != 0)
1720                 goto out2;
1721 #endif
1722         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1723         if (error == 0)
1724                 vput(nd.ni_vp);
1725 #ifdef MAC
1726 out2:
1727 #endif
1728         NDFREE(&nd, NDF_ONLY_PNBUF);
1729         vput(nd.ni_dvp);
1730         vn_finished_write(mp);
1731 out:
1732         if (segflg != UIO_SYSSPACE)
1733                 uma_zfree(namei_zone, syspath);
1734         return (error);
1735 }
1736
1737 /*
1738  * Delete a whiteout from the filesystem.
1739  */
1740 int
1741 sys_undelete(td, uap)
1742         struct thread *td;
1743         register struct undelete_args /* {
1744                 char *path;
1745         } */ *uap;
1746 {
1747         struct mount *mp;
1748         struct nameidata nd;
1749         int error;
1750
1751 restart:
1752         bwillwrite();
1753         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1754             UIO_USERSPACE, uap->path, td);
1755         error = namei(&nd);
1756         if (error != 0)
1757                 return (error);
1758
1759         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1760                 NDFREE(&nd, NDF_ONLY_PNBUF);
1761                 if (nd.ni_vp == nd.ni_dvp)
1762                         vrele(nd.ni_dvp);
1763                 else
1764                         vput(nd.ni_dvp);
1765                 if (nd.ni_vp)
1766                         vrele(nd.ni_vp);
1767                 return (EEXIST);
1768         }
1769         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1770                 NDFREE(&nd, NDF_ONLY_PNBUF);
1771                 vput(nd.ni_dvp);
1772                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773                         return (error);
1774                 goto restart;
1775         }
1776         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1777         NDFREE(&nd, NDF_ONLY_PNBUF);
1778         vput(nd.ni_dvp);
1779         vn_finished_write(mp);
1780         return (error);
1781 }
1782
1783 /*
1784  * Delete a name from the filesystem.
1785  */
1786 #ifndef _SYS_SYSPROTO_H_
1787 struct unlink_args {
1788         char    *path;
1789 };
1790 #endif
1791 int
1792 sys_unlink(td, uap)
1793         struct thread *td;
1794         struct unlink_args /* {
1795                 char *path;
1796         } */ *uap;
1797 {
1798
1799         return (kern_unlink(td, uap->path, UIO_USERSPACE));
1800 }
1801
1802 #ifndef _SYS_SYSPROTO_H_
1803 struct unlinkat_args {
1804         int     fd;
1805         char    *path;
1806         int     flag;
1807 };
1808 #endif
1809 int
1810 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1811 {
1812         int flag = uap->flag;
1813         int fd = uap->fd;
1814         char *path = uap->path;
1815
1816         if (flag & ~AT_REMOVEDIR)
1817                 return (EINVAL);
1818
1819         if (flag & AT_REMOVEDIR)
1820                 return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1821         else
1822                 return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1823 }
1824
1825 int
1826 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1827 {
1828
1829         return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1830 }
1831
1832 int
1833 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1834     ino_t oldinum)
1835 {
1836         struct mount *mp;
1837         struct vnode *vp;
1838         struct nameidata nd;
1839         struct stat sb;
1840         cap_rights_t rights;
1841         int error;
1842
1843 restart:
1844         bwillwrite();
1845         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1846             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1847         if ((error = namei(&nd)) != 0)
1848                 return (error == EINVAL ? EPERM : error);
1849         vp = nd.ni_vp;
1850         if (vp->v_type == VDIR && oldinum == 0) {
1851                 error = EPERM;          /* POSIX */
1852         } else if (oldinum != 0 &&
1853                   ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1854                   sb.st_ino != oldinum) {
1855                         error = EIDRM;  /* Identifier removed */
1856         } else {
1857                 /*
1858                  * The root of a mounted filesystem cannot be deleted.
1859                  *
1860                  * XXX: can this only be a VDIR case?
1861                  */
1862                 if (vp->v_vflag & VV_ROOT)
1863                         error = EBUSY;
1864         }
1865         if (error == 0) {
1866                 if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1867                         NDFREE(&nd, NDF_ONLY_PNBUF);
1868                         vput(nd.ni_dvp);
1869                         if (vp == nd.ni_dvp)
1870                                 vrele(vp);
1871                         else
1872                                 vput(vp);
1873                         if ((error = vn_start_write(NULL, &mp,
1874                             V_XSLEEP | PCATCH)) != 0)
1875                                 return (error);
1876                         goto restart;
1877                 }
1878 #ifdef MAC
1879                 error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1880                     &nd.ni_cnd);
1881                 if (error != 0)
1882                         goto out;
1883 #endif
1884                 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1885                 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1886 #ifdef MAC
1887 out:
1888 #endif
1889                 vn_finished_write(mp);
1890         }
1891         NDFREE(&nd, NDF_ONLY_PNBUF);
1892         vput(nd.ni_dvp);
1893         if (vp == nd.ni_dvp)
1894                 vrele(vp);
1895         else
1896                 vput(vp);
1897         return (error);
1898 }
1899
1900 /*
1901  * Reposition read/write file offset.
1902  */
1903 #ifndef _SYS_SYSPROTO_H_
1904 struct lseek_args {
1905         int     fd;
1906         int     pad;
1907         off_t   offset;
1908         int     whence;
1909 };
1910 #endif
1911 int
1912 sys_lseek(td, uap)
1913         struct thread *td;
1914         register struct lseek_args /* {
1915                 int fd;
1916                 int pad;
1917                 off_t offset;
1918                 int whence;
1919         } */ *uap;
1920 {
1921         struct file *fp;
1922         cap_rights_t rights;
1923         int error;
1924
1925         AUDIT_ARG_FD(uap->fd);
1926         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1927         if (error != 0)
1928                 return (error);
1929         error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1930             fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1931         fdrop(fp, td);
1932         return (error);
1933 }
1934
1935 #if defined(COMPAT_43)
1936 /*
1937  * Reposition read/write file offset.
1938  */
1939 #ifndef _SYS_SYSPROTO_H_
1940 struct olseek_args {
1941         int     fd;
1942         long    offset;
1943         int     whence;
1944 };
1945 #endif
1946 int
1947 olseek(td, uap)
1948         struct thread *td;
1949         register struct olseek_args /* {
1950                 int fd;
1951                 long offset;
1952                 int whence;
1953         } */ *uap;
1954 {
1955         struct lseek_args /* {
1956                 int fd;
1957                 int pad;
1958                 off_t offset;
1959                 int whence;
1960         } */ nuap;
1961
1962         nuap.fd = uap->fd;
1963         nuap.offset = uap->offset;
1964         nuap.whence = uap->whence;
1965         return (sys_lseek(td, &nuap));
1966 }
1967 #endif /* COMPAT_43 */
1968
1969 /* Version with the 'pad' argument */
1970 int
1971 freebsd6_lseek(td, uap)
1972         struct thread *td;
1973         register struct freebsd6_lseek_args *uap;
1974 {
1975         struct lseek_args ouap;
1976
1977         ouap.fd = uap->fd;
1978         ouap.offset = uap->offset;
1979         ouap.whence = uap->whence;
1980         return (sys_lseek(td, &ouap));
1981 }
1982
1983 /*
1984  * Check access permissions using passed credentials.
1985  */
1986 static int
1987 vn_access(vp, user_flags, cred, td)
1988         struct vnode    *vp;
1989         int             user_flags;
1990         struct ucred    *cred;
1991         struct thread   *td;
1992 {
1993         accmode_t accmode;
1994         int error;
1995
1996         /* Flags == 0 means only check for existence. */
1997         error = 0;
1998         if (user_flags) {
1999                 accmode = 0;
2000                 if (user_flags & R_OK)
2001                         accmode |= VREAD;
2002                 if (user_flags & W_OK)
2003                         accmode |= VWRITE;
2004                 if (user_flags & X_OK)
2005                         accmode |= VEXEC;
2006 #ifdef MAC
2007                 error = mac_vnode_check_access(cred, vp, accmode);
2008                 if (error != 0)
2009                         return (error);
2010 #endif
2011                 if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2012                         error = VOP_ACCESS(vp, accmode, cred, td);
2013         }
2014         return (error);
2015 }
2016
2017 /*
2018  * Check access permissions using "real" credentials.
2019  */
2020 #ifndef _SYS_SYSPROTO_H_
2021 struct access_args {
2022         char    *path;
2023         int     amode;
2024 };
2025 #endif
2026 int
2027 sys_access(td, uap)
2028         struct thread *td;
2029         register struct access_args /* {
2030                 char *path;
2031                 int amode;
2032         } */ *uap;
2033 {
2034
2035         return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2036 }
2037
2038 #ifndef _SYS_SYSPROTO_H_
2039 struct faccessat_args {
2040         int     dirfd;
2041         char    *path;
2042         int     amode;
2043         int     flag;
2044 }
2045 #endif
2046 int
2047 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2048 {
2049
2050         if (uap->flag & ~AT_EACCESS)
2051                 return (EINVAL);
2052         return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2053             uap->amode));
2054 }
2055
2056 int
2057 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2058 {
2059
2060         return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2061 }
2062
2063 int
2064 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2065     int flag, int amode)
2066 {
2067         struct ucred *cred, *tmpcred;
2068         struct vnode *vp;
2069         struct nameidata nd;
2070         cap_rights_t rights;
2071         int error;
2072
2073         if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
2074                 return (EINVAL);
2075
2076         /*
2077          * Create and modify a temporary credential instead of one that
2078          * is potentially shared.
2079          */
2080         if (!(flag & AT_EACCESS)) {
2081                 cred = td->td_ucred;
2082                 tmpcred = crdup(cred);
2083                 tmpcred->cr_uid = cred->cr_ruid;
2084                 tmpcred->cr_groups[0] = cred->cr_rgid;
2085                 td->td_ucred = tmpcred;
2086         } else
2087                 cred = tmpcred = td->td_ucred;
2088         AUDIT_ARG_VALUE(amode);
2089         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2090             AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2091             td);
2092         if ((error = namei(&nd)) != 0)
2093                 goto out1;
2094         vp = nd.ni_vp;
2095
2096         error = vn_access(vp, amode, tmpcred, td);
2097         NDFREE(&nd, NDF_ONLY_PNBUF);
2098         vput(vp);
2099 out1:
2100         if (!(flag & AT_EACCESS)) {
2101                 td->td_ucred = cred;
2102                 crfree(tmpcred);
2103         }
2104         return (error);
2105 }
2106
2107 /*
2108  * Check access permissions using "effective" credentials.
2109  */
2110 #ifndef _SYS_SYSPROTO_H_
2111 struct eaccess_args {
2112         char    *path;
2113         int     amode;
2114 };
2115 #endif
2116 int
2117 sys_eaccess(td, uap)
2118         struct thread *td;
2119         register struct eaccess_args /* {
2120                 char *path;
2121                 int amode;
2122         } */ *uap;
2123 {
2124
2125         return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2126 }
2127
2128 int
2129 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2130 {
2131
2132         return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2133 }
2134
2135 #if defined(COMPAT_43)
2136 /*
2137  * Get file status; this version follows links.
2138  */
2139 #ifndef _SYS_SYSPROTO_H_
2140 struct ostat_args {
2141         char    *path;
2142         struct ostat *ub;
2143 };
2144 #endif
2145 int
2146 ostat(td, uap)
2147         struct thread *td;
2148         register struct ostat_args /* {
2149                 char *path;
2150                 struct ostat *ub;
2151         } */ *uap;
2152 {
2153         struct stat sb;
2154         struct ostat osb;
2155         int error;
2156
2157         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2158         if (error != 0)
2159                 return (error);
2160         cvtstat(&sb, &osb);
2161         return (copyout(&osb, uap->ub, sizeof (osb)));
2162 }
2163
2164 /*
2165  * Get file status; this version does not follow links.
2166  */
2167 #ifndef _SYS_SYSPROTO_H_
2168 struct olstat_args {
2169         char    *path;
2170         struct ostat *ub;
2171 };
2172 #endif
2173 int
2174 olstat(td, uap)
2175         struct thread *td;
2176         register struct olstat_args /* {
2177                 char *path;
2178                 struct ostat *ub;
2179         } */ *uap;
2180 {
2181         struct stat sb;
2182         struct ostat osb;
2183         int error;
2184
2185         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2186         if (error != 0)
2187                 return (error);
2188         cvtstat(&sb, &osb);
2189         return (copyout(&osb, uap->ub, sizeof (osb)));
2190 }
2191
2192 /*
2193  * Convert from an old to a new stat structure.
2194  */
2195 void
2196 cvtstat(st, ost)
2197         struct stat *st;
2198         struct ostat *ost;
2199 {
2200
2201         ost->st_dev = st->st_dev;
2202         ost->st_ino = st->st_ino;
2203         ost->st_mode = st->st_mode;
2204         ost->st_nlink = st->st_nlink;
2205         ost->st_uid = st->st_uid;
2206         ost->st_gid = st->st_gid;
2207         ost->st_rdev = st->st_rdev;
2208         if (st->st_size < (quad_t)1 << 32)
2209                 ost->st_size = st->st_size;
2210         else
2211                 ost->st_size = -2;
2212         ost->st_atim = st->st_atim;
2213         ost->st_mtim = st->st_mtim;
2214         ost->st_ctim = st->st_ctim;
2215         ost->st_blksize = st->st_blksize;
2216         ost->st_blocks = st->st_blocks;
2217         ost->st_flags = st->st_flags;
2218         ost->st_gen = st->st_gen;
2219 }
2220 #endif /* COMPAT_43 */
2221
2222 /*
2223  * Get file status; this version follows links.
2224  */
2225 #ifndef _SYS_SYSPROTO_H_
2226 struct stat_args {
2227         char    *path;
2228         struct stat *ub;
2229 };
2230 #endif
2231 int
2232 sys_stat(td, uap)
2233         struct thread *td;
2234         register struct stat_args /* {
2235                 char *path;
2236                 struct stat *ub;
2237         } */ *uap;
2238 {
2239         struct stat sb;
2240         int error;
2241
2242         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2243         if (error == 0)
2244                 error = copyout(&sb, uap->ub, sizeof (sb));
2245         return (error);
2246 }
2247
2248 #ifndef _SYS_SYSPROTO_H_
2249 struct fstatat_args {
2250         int     fd;
2251         char    *path;
2252         struct stat     *buf;
2253         int     flag;
2254 }
2255 #endif
2256 int
2257 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2258 {
2259         struct stat sb;
2260         int error;
2261
2262         error = kern_statat(td, uap->flag, uap->fd, uap->path,
2263             UIO_USERSPACE, &sb);
2264         if (error == 0)
2265                 error = copyout(&sb, uap->buf, sizeof (sb));
2266         return (error);
2267 }
2268
2269 int
2270 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2271 {
2272
2273         return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2274 }
2275
2276 int
2277 kern_statat(struct thread *td, int flag, int fd, char *path,
2278     enum uio_seg pathseg, struct stat *sbp)
2279 {
2280
2281         return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2282 }
2283
2284 int
2285 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2286     enum uio_seg pathseg, struct stat *sbp,
2287     void (*hook)(struct vnode *vp, struct stat *sbp))
2288 {
2289         struct nameidata nd;
2290         struct stat sb;
2291         cap_rights_t rights;
2292         int error;
2293
2294         if (flag & ~AT_SYMLINK_NOFOLLOW)
2295                 return (EINVAL);
2296
2297         NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2298             FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2299             cap_rights_init(&rights, CAP_FSTAT), td);
2300
2301         if ((error = namei(&nd)) != 0)
2302                 return (error);
2303         error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2304         if (error == 0) {
2305                 SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2306                 if (S_ISREG(sb.st_mode))
2307                         SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2308                 if (__predict_false(hook != NULL))
2309                         hook(nd.ni_vp, &sb);
2310         }
2311         NDFREE(&nd, NDF_ONLY_PNBUF);
2312         vput(nd.ni_vp);
2313         if (error != 0)
2314                 return (error);
2315         *sbp = sb;
2316 #ifdef KTRACE
2317         if (KTRPOINT(td, KTR_STRUCT))
2318                 ktrstat(&sb);
2319 #endif
2320         return (0);
2321 }
2322
2323 /*
2324  * Get file status; this version does not follow links.
2325  */
2326 #ifndef _SYS_SYSPROTO_H_
2327 struct lstat_args {
2328         char    *path;
2329         struct stat *ub;
2330 };
2331 #endif
2332 int
2333 sys_lstat(td, uap)
2334         struct thread *td;
2335         register struct lstat_args /* {
2336                 char *path;
2337                 struct stat *ub;
2338         } */ *uap;
2339 {
2340         struct stat sb;
2341         int error;
2342
2343         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2344         if (error == 0)
2345                 error = copyout(&sb, uap->ub, sizeof (sb));
2346         return (error);
2347 }
2348
2349 int
2350 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2351 {
2352
2353         return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2354             sbp));
2355 }
2356
2357 /*
2358  * Implementation of the NetBSD [l]stat() functions.
2359  */
2360 void
2361 cvtnstat(sb, nsb)
2362         struct stat *sb;
2363         struct nstat *nsb;
2364 {
2365
2366         bzero(nsb, sizeof *nsb);
2367         nsb->st_dev = sb->st_dev;
2368         nsb->st_ino = sb->st_ino;
2369         nsb->st_mode = sb->st_mode;
2370         nsb->st_nlink = sb->st_nlink;
2371         nsb->st_uid = sb->st_uid;
2372         nsb->st_gid = sb->st_gid;
2373         nsb->st_rdev = sb->st_rdev;
2374         nsb->st_atim = sb->st_atim;
2375         nsb->st_mtim = sb->st_mtim;
2376         nsb->st_ctim = sb->st_ctim;
2377         nsb->st_size = sb->st_size;
2378         nsb->st_blocks = sb->st_blocks;
2379         nsb->st_blksize = sb->st_blksize;
2380         nsb->st_flags = sb->st_flags;
2381         nsb->st_gen = sb->st_gen;
2382         nsb->st_birthtim = sb->st_birthtim;
2383 }
2384
2385 #ifndef _SYS_SYSPROTO_H_
2386 struct nstat_args {
2387         char    *path;
2388         struct nstat *ub;
2389 };
2390 #endif
2391 int
2392 sys_nstat(td, uap)
2393         struct thread *td;
2394         register struct nstat_args /* {
2395                 char *path;
2396                 struct nstat *ub;
2397         } */ *uap;
2398 {
2399         struct stat sb;
2400         struct nstat nsb;
2401         int error;
2402
2403         error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2404         if (error != 0)
2405                 return (error);
2406         cvtnstat(&sb, &nsb);
2407         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2408 }
2409
2410 /*
2411  * NetBSD lstat.  Get file status; this version does not follow links.
2412  */
2413 #ifndef _SYS_SYSPROTO_H_
2414 struct lstat_args {
2415         char    *path;
2416         struct stat *ub;
2417 };
2418 #endif
2419 int
2420 sys_nlstat(td, uap)
2421         struct thread *td;
2422         register struct nlstat_args /* {
2423                 char *path;
2424                 struct nstat *ub;
2425         } */ *uap;
2426 {
2427         struct stat sb;
2428         struct nstat nsb;
2429         int error;
2430
2431         error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2432         if (error != 0)
2433                 return (error);
2434         cvtnstat(&sb, &nsb);
2435         return (copyout(&nsb, uap->ub, sizeof (nsb)));
2436 }
2437
2438 /*
2439  * Get configurable pathname variables.
2440  */
2441 #ifndef _SYS_SYSPROTO_H_
2442 struct pathconf_args {
2443         char    *path;
2444         int     name;
2445 };
2446 #endif
2447 int
2448 sys_pathconf(td, uap)
2449         struct thread *td;
2450         register struct pathconf_args /* {
2451                 char *path;
2452                 int name;
2453         } */ *uap;
2454 {
2455
2456         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2457 }
2458
2459 #ifndef _SYS_SYSPROTO_H_
2460 struct lpathconf_args {
2461         char    *path;
2462         int     name;
2463 };
2464 #endif
2465 int
2466 sys_lpathconf(td, uap)
2467         struct thread *td;
2468         register struct lpathconf_args /* {
2469                 char *path;
2470                 int name;
2471         } */ *uap;
2472 {
2473
2474         return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2475             NOFOLLOW));
2476 }
2477
2478 int
2479 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2480     u_long flags)
2481 {
2482         struct nameidata nd;
2483         int error;
2484
2485         NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2486             pathseg, path, td);
2487         if ((error = namei(&nd)) != 0)
2488                 return (error);
2489         NDFREE(&nd, NDF_ONLY_PNBUF);
2490
2491         /* If asynchronous I/O is available, it works for all files. */
2492         if (name == _PC_ASYNC_IO)
2493                 td->td_retval[0] = async_io_version;
2494         else
2495                 error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2496         vput(nd.ni_vp);
2497         return (error);
2498 }
2499
2500 /*
2501  * Return target name of a symbolic link.
2502  */
2503 #ifndef _SYS_SYSPROTO_H_
2504 struct readlink_args {
2505         char    *path;
2506         char    *buf;
2507         size_t  count;
2508 };
2509 #endif
2510 int
2511 sys_readlink(td, uap)
2512         struct thread *td;
2513         register struct readlink_args /* {
2514                 char *path;
2515                 char *buf;
2516                 size_t count;
2517         } */ *uap;
2518 {
2519
2520         return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2521             UIO_USERSPACE, uap->count));
2522 }
2523 #ifndef _SYS_SYSPROTO_H_
2524 struct readlinkat_args {
2525         int     fd;
2526         char    *path;
2527         char    *buf;
2528         size_t  bufsize;
2529 };
2530 #endif
2531 int
2532 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2533 {
2534
2535         return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2536             uap->buf, UIO_USERSPACE, uap->bufsize));
2537 }
2538
2539 int
2540 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2541     enum uio_seg bufseg, size_t count)
2542 {
2543
2544         return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2545             count));
2546 }
2547
2548 int
2549 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2550     char *buf, enum uio_seg bufseg, size_t count)
2551 {
2552         struct vnode *vp;
2553         struct iovec aiov;
2554         struct uio auio;
2555         struct nameidata nd;
2556         int error;
2557
2558         if (count > IOSIZE_MAX)
2559                 return (EINVAL);
2560
2561         NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2562             pathseg, path, fd, td);
2563
2564         if ((error = namei(&nd)) != 0)
2565                 return (error);
2566         NDFREE(&nd, NDF_ONLY_PNBUF);
2567         vp = nd.ni_vp;
2568 #ifdef MAC
2569         error = mac_vnode_check_readlink(td->td_ucred, vp);
2570         if (error != 0) {
2571                 vput(vp);
2572                 return (error);
2573         }
2574 #endif
2575         if (vp->v_type != VLNK)
2576                 error = EINVAL;
2577         else {
2578                 aiov.iov_base = buf;
2579                 aiov.iov_len = count;
2580                 auio.uio_iov = &aiov;
2581                 auio.uio_iovcnt = 1;
2582                 auio.uio_offset = 0;
2583                 auio.uio_rw = UIO_READ;
2584                 auio.uio_segflg = bufseg;
2585                 auio.uio_td = td;
2586                 auio.uio_resid = count;
2587                 error = VOP_READLINK(vp, &auio, td->td_ucred);
2588                 td->td_retval[0] = count - auio.uio_resid;
2589         }
2590         vput(vp);
2591         return (error);
2592 }
2593
2594 /*
2595  * Common implementation code for chflags() and fchflags().
2596  */
2597 static int
2598 setfflags(td, vp, flags)
2599         struct thread *td;
2600         struct vnode *vp;
2601         u_long flags;
2602 {
2603         struct mount *mp;
2604         struct vattr vattr;
2605         int error;
2606
2607         /* We can't support the value matching VNOVAL. */
2608         if (flags == VNOVAL)
2609                 return (EOPNOTSUPP);
2610
2611         /*
2612          * Prevent non-root users from setting flags on devices.  When
2613          * a device is reused, users can retain ownership of the device
2614          * if they are allowed to set flags and programs assume that
2615          * chown can't fail when done as root.
2616          */
2617         if (vp->v_type == VCHR || vp->v_type == VBLK) {
2618                 error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2619                 if (error != 0)
2620                         return (error);
2621         }
2622
2623         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2624                 return (error);
2625         VATTR_NULL(&vattr);
2626         vattr.va_flags = flags;
2627         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2628 #ifdef MAC
2629         error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2630         if (error == 0)
2631 #endif
2632                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2633         VOP_UNLOCK(vp, 0);
2634         vn_finished_write(mp);
2635         return (error);
2636 }
2637
2638 /*
2639  * Change flags of a file given a path name.
2640  */
2641 #ifndef _SYS_SYSPROTO_H_
2642 struct chflags_args {
2643         const char *path;
2644         u_long  flags;
2645 };
2646 #endif
2647 int
2648 sys_chflags(td, uap)
2649         struct thread *td;
2650         register struct chflags_args /* {
2651                 const char *path;
2652                 u_long flags;
2653         } */ *uap;
2654 {
2655
2656         return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2657 }
2658
2659 #ifndef _SYS_SYSPROTO_H_
2660 struct chflagsat_args {
2661         int     fd;
2662         const char *path;
2663         u_long  flags;
2664         int     atflag;
2665 }
2666 #endif
2667 int
2668 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2669 {
2670         int fd = uap->fd;
2671         const char *path = uap->path;
2672         u_long flags = uap->flags;
2673         int atflag = uap->atflag;
2674
2675         if (atflag & ~AT_SYMLINK_NOFOLLOW)
2676                 return (EINVAL);
2677
2678         return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2679 }
2680
2681 static int
2682 kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2683     u_long flags)
2684 {
2685
2686         return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2687 }
2688
2689 /*
2690  * Same as chflags() but doesn't follow symlinks.
2691  */
2692 int
2693 sys_lchflags(td, uap)
2694         struct thread *td;
2695         register struct lchflags_args /* {
2696                 const char *path;
2697                 u_long flags;
2698         } */ *uap;
2699 {
2700
2701         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2702             uap->flags, AT_SYMLINK_NOFOLLOW));
2703 }
2704
2705 static int
2706 kern_chflagsat(struct thread *td, int fd, const char *path,
2707     enum uio_seg pathseg, u_long flags, int atflag)
2708 {
2709         struct nameidata nd;
2710         cap_rights_t rights;
2711         int error, follow;
2712
2713         AUDIT_ARG_FFLAGS(flags);
2714         follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2715         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2716             cap_rights_init(&rights, CAP_FCHFLAGS), td);
2717         if ((error = namei(&nd)) != 0)
2718                 return (error);
2719         NDFREE(&nd, NDF_ONLY_PNBUF);
2720         error = setfflags(td, nd.ni_vp, flags);
2721         vrele(nd.ni_vp);
2722         return (error);
2723 }
2724
2725 /*
2726  * Change flags of a file given a file descriptor.
2727  */
2728 #ifndef _SYS_SYSPROTO_H_
2729 struct fchflags_args {
2730         int     fd;
2731         u_long  flags;
2732 };
2733 #endif
2734 int
2735 sys_fchflags(td, uap)
2736         struct thread *td;
2737         register struct fchflags_args /* {
2738                 int fd;
2739                 u_long flags;
2740         } */ *uap;
2741 {
2742         struct file *fp;
2743         cap_rights_t rights;
2744         int error;
2745
2746         AUDIT_ARG_FD(uap->fd);
2747         AUDIT_ARG_FFLAGS(uap->flags);
2748         error = getvnode(td->td_proc->p_fd, uap->fd,
2749             cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2750         if (error != 0)
2751                 return (error);
2752 #ifdef AUDIT
2753         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2754         AUDIT_ARG_VNODE1(fp->f_vnode);
2755         VOP_UNLOCK(fp->f_vnode, 0);
2756 #endif
2757         error = setfflags(td, fp->f_vnode, uap->flags);
2758         fdrop(fp, td);
2759         return (error);
2760 }
2761
2762 /*
2763  * Common implementation code for chmod(), lchmod() and fchmod().
2764  */
2765 int
2766 setfmode(td, cred, vp, mode)
2767         struct thread *td;
2768         struct ucred *cred;
2769         struct vnode *vp;
2770         int mode;
2771 {
2772         struct mount *mp;
2773         struct vattr vattr;
2774         int error;
2775
2776         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2777                 return (error);
2778         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2779         VATTR_NULL(&vattr);
2780         vattr.va_mode = mode & ALLPERMS;
2781 #ifdef MAC
2782         error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2783         if (error == 0)
2784 #endif
2785                 error = VOP_SETATTR(vp, &vattr, cred);
2786         VOP_UNLOCK(vp, 0);
2787         vn_finished_write(mp);
2788         return (error);
2789 }
2790
2791 /*
2792  * Change mode of a file given path name.
2793  */
2794 #ifndef _SYS_SYSPROTO_H_
2795 struct chmod_args {
2796         char    *path;
2797         int     mode;
2798 };
2799 #endif
2800 int
2801 sys_chmod(td, uap)
2802         struct thread *td;
2803         register struct chmod_args /* {
2804                 char *path;
2805                 int mode;
2806         } */ *uap;
2807 {
2808
2809         return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2810 }
2811
2812 #ifndef _SYS_SYSPROTO_H_
2813 struct fchmodat_args {
2814         int     dirfd;
2815         char    *path;
2816         mode_t  mode;
2817         int     flag;
2818 }
2819 #endif
2820 int
2821 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2822 {
2823         int flag = uap->flag;
2824         int fd = uap->fd;
2825         char *path = uap->path;
2826         mode_t mode = uap->mode;
2827
2828         if (flag & ~AT_SYMLINK_NOFOLLOW)
2829                 return (EINVAL);
2830
2831         return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2832 }
2833
2834 int
2835 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2836 {
2837
2838         return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2839 }
2840
2841 /*
2842  * Change mode of a file given path name (don't follow links.)
2843  */
2844 #ifndef _SYS_SYSPROTO_H_
2845 struct lchmod_args {
2846         char    *path;
2847         int     mode;
2848 };
2849 #endif
2850 int
2851 sys_lchmod(td, uap)
2852         struct thread *td;
2853         register struct lchmod_args /* {
2854                 char *path;
2855                 int mode;
2856         } */ *uap;
2857 {
2858
2859         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2860             uap->mode, AT_SYMLINK_NOFOLLOW));
2861 }
2862
2863 int
2864 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2865     mode_t mode, int flag)
2866 {
2867         struct nameidata nd;
2868         cap_rights_t rights;
2869         int error, follow;
2870
2871         AUDIT_ARG_MODE(mode);
2872         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2873         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2874             cap_rights_init(&rights, CAP_FCHMOD), td);
2875         if ((error = namei(&nd)) != 0)
2876                 return (error);
2877         NDFREE(&nd, NDF_ONLY_PNBUF);
2878         error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2879         vrele(nd.ni_vp);
2880         return (error);
2881 }
2882
2883 /*
2884  * Change mode of a file given a file descriptor.
2885  */
2886 #ifndef _SYS_SYSPROTO_H_
2887 struct fchmod_args {
2888         int     fd;
2889         int     mode;
2890 };
2891 #endif
2892 int
2893 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2894 {
2895         struct file *fp;
2896         cap_rights_t rights;
2897         int error;
2898
2899         AUDIT_ARG_FD(uap->fd);
2900         AUDIT_ARG_MODE(uap->mode);
2901
2902         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2903         if (error != 0)
2904                 return (error);
2905         error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2906         fdrop(fp, td);
2907         return (error);
2908 }
2909
2910 /*
2911  * Common implementation for chown(), lchown(), and fchown()
2912  */
2913 int
2914 setfown(td, cred, vp, uid, gid)
2915         struct thread *td;
2916         struct ucred *cred;
2917         struct vnode *vp;
2918         uid_t uid;
2919         gid_t gid;
2920 {
2921         struct mount *mp;
2922         struct vattr vattr;
2923         int error;
2924
2925         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2926                 return (error);
2927         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2928         VATTR_NULL(&vattr);
2929         vattr.va_uid = uid;
2930         vattr.va_gid = gid;
2931 #ifdef MAC
2932         error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2933             vattr.va_gid);
2934         if (error == 0)
2935 #endif
2936                 error = VOP_SETATTR(vp, &vattr, cred);
2937         VOP_UNLOCK(vp, 0);
2938         vn_finished_write(mp);
2939         return (error);
2940 }
2941
2942 /*
2943  * Set ownership given a path name.
2944  */
2945 #ifndef _SYS_SYSPROTO_H_
2946 struct chown_args {
2947         char    *path;
2948         int     uid;
2949         int     gid;
2950 };
2951 #endif
2952 int
2953 sys_chown(td, uap)
2954         struct thread *td;
2955         register struct chown_args /* {
2956                 char *path;
2957                 int uid;
2958                 int gid;
2959         } */ *uap;
2960 {
2961
2962         return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2963 }
2964
2965 #ifndef _SYS_SYSPROTO_H_
2966 struct fchownat_args {
2967         int fd;
2968         const char * path;
2969         uid_t uid;
2970         gid_t gid;
2971         int flag;
2972 };
2973 #endif
2974 int
2975 sys_fchownat(struct thread *td, struct fchownat_args *uap)
2976 {
2977         int flag;
2978
2979         flag = uap->flag;
2980         if (flag & ~AT_SYMLINK_NOFOLLOW)
2981                 return (EINVAL);
2982
2983         return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2984             uap->gid, uap->flag));
2985 }
2986
2987 int
2988 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2989     int gid)
2990 {
2991
2992         return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2993 }
2994
2995 int
2996 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2997     int uid, int gid, int flag)
2998 {
2999         struct nameidata nd;
3000         cap_rights_t rights;
3001         int error, follow;
3002
3003         AUDIT_ARG_OWNER(uid, gid);
3004         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3005         NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3006             cap_rights_init(&rights, CAP_FCHOWN), td);
3007
3008         if ((error = namei(&nd)) != 0)
3009                 return (error);
3010         NDFREE(&nd, NDF_ONLY_PNBUF);
3011         error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3012         vrele(nd.ni_vp);
3013         return (error);
3014 }
3015
3016 /*
3017  * Set ownership given a path name, do not cross symlinks.
3018  */
3019 #ifndef _SYS_SYSPROTO_H_
3020 struct lchown_args {
3021         char    *path;
3022         int     uid;
3023         int     gid;
3024 };
3025 #endif
3026 int
3027 sys_lchown(td, uap)
3028         struct thread *td;
3029         register struct lchown_args /* {
3030                 char *path;
3031                 int uid;
3032                 int gid;
3033         } */ *uap;
3034 {
3035
3036         return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3037 }
3038
3039 int
3040 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3041     int gid)
3042 {
3043
3044         return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3045             AT_SYMLINK_NOFOLLOW));
3046 }
3047
3048 /*
3049  * Set ownership given a file descriptor.
3050  */
3051 #ifndef _SYS_SYSPROTO_H_
3052 struct fchown_args {
3053         int     fd;
3054         int     uid;
3055         int     gid;
3056 };
3057 #endif
3058 int
3059 sys_fchown(td, uap)
3060         struct thread *td;
3061         register struct fchown_args /* {
3062                 int fd;
3063                 int uid;
3064                 int gid;
3065         } */ *uap;
3066 {
3067         struct file *fp;
3068         cap_rights_t rights;
3069         int error;
3070
3071         AUDIT_ARG_FD(uap->fd);
3072         AUDIT_ARG_OWNER(uap->uid, uap->gid);
3073         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3074         if (error != 0)
3075                 return (error);
3076         error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3077         fdrop(fp, td);
3078         return (error);
3079 }
3080
3081 /*
3082  * Common implementation code for utimes(), lutimes(), and futimes().
3083  */
3084 static int
3085 getutimes(usrtvp, tvpseg, tsp)
3086         const struct timeval *usrtvp;
3087         enum uio_seg tvpseg;
3088         struct timespec *tsp;
3089 {
3090         struct timeval tv[2];
3091         const struct timeval *tvp;
3092         int error;
3093
3094         if (usrtvp == NULL) {
3095                 vfs_timestamp(&tsp[0]);
3096                 tsp[1] = tsp[0];
3097         } else {
3098                 if (tvpseg == UIO_SYSSPACE) {
3099                         tvp = usrtvp;
3100                 } else {
3101                         if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3102                                 return (error);
3103                         tvp = tv;
3104                 }
3105
3106                 if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3107                     tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3108                         return (EINVAL);
3109                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3110                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3111         }
3112         return (0);
3113 }
3114
3115 /*
3116  * Common implementation code for utimes(), lutimes(), and futimes().
3117  */
3118 static int
3119 setutimes(td, vp, ts, numtimes, nullflag)
3120         struct thread *td;
3121         struct vnode *vp;
3122         const struct timespec *ts;
3123         int numtimes;
3124         int nullflag;
3125 {
3126         struct mount *mp;
3127         struct vattr vattr;
3128         int error, setbirthtime;
3129
3130         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3131                 return (error);
3132         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3133         setbirthtime = 0;
3134         if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3135             timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3136                 setbirthtime = 1;
3137         VATTR_NULL(&vattr);
3138         vattr.va_atime = ts[0];
3139         vattr.va_mtime = ts[1];
3140         if (setbirthtime)
3141                 vattr.va_birthtime = ts[1];
3142         if (numtimes > 2)
3143                 vattr.va_birthtime = ts[2];
3144         if (nullflag)
3145                 vattr.va_vaflags |= VA_UTIMES_NULL;
3146 #ifdef MAC
3147         error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3148             vattr.va_mtime);
3149 #endif
3150         if (error == 0)
3151                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3152         VOP_UNLOCK(vp, 0);
3153         vn_finished_write(mp);
3154         return (error);
3155 }
3156
3157 /*
3158  * Set the access and modification times of a file.
3159  */
3160 #ifndef _SYS_SYSPROTO_H_
3161 struct utimes_args {
3162         char    *path;
3163         struct  timeval *tptr;
3164 };
3165 #endif
3166 int
3167 sys_utimes(td, uap)
3168         struct thread *td;
3169         register struct utimes_args /* {
3170                 char *path;
3171                 struct timeval *tptr;
3172         } */ *uap;
3173 {
3174
3175         return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3176             UIO_USERSPACE));
3177 }
3178
3179 #ifndef _SYS_SYSPROTO_H_
3180 struct futimesat_args {
3181         int fd;
3182         const char * path;
3183         const struct timeval * times;
3184 };
3185 #endif
3186 int
3187 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3188 {
3189
3190         return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3191             uap->times, UIO_USERSPACE));
3192 }
3193
3194 int
3195 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3196     struct timeval *tptr, enum uio_seg tptrseg)
3197 {
3198
3199         return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3200 }
3201
3202 int
3203 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3204     struct timeval *tptr, enum uio_seg tptrseg)
3205 {
3206         struct nameidata nd;
3207         struct timespec ts[2];
3208         cap_rights_t rights;
3209         int error;
3210
3211         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3212                 return (error);
3213         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3214             cap_rights_init(&rights, CAP_FUTIMES), td);
3215
3216         if ((error = namei(&nd)) != 0)
3217                 return (error);
3218         NDFREE(&nd, NDF_ONLY_PNBUF);
3219         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3220         vrele(nd.ni_vp);
3221         return (error);
3222 }
3223
3224 /*
3225  * Set the access and modification times of a file.
3226  */
3227 #ifndef _SYS_SYSPROTO_H_
3228 struct lutimes_args {
3229         char    *path;
3230         struct  timeval *tptr;
3231 };
3232 #endif
3233 int
3234 sys_lutimes(td, uap)
3235         struct thread *td;
3236         register struct lutimes_args /* {
3237                 char *path;
3238                 struct timeval *tptr;
3239         } */ *uap;
3240 {
3241
3242         return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3243             UIO_USERSPACE));
3244 }
3245
3246 int
3247 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3248     struct timeval *tptr, enum uio_seg tptrseg)
3249 {
3250         struct timespec ts[2];
3251         struct nameidata nd;
3252         int error;
3253
3254         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3255                 return (error);
3256         NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3257         if ((error = namei(&nd)) != 0)
3258                 return (error);
3259         NDFREE(&nd, NDF_ONLY_PNBUF);
3260         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3261         vrele(nd.ni_vp);
3262         return (error);
3263 }
3264
3265 /*
3266  * Set the access and modification times of a file.
3267  */
3268 #ifndef _SYS_SYSPROTO_H_
3269 struct futimes_args {
3270         int     fd;
3271         struct  timeval *tptr;
3272 };
3273 #endif
3274 int
3275 sys_futimes(td, uap)
3276         struct thread *td;
3277         register struct futimes_args /* {
3278                 int  fd;
3279                 struct timeval *tptr;
3280         } */ *uap;
3281 {
3282
3283         return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3284 }
3285
3286 int
3287 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3288     enum uio_seg tptrseg)
3289 {
3290         struct timespec ts[2];
3291         struct file *fp;
3292         cap_rights_t rights;
3293         int error;
3294
3295         AUDIT_ARG_FD(fd);
3296         error = getutimes(tptr, tptrseg, ts);
3297         if (error != 0)
3298                 return (error);
3299         error = getvnode(td->td_proc->p_fd, fd,
3300             cap_rights_init(&rights, CAP_FUTIMES), &fp);
3301         if (error != 0)
3302                 return (error);
3303 #ifdef AUDIT
3304         vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3305         AUDIT_ARG_VNODE1(fp->f_vnode);
3306         VOP_UNLOCK(fp->f_vnode, 0);
3307 #endif
3308         error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3309         fdrop(fp, td);
3310         return (error);
3311 }
3312
3313 /*
3314  * Truncate a file given its path name.
3315  */
3316 #ifndef _SYS_SYSPROTO_H_
3317 struct truncate_args {
3318         char    *path;
3319         int     pad;
3320         off_t   length;
3321 };
3322 #endif
3323 int
3324 sys_truncate(td, uap)
3325         struct thread *td;
3326         register struct truncate_args /* {
3327                 char *path;
3328                 int pad;
3329                 off_t length;
3330         } */ *uap;
3331 {
3332
3333         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3334 }
3335
3336 int
3337 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3338 {
3339         struct mount *mp;
3340         struct vnode *vp;
3341         void *rl_cookie;
3342         struct vattr vattr;
3343         struct nameidata nd;
3344         int error;
3345
3346         if (length < 0)
3347                 return(EINVAL);
3348         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3349         if ((error = namei(&nd)) != 0)
3350                 return (error);
3351         vp = nd.ni_vp;
3352         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3353         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3354                 vn_rangelock_unlock(vp, rl_cookie);
3355                 vrele(vp);
3356                 return (error);
3357         }
3358         NDFREE(&nd, NDF_ONLY_PNBUF);
3359         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3360         if (vp->v_type == VDIR)
3361                 error = EISDIR;
3362 #ifdef MAC
3363         else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3364         }
3365 #endif
3366         else if ((error = vn_writechk(vp)) == 0 &&
3367             (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3368                 VATTR_NULL(&vattr);
3369                 vattr.va_size = length;
3370                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3371         }
3372         VOP_UNLOCK(vp, 0);
3373         vn_finished_write(mp);
3374         vn_rangelock_unlock(vp, rl_cookie);
3375         vrele(vp);
3376         return (error);
3377 }
3378
3379 #if defined(COMPAT_43)
3380 /*
3381  * Truncate a file given its path name.
3382  */
3383 #ifndef _SYS_SYSPROTO_H_
3384 struct otruncate_args {
3385         char    *path;
3386         long    length;
3387 };
3388 #endif
3389 int
3390 otruncate(td, uap)
3391         struct thread *td;
3392         register struct otruncate_args /* {
3393                 char *path;
3394                 long length;
3395         } */ *uap;
3396 {
3397         struct truncate_args /* {
3398                 char *path;
3399                 int pad;
3400                 off_t length;
3401         } */ nuap;
3402
3403         nuap.path = uap->path;
3404         nuap.length = uap->length;
3405         return (sys_truncate(td, &nuap));
3406 }
3407 #endif /* COMPAT_43 */
3408
3409 /* Versions with the pad argument */
3410 int
3411 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3412 {
3413         struct truncate_args ouap;
3414
3415         ouap.path = uap->path;
3416         ouap.length = uap->length;
3417         return (sys_truncate(td, &ouap));
3418 }
3419
3420 int
3421 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3422 {
3423         struct ftruncate_args ouap;
3424
3425         ouap.fd = uap->fd;
3426         ouap.length = uap->length;
3427         return (sys_ftruncate(td, &ouap));
3428 }
3429
3430 /*
3431  * Sync an open file.
3432  */
3433 #ifndef _SYS_SYSPROTO_H_
3434 struct fsync_args {
3435         int     fd;
3436 };
3437 #endif
3438 int
3439 sys_fsync(td, uap)
3440         struct thread *td;
3441         struct fsync_args /* {
3442                 int fd;
3443         } */ *uap;
3444 {
3445         struct vnode *vp;
3446         struct mount *mp;
3447         struct file *fp;
3448         cap_rights_t rights;
3449         int error, lock_flags;
3450
3451         AUDIT_ARG_FD(uap->fd);
3452         error = getvnode(td->td_proc->p_fd, uap->fd,
3453             cap_rights_init(&rights, CAP_FSYNC), &fp);
3454         if (error != 0)
3455                 return (error);
3456         vp = fp->f_vnode;
3457         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3458         if (error != 0)
3459                 goto drop;
3460         if (MNT_SHARED_WRITES(mp) ||
3461             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3462                 lock_flags = LK_SHARED;
3463         } else {
3464                 lock_flags = LK_EXCLUSIVE;
3465         }
3466         vn_lock(vp, lock_flags | LK_RETRY);
3467         AUDIT_ARG_VNODE1(vp);
3468         if (vp->v_object != NULL) {
3469                 VM_OBJECT_WLOCK(vp->v_object);
3470                 vm_object_page_clean(vp->v_object, 0, 0, 0);
3471                 VM_OBJECT_WUNLOCK(vp->v_object);
3472         }
3473         error = VOP_FSYNC(vp, MNT_WAIT, td);
3474
3475         VOP_UNLOCK(vp, 0);
3476         vn_finished_write(mp);
3477 drop:
3478         fdrop(fp, td);
3479         return (error);
3480 }
3481
3482 /*
3483  * Rename files.  Source and destination must either both be directories, or
3484  * both not be directories.  If target is a directory, it must be empty.
3485  */
3486 #ifndef _SYS_SYSPROTO_H_
3487 struct rename_args {
3488         char    *from;
3489         char    *to;
3490 };
3491 #endif
3492 int
3493 sys_rename(td, uap)
3494         struct thread *td;
3495         register struct rename_args /* {
3496                 char *from;
3497                 char *to;
3498         } */ *uap;
3499 {
3500
3501         return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3502 }
3503
3504 #ifndef _SYS_SYSPROTO_H_
3505 struct renameat_args {
3506         int     oldfd;
3507         char    *old;
3508         int     newfd;
3509         char    *new;
3510 };
3511 #endif
3512 int
3513 sys_renameat(struct thread *td, struct renameat_args *uap)
3514 {
3515
3516         return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3517             UIO_USERSPACE));
3518 }
3519
3520 int
3521 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3522 {
3523
3524         return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3525 }
3526
3527 int
3528 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3529     enum uio_seg pathseg)
3530 {
3531         struct mount *mp = NULL;
3532         struct vnode *tvp, *fvp, *tdvp;
3533         struct nameidata fromnd, tond;
3534         cap_rights_t rights;
3535         int error;
3536
3537 again:
3538         bwillwrite();
3539 #ifdef MAC
3540         NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3541             AUDITVNODE1, pathseg, old, oldfd,
3542             cap_rights_init(&rights, CAP_RENAMEAT), td);
3543 #else
3544         NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3545             pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3546 #endif
3547
3548         if ((error = namei(&fromnd)) != 0)
3549                 return (error);
3550 #ifdef MAC
3551         error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3552             fromnd.ni_vp, &fromnd.ni_cnd);
3553         VOP_UNLOCK(fromnd.ni_dvp, 0);
3554         if (fromnd.ni_dvp != fromnd.ni_vp)
3555                 VOP_UNLOCK(fromnd.ni_vp, 0);
3556 #endif
3557         fvp = fromnd.ni_vp;
3558         NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3559             SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3560             cap_rights_init(&rights, CAP_LINKAT), td);
3561         if (fromnd.ni_vp->v_type == VDIR)
3562                 tond.ni_cnd.cn_flags |= WILLBEDIR;
3563         if ((error = namei(&tond)) != 0) {
3564                 /* Translate error code for rename("dir1", "dir2/."). */
3565                 if (error == EISDIR && fvp->v_type == VDIR)
3566                         error = EINVAL;
3567                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3568                 vrele(fromnd.ni_dvp);
3569                 vrele(fvp);
3570                 goto out1;
3571         }
3572         tdvp = tond.ni_dvp;
3573         tvp = tond.ni_vp;
3574         error = vn_start_write(fvp, &mp, V_NOWAIT);
3575         if (error != 0) {
3576                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3577                 NDFREE(&tond, NDF_ONLY_PNBUF);
3578                 if (tvp != NULL)
3579                         vput(tvp);
3580                 if (tdvp == tvp)
3581                         vrele(tdvp);
3582                 else
3583                         vput(tdvp);
3584                 vrele(fromnd.ni_dvp);
3585                 vrele(fvp);
3586                 vrele(tond.ni_startdir);
3587                 if (fromnd.ni_startdir != NULL)
3588                         vrele(fromnd.ni_startdir);
3589                 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3590                 if (error != 0)
3591                         return (error);
3592                 goto again;
3593         }
3594         if (tvp != NULL) {
3595                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3596                         error = ENOTDIR;
3597                         goto out;
3598                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3599                         error = EISDIR;
3600                         goto out;
3601                 }
3602 #ifdef CAPABILITIES
3603                 if (newfd != AT_FDCWD) {
3604                         /*
3605                          * If the target already exists we require CAP_UNLINKAT
3606                          * from 'newfd'.
3607                          */
3608                         error = cap_check(&tond.ni_filecaps.fc_rights,
3609                             cap_rights_init(&rights, CAP_UNLINKAT));
3610                         if (error != 0)
3611                                 goto out;
3612                 }
3613 #endif
3614         }
3615         if (fvp == tdvp) {
3616                 error = EINVAL;
3617                 goto out;
3618         }
3619         /*
3620          * If the source is the same as the destination (that is, if they
3621          * are links to the same vnode), then there is nothing to do.
3622          */
3623         if (fvp == tvp)
3624                 error = -1;
3625 #ifdef MAC
3626         else
3627                 error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3628                     tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3629 #endif
3630 out:
3631         if (error == 0) {
3632                 error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3633                     tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3634                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3635                 NDFREE(&tond, NDF_ONLY_PNBUF);
3636         } else {
3637                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
3638                 NDFREE(&tond, NDF_ONLY_PNBUF);
3639                 if (tvp != NULL)
3640                         vput(tvp);
3641                 if (tdvp == tvp)
3642                         vrele(tdvp);
3643                 else
3644                         vput(tdvp);
3645                 vrele(fromnd.ni_dvp);
3646                 vrele(fvp);
3647         }
3648         vrele(tond.ni_startdir);
3649         vn_finished_write(mp);
3650 out1:
3651         if (fromnd.ni_startdir)
3652                 vrele(fromnd.ni_startdir);
3653         if (error == -1)
3654                 return (0);
3655         return (error);
3656 }
3657
3658 /*
3659  * Make a directory file.
3660  */
3661 #ifndef _SYS_SYSPROTO_H_
3662 struct mkdir_args {
3663         char    *path;
3664         int     mode;
3665 };
3666 #endif
3667 int
3668 sys_mkdir(td, uap)
3669         struct thread *td;
3670         register struct mkdir_args /* {
3671                 char *path;
3672                 int mode;
3673         } */ *uap;
3674 {
3675
3676         return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3677 }
3678
3679 #ifndef _SYS_SYSPROTO_H_
3680 struct mkdirat_args {
3681         int     fd;
3682         char    *path;
3683         mode_t  mode;
3684 };
3685 #endif
3686 int
3687 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3688 {
3689
3690         return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3691 }
3692
3693 int
3694 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3695 {
3696
3697         return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3698 }
3699
3700 int
3701 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3702     int mode)
3703 {
3704         struct mount *mp;
3705         struct vnode *vp;
3706         struct vattr vattr;
3707         struct nameidata nd;
3708         cap_rights_t rights;
3709         int error;
3710
3711         AUDIT_ARG_MODE(mode);
3712 restart:
3713         bwillwrite();
3714         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
3715             segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
3716         nd.ni_cnd.cn_flags |= WILLBEDIR;
3717         if ((error = namei(&nd)) != 0)
3718                 return (error);
3719         vp = nd.ni_vp;
3720         if (vp != NULL) {
3721                 NDFREE(&nd, NDF_ONLY_PNBUF);
3722                 /*
3723                  * XXX namei called with LOCKPARENT but not LOCKLEAF has
3724                  * the strange behaviour of leaving the vnode unlocked
3725                  * if the target is the same vnode as the parent.
3726                  */
3727                 if (vp == nd.ni_dvp)
3728                         vrele(nd.ni_dvp);
3729                 else
3730                         vput(nd.ni_dvp);
3731                 vrele(vp);
3732                 return (EEXIST);
3733         }
3734         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3735                 NDFREE(&nd, NDF_ONLY_PNBUF);
3736                 vput(nd.ni_dvp);
3737                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3738                         return (error);
3739                 goto restart;
3740         }
3741         VATTR_NULL(&vattr);
3742         vattr.va_type = VDIR;
3743         vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3744 #ifdef MAC
3745         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3746             &vattr);
3747         if (error != 0)
3748                 goto out;
3749 #endif
3750         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3751 #ifdef MAC
3752 out:
3753 #endif
3754         NDFREE(&nd, NDF_ONLY_PNBUF);
3755         vput(nd.ni_dvp);
3756         if (error == 0)
3757                 vput(nd.ni_vp);
3758         vn_finished_write(mp);
3759         return (error);
3760 }
3761
3762 /*
3763  * Remove a directory file.
3764  */
3765 #ifndef _SYS_SYSPROTO_H_
3766 struct rmdir_args {
3767         char    *path;
3768 };
3769 #endif
3770 int
3771 sys_rmdir(td, uap)
3772         struct thread *td;
3773         struct rmdir_args /* {
3774                 char *path;
3775         } */ *uap;
3776 {
3777
3778         return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3779 }
3780
3781 int
3782 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3783 {
3784
3785         return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3786 }
3787
3788 int
3789 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3790 {
3791         struct mount *mp;
3792         struct vnode *vp;
3793         struct nameidata nd;
3794         cap_rights_t rights;
3795         int error;
3796
3797 restart:
3798         bwillwrite();
3799         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3800             pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3801         if ((error = namei(&nd)) != 0)
3802                 return (error);
3803         vp = nd.ni_vp;
3804         if (vp->v_type != VDIR) {
3805                 error = ENOTDIR;
3806                 goto out;
3807         }
3808         /*
3809          * No rmdir "." please.
3810          */
3811         if (nd.ni_dvp == vp) {
3812                 error = EINVAL;
3813                 goto out;
3814         }
3815         /*
3816          * The root of a mounted filesystem cannot be deleted.
3817          */
3818         if (vp->v_vflag & VV_ROOT) {
3819                 error = EBUSY;
3820                 goto out;
3821         }
3822 #ifdef MAC
3823         error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3824             &nd.ni_cnd);
3825         if (error != 0)
3826                 goto out;
3827 #endif
3828         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3829                 NDFREE(&nd, NDF_ONLY_PNBUF);
3830                 vput(vp);
3831                 if (nd.ni_dvp == vp)
3832                         vrele(nd.ni_dvp);
3833                 else
3834                         vput(nd.ni_dvp);
3835                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3836                         return (error);
3837                 goto restart;
3838         }
3839         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3840         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3841         vn_finished_write(mp);
3842 out:
3843         NDFREE(&nd, NDF_ONLY_PNBUF);
3844         vput(vp);
3845         if (nd.ni_dvp == vp)
3846                 vrele(nd.ni_dvp);
3847         else
3848                 vput(nd.ni_dvp);
3849         return (error);
3850 }
3851
3852 #ifdef COMPAT_43
3853 /*
3854  * Read a block of directory entries in a filesystem independent format.
3855  */
3856 #ifndef _SYS_SYSPROTO_H_
3857 struct ogetdirentries_args {
3858         int     fd;
3859         char    *buf;
3860         u_int   count;
3861         long    *basep;
3862 };
3863 #endif
3864 int
3865 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3866 {
3867         long loff;
3868         int error;
3869
3870         error = kern_ogetdirentries(td, uap, &loff);
3871         if (error == 0)
3872                 error = copyout(&loff, uap->basep, sizeof(long));
3873         return (error);
3874 }
3875
3876 int
3877 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3878     long *ploff)
3879 {
3880         struct vnode *vp;
3881         struct file *fp;
3882         struct uio auio, kuio;
3883         struct iovec aiov, kiov;
3884         struct dirent *dp, *edp;
3885         cap_rights_t rights;
3886         caddr_t dirbuf;
3887         int error, eofflag, readcnt;
3888         long loff;
3889         off_t foffset;
3890
3891         /* XXX arbitrary sanity limit on `count'. */
3892         if (uap->count > 64 * 1024)
3893                 return (EINVAL);
3894         error = getvnode(td->td_proc->p_fd, uap->fd,
3895             cap_rights_init(&rights, CAP_READ), &fp);
3896         if (error != 0)
3897                 return (error);
3898         if ((fp->f_flag & FREAD) == 0) {
3899                 fdrop(fp, td);
3900                 return (EBADF);
3901         }
3902         vp = fp->f_vnode;
3903         foffset = foffset_lock(fp, 0);
3904 unionread:
3905         if (vp->v_type != VDIR) {
3906                 foffset_unlock(fp, foffset, 0);
3907                 fdrop(fp, td);
3908                 return (EINVAL);
3909         }
3910         aiov.iov_base = uap->buf;
3911         aiov.iov_len = uap->count;
3912         auio.uio_iov = &aiov;
3913         auio.uio_iovcnt = 1;
3914         auio.uio_rw = UIO_READ;
3915         auio.uio_segflg = UIO_USERSPACE;
3916         auio.uio_td = td;
3917         auio.uio_resid = uap->count;
3918         vn_lock(vp, LK_SHARED | LK_RETRY);
3919         loff = auio.uio_offset = foffset;
3920 #ifdef MAC
3921         error = mac_vnode_check_readdir(td->td_ucred, vp);
3922         if (error != 0) {
3923                 VOP_UNLOCK(vp, 0);
3924                 foffset_unlock(fp, foffset, FOF_NOUPDATE);
3925                 fdrop(fp, td);
3926                 return (error);
3927         }
3928 #endif
3929 #       if (BYTE_ORDER != LITTLE_ENDIAN)
3930                 if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3931                         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3932                             NULL, NULL);
3933                         foffset = auio.uio_offset;
3934                 } else
3935 #       endif
3936         {
3937                 kuio = auio;
3938                 kuio.uio_iov = &kiov;
3939                 kuio.uio_segflg = UIO_SYSSPACE;
3940                 kiov.iov_len = uap->count;
3941                 dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3942                 kiov.iov_base = dirbuf;
3943                 error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3944                             NULL, NULL);
3945                 foffset = kuio.uio_offset;
3946                 if (error == 0) {
3947                         readcnt = uap->count - kuio.uio_resid;
3948                         edp = (struct dirent *)&dirbuf[readcnt];
3949                         for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3950 #                               if (BYTE_ORDER == LITTLE_ENDIAN)
3951                                         /*
3952                                          * The expected low byte of
3953                                          * dp->d_namlen is our dp->d_type.
3954                                          * The high MBZ byte of dp->d_namlen
3955                                          * is our dp->d_namlen.
3956                                          */
3957                                         dp->d_type = dp->d_namlen;
3958                                         dp->d_namlen = 0;
3959 #                               else
3960                                         /*
3961                                          * The dp->d_type is the high byte
3962                                          * of the expected dp->d_namlen,
3963                                          * so must be zero'ed.
3964                                          */
3965                                         dp->d_type = 0;
3966 #                               endif
3967                                 if (dp->d_reclen > 0) {
3968                                         dp = (struct dirent *)
3969                                             ((char *)dp + dp->d_reclen);
3970                                 } else {
3971                                         error = EIO;
3972                                         break;
3973                                 }
3974                         }
3975                         if (dp >= edp)
3976                                 error = uiomove(dirbuf, readcnt, &auio);
3977                 }
3978                 free(dirbuf, M_TEMP);
3979         }
3980         if (error != 0) {
3981                 VOP_UNLOCK(vp, 0);
3982                 foffset_unlock(fp, foffset, 0);
3983                 fdrop(fp, td);
3984                 return (error);
3985         }
3986         if (uap->count == auio.uio_resid &&
3987             (vp->v_vflag & VV_ROOT) &&
3988             (vp->v_mount->mnt_flag & MNT_UNION)) {
3989                 struct vnode *tvp = vp;
3990                 vp = vp->v_mount->mnt_vnodecovered;
3991                 VREF(vp);
3992                 fp->f_vnode = vp;
3993                 fp->f_data = vp;
3994                 foffset = 0;
3995                 vput(tvp);
3996                 goto unionread;
3997         }
3998         VOP_UNLOCK(vp, 0);
3999         foffset_unlock(fp, foffset, 0);
4000         fdrop(fp, td);
4001         td->td_retval[0] = uap->count - auio.uio_resid;
4002         if (error == 0)
4003                 *ploff = loff;
4004         return (error);
4005 }
4006 #endif /* COMPAT_43 */
4007
4008 /*
4009  * Read a block of directory entries in a filesystem independent format.
4010  */
4011 #ifndef _SYS_SYSPROTO_H_
4012 struct getdirentries_args {
4013         int     fd;
4014         char    *buf;
4015         u_int   count;
4016         long    *basep;
4017 };
4018 #endif
4019 int
4020 sys_getdirentries(td, uap)
4021         struct thread *td;
4022         register struct getdirentries_args /* {
4023                 int fd;
4024                 char *buf;
4025                 u_int count;
4026                 long *basep;
4027         } */ *uap;
4028 {
4029         long base;
4030         int error;
4031
4032         error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4033             NULL, UIO_USERSPACE);
4034         if (error != 0)
4035                 return (error);
4036         if (uap->basep != NULL)
4037                 error = copyout(&base, uap->basep, sizeof(long));
4038         return (error);
4039 }
4040
4041 int
4042 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4043     long *basep, ssize_t *residp, enum uio_seg bufseg)
4044 {
4045         struct vnode *vp;
4046         struct file *fp;
4047         struct uio auio;
4048         struct iovec aiov;
4049         cap_rights_t rights;
4050         long loff;
4051         int error, eofflag;
4052         off_t foffset;
4053
4054         AUDIT_ARG_FD(fd);
4055         if (count > IOSIZE_MAX)
4056                 return (EINVAL);
4057         auio.uio_resid = count;
4058         error = getvnode(td->td_proc->p_fd, fd,
4059             cap_rights_init(&rights, CAP_READ), &fp);
4060         if (error != 0)
4061                 return (error);
4062         if ((fp->f_flag & FREAD) == 0) {
4063                 fdrop(fp, td);
4064                 return (EBADF);
4065         }
4066         vp = fp->f_vnode;
4067         foffset = foffset_lock(fp, 0);
4068 unionread:
4069         if (vp->v_type != VDIR) {
4070                 error = EINVAL;
4071                 goto fail;
4072         }
4073         aiov.iov_base = buf;
4074         aiov.iov_len = count;
4075         auio.uio_iov = &aiov;
4076         auio.uio_iovcnt = 1;
4077         auio.uio_rw = UIO_READ;
4078         auio.uio_segflg = bufseg;
4079         auio.uio_td = td;
4080         vn_lock(vp, LK_SHARED | LK_RETRY);
4081         AUDIT_ARG_VNODE1(vp);
4082         loff = auio.uio_offset = foffset;
4083 #ifdef MAC
4084         error = mac_vnode_check_readdir(td->td_ucred, vp);
4085         if (error == 0)
4086 #endif
4087                 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4088                     NULL);
4089         foffset = auio.uio_offset;
4090         if (error != 0) {
4091                 VOP_UNLOCK(vp, 0);
4092                 goto fail;
4093         }
4094         if (count == auio.uio_resid &&
4095             (vp->v_vflag & VV_ROOT) &&
4096             (vp->v_mount->mnt_flag & MNT_UNION)) {
4097                 struct vnode *tvp = vp;
4098
4099                 vp = vp->v_mount->mnt_vnodecovered;
4100                 VREF(vp);
4101                 fp->f_vnode = vp;
4102                 fp->f_data = vp;
4103                 foffset = 0;
4104                 vput(tvp);
4105                 goto unionread;
4106         }
4107         VOP_UNLOCK(vp, 0);
4108         *basep = loff;
4109         if (residp != NULL)
4110                 *residp = auio.uio_resid;
4111         td->td_retval[0] = count - auio.uio_resid;
4112 fail:
4113         foffset_unlock(fp, foffset, 0);
4114         fdrop(fp, td);
4115         return (error);
4116 }
4117
4118 #ifndef _SYS_SYSPROTO_H_
4119 struct getdents_args {
4120         int fd;
4121         char *buf;
4122         size_t count;
4123 };
4124 #endif
4125 int
4126 sys_getdents(td, uap)
4127         struct thread *td;
4128         register struct getdents_args /* {
4129                 int fd;
4130                 char *buf;
4131                 u_int count;
4132         } */ *uap;
4133 {
4134         struct getdirentries_args ap;
4135
4136         ap.fd = uap->fd;
4137         ap.buf = uap->buf;
4138         ap.count = uap->count;
4139         ap.basep = NULL;
4140         return (sys_getdirentries(td, &ap));
4141 }
4142
4143 /*
4144  * Set the mode mask for creation of filesystem nodes.
4145  */
4146 #ifndef _SYS_SYSPROTO_H_
4147 struct umask_args {
4148         int     newmask;
4149 };
4150 #endif
4151 int
4152 sys_umask(td, uap)
4153         struct thread *td;
4154         struct umask_args /* {
4155                 int newmask;
4156         } */ *uap;
4157 {
4158         register struct filedesc *fdp;
4159
4160         FILEDESC_XLOCK(td->td_proc->p_fd);
4161         fdp = td->td_proc->p_fd;
4162         td->td_retval[0] = fdp->fd_cmask;
4163         fdp->fd_cmask = uap->newmask & ALLPERMS;
4164         FILEDESC_XUNLOCK(td->td_proc->p_fd);
4165         return (0);
4166 }
4167
4168 /*
4169  * Void all references to file by ripping underlying filesystem away from
4170  * vnode.
4171  */
4172 #ifndef _SYS_SYSPROTO_H_
4173 struct revoke_args {
4174         char    *path;
4175 };
4176 #endif
4177 int
4178 sys_revoke(td, uap)
4179         struct thread *td;
4180         register struct revoke_args /* {
4181                 char *path;
4182         } */ *uap;
4183 {
4184         struct vnode *vp;
4185         struct vattr vattr;
4186         struct nameidata nd;
4187         int error;
4188
4189         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4190             uap->path, td);
4191         if ((error = namei(&nd)) != 0)
4192                 return (error);
4193         vp = nd.ni_vp;
4194         NDFREE(&nd, NDF_ONLY_PNBUF);
4195         if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4196                 error = EINVAL;
4197                 goto out;
4198         }
4199 #ifdef MAC
4200         error = mac_vnode_check_revoke(td->td_ucred, vp);
4201         if (error != 0)
4202                 goto out;
4203 #endif
4204         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4205         if (error != 0)
4206                 goto out;
4207         if (td->td_ucred->cr_uid != vattr.va_uid) {
4208                 error = priv_check(td, PRIV_VFS_ADMIN);
4209                 if (error != 0)
4210                         goto out;
4211         }
4212         if (vcount(vp) > 1)
4213                 VOP_REVOKE(vp, REVOKEALL);
4214 out:
4215         vput(vp);
4216         return (error);
4217 }
4218
4219 /*
4220  * Convert a user file descriptor to a kernel file entry and check that, if it
4221  * is a capability, the correct rights are present. A reference on the file
4222  * entry is held upon returning.
4223  */
4224 int
4225 getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4226 {
4227         struct file *fp;
4228         int error;
4229
4230         error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4231         if (error != 0)
4232                 return (error);
4233
4234         /*
4235          * The file could be not of the vnode type, or it may be not
4236          * yet fully initialized, in which case the f_vnode pointer
4237          * may be set, but f_ops is still badfileops.  E.g.,
4238          * devfs_open() transiently create such situation to
4239          * facilitate csw d_fdopen().
4240          *
4241          * Dupfdopen() handling in kern_openat() installs the
4242          * half-baked file into the process descriptor table, allowing
4243          * other thread to dereference it. Guard against the race by
4244          * checking f_ops.
4245          */
4246         if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4247                 fdrop(fp, curthread);
4248                 return (EINVAL);
4249         }
4250         *fpp = fp;
4251         return (0);
4252 }
4253
4254
4255 /*
4256  * Get an (NFS) file handle.
4257  */
4258 #ifndef _SYS_SYSPROTO_H_
4259 struct lgetfh_args {
4260         char    *fname;
4261         fhandle_t *fhp;
4262 };
4263 #endif
4264 int
4265 sys_lgetfh(td, uap)
4266         struct thread *td;
4267         register struct lgetfh_args *uap;
4268 {
4269         struct nameidata nd;
4270         fhandle_t fh;
4271         register struct vnode *vp;
4272         int error;
4273
4274         error = priv_check(td, PRIV_VFS_GETFH);
4275         if (error != 0)
4276                 return (error);
4277         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4278             uap->fname, td);
4279         error = namei(&nd);
4280         if (error != 0)
4281                 return (error);
4282         NDFREE(&nd, NDF_ONLY_PNBUF);
4283         vp = nd.ni_vp;
4284         bzero(&fh, sizeof(fh));
4285         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4286         error = VOP_VPTOFH(vp, &fh.fh_fid);
4287         vput(vp);
4288         if (error == 0)
4289                 error = copyout(&fh, uap->fhp, sizeof (fh));
4290         return (error);
4291 }
4292
4293 #ifndef _SYS_SYSPROTO_H_
4294 struct getfh_args {
4295         char    *fname;
4296         fhandle_t *fhp;
4297 };
4298 #endif
4299 int
4300 sys_getfh(td, uap)
4301         struct thread *td;
4302         register struct getfh_args *uap;
4303 {
4304         struct nameidata nd;
4305         fhandle_t fh;
4306         register struct vnode *vp;
4307         int error;
4308
4309         error = priv_check(td, PRIV_VFS_GETFH);
4310         if (error != 0)
4311                 return (error);
4312         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4313             uap->fname, td);
4314         error = namei(&nd);
4315         if (error != 0)
4316                 return (error);
4317         NDFREE(&nd, NDF_ONLY_PNBUF);
4318         vp = nd.ni_vp;
4319         bzero(&fh, sizeof(fh));
4320         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4321         error = VOP_VPTOFH(vp, &fh.fh_fid);
4322         vput(vp);
4323         if (error == 0)
4324                 error = copyout(&fh, uap->fhp, sizeof (fh));
4325         return (error);
4326 }
4327
4328 /*
4329  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4330  * open descriptor.
4331  *
4332  * warning: do not remove the priv_check() call or this becomes one giant
4333  * security hole.
4334  */
4335 #ifndef _SYS_SYSPROTO_H_
4336 struct fhopen_args {
4337         const struct fhandle *u_fhp;
4338         int flags;
4339 };
4340 #endif
4341 int
4342 sys_fhopen(td, uap)
4343         struct thread *td;
4344         struct fhopen_args /* {
4345                 const struct fhandle *u_fhp;
4346                 int flags;
4347         } */ *uap;
4348 {
4349         struct mount *mp;
4350         struct vnode *vp;
4351         struct fhandle fhp;
4352         struct file *fp;
4353         int fmode, error;
4354         int indx;
4355
4356         error = priv_check(td, PRIV_VFS_FHOPEN);
4357         if (error != 0)
4358                 return (error);
4359         indx = -1;
4360         fmode = FFLAGS(uap->flags);
4361         /* why not allow a non-read/write open for our lockd? */
4362         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4363                 return (EINVAL);
4364         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4365         if (error != 0)
4366                 return(error);
4367         /* find the mount point */
4368         mp = vfs_busyfs(&fhp.fh_fsid);
4369         if (mp == NULL)
4370                 return (ESTALE);
4371         /* now give me my vnode, it gets returned to me locked */
4372         error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4373         vfs_unbusy(mp);
4374         if (error != 0)
4375                 return (error);
4376
4377         error = falloc_noinstall(td, &fp);
4378         if (error != 0) {
4379                 vput(vp);
4380                 return (error);
4381         }
4382         /*
4383          * An extra reference on `fp' has been held for us by
4384          * falloc_noinstall().
4385          */
4386
4387 #ifdef INVARIANTS
4388         td->td_dupfd = -1;
4389 #endif
4390         error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4391         if (error != 0) {
4392                 KASSERT(fp->f_ops == &badfileops,
4393                     ("VOP_OPEN in fhopen() set f_ops"));
4394                 KASSERT(td->td_dupfd < 0,
4395                     ("fhopen() encountered fdopen()"));
4396
4397                 vput(vp);
4398                 goto bad;
4399         }
4400 #ifdef INVARIANTS
4401         td->td_dupfd = 0;
4402 #endif
4403         fp->f_vnode = vp;
4404         fp->f_seqcount = 1;
4405         finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4406             &vnops);
4407         VOP_UNLOCK(vp, 0);
4408         if ((fmode & O_TRUNC) != 0) {
4409                 error = fo_truncate(fp, 0, td->td_ucred, td);
4410                 if (error != 0)
4411                         goto bad;
4412         }
4413
4414         error = finstall(td, fp, &indx, fmode, NULL);
4415 bad:
4416         fdrop(fp, td);
4417         td->td_retval[0] = indx;
4418         return (error);
4419 }
4420
4421 /*
4422  * Stat an (NFS) file handle.
4423  */
4424 #ifndef _SYS_SYSPROTO_H_
4425 struct fhstat_args {
4426         struct fhandle *u_fhp;
4427         struct stat *sb;
4428 };
4429 #endif
4430 int
4431 sys_fhstat(td, uap)
4432         struct thread *td;
4433         register struct fhstat_args /* {
4434                 struct fhandle *u_fhp;
4435                 struct stat *sb;
4436         } */ *uap;
4437 {
4438         struct stat sb;
4439         struct fhandle fh;
4440         int error;
4441
4442         error = copyin(uap->u_fhp, &fh, sizeof(fh));
4443         if (error != 0)
4444                 return (error);
4445         error = kern_fhstat(td, fh, &sb);
4446         if (error == 0)
4447                 error = copyout(&sb, uap->sb, sizeof(sb));
4448         return (error);
4449 }
4450
4451 int
4452 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4453 {
4454         struct mount *mp;
4455         struct vnode *vp;
4456         int error;
4457
4458         error = priv_check(td, PRIV_VFS_FHSTAT);
4459         if (error != 0)
4460                 return (error);
4461         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4462                 return (ESTALE);
4463         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4464         vfs_unbusy(mp);
4465         if (error != 0)
4466                 return (error);
4467         error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4468         vput(vp);
4469         return (error);
4470 }
4471
4472 /*
4473  * Implement fstatfs() for (NFS) file handles.
4474  */
4475 #ifndef _SYS_SYSPROTO_H_
4476 struct fhstatfs_args {
4477         struct fhandle *u_fhp;
4478         struct statfs *buf;
4479 };
4480 #endif
4481 int
4482 sys_fhstatfs(td, uap)
4483         struct thread *td;
4484         struct fhstatfs_args /* {
4485                 struct fhandle *u_fhp;
4486                 struct statfs *buf;
4487         } */ *uap;
4488 {
4489         struct statfs sf;
4490         fhandle_t fh;
4491         int error;
4492
4493         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4494         if (error != 0)
4495                 return (error);
4496         error = kern_fhstatfs(td, fh, &sf);
4497         if (error != 0)
4498                 return (error);
4499         return (copyout(&sf, uap->buf, sizeof(sf)));
4500 }
4501
4502 int
4503 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4504 {
4505         struct statfs *sp;
4506         struct mount *mp;
4507         struct vnode *vp;
4508         int error;
4509
4510         error = priv_check(td, PRIV_VFS_FHSTATFS);
4511         if (error != 0)
4512                 return (error);
4513         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4514                 return (ESTALE);
4515         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4516         if (error != 0) {
4517                 vfs_unbusy(mp);
4518                 return (error);
4519         }
4520         vput(vp);
4521         error = prison_canseemount(td->td_ucred, mp);
4522         if (error != 0)
4523                 goto out;
4524 #ifdef MAC
4525         error = mac_mount_check_stat(td->td_ucred, mp);
4526         if (error != 0)
4527                 goto out;
4528 #endif
4529         /*
4530          * Set these in case the underlying filesystem fails to do so.
4531          */
4532         sp = &mp->mnt_stat;
4533         sp->f_version = STATFS_VERSION;
4534         sp->f_namemax = NAME_MAX;
4535         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4536         error = VFS_STATFS(mp, sp);
4537         if (error == 0)
4538                 *buf = *sp;
4539 out:
4540         vfs_unbusy(mp);
4541         return (error);
4542 }
4543
4544 int
4545 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4546 {
4547         struct file *fp;
4548         struct mount *mp;
4549         struct vnode *vp;
4550         cap_rights_t rights;
4551         off_t olen, ooffset;
4552         int error;
4553
4554         fp = NULL;
4555         error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4556         if (error != 0)
4557                 goto out;
4558
4559         switch (fp->f_type) {
4560         case DTYPE_VNODE:
4561                 break;
4562         case DTYPE_PIPE:
4563         case DTYPE_FIFO:
4564                 error = ESPIPE;
4565                 goto out;
4566         default:
4567                 error = ENODEV;
4568                 goto out;
4569         }
4570         if ((fp->f_flag & FWRITE) == 0) {
4571                 error = EBADF;
4572                 goto out;
4573         }
4574         vp = fp->f_vnode;
4575         if (vp->v_type != VREG) {
4576                 error = ENODEV;
4577                 goto out;
4578         }
4579         if (offset < 0 || len <= 0) {
4580                 error = EINVAL;
4581                 goto out;
4582         }
4583         /* Check for wrap. */
4584         if (offset > OFF_MAX - len) {
4585                 error = EFBIG;
4586                 goto out;
4587         }
4588
4589         /* Allocating blocks may take a long time, so iterate. */
4590         for (;;) {
4591                 olen = len;
4592                 ooffset = offset;
4593
4594                 bwillwrite();
4595                 mp = NULL;
4596                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4597                 if (error != 0)
4598                         break;
4599                 error = vn_lock(vp, LK_EXCLUSIVE);
4600                 if (error != 0) {
4601                         vn_finished_write(mp);
4602                         break;
4603                 }
4604 #ifdef MAC
4605                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4606                 if (error == 0)
4607 #endif
4608                         error = VOP_ALLOCATE(vp, &offset, &len);
4609                 VOP_UNLOCK(vp, 0);
4610                 vn_finished_write(mp);
4611
4612                 if (olen + ooffset != offset + len) {
4613                         panic("offset + len changed from %jx/%jx to %jx/%jx",
4614                             ooffset, olen, offset, len);
4615                 }
4616                 if (error != 0 || len == 0)
4617                         break;
4618                 KASSERT(olen > len, ("Iteration did not make progress?"));
4619                 maybe_yield();
4620         }
4621  out:
4622         if (fp != NULL)
4623                 fdrop(fp, td);
4624         return (error);
4625 }
4626
4627 int
4628 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4629 {
4630
4631         td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4632             uap->len);
4633         return (0);
4634 }
4635
4636 /*
4637  * Unlike madvise(2), we do not make a best effort to remember every
4638  * possible caching hint.  Instead, we remember the last setting with
4639  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4640  * region of any current setting.
4641  */
4642 int
4643 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4644     int advice)
4645 {
4646         struct fadvise_info *fa, *new;
4647         struct file *fp;
4648         struct vnode *vp;
4649         cap_rights_t rights;
4650         off_t end;
4651         int error;
4652
4653         if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4654                 return (EINVAL);
4655         switch (advice) {
4656         case POSIX_FADV_SEQUENTIAL:
4657         case POSIX_FADV_RANDOM:
4658         case POSIX_FADV_NOREUSE:
4659                 new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4660                 break;
4661         case POSIX_FADV_NORMAL:
4662         case POSIX_FADV_WILLNEED:
4663         case POSIX_FADV_DONTNEED:
4664                 new = NULL;
4665                 break;
4666         default:
4667                 return (EINVAL);
4668         }
4669         /* XXX: CAP_POSIX_FADVISE? */
4670         error = fget(td, fd, cap_rights_init(&rights), &fp);
4671         if (error != 0)
4672                 goto out;
4673
4674         switch (fp->f_type) {
4675         case DTYPE_VNODE:
4676                 break;
4677         case DTYPE_PIPE:
4678         case DTYPE_FIFO:
4679                 error = ESPIPE;
4680                 goto out;
4681         default:
4682                 error = ENODEV;
4683                 goto out;
4684         }
4685         vp = fp->f_vnode;
4686         if (vp->v_type != VREG) {
4687                 error = ENODEV;
4688                 goto out;
4689         }
4690         if (len == 0)
4691                 end = OFF_MAX;
4692         else
4693                 end = offset + len - 1;
4694         switch (advice) {
4695         case POSIX_FADV_SEQUENTIAL:
4696         case POSIX_FADV_RANDOM:
4697         case POSIX_FADV_NOREUSE:
4698                 /*
4699                  * Try to merge any existing non-standard region with
4700                  * this new region if possible, otherwise create a new
4701                  * non-standard region for this request.
4702                  */
4703                 mtx_pool_lock(mtxpool_sleep, fp);
4704                 fa = fp->f_advice;
4705                 if (fa != NULL && fa->fa_advice == advice &&
4706                     ((fa->fa_start <= end && fa->fa_end >= offset) ||
4707                     (end != OFF_MAX && fa->fa_start == end + 1) ||
4708                     (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4709                         if (offset < fa->fa_start)
4710                                 fa->fa_start = offset;
4711                         if (end > fa->fa_end)
4712                                 fa->fa_end = end;
4713                 } else {
4714                         new->fa_advice = advice;
4715                         new->fa_start = offset;
4716                         new->fa_end = end;
4717                         new->fa_prevstart = 0;
4718                         new->fa_prevend = 0;
4719                         fp->f_advice = new;
4720                         new = fa;
4721                 }
4722                 mtx_pool_unlock(mtxpool_sleep, fp);
4723                 break;
4724         case POSIX_FADV_NORMAL:
4725                 /*
4726                  * If a the "normal" region overlaps with an existing
4727                  * non-standard region, trim or remove the
4728                  * non-standard region.
4729                  */
4730                 mtx_pool_lock(mtxpool_sleep, fp);
4731                 fa = fp->f_advice;
4732                 if (fa != NULL) {
4733                         if (offset <= fa->fa_start && end >= fa->fa_end) {
4734                                 new = fa;
4735                                 fp->f_advice = NULL;
4736                         } else if (offset <= fa->fa_start &&
4737                             end >= fa->fa_start)
4738                                 fa->fa_start = end + 1;
4739                         else if (offset <= fa->fa_end && end >= fa->fa_end)
4740                                 fa->fa_end = offset - 1;
4741                         else if (offset >= fa->fa_start && end <= fa->fa_end) {
4742                                 /*
4743                                  * If the "normal" region is a middle
4744                                  * portion of the existing
4745                                  * non-standard region, just remove
4746                                  * the whole thing rather than picking
4747                                  * one side or the other to
4748                                  * preserve.
4749                                  */
4750                                 new = fa;
4751                                 fp->f_advice = NULL;
4752                         }
4753                 }
4754                 mtx_pool_unlock(mtxpool_sleep, fp);
4755                 break;
4756         case POSIX_FADV_WILLNEED:
4757         case POSIX_FADV_DONTNEED:
4758                 error = VOP_ADVISE(vp, offset, end, advice);
4759                 break;
4760         }
4761 out:
4762         if (fp != NULL)
4763                 fdrop(fp, td);
4764         free(new, M_FADVISE);
4765         return (error);
4766 }
4767
4768 int
4769 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4770 {
4771
4772         td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4773             uap->len, uap->advice);
4774         return (0);
4775 }