sys/kern/vfs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  14  *
  15  * Portions of this software were developed by Konstantin Belousov
  16  * under sponsorship from the FreeBSD Foundation.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  * 3. Neither the name of the University nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  40  * SUCH DAMAGE.
  41  *
  42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include "opt_hwpmc_hooks.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/disk.h>
  53 #include <sys/fail.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/kdb.h>
  57 #include <sys/ktr.h>
  58 #include <sys/stat.h>
  59 #include <sys/priv.h>
  60 #include <sys/proc.h>
  61 #include <sys/limits.h>
  62 #include <sys/lock.h>
  63 #include <sys/mman.h>
  64 #include <sys/mount.h>
  65 #include <sys/mutex.h>
  66 #include <sys/namei.h>
  67 #include <sys/vnode.h>
  68 #include <sys/bio.h>
  69 #include <sys/buf.h>
  70 #include <sys/filio.h>
  71 #include <sys/resourcevar.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/prng.h>
  74 #include <sys/sx.h>
  75 #include <sys/sleepqueue.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/ttycom.h>
  78 #include <sys/conf.h>
  79 #include <sys/syslog.h>
  80 #include <sys/unistd.h>
  81 #include <sys/user.h>
  82
  83 #include <security/audit/audit.h>
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_object.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_pager.h>
  93
  94 #ifdef HWPMC_HOOKS
  95 #include <sys/pmckern.h>
  96 #endif
  97
  98 static fo_rdwr_t        vn_read;
  99 static fo_rdwr_t        vn_write;
 100 static fo_rdwr_t        vn_io_fault;
 101 static fo_truncate_t    vn_truncate;
 102 static fo_ioctl_t       vn_ioctl;
 103 static fo_poll_t        vn_poll;
 104 static fo_kqfilter_t    vn_kqfilter;
 105 static fo_close_t       vn_closefile;
 106 static fo_mmap_t        vn_mmap;
 107 static fo_fallocate_t   vn_fallocate;
 108
 109 struct  fileops vnops = {
 110         .fo_read = vn_io_fault,
 111         .fo_write = vn_io_fault,
 112         .fo_truncate = vn_truncate,
 113         .fo_ioctl = vn_ioctl,
 114         .fo_poll = vn_poll,
 115         .fo_kqfilter = vn_kqfilter,
 116         .fo_stat = vn_statfile,
 117         .fo_close = vn_closefile,
 118         .fo_chmod = vn_chmod,
 119         .fo_chown = vn_chown,
 120         .fo_sendfile = vn_sendfile,
 121         .fo_seek = vn_seek,
 122         .fo_fill_kinfo = vn_fill_kinfo,
 123         .fo_mmap = vn_mmap,
 124         .fo_fallocate = vn_fallocate,
 125         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 126 };
 127
 128 const u_int io_hold_cnt = 16;
 129 static int vn_io_fault_enable = 1;
 130 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
 131     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 132 static int vn_io_fault_prefault = 0;
 133 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
 134     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 135 static int vn_io_pgcache_read_enable = 1;
 136 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
 137     &vn_io_pgcache_read_enable, 0,
 138     "Enable copying from page cache for reads, avoiding fs");
 139 static u_long vn_io_faults_cnt;
 140 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
 141     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 142
 143 static int vfs_allow_read_dir = 0;
 144 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
 145     &vfs_allow_read_dir, 0,
 146     "Enable read(2) of directory by root for filesystems that support it");
 147
 148 /*
 149  * Returns true if vn_io_fault mode of handling the i/o request should
 150  * be used.
 151  */
 152 static bool
 153 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 154 {
 155         struct mount *mp;
 156
 157         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 158             (mp = vp->v_mount) != NULL &&
 159             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 160 }
 161
 162 /*
 163  * Structure used to pass arguments to vn_io_fault1(), to do either
 164  * file- or vnode-based I/O calls.
 165  */
 166 struct vn_io_fault_args {
 167         enum {
 168                 VN_IO_FAULT_FOP,
 169                 VN_IO_FAULT_VOP
 170         } kind;
 171         struct ucred *cred;
 172         int flags;
 173         union {
 174                 struct fop_args_tag {
 175                         struct file *fp;
 176                         fo_rdwr_t *doio;
 177                 } fop_args;
 178                 struct vop_args_tag {
 179                         struct vnode *vp;
 180                 } vop_args;
 181         } args;
 182 };
 183
 184 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
 185     struct vn_io_fault_args *args, struct thread *td);
 186
 187 int
 188 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 189 {
 190         struct thread *td = ndp->ni_cnd.cn_thread;
 191
 192         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 193 }
 194
 195 static uint64_t
 196 open2nameif(int fmode, u_int vn_open_flags)
 197 {
 198         uint64_t res;
 199
 200         res = ISOPEN | LOCKLEAF;
 201         if ((fmode & O_RESOLVE_BENEATH) != 0)
 202                 res |= RBENEATH;
 203         if ((fmode & O_EMPTY_PATH) != 0)
 204                 res |= EMPTYPATH;
 205         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 206                 res |= AUDITVNODE1;
 207         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 208                 res |= NOCAPCHECK;
 209         return (res);
 210 }
 211
 212 /*
 213  * Common code for vnode open operations via a name lookup.
 214  * Lookup the vnode and invoke VOP_CREATE if needed.
 215  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 216  *
 217  * Note that this does NOT free nameidata for the successful case,
 218  * due to the NDINIT being done elsewhere.
 219  */
 220 int
 221 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 222     struct ucred *cred, struct file *fp)
 223 {
 224         struct vnode *vp;
 225         struct mount *mp;
 226         struct thread *td = ndp->ni_cnd.cn_thread;
 227         struct vattr vat;
 228         struct vattr *vap = &vat;
 229         int fmode, error;
 230         bool first_open;
 231
 232 restart:
 233         first_open = false;
 234         fmode = *flagp;
 235         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 236             O_EXCL | O_DIRECTORY) ||
 237             (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
 238                 return (EINVAL);
 239         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 240                 ndp->ni_cnd.cn_nameiop = CREATE;
 241                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 242                 /*
 243                  * Set NOCACHE to avoid flushing the cache when
 244                  * rolling in many files at once.
 245                  *
 246                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
 247                  * exist despite NOCACHE.
 248                  */
 249                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 250                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 251                         ndp->ni_cnd.cn_flags |= FOLLOW;
 252                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 253                         bwillwrite();
 254                 if ((error = namei(ndp)) != 0)
 255                         return (error);
 256                 if (ndp->ni_vp == NULL) {
 257                         VATTR_NULL(vap);
 258                         vap->va_type = VREG;
 259                         vap->va_mode = cmode;
 260                         if (fmode & O_EXCL)
 261                                 vap->va_vaflags |= VA_EXCLUSIVE;
 262                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 263                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 264                                 vput(ndp->ni_dvp);
 265                                 if ((error = vn_start_write(NULL, &mp,
 266                                     V_XSLEEP | PCATCH)) != 0)
 267                                         return (error);
 268                                 NDREINIT(ndp);
 269                                 goto restart;
 270                         }
 271                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 272                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
 273 #ifdef MAC
 274                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
 275                             &ndp->ni_cnd, vap);
 276                         if (error == 0)
 277 #endif
 278                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 279                                     &ndp->ni_cnd, vap);
 280                         vp = ndp->ni_vp;
 281                         if (error == 0 && (fmode & O_EXCL) != 0 &&
 282                             (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 283                                 VI_LOCK(vp);
 284                                 vp->v_iflag |= VI_FOPENING;
 285                                 VI_UNLOCK(vp);
 286                                 first_open = true;
 287                         }
 288                         VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 289                             false);
 290                         vn_finished_write(mp);
 291                         if (error) {
 292                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 293                                 if (error == ERELOOKUP) {
 294                                         NDREINIT(ndp);
 295                                         goto restart;
 296                                 }
 297                                 return (error);
 298                         }
 299                         fmode &= ~O_TRUNC;
 300                 } else {
 301                         if (ndp->ni_dvp == ndp->ni_vp)
 302                                 vrele(ndp->ni_dvp);
 303                         else
 304                                 vput(ndp->ni_dvp);
 305                         ndp->ni_dvp = NULL;
 306                         vp = ndp->ni_vp;
 307                         if (fmode & O_EXCL) {
 308                                 error = EEXIST;
 309                                 goto bad;
 310                         }
 311                         if (vp->v_type == VDIR) {
 312                                 error = EISDIR;
 313                                 goto bad;
 314                         }
 315                         fmode &= ~O_CREAT;
 316                 }
 317         } else {
 318                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 319                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 320                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 321                     FOLLOW;
 322                 if ((fmode & FWRITE) == 0)
 323                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
 324                 if ((error = namei(ndp)) != 0)
 325                         return (error);
 326                 vp = ndp->ni_vp;
 327         }
 328         error = vn_open_vnode(vp, fmode, cred, td, fp);
 329         if (first_open) {
 330                 VI_LOCK(vp);
 331                 vp->v_iflag &= ~VI_FOPENING;
 332                 wakeup(vp);
 333                 VI_UNLOCK(vp);
 334         }
 335         if (error)
 336                 goto bad;
 337         *flagp = fmode;
 338         return (0);
 339 bad:
 340         NDFREE(ndp, NDF_ONLY_PNBUF);
 341         vput(vp);
 342         *flagp = fmode;
 343         ndp->ni_vp = NULL;
 344         return (error);
 345 }
 346
 347 static int
 348 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 349 {
 350         struct flock lf;
 351         int error, lock_flags, type;
 352
 353         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 354         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 355                 return (0);
 356         KASSERT(fp != NULL, ("open with flock requires fp"));
 357         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 358                 return (EOPNOTSUPP);
 359
 360         lock_flags = VOP_ISLOCKED(vp);
 361         VOP_UNLOCK(vp);
 362
 363         lf.l_whence = SEEK_SET;
 364         lf.l_start = 0;
 365         lf.l_len = 0;
 366         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 367         type = F_FLOCK;
 368         if ((fmode & FNONBLOCK) == 0)
 369                 type |= F_WAIT;
 370         if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 371                 type |= F_FIRSTOPEN;
 372         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 373         if (error == 0)
 374                 fp->f_flag |= FHASLOCK;
 375
 376         vn_lock(vp, lock_flags | LK_RETRY);
 377         return (error);
 378 }
 379
 380 /*
 381  * Common code for vnode open operations once a vnode is located.
 382  * Check permissions, and call the VOP_OPEN routine.
 383  */
 384 int
 385 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 386     struct thread *td, struct file *fp)
 387 {
 388         accmode_t accmode;
 389         int error;
 390
 391         if (vp->v_type == VLNK) {
 392                 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 393                         return (EMLINK);
 394         }
 395         if (vp->v_type == VSOCK)
 396                 return (EOPNOTSUPP);
 397         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 398                 return (ENOTDIR);
 399
 400         accmode = 0;
 401         if ((fmode & O_PATH) == 0) {
 402                 if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 403                         if (vp->v_type == VDIR)
 404                                 return (EISDIR);
 405                         accmode |= VWRITE;
 406                 }
 407                 if ((fmode & FREAD) != 0)
 408                         accmode |= VREAD;
 409                 if ((fmode & O_APPEND) && (fmode & FWRITE))
 410                         accmode |= VAPPEND;
 411 #ifdef MAC
 412                 if ((fmode & O_CREAT) != 0)
 413                         accmode |= VCREAT;
 414 #endif
 415         }
 416         if ((fmode & FEXEC) != 0)
 417                 accmode |= VEXEC;
 418 #ifdef MAC
 419         if ((fmode & O_VERIFY) != 0)
 420                 accmode |= VVERIFY;
 421         error = mac_vnode_check_open(cred, vp, accmode);
 422         if (error != 0)
 423                 return (error);
 424
 425         accmode &= ~(VCREAT | VVERIFY);
 426 #endif
 427         if ((fmode & O_CREAT) == 0 && accmode != 0) {
 428                 error = VOP_ACCESS(vp, accmode, cred, td);
 429                 if (error != 0)
 430                         return (error);
 431         }
 432         if ((fmode & O_PATH) != 0) {
 433                 if (vp->v_type == VFIFO)
 434                         error = EPIPE;
 435                 else
 436                         error = VOP_ACCESS(vp, VREAD, cred, td);
 437                 if (error == 0)
 438                         fp->f_flag |= FKQALLOWED;
 439                 return (0);
 440         }
 441
 442         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 443                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 444         error = VOP_OPEN(vp, fmode, cred, td, fp);
 445         if (error != 0)
 446                 return (error);
 447
 448         error = vn_open_vnode_advlock(vp, fmode, fp);
 449         if (error == 0 && (fmode & FWRITE) != 0) {
 450                 error = VOP_ADD_WRITECOUNT(vp, 1);
 451                 if (error == 0) {
 452                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 453                              __func__, vp, vp->v_writecount);
 454                 }
 455         }
 456
 457         /*
 458          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 459          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 460          */
 461         if (error != 0) {
 462                 if (fp != NULL) {
 463                         /*
 464                          * Arrange the call by having fdrop() to use
 465                          * vn_closefile().  This is to satisfy
 466                          * filesystems like devfs or tmpfs, which
 467                          * override fo_close().
 468                          */
 469                         fp->f_flag |= FOPENFAILED;
 470                         fp->f_vnode = vp;
 471                         if (fp->f_ops == &badfileops) {
 472                                 fp->f_type = DTYPE_VNODE;
 473                                 fp->f_ops = &vnops;
 474                         }
 475                         vref(vp);
 476                 } else {
 477                         /*
 478                          * If there is no fp, due to kernel-mode open,
 479                          * we can call VOP_CLOSE() now.
 480                          */
 481                         if (vp->v_type != VFIFO && (fmode & FWRITE) != 0 &&
 482                             !MNT_EXTENDED_SHARED(vp->v_mount) &&
 483                             VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 484                                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 485                         (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
 486                             cred, td);
 487                 }
 488         }
 489
 490         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 491         return (error);
 492
 493 }
 494
 495 /*
 496  * Check for write permissions on the specified vnode.
 497  * Prototype text segments cannot be written.
 498  * It is racy.
 499  */
 500 int
 501 vn_writechk(struct vnode *vp)
 502 {
 503
 504         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 505         /*
 506          * If there's shared text associated with
 507          * the vnode, try to free it up once.  If
 508          * we fail, we can't allow writing.
 509          */
 510         if (VOP_IS_TEXT(vp))
 511                 return (ETXTBSY);
 512
 513         return (0);
 514 }
 515
 516 /*
 517  * Vnode close call
 518  */
 519 static int
 520 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
 521     struct thread *td, bool keep_ref)
 522 {
 523         struct mount *mp;
 524         int error, lock_flags;
 525
 526         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 527             MNT_EXTENDED_SHARED(vp->v_mount))
 528                 lock_flags = LK_SHARED;
 529         else
 530                 lock_flags = LK_EXCLUSIVE;
 531
 532         vn_start_write(vp, &mp, V_WAIT);
 533         vn_lock(vp, lock_flags | LK_RETRY);
 534         AUDIT_ARG_VNODE1(vp);
 535         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 536                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 537                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 538                     __func__, vp, vp->v_writecount);
 539         }
 540         error = VOP_CLOSE(vp, flags, file_cred, td);
 541         if (keep_ref)
 542                 VOP_UNLOCK(vp);
 543         else
 544                 vput(vp);
 545         vn_finished_write(mp);
 546         return (error);
 547 }
 548
 549 int
 550 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
 551     struct thread *td)
 552 {
 553
 554         return (vn_close1(vp, flags, file_cred, td, false));
 555 }
 556
 557 /*
 558  * Heuristic to detect sequential operation.
 559  */
 560 static int
 561 sequential_heuristic(struct uio *uio, struct file *fp)
 562 {
 563         enum uio_rw rw;
 564
 565         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 566
 567         rw = uio->uio_rw;
 568         if (fp->f_flag & FRDAHEAD)
 569                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 570
 571         /*
 572          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 573          * that the first I/O is normally considered to be slightly
 574          * sequential.  Seeking to offset 0 doesn't change sequentiality
 575          * unless previous seeks have reduced f_seqcount to 0, in which
 576          * case offset 0 is not special.
 577          */
 578         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 579             uio->uio_offset == fp->f_nextoff[rw]) {
 580                 /*
 581                  * f_seqcount is in units of fixed-size blocks so that it
 582                  * depends mainly on the amount of sequential I/O and not
 583                  * much on the number of sequential I/O's.  The fixed size
 584                  * of 16384 is hard-coded here since it is (not quite) just
 585                  * a magic size that works well here.  This size is more
 586                  * closely related to the best I/O size for real disks than
 587                  * to any block size used by software.
 588                  */
 589                 if (uio->uio_resid >= IO_SEQMAX * 16384)
 590                         fp->f_seqcount[rw] = IO_SEQMAX;
 591                 else {
 592                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 593                         if (fp->f_seqcount[rw] > IO_SEQMAX)
 594                                 fp->f_seqcount[rw] = IO_SEQMAX;
 595                 }
 596                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 597         }
 598
 599         /* Not sequential.  Quickly draw-down sequentiality. */
 600         if (fp->f_seqcount[rw] > 1)
 601                 fp->f_seqcount[rw] = 1;
 602         else
 603                 fp->f_seqcount[rw] = 0;
 604         return (0);
 605 }
 606
 607 /*
 608  * Package up an I/O request on a vnode into a uio and do it.
 609  */
 610 int
 611 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
 612     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 613     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 614 {
 615         struct uio auio;
 616         struct iovec aiov;
 617         struct mount *mp;
 618         struct ucred *cred;
 619         void *rl_cookie;
 620         struct vn_io_fault_args args;
 621         int error, lock_flags;
 622
 623         if (offset < 0 && vp->v_type != VCHR)
 624                 return (EINVAL);
 625         auio.uio_iov = &aiov;
 626         auio.uio_iovcnt = 1;
 627         aiov.iov_base = base;
 628         aiov.iov_len = len;
 629         auio.uio_resid = len;
 630         auio.uio_offset = offset;
 631         auio.uio_segflg = segflg;
 632         auio.uio_rw = rw;
 633         auio.uio_td = td;
 634         error = 0;
 635
 636         if ((ioflg & IO_NODELOCKED) == 0) {
 637                 if ((ioflg & IO_RANGELOCKED) == 0) {
 638                         if (rw == UIO_READ) {
 639                                 rl_cookie = vn_rangelock_rlock(vp, offset,
 640                                     offset + len);
 641                         } else if ((ioflg & IO_APPEND) != 0) {
 642                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 643                         } else {
 644                                 rl_cookie = vn_rangelock_wlock(vp, offset,
 645                                     offset + len);
 646                         }
 647                 } else
 648                         rl_cookie = NULL;
 649                 mp = NULL;
 650                 if (rw == UIO_WRITE) {
 651                         if (vp->v_type != VCHR &&
 652                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 653                             != 0)
 654                                 goto out;
 655                         if (MNT_SHARED_WRITES(mp) ||
 656                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 657                                 lock_flags = LK_SHARED;
 658                         else
 659                                 lock_flags = LK_EXCLUSIVE;
 660                 } else
 661                         lock_flags = LK_SHARED;
 662                 vn_lock(vp, lock_flags | LK_RETRY);
 663         } else
 664                 rl_cookie = NULL;
 665
 666         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 667 #ifdef MAC
 668         if ((ioflg & IO_NOMACCHECK) == 0) {
 669                 if (rw == UIO_READ)
 670                         error = mac_vnode_check_read(active_cred, file_cred,
 671                             vp);
 672                 else
 673                         error = mac_vnode_check_write(active_cred, file_cred,
 674                             vp);
 675         }
 676 #endif
 677         if (error == 0) {
 678                 if (file_cred != NULL)
 679                         cred = file_cred;
 680                 else
 681                         cred = active_cred;
 682                 if (do_vn_io_fault(vp, &auio)) {
 683                         args.kind = VN_IO_FAULT_VOP;
 684                         args.cred = cred;
 685                         args.flags = ioflg;
 686                         args.args.vop_args.vp = vp;
 687                         error = vn_io_fault1(vp, &auio, &args, td);
 688                 } else if (rw == UIO_READ) {
 689                         error = VOP_READ(vp, &auio, ioflg, cred);
 690                 } else /* if (rw == UIO_WRITE) */ {
 691                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 692                 }
 693         }
 694         if (aresid)
 695                 *aresid = auio.uio_resid;
 696         else
 697                 if (auio.uio_resid && error == 0)
 698                         error = EIO;
 699         if ((ioflg & IO_NODELOCKED) == 0) {
 700                 VOP_UNLOCK(vp);
 701                 if (mp != NULL)
 702                         vn_finished_write(mp);
 703         }
 704  out:
 705         if (rl_cookie != NULL)
 706                 vn_rangelock_unlock(vp, rl_cookie);
 707         return (error);
 708 }
 709
 710 /*
 711  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 712  * request is split up into smaller chunks and we try to avoid saturating
 713  * the buffer cache while potentially holding a vnode locked, so we
 714  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
 715  * to give other processes a chance to lock the vnode (either other processes
 716  * core'ing the same binary, or unrelated processes scanning the directory).
 717  */
 718 int
 719 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 720     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 721     struct ucred *file_cred, size_t *aresid, struct thread *td)
 722 {
 723         int error = 0;
 724         ssize_t iaresid;
 725
 726         do {
 727                 int chunk;
 728
 729                 /*
 730                  * Force `offset' to a multiple of MAXBSIZE except possibly
 731                  * for the first chunk, so that filesystems only need to
 732                  * write full blocks except possibly for the first and last
 733                  * chunks.
 734                  */
 735                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 736
 737                 if (chunk > len)
 738                         chunk = len;
 739                 if (rw != UIO_READ && vp->v_type == VREG)
 740                         bwillwrite();
 741                 iaresid = 0;
 742                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 743                     ioflg, active_cred, file_cred, &iaresid, td);
 744                 len -= chunk;   /* aresid calc already includes length */
 745                 if (error)
 746                         break;
 747                 offset += chunk;
 748                 base = (char *)base + chunk;
 749                 kern_yield(PRI_USER);
 750         } while (len);
 751         if (aresid)
 752                 *aresid = len + iaresid;
 753         return (error);
 754 }
 755
 756 #if OFF_MAX <= LONG_MAX
 757 off_t
 758 foffset_lock(struct file *fp, int flags)
 759 {
 760         volatile short *flagsp;
 761         off_t res;
 762         short state;
 763
 764         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 765
 766         if ((flags & FOF_NOLOCK) != 0)
 767                 return (atomic_load_long(&fp->f_offset));
 768
 769         /*
 770          * According to McKusick the vn lock was protecting f_offset here.
 771          * It is now protected by the FOFFSET_LOCKED flag.
 772          */
 773         flagsp = &fp->f_vnread_flags;
 774         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 775                 return (atomic_load_long(&fp->f_offset));
 776
 777         sleepq_lock(&fp->f_vnread_flags);
 778         state = atomic_load_16(flagsp);
 779         for (;;) {
 780                 if ((state & FOFFSET_LOCKED) == 0) {
 781                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 782                             FOFFSET_LOCKED))
 783                                 continue;
 784                         break;
 785                 }
 786                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
 787                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 788                             state | FOFFSET_LOCK_WAITING))
 789                                 continue;
 790                 }
 791                 DROP_GIANT();
 792                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 793                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 794                 PICKUP_GIANT();
 795                 sleepq_lock(&fp->f_vnread_flags);
 796                 state = atomic_load_16(flagsp);
 797         }
 798         res = atomic_load_long(&fp->f_offset);
 799         sleepq_release(&fp->f_vnread_flags);
 800         return (res);
 801 }
 802
 803 void
 804 foffset_unlock(struct file *fp, off_t val, int flags)
 805 {
 806         volatile short *flagsp;
 807         short state;
 808
 809         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 810
 811         if ((flags & FOF_NOUPDATE) == 0)
 812                 atomic_store_long(&fp->f_offset, val);
 813         if ((flags & FOF_NEXTOFF_R) != 0)
 814                 fp->f_nextoff[UIO_READ] = val;
 815         if ((flags & FOF_NEXTOFF_W) != 0)
 816                 fp->f_nextoff[UIO_WRITE] = val;
 817
 818         if ((flags & FOF_NOLOCK) != 0)
 819                 return;
 820
 821         flagsp = &fp->f_vnread_flags;
 822         state = atomic_load_16(flagsp);
 823         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 824             atomic_cmpset_rel_16(flagsp, state, 0))
 825                 return;
 826
 827         sleepq_lock(&fp->f_vnread_flags);
 828         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 829         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 830         fp->f_vnread_flags = 0;
 831         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 832         sleepq_release(&fp->f_vnread_flags);
 833 }
 834 #else
 835 off_t
 836 foffset_lock(struct file *fp, int flags)
 837 {
 838         struct mtx *mtxp;
 839         off_t res;
 840
 841         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 842
 843         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 844         mtx_lock(mtxp);
 845         if ((flags & FOF_NOLOCK) == 0) {
 846                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 847                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 848                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 849                             "vofflock", 0);
 850                 }
 851                 fp->f_vnread_flags |= FOFFSET_LOCKED;
 852         }
 853         res = fp->f_offset;
 854         mtx_unlock(mtxp);
 855         return (res);
 856 }
 857
 858 void
 859 foffset_unlock(struct file *fp, off_t val, int flags)
 860 {
 861         struct mtx *mtxp;
 862
 863         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 864
 865         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 866         mtx_lock(mtxp);
 867         if ((flags & FOF_NOUPDATE) == 0)
 868                 fp->f_offset = val;
 869         if ((flags & FOF_NEXTOFF_R) != 0)
 870                 fp->f_nextoff[UIO_READ] = val;
 871         if ((flags & FOF_NEXTOFF_W) != 0)
 872                 fp->f_nextoff[UIO_WRITE] = val;
 873         if ((flags & FOF_NOLOCK) == 0) {
 874                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 875                     ("Lost FOFFSET_LOCKED"));
 876                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 877                         wakeup(&fp->f_vnread_flags);
 878                 fp->f_vnread_flags = 0;
 879         }
 880         mtx_unlock(mtxp);
 881 }
 882 #endif
 883
 884 void
 885 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 886 {
 887
 888         if ((flags & FOF_OFFSET) == 0)
 889                 uio->uio_offset = foffset_lock(fp, flags);
 890 }
 891
 892 void
 893 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 894 {
 895
 896         if ((flags & FOF_OFFSET) == 0)
 897                 foffset_unlock(fp, uio->uio_offset, flags);
 898 }
 899
 900 static int
 901 get_advice(struct file *fp, struct uio *uio)
 902 {
 903         struct mtx *mtxp;
 904         int ret;
 905
 906         ret = POSIX_FADV_NORMAL;
 907         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 908                 return (ret);
 909
 910         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 911         mtx_lock(mtxp);
 912         if (fp->f_advice != NULL &&
 913             uio->uio_offset >= fp->f_advice->fa_start &&
 914             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 915                 ret = fp->f_advice->fa_advice;
 916         mtx_unlock(mtxp);
 917         return (ret);
 918 }
 919
 920 int
 921 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 922 {
 923         vm_object_t obj;
 924         vm_page_t ma[io_hold_cnt + 2];
 925         off_t off, vsz;
 926         ssize_t resid;
 927         int error, i, j;
 928
 929         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 930         obj = atomic_load_ptr(&vp->v_object);
 931         if (obj == NULL)
 932                 return (EJUSTRETURN);
 933
 934         /*
 935          * Depends on type stability of vm_objects.
 936          */
 937         vm_object_pip_add(obj, 1);
 938         if ((obj->flags & OBJ_DEAD) != 0) {
 939                 /*
 940                  * Note that object might be already reused from the
 941                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 942                  * we recheck for DOOMED vnode state after all pages
 943                  * are busied, and retract then.
 944                  *
 945                  * But we check for OBJ_DEAD to ensure that we do not
 946                  * busy pages while vm_object_terminate_pages()
 947                  * processes the queue.
 948                  */
 949                 error = EJUSTRETURN;
 950                 goto out_pip;
 951         }
 952
 953         resid = uio->uio_resid;
 954         off = uio->uio_offset;
 955         for (i = 0; resid > 0; i++) {
 956                 MPASS(i < io_hold_cnt + 2);
 957                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
 958                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 959                     VM_ALLOC_NOWAIT);
 960                 if (ma[i] == NULL)
 961                         break;
 962
 963                 /*
 964                  * Skip invalid pages.  Valid mask can be partial only
 965                  * at EOF, and we clip later.
 966                  */
 967                 if (vm_page_none_valid(ma[i])) {
 968                         vm_page_sunbusy(ma[i]);
 969                         break;
 970                 }
 971
 972                 resid -= PAGE_SIZE;
 973                 off += PAGE_SIZE;
 974         }
 975         if (i == 0) {
 976                 error = EJUSTRETURN;
 977                 goto out_pip;
 978         }
 979
 980         /*
 981          * Check VIRF_DOOMED after we busied our pages.  Since
 982          * vgonel() terminates the vnode' vm_object, it cannot
 983          * process past pages busied by us.
 984          */
 985         if (VN_IS_DOOMED(vp)) {
 986                 error = EJUSTRETURN;
 987                 goto out;
 988         }
 989
 990         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 991         if (resid > uio->uio_resid)
 992                 resid = uio->uio_resid;
 993
 994         /*
 995          * Unlocked read of vnp_size is safe because truncation cannot
 996          * pass busied page.  But we load vnp_size into a local
 997          * variable so that possible concurrent extension does not
 998          * break calculation.
 999          */
1000 #if defined(__powerpc__) && !defined(__powerpc64__)
1001         vsz = obj->un_pager.vnp.vnp_size;
1002 #else
1003         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
1004 #endif
1005         if (uio->uio_offset >= vsz) {
1006                 error = EJUSTRETURN;
1007                 goto out;
1008         }
1009         if (uio->uio_offset + resid > vsz)
1010                 resid = vsz - uio->uio_offset;
1011
1012         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
1013
1014 out:
1015         for (j = 0; j < i; j++) {
1016                 if (error == 0)
1017                         vm_page_reference(ma[j]);
1018                 vm_page_sunbusy(ma[j]);
1019         }
1020 out_pip:
1021         vm_object_pip_wakeup(obj);
1022         if (error != 0)
1023                 return (error);
1024         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
1025 }
1026
1027 /*
1028  * File table vnode read routine.
1029  */
1030 static int
1031 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1032     struct thread *td)
1033 {
1034         struct vnode *vp;
1035         off_t orig_offset;
1036         int error, ioflag;
1037         int advice;
1038
1039         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1040             uio->uio_td, td));
1041         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1042         vp = fp->f_vnode;
1043         ioflag = 0;
1044         if (fp->f_flag & FNONBLOCK)
1045                 ioflag |= IO_NDELAY;
1046         if (fp->f_flag & O_DIRECT)
1047                 ioflag |= IO_DIRECT;
1048
1049         /*
1050          * Try to read from page cache.  VIRF_DOOMED check is racy but
1051          * allows us to avoid unneeded work outright.
1052          */
1053         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
1054             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
1055                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1056                 if (error == 0) {
1057                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1058                         return (0);
1059                 }
1060                 if (error != EJUSTRETURN)
1061                         return (error);
1062         }
1063
1064         advice = get_advice(fp, uio);
1065         vn_lock(vp, LK_SHARED | LK_RETRY);
1066
1067         switch (advice) {
1068         case POSIX_FADV_NORMAL:
1069         case POSIX_FADV_SEQUENTIAL:
1070         case POSIX_FADV_NOREUSE:
1071                 ioflag |= sequential_heuristic(uio, fp);
1072                 break;
1073         case POSIX_FADV_RANDOM:
1074                 /* Disable read-ahead for random I/O. */
1075                 break;
1076         }
1077         orig_offset = uio->uio_offset;
1078
1079 #ifdef MAC
1080         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1081         if (error == 0)
1082 #endif
1083                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1084         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1085         VOP_UNLOCK(vp);
1086         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1087             orig_offset != uio->uio_offset)
1088                 /*
1089                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1090                  * for the backing file after a POSIX_FADV_NOREUSE
1091                  * read(2).
1092                  */
1093                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1094                     POSIX_FADV_DONTNEED);
1095         return (error);
1096 }
1097
1098 /*
1099  * File table vnode write routine.
1100  */
1101 static int
1102 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1103     struct thread *td)
1104 {
1105         struct vnode *vp;
1106         struct mount *mp;
1107         off_t orig_offset;
1108         int error, ioflag, lock_flags;
1109         int advice;
1110         bool need_finished_write;
1111
1112         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1113             uio->uio_td, td));
1114         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1115         vp = fp->f_vnode;
1116         if (vp->v_type == VREG)
1117                 bwillwrite();
1118         ioflag = IO_UNIT;
1119         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
1120                 ioflag |= IO_APPEND;
1121         if (fp->f_flag & FNONBLOCK)
1122                 ioflag |= IO_NDELAY;
1123         if (fp->f_flag & O_DIRECT)
1124                 ioflag |= IO_DIRECT;
1125         if (fp->f_flag & O_FSYNC) {
1126                 mp = atomic_load_ptr(&vp->v_mount);
1127                 if (mp != NULL && mp->mnt_flag & MNT_SYNCHRONOUS)
1128                         ioflag |= IO_SYNC;
1129         }
1130         /*
1131          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
1132          * implementations that don't understand IO_DATASYNC fall back to full
1133          * O_SYNC behavior.
1134          */
1135         if (fp->f_flag & O_DSYNC)
1136                 ioflag |= IO_SYNC | IO_DATASYNC;
1137         mp = NULL;
1138         need_finished_write = false;
1139         if (vp->v_type != VCHR) {
1140                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1141                 if (error != 0)
1142                         goto unlock;
1143                 need_finished_write = true;
1144         }
1145
1146         advice = get_advice(fp, uio);
1147
1148         if (MNT_SHARED_WRITES(mp) ||
1149             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
1150                 lock_flags = LK_SHARED;
1151         } else {
1152                 lock_flags = LK_EXCLUSIVE;
1153         }
1154
1155         vn_lock(vp, lock_flags | LK_RETRY);
1156         switch (advice) {
1157         case POSIX_FADV_NORMAL:
1158         case POSIX_FADV_SEQUENTIAL:
1159         case POSIX_FADV_NOREUSE:
1160                 ioflag |= sequential_heuristic(uio, fp);
1161                 break;
1162         case POSIX_FADV_RANDOM:
1163                 /* XXX: Is this correct? */
1164                 break;
1165         }
1166         orig_offset = uio->uio_offset;
1167
1168 #ifdef MAC
1169         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1170         if (error == 0)
1171 #endif
1172                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1173         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1174         VOP_UNLOCK(vp);
1175         if (need_finished_write)
1176                 vn_finished_write(mp);
1177         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1178             orig_offset != uio->uio_offset)
1179                 /*
1180                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1181                  * for the backing file after a POSIX_FADV_NOREUSE
1182                  * write(2).
1183                  */
1184                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1185                     POSIX_FADV_DONTNEED);
1186 unlock:
1187         return (error);
1188 }
1189
1190 /*
1191  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1192  * prevent the following deadlock:
1193  *
1194  * Assume that the thread A reads from the vnode vp1 into userspace
1195  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1196  * currently not resident, then system ends up with the call chain
1197  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1198  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1199  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1200  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1201  * backed by the pages of vnode vp1, and some page in buf2 is not
1202  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1203  *
1204  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1205  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1206  * Instead, it first tries to do the whole range i/o with pagefaults
1207  * disabled. If all pages in the i/o buffer are resident and mapped,
1208  * VOP will succeed (ignoring the genuine filesystem errors).
1209  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1210  * i/o in chunks, with all pages in the chunk prefaulted and held
1211  * using vm_fault_quick_hold_pages().
1212  *
1213  * Filesystems using this deadlock avoidance scheme should use the
1214  * array of the held pages from uio, saved in the curthread->td_ma,
1215  * instead of doing uiomove().  A helper function
1216  * vn_io_fault_uiomove() converts uiomove request into
1217  * uiomove_fromphys() over td_ma array.
1218  *
1219  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1220  * make the current i/o request atomic with respect to other i/os and
1221  * truncations.
1222  */
1223
1224 /*
1225  * Decode vn_io_fault_args and perform the corresponding i/o.
1226  */
1227 static int
1228 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1229     struct thread *td)
1230 {
1231         int error, save;
1232
1233         error = 0;
1234         save = vm_fault_disable_pagefaults();
1235         switch (args->kind) {
1236         case VN_IO_FAULT_FOP:
1237                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1238                     uio, args->cred, args->flags, td);
1239                 break;
1240         case VN_IO_FAULT_VOP:
1241                 if (uio->uio_rw == UIO_READ) {
1242                         error = VOP_READ(args->args.vop_args.vp, uio,
1243                             args->flags, args->cred);
1244                 } else if (uio->uio_rw == UIO_WRITE) {
1245                         error = VOP_WRITE(args->args.vop_args.vp, uio,
1246                             args->flags, args->cred);
1247                 }
1248                 break;
1249         default:
1250                 panic("vn_io_fault_doio: unknown kind of io %d %d",
1251                     args->kind, uio->uio_rw);
1252         }
1253         vm_fault_enable_pagefaults(save);
1254         return (error);
1255 }
1256
1257 static int
1258 vn_io_fault_touch(char *base, const struct uio *uio)
1259 {
1260         int r;
1261
1262         r = fubyte(base);
1263         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1264                 return (EFAULT);
1265         return (0);
1266 }
1267
1268 static int
1269 vn_io_fault_prefault_user(const struct uio *uio)
1270 {
1271         char *base;
1272         const struct iovec *iov;
1273         size_t len;
1274         ssize_t resid;
1275         int error, i;
1276
1277         KASSERT(uio->uio_segflg == UIO_USERSPACE,
1278             ("vn_io_fault_prefault userspace"));
1279
1280         error = i = 0;
1281         iov = uio->uio_iov;
1282         resid = uio->uio_resid;
1283         base = iov->iov_base;
1284         len = iov->iov_len;
1285         while (resid > 0) {
1286                 error = vn_io_fault_touch(base, uio);
1287                 if (error != 0)
1288                         break;
1289                 if (len < PAGE_SIZE) {
1290                         if (len != 0) {
1291                                 error = vn_io_fault_touch(base + len - 1, uio);
1292                                 if (error != 0)
1293                                         break;
1294                                 resid -= len;
1295                         }
1296                         if (++i >= uio->uio_iovcnt)
1297                                 break;
1298                         iov = uio->uio_iov + i;
1299                         base = iov->iov_base;
1300                         len = iov->iov_len;
1301                 } else {
1302                         len -= PAGE_SIZE;
1303                         base += PAGE_SIZE;
1304                         resid -= PAGE_SIZE;
1305                 }
1306         }
1307         return (error);
1308 }
1309
1310 /*
1311  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1312  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1313  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1314  * into args and call vn_io_fault1() to handle faults during the user
1315  * mode buffer accesses.
1316  */
1317 static int
1318 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1319     struct thread *td)
1320 {
1321         vm_page_t ma[io_hold_cnt + 2];
1322         struct uio *uio_clone, short_uio;
1323         struct iovec short_iovec[1];
1324         vm_page_t *prev_td_ma;
1325         vm_prot_t prot;
1326         vm_offset_t addr, end;
1327         size_t len, resid;
1328         ssize_t adv;
1329         int error, cnt, saveheld, prev_td_ma_cnt;
1330
1331         if (vn_io_fault_prefault) {
1332                 error = vn_io_fault_prefault_user(uio);
1333                 if (error != 0)
1334                         return (error); /* Or ignore ? */
1335         }
1336
1337         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1338
1339         /*
1340          * The UFS follows IO_UNIT directive and replays back both
1341          * uio_offset and uio_resid if an error is encountered during the
1342          * operation.  But, since the iovec may be already advanced,
1343          * uio is still in an inconsistent state.
1344          *
1345          * Cache a copy of the original uio, which is advanced to the redo
1346          * point using UIO_NOCOPY below.
1347          */
1348         uio_clone = cloneuio(uio);
1349         resid = uio->uio_resid;
1350
1351         short_uio.uio_segflg = UIO_USERSPACE;
1352         short_uio.uio_rw = uio->uio_rw;
1353         short_uio.uio_td = uio->uio_td;
1354
1355         error = vn_io_fault_doio(args, uio, td);
1356         if (error != EFAULT)
1357                 goto out;
1358
1359         atomic_add_long(&vn_io_faults_cnt, 1);
1360         uio_clone->uio_segflg = UIO_NOCOPY;
1361         uiomove(NULL, resid - uio->uio_resid, uio_clone);
1362         uio_clone->uio_segflg = uio->uio_segflg;
1363
1364         saveheld = curthread_pflags_set(TDP_UIOHELD);
1365         prev_td_ma = td->td_ma;
1366         prev_td_ma_cnt = td->td_ma_cnt;
1367
1368         while (uio_clone->uio_resid != 0) {
1369                 len = uio_clone->uio_iov->iov_len;
1370                 if (len == 0) {
1371                         KASSERT(uio_clone->uio_iovcnt >= 1,
1372                             ("iovcnt underflow"));
1373                         uio_clone->uio_iov++;
1374                         uio_clone->uio_iovcnt--;
1375                         continue;
1376                 }
1377                 if (len > ptoa(io_hold_cnt))
1378                         len = ptoa(io_hold_cnt);
1379                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1380                 end = round_page(addr + len);
1381                 if (end < addr) {
1382                         error = EFAULT;
1383                         break;
1384                 }
1385                 cnt = atop(end - trunc_page(addr));
1386                 /*
1387                  * A perfectly misaligned address and length could cause
1388                  * both the start and the end of the chunk to use partial
1389                  * page.  +2 accounts for such a situation.
1390                  */
1391                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1392                     addr, len, prot, ma, io_hold_cnt + 2);
1393                 if (cnt == -1) {
1394                         error = EFAULT;
1395                         break;
1396                 }
1397                 short_uio.uio_iov = &short_iovec[0];
1398                 short_iovec[0].iov_base = (void *)addr;
1399                 short_uio.uio_iovcnt = 1;
1400                 short_uio.uio_resid = short_iovec[0].iov_len = len;
1401                 short_uio.uio_offset = uio_clone->uio_offset;
1402                 td->td_ma = ma;
1403                 td->td_ma_cnt = cnt;
1404
1405                 error = vn_io_fault_doio(args, &short_uio, td);
1406                 vm_page_unhold_pages(ma, cnt);
1407                 adv = len - short_uio.uio_resid;
1408
1409                 uio_clone->uio_iov->iov_base =
1410                     (char *)uio_clone->uio_iov->iov_base + adv;
1411                 uio_clone->uio_iov->iov_len -= adv;
1412                 uio_clone->uio_resid -= adv;
1413                 uio_clone->uio_offset += adv;
1414
1415                 uio->uio_resid -= adv;
1416                 uio->uio_offset += adv;
1417
1418                 if (error != 0 || adv == 0)
1419                         break;
1420         }
1421         td->td_ma = prev_td_ma;
1422         td->td_ma_cnt = prev_td_ma_cnt;
1423         curthread_pflags_restore(saveheld);
1424 out:
1425         free(uio_clone, M_IOV);
1426         return (error);
1427 }
1428
1429 static int
1430 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1431     int flags, struct thread *td)
1432 {
1433         fo_rdwr_t *doio;
1434         struct vnode *vp;
1435         void *rl_cookie;
1436         struct vn_io_fault_args args;
1437         int error;
1438
1439         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1440         vp = fp->f_vnode;
1441
1442         /*
1443          * The ability to read(2) on a directory has historically been
1444          * allowed for all users, but this can and has been the source of
1445          * at least one security issue in the past.  As such, it is now hidden
1446          * away behind a sysctl for those that actually need it to use it, and
1447          * restricted to root when it's turned on to make it relatively safe to
1448          * leave on for longer sessions of need.
1449          */
1450         if (vp->v_type == VDIR) {
1451                 KASSERT(uio->uio_rw == UIO_READ,
1452                     ("illegal write attempted on a directory"));
1453                 if (!vfs_allow_read_dir)
1454                         return (EISDIR);
1455                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1456                         return (EISDIR);
1457         }
1458
1459         foffset_lock_uio(fp, uio, flags);
1460         if (do_vn_io_fault(vp, uio)) {
1461                 args.kind = VN_IO_FAULT_FOP;
1462                 args.args.fop_args.fp = fp;
1463                 args.args.fop_args.doio = doio;
1464                 args.cred = active_cred;
1465                 args.flags = flags | FOF_OFFSET;
1466                 if (uio->uio_rw == UIO_READ) {
1467                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1468                             uio->uio_offset + uio->uio_resid);
1469                 } else if ((fp->f_flag & O_APPEND) != 0 ||
1470                     (flags & FOF_OFFSET) == 0) {
1471                         /* For appenders, punt and lock the whole range. */
1472                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1473                 } else {
1474                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1475                             uio->uio_offset + uio->uio_resid);
1476                 }
1477                 error = vn_io_fault1(vp, uio, &args, td);
1478                 vn_rangelock_unlock(vp, rl_cookie);
1479         } else {
1480                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1481         }
1482         foffset_unlock_uio(fp, uio, flags);
1483         return (error);
1484 }
1485
1486 /*
1487  * Helper function to perform the requested uiomove operation using
1488  * the held pages for io->uio_iov[0].iov_base buffer instead of
1489  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1490  * instead of iov_base prevents page faults that could occur due to
1491  * pmap_collect() invalidating the mapping created by
1492  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1493  * object cleanup revoking the write access from page mappings.
1494  *
1495  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1496  * instead of plain uiomove().
1497  */
1498 int
1499 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1500 {
1501         struct uio transp_uio;
1502         struct iovec transp_iov[1];
1503         struct thread *td;
1504         size_t adv;
1505         int error, pgadv;
1506
1507         td = curthread;
1508         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1509             uio->uio_segflg != UIO_USERSPACE)
1510                 return (uiomove(data, xfersize, uio));
1511
1512         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1513         transp_iov[0].iov_base = data;
1514         transp_uio.uio_iov = &transp_iov[0];
1515         transp_uio.uio_iovcnt = 1;
1516         if (xfersize > uio->uio_resid)
1517                 xfersize = uio->uio_resid;
1518         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1519         transp_uio.uio_offset = 0;
1520         transp_uio.uio_segflg = UIO_SYSSPACE;
1521         /*
1522          * Since transp_iov points to data, and td_ma page array
1523          * corresponds to original uio->uio_iov, we need to invert the
1524          * direction of the i/o operation as passed to
1525          * uiomove_fromphys().
1526          */
1527         switch (uio->uio_rw) {
1528         case UIO_WRITE:
1529                 transp_uio.uio_rw = UIO_READ;
1530                 break;
1531         case UIO_READ:
1532                 transp_uio.uio_rw = UIO_WRITE;
1533                 break;
1534         }
1535         transp_uio.uio_td = uio->uio_td;
1536         error = uiomove_fromphys(td->td_ma,
1537             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1538             xfersize, &transp_uio);
1539         adv = xfersize - transp_uio.uio_resid;
1540         pgadv =
1541             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1542             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1543         td->td_ma += pgadv;
1544         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1545             pgadv));
1546         td->td_ma_cnt -= pgadv;
1547         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1548         uio->uio_iov->iov_len -= adv;
1549         uio->uio_resid -= adv;
1550         uio->uio_offset += adv;
1551         return (error);
1552 }
1553
1554 int
1555 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1556     struct uio *uio)
1557 {
1558         struct thread *td;
1559         vm_offset_t iov_base;
1560         int cnt, pgadv;
1561
1562         td = curthread;
1563         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1564             uio->uio_segflg != UIO_USERSPACE)
1565                 return (uiomove_fromphys(ma, offset, xfersize, uio));
1566
1567         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1568         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1569         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1570         switch (uio->uio_rw) {
1571         case UIO_WRITE:
1572                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1573                     offset, cnt);
1574                 break;
1575         case UIO_READ:
1576                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1577                     cnt);
1578                 break;
1579         }
1580         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1581         td->td_ma += pgadv;
1582         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1583             pgadv));
1584         td->td_ma_cnt -= pgadv;
1585         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1586         uio->uio_iov->iov_len -= cnt;
1587         uio->uio_resid -= cnt;
1588         uio->uio_offset += cnt;
1589         return (0);
1590 }
1591
1592 /*
1593  * File table truncate routine.
1594  */
1595 static int
1596 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1597     struct thread *td)
1598 {
1599         struct mount *mp;
1600         struct vnode *vp;
1601         void *rl_cookie;
1602         int error;
1603
1604         vp = fp->f_vnode;
1605
1606 retry:
1607         /*
1608          * Lock the whole range for truncation.  Otherwise split i/o
1609          * might happen partly before and partly after the truncation.
1610          */
1611         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1612         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1613         if (error)
1614                 goto out1;
1615         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1616         AUDIT_ARG_VNODE1(vp);
1617         if (vp->v_type == VDIR) {
1618                 error = EISDIR;
1619                 goto out;
1620         }
1621 #ifdef MAC
1622         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1623         if (error)
1624                 goto out;
1625 #endif
1626         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1627             fp->f_cred);
1628 out:
1629         VOP_UNLOCK(vp);
1630         vn_finished_write(mp);
1631 out1:
1632         vn_rangelock_unlock(vp, rl_cookie);
1633         if (error == ERELOOKUP)
1634                 goto retry;
1635         return (error);
1636 }
1637
1638 /*
1639  * Truncate a file that is already locked.
1640  */
1641 int
1642 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1643     struct ucred *cred)
1644 {
1645         struct vattr vattr;
1646         int error;
1647
1648         error = VOP_ADD_WRITECOUNT(vp, 1);
1649         if (error == 0) {
1650                 VATTR_NULL(&vattr);
1651                 vattr.va_size = length;
1652                 if (sync)
1653                         vattr.va_vaflags |= VA_SYNC;
1654                 error = VOP_SETATTR(vp, &vattr, cred);
1655                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1656         }
1657         return (error);
1658 }
1659
1660 /*
1661  * File table vnode stat routine.
1662  */
1663 int
1664 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
1665     struct thread *td)
1666 {
1667         struct vnode *vp = fp->f_vnode;
1668         int error;
1669
1670         vn_lock(vp, LK_SHARED | LK_RETRY);
1671         error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
1672         VOP_UNLOCK(vp);
1673
1674         return (error);
1675 }
1676
1677 /*
1678  * File table vnode ioctl routine.
1679  */
1680 static int
1681 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1682     struct thread *td)
1683 {
1684         struct vattr vattr;
1685         struct vnode *vp;
1686         struct fiobmap2_arg *bmarg;
1687         int error;
1688
1689         vp = fp->f_vnode;
1690         switch (vp->v_type) {
1691         case VDIR:
1692         case VREG:
1693                 switch (com) {
1694                 case FIONREAD:
1695                         vn_lock(vp, LK_SHARED | LK_RETRY);
1696                         error = VOP_GETATTR(vp, &vattr, active_cred);
1697                         VOP_UNLOCK(vp);
1698                         if (error == 0)
1699                                 *(int *)data = vattr.va_size - fp->f_offset;
1700                         return (error);
1701                 case FIOBMAP2:
1702                         bmarg = (struct fiobmap2_arg *)data;
1703                         vn_lock(vp, LK_SHARED | LK_RETRY);
1704 #ifdef MAC
1705                         error = mac_vnode_check_read(active_cred, fp->f_cred,
1706                             vp);
1707                         if (error == 0)
1708 #endif
1709                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
1710                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
1711                         VOP_UNLOCK(vp);
1712                         return (error);
1713                 case FIONBIO:
1714                 case FIOASYNC:
1715                         return (0);
1716                 default:
1717                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
1718                             active_cred, td));
1719                 }
1720                 break;
1721         case VCHR:
1722                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1723                     active_cred, td));
1724         default:
1725                 return (ENOTTY);
1726         }
1727 }
1728
1729 /*
1730  * File table vnode poll routine.
1731  */
1732 static int
1733 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1734     struct thread *td)
1735 {
1736         struct vnode *vp;
1737         int error;
1738
1739         vp = fp->f_vnode;
1740 #if defined(MAC) || defined(AUDIT)
1741         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1742                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1743                 AUDIT_ARG_VNODE1(vp);
1744                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1745                 VOP_UNLOCK(vp);
1746                 if (error != 0)
1747                         return (error);
1748         }
1749 #endif
1750         error = VOP_POLL(vp, events, fp->f_cred, td);
1751         return (error);
1752 }
1753
1754 /*
1755  * Acquire the requested lock and then check for validity.  LK_RETRY
1756  * permits vn_lock to return doomed vnodes.
1757  */
1758 static int __noinline
1759 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1760     int error)
1761 {
1762
1763         KASSERT((flags & LK_RETRY) == 0 || error == 0,
1764             ("vn_lock: error %d incompatible with flags %#x", error, flags));
1765
1766         if (error == 0)
1767                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1768
1769         if ((flags & LK_RETRY) == 0) {
1770                 if (error == 0) {
1771                         VOP_UNLOCK(vp);
1772                         error = ENOENT;
1773                 }
1774                 return (error);
1775         }
1776
1777         /*
1778          * LK_RETRY case.
1779          *
1780          * Nothing to do if we got the lock.
1781          */
1782         if (error == 0)
1783                 return (0);
1784
1785         /*
1786          * Interlock was dropped by the call in _vn_lock.
1787          */
1788         flags &= ~LK_INTERLOCK;
1789         do {
1790                 error = VOP_LOCK1(vp, flags, file, line);
1791         } while (error != 0);
1792         return (0);
1793 }
1794
1795 int
1796 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1797 {
1798         int error;
1799
1800         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1801             ("vn_lock: no locktype (%d passed)", flags));
1802         VNPASS(vp->v_holdcnt > 0, vp);
1803         error = VOP_LOCK1(vp, flags, file, line);
1804         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1805                 return (_vn_lock_fallback(vp, flags, file, line, error));
1806         return (0);
1807 }
1808
1809 /*
1810  * File table vnode close routine.
1811  */
1812 static int
1813 vn_closefile(struct file *fp, struct thread *td)
1814 {
1815         struct vnode *vp;
1816         struct flock lf;
1817         int error;
1818         bool ref;
1819
1820         vp = fp->f_vnode;
1821         fp->f_ops = &badfileops;
1822         ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1823
1824         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1825
1826         if (__predict_false(ref)) {
1827                 lf.l_whence = SEEK_SET;
1828                 lf.l_start = 0;
1829                 lf.l_len = 0;
1830                 lf.l_type = F_UNLCK;
1831                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1832                 vrele(vp);
1833         }
1834         return (error);
1835 }
1836
1837 /*
1838  * Preparing to start a filesystem write operation. If the operation is
1839  * permitted, then we bump the count of operations in progress and
1840  * proceed. If a suspend request is in progress, we wait until the
1841  * suspension is over, and then proceed.
1842  */
1843 static int
1844 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
1845 {
1846         struct mount_pcpu *mpcpu;
1847         int error, mflags;
1848
1849         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
1850             vfs_op_thread_enter(mp, mpcpu)) {
1851                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1852                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
1853                 vfs_op_thread_exit(mp, mpcpu);
1854                 return (0);
1855         }
1856
1857         if (mplocked)
1858                 mtx_assert(MNT_MTX(mp), MA_OWNED);
1859         else
1860                 MNT_ILOCK(mp);
1861
1862         error = 0;
1863
1864         /*
1865          * Check on status of suspension.
1866          */
1867         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1868             mp->mnt_susp_owner != curthread) {
1869                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1870                     (flags & PCATCH) : 0) | (PUSER - 1);
1871                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1872                         if (flags & V_NOWAIT) {
1873                                 error = EWOULDBLOCK;
1874                                 goto unlock;
1875                         }
1876                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1877                             "suspfs", 0);
1878                         if (error)
1879                                 goto unlock;
1880                 }
1881         }
1882         if (flags & V_XSLEEP)
1883                 goto unlock;
1884         mp->mnt_writeopcount++;
1885 unlock:
1886         if (error != 0 || (flags & V_XSLEEP) != 0)
1887                 MNT_REL(mp);
1888         MNT_IUNLOCK(mp);
1889         return (error);
1890 }
1891
1892 int
1893 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1894 {
1895         struct mount *mp;
1896         int error;
1897
1898         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1899             ("V_MNTREF requires mp"));
1900
1901         error = 0;
1902         /*
1903          * If a vnode is provided, get and return the mount point that
1904          * to which it will write.
1905          */
1906         if (vp != NULL) {
1907                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1908                         *mpp = NULL;
1909                         if (error != EOPNOTSUPP)
1910                                 return (error);
1911                         return (0);
1912                 }
1913         }
1914         if ((mp = *mpp) == NULL)
1915                 return (0);
1916
1917         /*
1918          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1919          * a vfs_ref().
1920          * As long as a vnode is not provided we need to acquire a
1921          * refcount for the provided mountpoint too, in order to
1922          * emulate a vfs_ref().
1923          */
1924         if (vp == NULL && (flags & V_MNTREF) == 0)
1925                 vfs_ref(mp);
1926
1927         return (vn_start_write_refed(mp, flags, false));
1928 }
1929
1930 /*
1931  * Secondary suspension. Used by operations such as vop_inactive
1932  * routines that are needed by the higher level functions. These
1933  * are allowed to proceed until all the higher level functions have
1934  * completed (indicated by mnt_writeopcount dropping to zero). At that
1935  * time, these operations are halted until the suspension is over.
1936  */
1937 int
1938 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1939 {
1940         struct mount *mp;
1941         int error;
1942
1943         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1944             ("V_MNTREF requires mp"));
1945
1946  retry:
1947         if (vp != NULL) {
1948                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1949                         *mpp = NULL;
1950                         if (error != EOPNOTSUPP)
1951                                 return (error);
1952                         return (0);
1953                 }
1954         }
1955         /*
1956          * If we are not suspended or have not yet reached suspended
1957          * mode, then let the operation proceed.
1958          */
1959         if ((mp = *mpp) == NULL)
1960                 return (0);
1961
1962         /*
1963          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1964          * a vfs_ref().
1965          * As long as a vnode is not provided we need to acquire a
1966          * refcount for the provided mountpoint too, in order to
1967          * emulate a vfs_ref().
1968          */
1969         MNT_ILOCK(mp);
1970         if (vp == NULL && (flags & V_MNTREF) == 0)
1971                 MNT_REF(mp);
1972         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1973                 mp->mnt_secondary_writes++;
1974                 mp->mnt_secondary_accwrites++;
1975                 MNT_IUNLOCK(mp);
1976                 return (0);
1977         }
1978         if (flags & V_NOWAIT) {
1979                 MNT_REL(mp);
1980                 MNT_IUNLOCK(mp);
1981                 return (EWOULDBLOCK);
1982         }
1983         /*
1984          * Wait for the suspension to finish.
1985          */
1986         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1987             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1988             "suspfs", 0);
1989         vfs_rel(mp);
1990         if (error == 0)
1991                 goto retry;
1992         return (error);
1993 }
1994
1995 /*
1996  * Filesystem write operation has completed. If we are suspending and this
1997  * operation is the last one, notify the suspender that the suspension is
1998  * now in effect.
1999  */
2000 void
2001 vn_finished_write(struct mount *mp)
2002 {
2003         struct mount_pcpu *mpcpu;
2004         int c;
2005
2006         if (mp == NULL)
2007                 return;
2008
2009         if (vfs_op_thread_enter(mp, mpcpu)) {
2010                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
2011                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
2012                 vfs_op_thread_exit(mp, mpcpu);
2013                 return;
2014         }
2015
2016         MNT_ILOCK(mp);
2017         vfs_assert_mount_counters(mp);
2018         MNT_REL(mp);
2019         c = --mp->mnt_writeopcount;
2020         if (mp->mnt_vfs_ops == 0) {
2021                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
2022                 MNT_IUNLOCK(mp);
2023                 return;
2024         }
2025         if (c < 0)
2026                 vfs_dump_mount_counters(mp);
2027         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
2028                 wakeup(&mp->mnt_writeopcount);
2029         MNT_IUNLOCK(mp);
2030 }
2031
2032 /*
2033  * Filesystem secondary write operation has completed. If we are
2034  * suspending and this operation is the last one, notify the suspender
2035  * that the suspension is now in effect.
2036  */
2037 void
2038 vn_finished_secondary_write(struct mount *mp)
2039 {
2040         if (mp == NULL)
2041                 return;
2042         MNT_ILOCK(mp);
2043         MNT_REL(mp);
2044         mp->mnt_secondary_writes--;
2045         if (mp->mnt_secondary_writes < 0)
2046                 panic("vn_finished_secondary_write: neg cnt");
2047         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
2048             mp->mnt_secondary_writes <= 0)
2049                 wakeup(&mp->mnt_secondary_writes);
2050         MNT_IUNLOCK(mp);
2051 }
2052
2053 /*
2054  * Request a filesystem to suspend write operations.
2055  */
2056 int
2057 vfs_write_suspend(struct mount *mp, int flags)
2058 {
2059         int error;
2060
2061         vfs_op_enter(mp);
2062
2063         MNT_ILOCK(mp);
2064         vfs_assert_mount_counters(mp);
2065         if (mp->mnt_susp_owner == curthread) {
2066                 vfs_op_exit_locked(mp);
2067                 MNT_IUNLOCK(mp);
2068                 return (EALREADY);
2069         }
2070         while (mp->mnt_kern_flag & MNTK_SUSPEND)
2071                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
2072
2073         /*
2074          * Unmount holds a write reference on the mount point.  If we
2075          * own busy reference and drain for writers, we deadlock with
2076          * the reference draining in the unmount path.  Callers of
2077          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2078          * vfs_busy() reference is owned and caller is not in the
2079          * unmount context.
2080          */
2081         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2082             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2083                 vfs_op_exit_locked(mp);
2084                 MNT_IUNLOCK(mp);
2085                 return (EBUSY);
2086         }
2087
2088         mp->mnt_kern_flag |= MNTK_SUSPEND;
2089         mp->mnt_susp_owner = curthread;
2090         if (mp->mnt_writeopcount > 0)
2091                 (void) msleep(&mp->mnt_writeopcount,
2092                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
2093         else
2094                 MNT_IUNLOCK(mp);
2095         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2096                 vfs_write_resume(mp, 0);
2097                 /* vfs_write_resume does vfs_op_exit() for us */
2098         }
2099         return (error);
2100 }
2101
2102 /*
2103  * Request a filesystem to resume write operations.
2104  */
2105 void
2106 vfs_write_resume(struct mount *mp, int flags)
2107 {
2108
2109         MNT_ILOCK(mp);
2110         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2111                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2112                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2113                                        MNTK_SUSPENDED);
2114                 mp->mnt_susp_owner = NULL;
2115                 wakeup(&mp->mnt_writeopcount);
2116                 wakeup(&mp->mnt_flag);
2117                 curthread->td_pflags &= ~TDP_IGNSUSP;
2118                 if ((flags & VR_START_WRITE) != 0) {
2119                         MNT_REF(mp);
2120                         mp->mnt_writeopcount++;
2121                 }
2122                 MNT_IUNLOCK(mp);
2123                 if ((flags & VR_NO_SUSPCLR) == 0)
2124                         VFS_SUSP_CLEAN(mp);
2125                 vfs_op_exit(mp);
2126         } else if ((flags & VR_START_WRITE) != 0) {
2127                 MNT_REF(mp);
2128                 vn_start_write_refed(mp, 0, true);
2129         } else {
2130                 MNT_IUNLOCK(mp);
2131         }
2132 }
2133
2134 /*
2135  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2136  * methods.
2137  */
2138 int
2139 vfs_write_suspend_umnt(struct mount *mp)
2140 {
2141         int error;
2142
2143         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2144             ("vfs_write_suspend_umnt: recursed"));
2145
2146         /* dounmount() already called vn_start_write(). */
2147         for (;;) {
2148                 vn_finished_write(mp);
2149                 error = vfs_write_suspend(mp, 0);
2150                 if (error != 0) {
2151                         vn_start_write(NULL, &mp, V_WAIT);
2152                         return (error);
2153                 }
2154                 MNT_ILOCK(mp);
2155                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2156                         break;
2157                 MNT_IUNLOCK(mp);
2158                 vn_start_write(NULL, &mp, V_WAIT);
2159         }
2160         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2161         wakeup(&mp->mnt_flag);
2162         MNT_IUNLOCK(mp);
2163         curthread->td_pflags |= TDP_IGNSUSP;
2164         return (0);
2165 }
2166
2167 /*
2168  * Implement kqueues for files by translating it to vnode operation.
2169  */
2170 static int
2171 vn_kqfilter(struct file *fp, struct knote *kn)
2172 {
2173
2174         return (VOP_KQFILTER(fp->f_vnode, kn));
2175 }
2176
2177 int
2178 vn_kqfilter_opath(struct file *fp, struct knote *kn)
2179 {
2180         if ((fp->f_flag & FKQALLOWED) == 0)
2181                 return (EBADF);
2182         return (vn_kqfilter(fp, kn));
2183 }
2184
2185 /*
2186  * Simplified in-kernel wrapper calls for extended attribute access.
2187  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2188  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2189  */
2190 int
2191 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2192     const char *attrname, int *buflen, char *buf, struct thread *td)
2193 {
2194         struct uio      auio;
2195         struct iovec    iov;
2196         int     error;
2197
2198         iov.iov_len = *buflen;
2199         iov.iov_base = buf;
2200
2201         auio.uio_iov = &iov;
2202         auio.uio_iovcnt = 1;
2203         auio.uio_rw = UIO_READ;
2204         auio.uio_segflg = UIO_SYSSPACE;
2205         auio.uio_td = td;
2206         auio.uio_offset = 0;
2207         auio.uio_resid = *buflen;
2208
2209         if ((ioflg & IO_NODELOCKED) == 0)
2210                 vn_lock(vp, LK_SHARED | LK_RETRY);
2211
2212         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2213
2214         /* authorize attribute retrieval as kernel */
2215         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2216             td);
2217
2218         if ((ioflg & IO_NODELOCKED) == 0)
2219                 VOP_UNLOCK(vp);
2220
2221         if (error == 0) {
2222                 *buflen = *buflen - auio.uio_resid;
2223         }
2224
2225         return (error);
2226 }
2227
2228 /*
2229  * XXX failure mode if partially written?
2230  */
2231 int
2232 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2233     const char *attrname, int buflen, char *buf, struct thread *td)
2234 {
2235         struct uio      auio;
2236         struct iovec    iov;
2237         struct mount    *mp;
2238         int     error;
2239
2240         iov.iov_len = buflen;
2241         iov.iov_base = buf;
2242
2243         auio.uio_iov = &iov;
2244         auio.uio_iovcnt = 1;
2245         auio.uio_rw = UIO_WRITE;
2246         auio.uio_segflg = UIO_SYSSPACE;
2247         auio.uio_td = td;
2248         auio.uio_offset = 0;
2249         auio.uio_resid = buflen;
2250
2251         if ((ioflg & IO_NODELOCKED) == 0) {
2252                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2253                         return (error);
2254                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2255         }
2256
2257         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2258
2259         /* authorize attribute setting as kernel */
2260         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2261
2262         if ((ioflg & IO_NODELOCKED) == 0) {
2263                 vn_finished_write(mp);
2264                 VOP_UNLOCK(vp);
2265         }
2266
2267         return (error);
2268 }
2269
2270 int
2271 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2272     const char *attrname, struct thread *td)
2273 {
2274         struct mount    *mp;
2275         int     error;
2276
2277         if ((ioflg & IO_NODELOCKED) == 0) {
2278                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2279                         return (error);
2280                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2281         }
2282
2283         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2284
2285         /* authorize attribute removal as kernel */
2286         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2287         if (error == EOPNOTSUPP)
2288                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2289                     NULL, td);
2290
2291         if ((ioflg & IO_NODELOCKED) == 0) {
2292                 vn_finished_write(mp);
2293                 VOP_UNLOCK(vp);
2294         }
2295
2296         return (error);
2297 }
2298
2299 static int
2300 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2301     struct vnode **rvp)
2302 {
2303
2304         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2305 }
2306
2307 int
2308 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2309 {
2310
2311         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2312             lkflags, rvp));
2313 }
2314
2315 int
2316 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2317     int lkflags, struct vnode **rvp)
2318 {
2319         struct mount *mp;
2320         int ltype, error;
2321
2322         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2323         mp = vp->v_mount;
2324         ltype = VOP_ISLOCKED(vp);
2325         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2326             ("vn_vget_ino: vp not locked"));
2327         error = vfs_busy(mp, MBF_NOWAIT);
2328         if (error != 0) {
2329                 vfs_ref(mp);
2330                 VOP_UNLOCK(vp);
2331                 error = vfs_busy(mp, 0);
2332                 vn_lock(vp, ltype | LK_RETRY);
2333                 vfs_rel(mp);
2334                 if (error != 0)
2335                         return (ENOENT);
2336                 if (VN_IS_DOOMED(vp)) {
2337                         vfs_unbusy(mp);
2338                         return (ENOENT);
2339                 }
2340         }
2341         VOP_UNLOCK(vp);
2342         error = alloc(mp, alloc_arg, lkflags, rvp);
2343         vfs_unbusy(mp);
2344         if (error != 0 || *rvp != vp)
2345                 vn_lock(vp, ltype | LK_RETRY);
2346         if (VN_IS_DOOMED(vp)) {
2347                 if (error == 0) {
2348                         if (*rvp == vp)
2349                                 vunref(vp);
2350                         else
2351                                 vput(*rvp);
2352                 }
2353                 error = ENOENT;
2354         }
2355         return (error);
2356 }
2357
2358 int
2359 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2360     struct thread *td)
2361 {
2362
2363         if (vp->v_type != VREG || td == NULL)
2364                 return (0);
2365         if ((uoff_t)uio->uio_offset + uio->uio_resid >
2366             lim_cur(td, RLIMIT_FSIZE)) {
2367                 PROC_LOCK(td->td_proc);
2368                 kern_psignal(td->td_proc, SIGXFSZ);
2369                 PROC_UNLOCK(td->td_proc);
2370                 return (EFBIG);
2371         }
2372         return (0);
2373 }
2374
2375 int
2376 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2377     struct thread *td)
2378 {
2379         struct vnode *vp;
2380
2381         vp = fp->f_vnode;
2382 #ifdef AUDIT
2383         vn_lock(vp, LK_SHARED | LK_RETRY);
2384         AUDIT_ARG_VNODE1(vp);
2385         VOP_UNLOCK(vp);
2386 #endif
2387         return (setfmode(td, active_cred, vp, mode));
2388 }
2389
2390 int
2391 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2392     struct thread *td)
2393 {
2394         struct vnode *vp;
2395
2396         vp = fp->f_vnode;
2397 #ifdef AUDIT
2398         vn_lock(vp, LK_SHARED | LK_RETRY);
2399         AUDIT_ARG_VNODE1(vp);
2400         VOP_UNLOCK(vp);
2401 #endif
2402         return (setfown(td, active_cred, vp, uid, gid));
2403 }
2404
2405 void
2406 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2407 {
2408         vm_object_t object;
2409
2410         if ((object = vp->v_object) == NULL)
2411                 return;
2412         VM_OBJECT_WLOCK(object);
2413         vm_object_page_remove(object, start, end, 0);
2414         VM_OBJECT_WUNLOCK(object);
2415 }
2416
2417 int
2418 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2419 {
2420         struct vattr va;
2421         daddr_t bn, bnp;
2422         uint64_t bsize;
2423         off_t noff;
2424         int error;
2425
2426         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2427             ("Wrong command %lu", cmd));
2428
2429         if (vn_lock(vp, LK_SHARED) != 0)
2430                 return (EBADF);
2431         if (vp->v_type != VREG) {
2432                 error = ENOTTY;
2433                 goto unlock;
2434         }
2435         error = VOP_GETATTR(vp, &va, cred);
2436         if (error != 0)
2437                 goto unlock;
2438         noff = *off;
2439         if (noff >= va.va_size) {
2440                 error = ENXIO;
2441                 goto unlock;
2442         }
2443         bsize = vp->v_mount->mnt_stat.f_iosize;
2444         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2445             noff % bsize) {
2446                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2447                 if (error == EOPNOTSUPP) {
2448                         error = ENOTTY;
2449                         goto unlock;
2450                 }
2451                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2452                     (bnp != -1 && cmd == FIOSEEKDATA)) {
2453                         noff = bn * bsize;
2454                         if (noff < *off)
2455                                 noff = *off;
2456                         goto unlock;
2457                 }
2458         }
2459         if (noff > va.va_size)
2460                 noff = va.va_size;
2461         /* noff == va.va_size. There is an implicit hole at the end of file. */
2462         if (cmd == FIOSEEKDATA)
2463                 error = ENXIO;
2464 unlock:
2465         VOP_UNLOCK(vp);
2466         if (error == 0)
2467                 *off = noff;
2468         return (error);
2469 }
2470
2471 int
2472 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2473 {
2474         struct ucred *cred;
2475         struct vnode *vp;
2476         struct vattr vattr;
2477         off_t foffset, size;
2478         int error, noneg;
2479
2480         cred = td->td_ucred;
2481         vp = fp->f_vnode;
2482         foffset = foffset_lock(fp, 0);
2483         noneg = (vp->v_type != VCHR);
2484         error = 0;
2485         switch (whence) {
2486         case L_INCR:
2487                 if (noneg &&
2488                     (foffset < 0 ||
2489                     (offset > 0 && foffset > OFF_MAX - offset))) {
2490                         error = EOVERFLOW;
2491                         break;
2492                 }
2493                 offset += foffset;
2494                 break;
2495         case L_XTND:
2496                 vn_lock(vp, LK_SHARED | LK_RETRY);
2497                 error = VOP_GETATTR(vp, &vattr, cred);
2498                 VOP_UNLOCK(vp);
2499                 if (error)
2500                         break;
2501
2502                 /*
2503                  * If the file references a disk device, then fetch
2504                  * the media size and use that to determine the ending
2505                  * offset.
2506                  */
2507                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2508                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2509                         vattr.va_size = size;
2510                 if (noneg &&
2511                     (vattr.va_size > OFF_MAX ||
2512                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2513                         error = EOVERFLOW;
2514                         break;
2515                 }
2516                 offset += vattr.va_size;
2517                 break;
2518         case L_SET:
2519                 break;
2520         case SEEK_DATA:
2521                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2522                 if (error == ENOTTY)
2523                         error = EINVAL;
2524                 break;
2525         case SEEK_HOLE:
2526                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2527                 if (error == ENOTTY)
2528                         error = EINVAL;
2529                 break;
2530         default:
2531                 error = EINVAL;
2532         }
2533         if (error == 0 && noneg && offset < 0)
2534                 error = EINVAL;
2535         if (error != 0)
2536                 goto drop;
2537         VFS_KNOTE_UNLOCKED(vp, 0);
2538         td->td_uretoff.tdu_off = offset;
2539 drop:
2540         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2541         return (error);
2542 }
2543
2544 int
2545 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2546     struct thread *td)
2547 {
2548         int error;
2549
2550         /*
2551          * Grant permission if the caller is the owner of the file, or
2552          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2553          * on the file.  If the time pointer is null, then write
2554          * permission on the file is also sufficient.
2555          *
2556          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2557          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2558          * will be allowed to set the times [..] to the current
2559          * server time.
2560          */
2561         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2562         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2563                 error = VOP_ACCESS(vp, VWRITE, cred, td);
2564         return (error);
2565 }
2566
2567 int
2568 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2569 {
2570         struct vnode *vp;
2571         int error;
2572
2573         if (fp->f_type == DTYPE_FIFO)
2574                 kif->kf_type = KF_TYPE_FIFO;
2575         else
2576                 kif->kf_type = KF_TYPE_VNODE;
2577         vp = fp->f_vnode;
2578         vref(vp);
2579         FILEDESC_SUNLOCK(fdp);
2580         error = vn_fill_kinfo_vnode(vp, kif);
2581         vrele(vp);
2582         FILEDESC_SLOCK(fdp);
2583         return (error);
2584 }
2585
2586 static inline void
2587 vn_fill_junk(struct kinfo_file *kif)
2588 {
2589         size_t len, olen;
2590
2591         /*
2592          * Simulate vn_fullpath returning changing values for a given
2593          * vp during e.g. coredump.
2594          */
2595         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2596         olen = strlen(kif->kf_path);
2597         if (len < olen)
2598                 strcpy(&kif->kf_path[len - 1], "$");
2599         else
2600                 for (; olen < len; olen++)
2601                         strcpy(&kif->kf_path[olen], "A");
2602 }
2603
2604 int
2605 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2606 {
2607         struct vattr va;
2608         char *fullpath, *freepath;
2609         int error;
2610
2611         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2612         freepath = NULL;
2613         fullpath = "-";
2614         error = vn_fullpath(vp, &fullpath, &freepath);
2615         if (error == 0) {
2616                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2617         }
2618         if (freepath != NULL)
2619                 free(freepath, M_TEMP);
2620
2621         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2622                 vn_fill_junk(kif);
2623         );
2624
2625         /*
2626          * Retrieve vnode attributes.
2627          */
2628         va.va_fsid = VNOVAL;
2629         va.va_rdev = NODEV;
2630         vn_lock(vp, LK_SHARED | LK_RETRY);
2631         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2632         VOP_UNLOCK(vp);
2633         if (error != 0)
2634                 return (error);
2635         if (va.va_fsid != VNOVAL)
2636                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2637         else
2638                 kif->kf_un.kf_file.kf_file_fsid =
2639                     vp->v_mount->mnt_stat.f_fsid.val[0];
2640         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2641             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2642         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2643         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2644         kif->kf_un.kf_file.kf_file_size = va.va_size;
2645         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2646         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2647             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2648         return (0);
2649 }
2650
2651 int
2652 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2653     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2654     struct thread *td)
2655 {
2656 #ifdef HWPMC_HOOKS
2657         struct pmckern_map_in pkm;
2658 #endif
2659         struct mount *mp;
2660         struct vnode *vp;
2661         vm_object_t object;
2662         vm_prot_t maxprot;
2663         boolean_t writecounted;
2664         int error;
2665
2666 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2667     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2668         /*
2669          * POSIX shared-memory objects are defined to have
2670          * kernel persistence, and are not defined to support
2671          * read(2)/write(2) -- or even open(2).  Thus, we can
2672          * use MAP_ASYNC to trade on-disk coherence for speed.
2673          * The shm_open(3) library routine turns on the FPOSIXSHM
2674          * flag to request this behavior.
2675          */
2676         if ((fp->f_flag & FPOSIXSHM) != 0)
2677                 flags |= MAP_NOSYNC;
2678 #endif
2679         vp = fp->f_vnode;
2680
2681         /*
2682          * Ensure that file and memory protections are
2683          * compatible.  Note that we only worry about
2684          * writability if mapping is shared; in this case,
2685          * current and max prot are dictated by the open file.
2686          * XXX use the vnode instead?  Problem is: what
2687          * credentials do we use for determination? What if
2688          * proc does a setuid?
2689          */
2690         mp = vp->v_mount;
2691         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2692                 maxprot = VM_PROT_NONE;
2693                 if ((prot & VM_PROT_EXECUTE) != 0)
2694                         return (EACCES);
2695         } else
2696                 maxprot = VM_PROT_EXECUTE;
2697         if ((fp->f_flag & FREAD) != 0)
2698                 maxprot |= VM_PROT_READ;
2699         else if ((prot & VM_PROT_READ) != 0)
2700                 return (EACCES);
2701
2702         /*
2703          * If we are sharing potential changes via MAP_SHARED and we
2704          * are trying to get write permission although we opened it
2705          * without asking for it, bail out.
2706          */
2707         if ((flags & MAP_SHARED) != 0) {
2708                 if ((fp->f_flag & FWRITE) != 0)
2709                         maxprot |= VM_PROT_WRITE;
2710                 else if ((prot & VM_PROT_WRITE) != 0)
2711                         return (EACCES);
2712         } else {
2713                 maxprot |= VM_PROT_WRITE;
2714                 cap_maxprot |= VM_PROT_WRITE;
2715         }
2716         maxprot &= cap_maxprot;
2717
2718         /*
2719          * For regular files and shared memory, POSIX requires that
2720          * the value of foff be a legitimate offset within the data
2721          * object.  In particular, negative offsets are invalid.
2722          * Blocking negative offsets and overflows here avoids
2723          * possible wraparound or user-level access into reserved
2724          * ranges of the data object later.  In contrast, POSIX does
2725          * not dictate how offsets are used by device drivers, so in
2726          * the case of a device mapping a negative offset is passed
2727          * on.
2728          */
2729         if (
2730 #ifdef _LP64
2731             size > OFF_MAX ||
2732 #endif
2733             foff > OFF_MAX - size)
2734                 return (EINVAL);
2735
2736         writecounted = FALSE;
2737         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2738             &foff, &object, &writecounted);
2739         if (error != 0)
2740                 return (error);
2741         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2742             foff, writecounted, td);
2743         if (error != 0) {
2744                 /*
2745                  * If this mapping was accounted for in the vnode's
2746                  * writecount, then undo that now.
2747                  */
2748                 if (writecounted)
2749                         vm_pager_release_writecount(object, 0, size);
2750                 vm_object_deallocate(object);
2751         }
2752 #ifdef HWPMC_HOOKS
2753         /* Inform hwpmc(4) if an executable is being mapped. */
2754         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2755                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2756                         pkm.pm_file = vp;
2757                         pkm.pm_address = (uintptr_t) *addr;
2758                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2759                 }
2760         }
2761 #endif
2762         return (error);
2763 }
2764
2765 void
2766 vn_fsid(struct vnode *vp, struct vattr *va)
2767 {
2768         fsid_t *f;
2769
2770         f = &vp->v_mount->mnt_stat.f_fsid;
2771         va->va_fsid = (uint32_t)f->val[1];
2772         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2773         va->va_fsid += (uint32_t)f->val[0];
2774 }
2775
2776 int
2777 vn_fsync_buf(struct vnode *vp, int waitfor)
2778 {
2779         struct buf *bp, *nbp;
2780         struct bufobj *bo;
2781         struct mount *mp;
2782         int error, maxretry;
2783
2784         error = 0;
2785         maxretry = 10000;     /* large, arbitrarily chosen */
2786         mp = NULL;
2787         if (vp->v_type == VCHR) {
2788                 VI_LOCK(vp);
2789                 mp = vp->v_rdev->si_mountpt;
2790                 VI_UNLOCK(vp);
2791         }
2792         bo = &vp->v_bufobj;
2793         BO_LOCK(bo);
2794 loop1:
2795         /*
2796          * MARK/SCAN initialization to avoid infinite loops.
2797          */
2798         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2799                 bp->b_vflags &= ~BV_SCANNED;
2800                 bp->b_error = 0;
2801         }
2802
2803         /*
2804          * Flush all dirty buffers associated with a vnode.
2805          */
2806 loop2:
2807         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2808                 if ((bp->b_vflags & BV_SCANNED) != 0)
2809                         continue;
2810                 bp->b_vflags |= BV_SCANNED;
2811                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2812                         if (waitfor != MNT_WAIT)
2813                                 continue;
2814                         if (BUF_LOCK(bp,
2815                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
2816                             BO_LOCKPTR(bo)) != 0) {
2817                                 BO_LOCK(bo);
2818                                 goto loop1;
2819                         }
2820                         BO_LOCK(bo);
2821                 }
2822                 BO_UNLOCK(bo);
2823                 KASSERT(bp->b_bufobj == bo,
2824                     ("bp %p wrong b_bufobj %p should be %p",
2825                     bp, bp->b_bufobj, bo));
2826                 if ((bp->b_flags & B_DELWRI) == 0)
2827                         panic("fsync: not dirty");
2828                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2829                         vfs_bio_awrite(bp);
2830                 } else {
2831                         bremfree(bp);
2832                         bawrite(bp);
2833                 }
2834                 if (maxretry < 1000)
2835                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
2836                 BO_LOCK(bo);
2837                 goto loop2;
2838         }
2839
2840         /*
2841          * If synchronous the caller expects us to completely resolve all
2842          * dirty buffers in the system.  Wait for in-progress I/O to
2843          * complete (which could include background bitmap writes), then
2844          * retry if dirty blocks still exist.
2845          */
2846         if (waitfor == MNT_WAIT) {
2847                 bufobj_wwait(bo, 0, 0);
2848                 if (bo->bo_dirty.bv_cnt > 0) {
2849                         /*
2850                          * If we are unable to write any of these buffers
2851                          * then we fail now rather than trying endlessly
2852                          * to write them out.
2853                          */
2854                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2855                                 if ((error = bp->b_error) != 0)
2856                                         break;
2857                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
2858                             (error == 0 && --maxretry >= 0))
2859                                 goto loop1;
2860                         if (error == 0)
2861                                 error = EAGAIN;
2862                 }
2863         }
2864         BO_UNLOCK(bo);
2865         if (error != 0)
2866                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2867
2868         return (error);
2869 }
2870
2871 /*
2872  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
2873  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
2874  * to do the actual copy.
2875  * vn_generic_copy_file_range() is factored out, so it can be called
2876  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
2877  * different file systems.
2878  */
2879 int
2880 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
2881     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
2882     struct ucred *outcred, struct thread *fsize_td)
2883 {
2884         int error;
2885         size_t len;
2886         uint64_t uval;
2887
2888         len = *lenp;
2889         *lenp = 0;              /* For error returns. */
2890         error = 0;
2891
2892         /* Do some sanity checks on the arguments. */
2893         if (invp->v_type == VDIR || outvp->v_type == VDIR)
2894                 error = EISDIR;
2895         else if (*inoffp < 0 || *outoffp < 0 ||
2896             invp->v_type != VREG || outvp->v_type != VREG)
2897                 error = EINVAL;
2898         if (error != 0)
2899                 goto out;
2900
2901         /* Ensure offset + len does not wrap around. */
2902         uval = *inoffp;
2903         uval += len;
2904         if (uval > INT64_MAX)
2905                 len = INT64_MAX - *inoffp;
2906         uval = *outoffp;
2907         uval += len;
2908         if (uval > INT64_MAX)
2909                 len = INT64_MAX - *outoffp;
2910         if (len == 0)
2911                 goto out;
2912
2913         /*
2914          * If the two vnode are for the same file system, call
2915          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
2916          * which can handle copies across multiple file systems.
2917          */
2918         *lenp = len;
2919         if (invp->v_mount == outvp->v_mount)
2920                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
2921                     lenp, flags, incred, outcred, fsize_td);
2922         else
2923                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
2924                     outoffp, lenp, flags, incred, outcred, fsize_td);
2925 out:
2926         return (error);
2927 }
2928
2929 /*
2930  * Test len bytes of data starting at dat for all bytes == 0.
2931  * Return true if all bytes are zero, false otherwise.
2932  * Expects dat to be well aligned.
2933  */
2934 static bool
2935 mem_iszero(void *dat, int len)
2936 {
2937         int i;
2938         const u_int *p;
2939         const char *cp;
2940
2941         for (p = dat; len > 0; len -= sizeof(*p), p++) {
2942                 if (len >= sizeof(*p)) {
2943                         if (*p != 0)
2944                                 return (false);
2945                 } else {
2946                         cp = (const char *)p;
2947                         for (i = 0; i < len; i++, cp++)
2948                                 if (*cp != '\0')
2949                                         return (false);
2950                 }
2951         }
2952         return (true);
2953 }
2954
2955 /*
2956  * Look for a hole in the output file and, if found, adjust *outoffp
2957  * and *xferp to skip past the hole.
2958  * *xferp is the entire hole length to be written and xfer2 is how many bytes
2959  * to be written as 0's upon return.
2960  */
2961 static off_t
2962 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
2963     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
2964 {
2965         int error;
2966         off_t delta;
2967
2968         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
2969                 *dataoffp = *outoffp;
2970                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
2971                     curthread);
2972                 if (error == 0) {
2973                         *holeoffp = *dataoffp;
2974                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
2975                             curthread);
2976                 }
2977                 if (error != 0 || *holeoffp == *dataoffp) {
2978                         /*
2979                          * Since outvp is unlocked, it may be possible for
2980                          * another thread to do a truncate(), lseek(), write()
2981                          * creating a hole at startoff between the above
2982                          * VOP_IOCTL() calls, if the other thread does not do
2983                          * rangelocking.
2984                          * If that happens, *holeoffp == *dataoffp and finding
2985                          * the hole has failed, so disable vn_skip_hole().
2986                          */
2987                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
2988                         return (xfer2);
2989                 }
2990                 KASSERT(*dataoffp >= *outoffp,
2991                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
2992                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
2993                 KASSERT(*holeoffp > *dataoffp,
2994                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
2995                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
2996         }
2997
2998         /*
2999          * If there is a hole before the data starts, advance *outoffp and
3000          * *xferp past the hole.
3001          */
3002         if (*dataoffp > *outoffp) {
3003                 delta = *dataoffp - *outoffp;
3004                 if (delta >= *xferp) {
3005                         /* Entire *xferp is a hole. */
3006                         *outoffp += *xferp;
3007                         *xferp = 0;
3008                         return (0);
3009                 }
3010                 *xferp -= delta;
3011                 *outoffp += delta;
3012                 xfer2 = MIN(xfer2, *xferp);
3013         }
3014
3015         /*
3016          * If a hole starts before the end of this xfer2, reduce this xfer2 so
3017          * that the write ends at the start of the hole.
3018          * *holeoffp should always be greater than *outoffp, but for the
3019          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
3020          * value.
3021          */
3022         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
3023                 xfer2 = *holeoffp - *outoffp;
3024         return (xfer2);
3025 }
3026
3027 /*
3028  * Write an xfer sized chunk to outvp in blksize blocks from dat.
3029  * dat is a maximum of blksize in length and can be written repeatedly in
3030  * the chunk.
3031  * If growfile == true, just grow the file via vn_truncate_locked() instead
3032  * of doing actual writes.
3033  * If checkhole == true, a hole is being punched, so skip over any hole
3034  * already in the output file.
3035  */
3036 static int
3037 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
3038     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
3039 {
3040         struct mount *mp;
3041         off_t dataoff, holeoff, xfer2;
3042         int error, lckf;
3043
3044         /*
3045          * Loop around doing writes of blksize until write has been completed.
3046          * Lock/unlock on each loop iteration so that a bwillwrite() can be
3047          * done for each iteration, since the xfer argument can be very
3048          * large if there is a large hole to punch in the output file.
3049          */
3050         error = 0;
3051         holeoff = 0;
3052         do {
3053                 xfer2 = MIN(xfer, blksize);
3054                 if (checkhole) {
3055                         /*
3056                          * Punching a hole.  Skip writing if there is
3057                          * already a hole in the output file.
3058                          */
3059                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
3060                             &dataoff, &holeoff, cred);
3061                         if (xfer == 0)
3062                                 break;
3063                         if (holeoff < 0)
3064                                 checkhole = false;
3065                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
3066                             (intmax_t)xfer2));
3067                 }
3068                 bwillwrite();
3069                 mp = NULL;
3070                 error = vn_start_write(outvp, &mp, V_WAIT);
3071                 if (error != 0)
3072                         break;
3073                 if (growfile) {
3074                         error = vn_lock(outvp, LK_EXCLUSIVE);
3075                         if (error == 0) {
3076                                 error = vn_truncate_locked(outvp, outoff + xfer,
3077                                     false, cred);
3078                                 VOP_UNLOCK(outvp);
3079                         }
3080                 } else {
3081                         if (MNT_SHARED_WRITES(mp))
3082                                 lckf = LK_SHARED;
3083                         else
3084                                 lckf = LK_EXCLUSIVE;
3085                         error = vn_lock(outvp, lckf);
3086                         if (error == 0) {
3087                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3088                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
3089                                     curthread->td_ucred, cred, NULL, curthread);
3090                                 outoff += xfer2;
3091                                 xfer -= xfer2;
3092                                 VOP_UNLOCK(outvp);
3093                         }
3094                 }
3095                 if (mp != NULL)
3096                         vn_finished_write(mp);
3097         } while (!growfile && xfer > 0 && error == 0);
3098         return (error);
3099 }
3100
3101 /*
3102  * Copy a byte range of one file to another.  This function can handle the
3103  * case where invp and outvp are on different file systems.
3104  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3105  * is no better file system specific way to do it.
3106  */
3107 int
3108 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3109     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3110     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3111 {
3112         struct vattr va, inva;
3113         struct mount *mp;
3114         struct uio io;
3115         off_t startoff, endoff, xfer, xfer2;
3116         u_long blksize;
3117         int error, interrupted;
3118         bool cantseek, readzeros, eof, lastblock, holetoeof;
3119         ssize_t aresid;
3120         size_t copylen, len, rem, savlen;
3121         char *dat;
3122         long holein, holeout;
3123
3124         holein = holeout = 0;
3125         savlen = len = *lenp;
3126         error = 0;
3127         interrupted = 0;
3128         dat = NULL;
3129
3130         error = vn_lock(invp, LK_SHARED);
3131         if (error != 0)
3132                 goto out;
3133         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3134                 holein = 0;
3135         if (holein > 0)
3136                 error = VOP_GETATTR(invp, &inva, incred);
3137         VOP_UNLOCK(invp);
3138         if (error != 0)
3139                 goto out;
3140
3141         mp = NULL;
3142         error = vn_start_write(outvp, &mp, V_WAIT);
3143         if (error == 0)
3144                 error = vn_lock(outvp, LK_EXCLUSIVE);
3145         if (error == 0) {
3146                 /*
3147                  * If fsize_td != NULL, do a vn_rlimit_fsize() call,
3148                  * now that outvp is locked.
3149                  */
3150                 if (fsize_td != NULL) {
3151                         io.uio_offset = *outoffp;
3152                         io.uio_resid = len;
3153                         error = vn_rlimit_fsize(outvp, &io, fsize_td);
3154                         if (error != 0)
3155                                 error = EFBIG;
3156                 }
3157                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3158                         holeout = 0;
3159                 /*
3160                  * Holes that are past EOF do not need to be written as a block
3161                  * of zero bytes.  So, truncate the output file as far as
3162                  * possible and then use va.va_size to decide if writing 0
3163                  * bytes is necessary in the loop below.
3164                  */
3165                 if (error == 0)
3166                         error = VOP_GETATTR(outvp, &va, outcred);
3167                 if (error == 0 && va.va_size > *outoffp && va.va_size <=
3168                     *outoffp + len) {
3169 #ifdef MAC
3170                         error = mac_vnode_check_write(curthread->td_ucred,
3171                             outcred, outvp);
3172                         if (error == 0)
3173 #endif
3174                                 error = vn_truncate_locked(outvp, *outoffp,
3175                                     false, outcred);
3176                         if (error == 0)
3177                                 va.va_size = *outoffp;
3178                 }
3179                 VOP_UNLOCK(outvp);
3180         }
3181         if (mp != NULL)
3182                 vn_finished_write(mp);
3183         if (error != 0)
3184                 goto out;
3185
3186         /*
3187          * Set the blksize to the larger of the hole sizes for invp and outvp.
3188          * If hole sizes aren't available, set the blksize to the larger
3189          * f_iosize of invp and outvp.
3190          * This code expects the hole sizes and f_iosizes to be powers of 2.
3191          * This value is clipped at 4Kbytes and 1Mbyte.
3192          */
3193         blksize = MAX(holein, holeout);
3194
3195         /* Clip len to end at an exact multiple of hole size. */
3196         if (blksize > 1) {
3197                 rem = *inoffp % blksize;
3198                 if (rem > 0)
3199                         rem = blksize - rem;
3200                 if (len > rem && len - rem > blksize)
3201                         len = savlen = rounddown(len - rem, blksize) + rem;
3202         }
3203
3204         if (blksize <= 1)
3205                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3206                     outvp->v_mount->mnt_stat.f_iosize);
3207         if (blksize < 4096)
3208                 blksize = 4096;
3209         else if (blksize > 1024 * 1024)
3210                 blksize = 1024 * 1024;
3211         dat = malloc(blksize, M_TEMP, M_WAITOK);
3212
3213         /*
3214          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3215          * to find holes.  Otherwise, just scan the read block for all 0s
3216          * in the inner loop where the data copying is done.
3217          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3218          * support holes on the server, but do not support FIOSEEKHOLE.
3219          */
3220         holetoeof = eof = false;
3221         while (len > 0 && error == 0 && !eof && interrupted == 0) {
3222                 endoff = 0;                     /* To shut up compilers. */
3223                 cantseek = true;
3224                 startoff = *inoffp;
3225                 copylen = len;
3226
3227                 /*
3228                  * Find the next data area.  If there is just a hole to EOF,
3229                  * FIOSEEKDATA should fail with ENXIO.
3230                  * (I do not know if any file system will report a hole to
3231                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3232                  *  will fail for those file systems.)
3233                  *
3234                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3235                  * the code just falls through to the inner copy loop.
3236                  */
3237                 error = EINVAL;
3238                 if (holein > 0) {
3239                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3240                             incred, curthread);
3241                         if (error == ENXIO) {
3242                                 startoff = endoff = inva.va_size;
3243                                 eof = holetoeof = true;
3244                                 error = 0;
3245                         }
3246                 }
3247                 if (error == 0 && !holetoeof) {
3248                         endoff = startoff;
3249                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3250                             incred, curthread);
3251                         /*
3252                          * Since invp is unlocked, it may be possible for
3253                          * another thread to do a truncate(), lseek(), write()
3254                          * creating a hole at startoff between the above
3255                          * VOP_IOCTL() calls, if the other thread does not do
3256                          * rangelocking.
3257                          * If that happens, startoff == endoff and finding
3258                          * the hole has failed, so set an error.
3259                          */
3260                         if (error == 0 && startoff == endoff)
3261                                 error = EINVAL; /* Any error. Reset to 0. */
3262                 }
3263                 if (error == 0) {
3264                         if (startoff > *inoffp) {
3265                                 /* Found hole before data block. */
3266                                 xfer = MIN(startoff - *inoffp, len);
3267                                 if (*outoffp < va.va_size) {
3268                                         /* Must write 0s to punch hole. */
3269                                         xfer2 = MIN(va.va_size - *outoffp,
3270                                             xfer);
3271                                         memset(dat, 0, MIN(xfer2, blksize));
3272                                         error = vn_write_outvp(outvp, dat,
3273                                             *outoffp, xfer2, blksize, false,
3274                                             holeout > 0, outcred);
3275                                 }
3276
3277                                 if (error == 0 && *outoffp + xfer >
3278                                     va.va_size && (xfer == len || holetoeof)) {
3279                                         /* Grow output file (hole at end). */
3280                                         error = vn_write_outvp(outvp, dat,
3281                                             *outoffp, xfer, blksize, true,
3282                                             false, outcred);
3283                                 }
3284                                 if (error == 0) {
3285                                         *inoffp += xfer;
3286                                         *outoffp += xfer;
3287                                         len -= xfer;
3288                                         if (len < savlen)
3289                                                 interrupted = sig_intr();
3290                                 }
3291                         }
3292                         copylen = MIN(len, endoff - startoff);
3293                         cantseek = false;
3294                 } else {
3295                         cantseek = true;
3296                         startoff = *inoffp;
3297                         copylen = len;
3298                         error = 0;
3299                 }
3300
3301                 xfer = blksize;
3302                 if (cantseek) {
3303                         /*
3304                          * Set first xfer to end at a block boundary, so that
3305                          * holes are more likely detected in the loop below via
3306                          * the for all bytes 0 method.
3307                          */
3308                         xfer -= (*inoffp % blksize);
3309                 }
3310                 /* Loop copying the data block. */
3311                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
3312                         if (copylen < xfer)
3313                                 xfer = copylen;
3314                         error = vn_lock(invp, LK_SHARED);
3315                         if (error != 0)
3316                                 goto out;
3317                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
3318                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
3319                             curthread->td_ucred, incred, &aresid,
3320                             curthread);
3321                         VOP_UNLOCK(invp);
3322                         lastblock = false;
3323                         if (error == 0 && aresid > 0) {
3324                                 /* Stop the copy at EOF on the input file. */
3325                                 xfer -= aresid;
3326                                 eof = true;
3327                                 lastblock = true;
3328                         }
3329                         if (error == 0) {
3330                                 /*
3331                                  * Skip the write for holes past the initial EOF
3332                                  * of the output file, unless this is the last
3333                                  * write of the output file at EOF.
3334                                  */
3335                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
3336                                     false;
3337                                 if (xfer == len)
3338                                         lastblock = true;
3339                                 if (!cantseek || *outoffp < va.va_size ||
3340                                     lastblock || !readzeros)
3341                                         error = vn_write_outvp(outvp, dat,
3342                                             *outoffp, xfer, blksize,
3343                                             readzeros && lastblock &&
3344                                             *outoffp >= va.va_size, false,
3345                                             outcred);
3346                                 if (error == 0) {
3347                                         *inoffp += xfer;
3348                                         startoff += xfer;
3349                                         *outoffp += xfer;
3350                                         copylen -= xfer;
3351                                         len -= xfer;
3352                                         if (len < savlen)
3353                                                 interrupted = sig_intr();
3354                                 }
3355                         }
3356                         xfer = blksize;
3357                 }
3358         }
3359 out:
3360         *lenp = savlen - len;
3361         free(dat, M_TEMP);
3362         return (error);
3363 }
3364
3365 static int
3366 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3367 {
3368         struct mount *mp;
3369         struct vnode *vp;
3370         off_t olen, ooffset;
3371         int error;
3372 #ifdef AUDIT
3373         int audited_vnode1 = 0;
3374 #endif
3375
3376         vp = fp->f_vnode;
3377         if (vp->v_type != VREG)
3378                 return (ENODEV);
3379
3380         /* Allocating blocks may take a long time, so iterate. */
3381         for (;;) {
3382                 olen = len;
3383                 ooffset = offset;
3384
3385                 bwillwrite();
3386                 mp = NULL;
3387                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3388                 if (error != 0)
3389                         break;
3390                 error = vn_lock(vp, LK_EXCLUSIVE);
3391                 if (error != 0) {
3392                         vn_finished_write(mp);
3393                         break;
3394                 }
3395 #ifdef AUDIT
3396                 if (!audited_vnode1) {
3397                         AUDIT_ARG_VNODE1(vp);
3398                         audited_vnode1 = 1;
3399                 }
3400 #endif
3401 #ifdef MAC
3402                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3403                 if (error == 0)
3404 #endif
3405                         error = VOP_ALLOCATE(vp, &offset, &len);
3406                 VOP_UNLOCK(vp);
3407                 vn_finished_write(mp);
3408
3409                 if (olen + ooffset != offset + len) {
3410                         panic("offset + len changed from %jx/%jx to %jx/%jx",
3411                             ooffset, olen, offset, len);
3412                 }
3413                 if (error != 0 || len == 0)
3414                         break;
3415                 KASSERT(olen > len, ("Iteration did not make progress?"));
3416                 maybe_yield();
3417         }
3418
3419         return (error);
3420 }
3421
3422 static u_long vn_lock_pair_pause_cnt;
3423 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
3424     &vn_lock_pair_pause_cnt, 0,
3425     "Count of vn_lock_pair deadlocks");
3426
3427 u_int vn_lock_pair_pause_max;
3428 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
3429     &vn_lock_pair_pause_max, 0,
3430     "Max ticks for vn_lock_pair deadlock avoidance sleep");
3431
3432 static void
3433 vn_lock_pair_pause(const char *wmesg)
3434 {
3435         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
3436         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
3437 }
3438
3439 /*
3440  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
3441  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
3442  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
3443  * can be NULL.
3444  *
3445  * The function returns with both vnodes exclusively locked, and
3446  * guarantees that it does not create lock order reversal with other
3447  * threads during its execution.  Both vnodes could be unlocked
3448  * temporary (and reclaimed).
3449  */
3450 void
3451 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
3452     bool vp2_locked)
3453 {
3454         int error;
3455
3456         if (vp1 == NULL && vp2 == NULL)
3457                 return;
3458         if (vp1 != NULL) {
3459                 if (vp1_locked)
3460                         ASSERT_VOP_ELOCKED(vp1, "vp1");
3461                 else
3462                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
3463         } else {
3464                 vp1_locked = true;
3465         }
3466         if (vp2 != NULL) {
3467                 if (vp2_locked)
3468                         ASSERT_VOP_ELOCKED(vp2, "vp2");
3469                 else
3470                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
3471         } else {
3472                 vp2_locked = true;
3473         }
3474         if (!vp1_locked && !vp2_locked) {
3475                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3476                 vp1_locked = true;
3477         }
3478
3479         for (;;) {
3480                 if (vp1_locked && vp2_locked)
3481                         break;
3482                 if (vp1_locked && vp2 != NULL) {
3483                         if (vp1 != NULL) {
3484                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
3485                                     __FILE__, __LINE__);
3486                                 if (error == 0)
3487                                         break;
3488                                 VOP_UNLOCK(vp1);
3489                                 vp1_locked = false;
3490                                 vn_lock_pair_pause("vlp1");
3491                         }
3492                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3493                         vp2_locked = true;
3494                 }
3495                 if (vp2_locked && vp1 != NULL) {
3496                         if (vp2 != NULL) {
3497                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
3498                                     __FILE__, __LINE__);
3499                                 if (error == 0)
3500                                         break;
3501                                 VOP_UNLOCK(vp2);
3502                                 vp2_locked = false;
3503                                 vn_lock_pair_pause("vlp2");
3504                         }
3505                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3506                         vp1_locked = true;
3507                 }
3508         }
3509         if (vp1 != NULL)
3510                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
3511         if (vp2 != NULL)
3512                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
3513 }