sys/kern/vfs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  14  *
  15  * Portions of this software were developed by Konstantin Belousov
  16  * under sponsorship from the FreeBSD Foundation.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  * 3. Neither the name of the University nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  40  * SUCH DAMAGE.
  41  *
  42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include "opt_hwpmc_hooks.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/disk.h>
  53 #include <sys/fail.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/kdb.h>
  57 #include <sys/ktr.h>
  58 #include <sys/stat.h>
  59 #include <sys/priv.h>
  60 #include <sys/proc.h>
  61 #include <sys/limits.h>
  62 #include <sys/lock.h>
  63 #include <sys/mman.h>
  64 #include <sys/mount.h>
  65 #include <sys/mutex.h>
  66 #include <sys/namei.h>
  67 #include <sys/vnode.h>
  68 #include <sys/bio.h>
  69 #include <sys/buf.h>
  70 #include <sys/filio.h>
  71 #include <sys/resourcevar.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/prng.h>
  74 #include <sys/sx.h>
  75 #include <sys/sleepqueue.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/ttycom.h>
  78 #include <sys/conf.h>
  79 #include <sys/syslog.h>
  80 #include <sys/unistd.h>
  81 #include <sys/user.h>
  82
  83 #include <security/audit/audit.h>
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_object.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_pager.h>
  93
  94 #ifdef HWPMC_HOOKS
  95 #include <sys/pmckern.h>
  96 #endif
  97
  98 static fo_rdwr_t        vn_read;
  99 static fo_rdwr_t        vn_write;
 100 static fo_rdwr_t        vn_io_fault;
 101 static fo_truncate_t    vn_truncate;
 102 static fo_ioctl_t       vn_ioctl;
 103 static fo_poll_t        vn_poll;
 104 static fo_kqfilter_t    vn_kqfilter;
 105 static fo_close_t       vn_closefile;
 106 static fo_mmap_t        vn_mmap;
 107 static fo_fallocate_t   vn_fallocate;
 108
 109 struct  fileops vnops = {
 110         .fo_read = vn_io_fault,
 111         .fo_write = vn_io_fault,
 112         .fo_truncate = vn_truncate,
 113         .fo_ioctl = vn_ioctl,
 114         .fo_poll = vn_poll,
 115         .fo_kqfilter = vn_kqfilter,
 116         .fo_stat = vn_statfile,
 117         .fo_close = vn_closefile,
 118         .fo_chmod = vn_chmod,
 119         .fo_chown = vn_chown,
 120         .fo_sendfile = vn_sendfile,
 121         .fo_seek = vn_seek,
 122         .fo_fill_kinfo = vn_fill_kinfo,
 123         .fo_mmap = vn_mmap,
 124         .fo_fallocate = vn_fallocate,
 125         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 126 };
 127
 128 const u_int io_hold_cnt = 16;
 129 static int vn_io_fault_enable = 1;
 130 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
 131     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 132 static int vn_io_fault_prefault = 0;
 133 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
 134     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 135 static int vn_io_pgcache_read_enable = 1;
 136 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
 137     &vn_io_pgcache_read_enable, 0,
 138     "Enable copying from page cache for reads, avoiding fs");
 139 static u_long vn_io_faults_cnt;
 140 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
 141     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 142
 143 static int vfs_allow_read_dir = 0;
 144 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
 145     &vfs_allow_read_dir, 0,
 146     "Enable read(2) of directory by root for filesystems that support it");
 147
 148 /*
 149  * Returns true if vn_io_fault mode of handling the i/o request should
 150  * be used.
 151  */
 152 static bool
 153 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 154 {
 155         struct mount *mp;
 156
 157         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 158             (mp = vp->v_mount) != NULL &&
 159             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 160 }
 161
 162 /*
 163  * Structure used to pass arguments to vn_io_fault1(), to do either
 164  * file- or vnode-based I/O calls.
 165  */
 166 struct vn_io_fault_args {
 167         enum {
 168                 VN_IO_FAULT_FOP,
 169                 VN_IO_FAULT_VOP
 170         } kind;
 171         struct ucred *cred;
 172         int flags;
 173         union {
 174                 struct fop_args_tag {
 175                         struct file *fp;
 176                         fo_rdwr_t *doio;
 177                 } fop_args;
 178                 struct vop_args_tag {
 179                         struct vnode *vp;
 180                 } vop_args;
 181         } args;
 182 };
 183
 184 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
 185     struct vn_io_fault_args *args, struct thread *td);
 186
 187 int
 188 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 189 {
 190         struct thread *td = ndp->ni_cnd.cn_thread;
 191
 192         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 193 }
 194
 195 static uint64_t
 196 open2nameif(int fmode, u_int vn_open_flags)
 197 {
 198         uint64_t res;
 199
 200         res = ISOPEN | LOCKLEAF;
 201         if ((fmode & O_RESOLVE_BENEATH) != 0)
 202                 res |= RBENEATH;
 203         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 204                 res |= AUDITVNODE1;
 205         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 206                 res |= NOCAPCHECK;
 207         return (res);
 208 }
 209
 210 /*
 211  * Common code for vnode open operations via a name lookup.
 212  * Lookup the vnode and invoke VOP_CREATE if needed.
 213  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 214  *
 215  * Note that this does NOT free nameidata for the successful case,
 216  * due to the NDINIT being done elsewhere.
 217  */
 218 int
 219 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 220     struct ucred *cred, struct file *fp)
 221 {
 222         struct vnode *vp;
 223         struct mount *mp;
 224         struct thread *td = ndp->ni_cnd.cn_thread;
 225         struct vattr vat;
 226         struct vattr *vap = &vat;
 227         int fmode, error;
 228         bool first_open;
 229
 230 restart:
 231         first_open = false;
 232         fmode = *flagp;
 233         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 234             O_EXCL | O_DIRECTORY))
 235                 return (EINVAL);
 236         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 237                 ndp->ni_cnd.cn_nameiop = CREATE;
 238                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 239                 /*
 240                  * Set NOCACHE to avoid flushing the cache when
 241                  * rolling in many files at once.
 242                  *
 243                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
 244                  * exist despite NOCACHE.
 245                  */
 246                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 247                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 248                         ndp->ni_cnd.cn_flags |= FOLLOW;
 249                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 250                         bwillwrite();
 251                 if ((error = namei(ndp)) != 0)
 252                         return (error);
 253                 if (ndp->ni_vp == NULL) {
 254                         VATTR_NULL(vap);
 255                         vap->va_type = VREG;
 256                         vap->va_mode = cmode;
 257                         if (fmode & O_EXCL)
 258                                 vap->va_vaflags |= VA_EXCLUSIVE;
 259                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 260                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 261                                 vput(ndp->ni_dvp);
 262                                 if ((error = vn_start_write(NULL, &mp,
 263                                     V_XSLEEP | PCATCH)) != 0)
 264                                         return (error);
 265                                 NDREINIT(ndp);
 266                                 goto restart;
 267                         }
 268                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 269                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
 270 #ifdef MAC
 271                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
 272                             &ndp->ni_cnd, vap);
 273                         if (error == 0)
 274 #endif
 275                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 276                                     &ndp->ni_cnd, vap);
 277                         vp = ndp->ni_vp;
 278                         if (error == 0 && (fmode & O_EXCL) != 0 &&
 279                             (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 280                                 VI_LOCK(vp);
 281                                 vp->v_iflag |= VI_FOPENING;
 282                                 VI_UNLOCK(vp);
 283                                 first_open = true;
 284                         }
 285                         VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 286                             false);
 287                         vn_finished_write(mp);
 288                         if (error) {
 289                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 290                                 if (error == ERELOOKUP) {
 291                                         NDREINIT(ndp);
 292                                         goto restart;
 293                                 }
 294                                 return (error);
 295                         }
 296                         fmode &= ~O_TRUNC;
 297                 } else {
 298                         if (ndp->ni_dvp == ndp->ni_vp)
 299                                 vrele(ndp->ni_dvp);
 300                         else
 301                                 vput(ndp->ni_dvp);
 302                         ndp->ni_dvp = NULL;
 303                         vp = ndp->ni_vp;
 304                         if (fmode & O_EXCL) {
 305                                 error = EEXIST;
 306                                 goto bad;
 307                         }
 308                         if (vp->v_type == VDIR) {
 309                                 error = EISDIR;
 310                                 goto bad;
 311                         }
 312                         fmode &= ~O_CREAT;
 313                 }
 314         } else {
 315                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 316                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 317                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 318                     FOLLOW;
 319                 if ((fmode & FWRITE) == 0)
 320                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
 321                 if ((error = namei(ndp)) != 0)
 322                         return (error);
 323                 vp = ndp->ni_vp;
 324         }
 325         error = vn_open_vnode(vp, fmode, cred, td, fp);
 326         if (first_open) {
 327                 VI_LOCK(vp);
 328                 vp->v_iflag &= ~VI_FOPENING;
 329                 wakeup(vp);
 330                 VI_UNLOCK(vp);
 331         }
 332         if (error)
 333                 goto bad;
 334         *flagp = fmode;
 335         return (0);
 336 bad:
 337         NDFREE(ndp, NDF_ONLY_PNBUF);
 338         vput(vp);
 339         *flagp = fmode;
 340         ndp->ni_vp = NULL;
 341         return (error);
 342 }
 343
 344 static int
 345 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 346 {
 347         struct flock lf;
 348         int error, lock_flags, type;
 349
 350         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 351         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 352                 return (0);
 353         KASSERT(fp != NULL, ("open with flock requires fp"));
 354         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 355                 return (EOPNOTSUPP);
 356
 357         lock_flags = VOP_ISLOCKED(vp);
 358         VOP_UNLOCK(vp);
 359
 360         lf.l_whence = SEEK_SET;
 361         lf.l_start = 0;
 362         lf.l_len = 0;
 363         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 364         type = F_FLOCK;
 365         if ((fmode & FNONBLOCK) == 0)
 366                 type |= F_WAIT;
 367         if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 368                 type |= F_FIRSTOPEN;
 369         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 370         if (error == 0)
 371                 fp->f_flag |= FHASLOCK;
 372
 373         vn_lock(vp, lock_flags | LK_RETRY);
 374         return (error);
 375 }
 376
 377 /*
 378  * Common code for vnode open operations once a vnode is located.
 379  * Check permissions, and call the VOP_OPEN routine.
 380  */
 381 int
 382 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 383     struct thread *td, struct file *fp)
 384 {
 385         accmode_t accmode;
 386         int error;
 387
 388         if (vp->v_type == VLNK) {
 389                 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 390                         return (EMLINK);
 391         }
 392         if (vp->v_type == VSOCK)
 393                 return (EOPNOTSUPP);
 394         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 395                 return (ENOTDIR);
 396
 397         accmode = 0;
 398         if ((fmode & O_PATH) == 0) {
 399                 if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 400                         if (vp->v_type == VDIR)
 401                                 return (EISDIR);
 402                         accmode |= VWRITE;
 403                 }
 404                 if ((fmode & FREAD) != 0)
 405                         accmode |= VREAD;
 406                 if ((fmode & O_APPEND) && (fmode & FWRITE))
 407                         accmode |= VAPPEND;
 408 #ifdef MAC
 409                 if ((fmode & O_CREAT) != 0)
 410                         accmode |= VCREAT;
 411 #endif
 412         }
 413         if ((fmode & FEXEC) != 0)
 414                 accmode |= VEXEC;
 415 #ifdef MAC
 416         if ((fmode & O_VERIFY) != 0)
 417                 accmode |= VVERIFY;
 418         error = mac_vnode_check_open(cred, vp, accmode);
 419         if (error != 0)
 420                 return (error);
 421
 422         accmode &= ~(VCREAT | VVERIFY);
 423 #endif
 424         if ((fmode & O_CREAT) == 0 && accmode != 0) {
 425                 error = VOP_ACCESS(vp, accmode, cred, td);
 426                 if (error != 0)
 427                         return (error);
 428         }
 429         if ((fmode & O_PATH) != 0)
 430                 return (0);
 431
 432         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 433                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 434         error = VOP_OPEN(vp, fmode, cred, td, fp);
 435         if (error != 0)
 436                 return (error);
 437
 438         error = vn_open_vnode_advlock(vp, fmode, fp);
 439         if (error == 0 && (fmode & FWRITE) != 0) {
 440                 error = VOP_ADD_WRITECOUNT(vp, 1);
 441                 if (error == 0) {
 442                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 443                              __func__, vp, vp->v_writecount);
 444                 }
 445         }
 446
 447         /*
 448          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 449          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 450          * Arrange for that by having fdrop() to use vn_closefile().
 451          */
 452         if (error != 0) {
 453                 fp->f_flag |= FOPENFAILED;
 454                 fp->f_vnode = vp;
 455                 if (fp->f_ops == &badfileops) {
 456                         fp->f_type = DTYPE_VNODE;
 457                         fp->f_ops = &vnops;
 458                 }
 459                 vref(vp);
 460         }
 461
 462         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 463         return (error);
 464
 465 }
 466
 467 /*
 468  * Check for write permissions on the specified vnode.
 469  * Prototype text segments cannot be written.
 470  * It is racy.
 471  */
 472 int
 473 vn_writechk(struct vnode *vp)
 474 {
 475
 476         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 477         /*
 478          * If there's shared text associated with
 479          * the vnode, try to free it up once.  If
 480          * we fail, we can't allow writing.
 481          */
 482         if (VOP_IS_TEXT(vp))
 483                 return (ETXTBSY);
 484
 485         return (0);
 486 }
 487
 488 /*
 489  * Vnode close call
 490  */
 491 static int
 492 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
 493     struct thread *td, bool keep_ref)
 494 {
 495         struct mount *mp;
 496         int error, lock_flags;
 497
 498         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 499             MNT_EXTENDED_SHARED(vp->v_mount))
 500                 lock_flags = LK_SHARED;
 501         else
 502                 lock_flags = LK_EXCLUSIVE;
 503
 504         vn_start_write(vp, &mp, V_WAIT);
 505         vn_lock(vp, lock_flags | LK_RETRY);
 506         AUDIT_ARG_VNODE1(vp);
 507         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 508                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 509                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 510                     __func__, vp, vp->v_writecount);
 511         }
 512         error = VOP_CLOSE(vp, flags, file_cred, td);
 513         if (keep_ref)
 514                 VOP_UNLOCK(vp);
 515         else
 516                 vput(vp);
 517         vn_finished_write(mp);
 518         return (error);
 519 }
 520
 521 int
 522 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
 523     struct thread *td)
 524 {
 525
 526         return (vn_close1(vp, flags, file_cred, td, false));
 527 }
 528
 529 /*
 530  * Heuristic to detect sequential operation.
 531  */
 532 static int
 533 sequential_heuristic(struct uio *uio, struct file *fp)
 534 {
 535         enum uio_rw rw;
 536
 537         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 538
 539         rw = uio->uio_rw;
 540         if (fp->f_flag & FRDAHEAD)
 541                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 542
 543         /*
 544          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 545          * that the first I/O is normally considered to be slightly
 546          * sequential.  Seeking to offset 0 doesn't change sequentiality
 547          * unless previous seeks have reduced f_seqcount to 0, in which
 548          * case offset 0 is not special.
 549          */
 550         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 551             uio->uio_offset == fp->f_nextoff[rw]) {
 552                 /*
 553                  * f_seqcount is in units of fixed-size blocks so that it
 554                  * depends mainly on the amount of sequential I/O and not
 555                  * much on the number of sequential I/O's.  The fixed size
 556                  * of 16384 is hard-coded here since it is (not quite) just
 557                  * a magic size that works well here.  This size is more
 558                  * closely related to the best I/O size for real disks than
 559                  * to any block size used by software.
 560                  */
 561                 if (uio->uio_resid >= IO_SEQMAX * 16384)
 562                         fp->f_seqcount[rw] = IO_SEQMAX;
 563                 else {
 564                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 565                         if (fp->f_seqcount[rw] > IO_SEQMAX)
 566                                 fp->f_seqcount[rw] = IO_SEQMAX;
 567                 }
 568                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 569         }
 570
 571         /* Not sequential.  Quickly draw-down sequentiality. */
 572         if (fp->f_seqcount[rw] > 1)
 573                 fp->f_seqcount[rw] = 1;
 574         else
 575                 fp->f_seqcount[rw] = 0;
 576         return (0);
 577 }
 578
 579 /*
 580  * Package up an I/O request on a vnode into a uio and do it.
 581  */
 582 int
 583 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
 584     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 585     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 586 {
 587         struct uio auio;
 588         struct iovec aiov;
 589         struct mount *mp;
 590         struct ucred *cred;
 591         void *rl_cookie;
 592         struct vn_io_fault_args args;
 593         int error, lock_flags;
 594
 595         if (offset < 0 && vp->v_type != VCHR)
 596                 return (EINVAL);
 597         auio.uio_iov = &aiov;
 598         auio.uio_iovcnt = 1;
 599         aiov.iov_base = base;
 600         aiov.iov_len = len;
 601         auio.uio_resid = len;
 602         auio.uio_offset = offset;
 603         auio.uio_segflg = segflg;
 604         auio.uio_rw = rw;
 605         auio.uio_td = td;
 606         error = 0;
 607
 608         if ((ioflg & IO_NODELOCKED) == 0) {
 609                 if ((ioflg & IO_RANGELOCKED) == 0) {
 610                         if (rw == UIO_READ) {
 611                                 rl_cookie = vn_rangelock_rlock(vp, offset,
 612                                     offset + len);
 613                         } else if ((ioflg & IO_APPEND) != 0) {
 614                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 615                         } else {
 616                                 rl_cookie = vn_rangelock_wlock(vp, offset,
 617                                     offset + len);
 618                         }
 619                 } else
 620                         rl_cookie = NULL;
 621                 mp = NULL;
 622                 if (rw == UIO_WRITE) {
 623                         if (vp->v_type != VCHR &&
 624                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 625                             != 0)
 626                                 goto out;
 627                         if (MNT_SHARED_WRITES(mp) ||
 628                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 629                                 lock_flags = LK_SHARED;
 630                         else
 631                                 lock_flags = LK_EXCLUSIVE;
 632                 } else
 633                         lock_flags = LK_SHARED;
 634                 vn_lock(vp, lock_flags | LK_RETRY);
 635         } else
 636                 rl_cookie = NULL;
 637
 638         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 639 #ifdef MAC
 640         if ((ioflg & IO_NOMACCHECK) == 0) {
 641                 if (rw == UIO_READ)
 642                         error = mac_vnode_check_read(active_cred, file_cred,
 643                             vp);
 644                 else
 645                         error = mac_vnode_check_write(active_cred, file_cred,
 646                             vp);
 647         }
 648 #endif
 649         if (error == 0) {
 650                 if (file_cred != NULL)
 651                         cred = file_cred;
 652                 else
 653                         cred = active_cred;
 654                 if (do_vn_io_fault(vp, &auio)) {
 655                         args.kind = VN_IO_FAULT_VOP;
 656                         args.cred = cred;
 657                         args.flags = ioflg;
 658                         args.args.vop_args.vp = vp;
 659                         error = vn_io_fault1(vp, &auio, &args, td);
 660                 } else if (rw == UIO_READ) {
 661                         error = VOP_READ(vp, &auio, ioflg, cred);
 662                 } else /* if (rw == UIO_WRITE) */ {
 663                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 664                 }
 665         }
 666         if (aresid)
 667                 *aresid = auio.uio_resid;
 668         else
 669                 if (auio.uio_resid && error == 0)
 670                         error = EIO;
 671         if ((ioflg & IO_NODELOCKED) == 0) {
 672                 VOP_UNLOCK(vp);
 673                 if (mp != NULL)
 674                         vn_finished_write(mp);
 675         }
 676  out:
 677         if (rl_cookie != NULL)
 678                 vn_rangelock_unlock(vp, rl_cookie);
 679         return (error);
 680 }
 681
 682 /*
 683  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 684  * request is split up into smaller chunks and we try to avoid saturating
 685  * the buffer cache while potentially holding a vnode locked, so we
 686  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
 687  * to give other processes a chance to lock the vnode (either other processes
 688  * core'ing the same binary, or unrelated processes scanning the directory).
 689  */
 690 int
 691 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 692     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 693     struct ucred *file_cred, size_t *aresid, struct thread *td)
 694 {
 695         int error = 0;
 696         ssize_t iaresid;
 697
 698         do {
 699                 int chunk;
 700
 701                 /*
 702                  * Force `offset' to a multiple of MAXBSIZE except possibly
 703                  * for the first chunk, so that filesystems only need to
 704                  * write full blocks except possibly for the first and last
 705                  * chunks.
 706                  */
 707                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 708
 709                 if (chunk > len)
 710                         chunk = len;
 711                 if (rw != UIO_READ && vp->v_type == VREG)
 712                         bwillwrite();
 713                 iaresid = 0;
 714                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 715                     ioflg, active_cred, file_cred, &iaresid, td);
 716                 len -= chunk;   /* aresid calc already includes length */
 717                 if (error)
 718                         break;
 719                 offset += chunk;
 720                 base = (char *)base + chunk;
 721                 kern_yield(PRI_USER);
 722         } while (len);
 723         if (aresid)
 724                 *aresid = len + iaresid;
 725         return (error);
 726 }
 727
 728 #if OFF_MAX <= LONG_MAX
 729 off_t
 730 foffset_lock(struct file *fp, int flags)
 731 {
 732         volatile short *flagsp;
 733         off_t res;
 734         short state;
 735
 736         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 737
 738         if ((flags & FOF_NOLOCK) != 0)
 739                 return (atomic_load_long(&fp->f_offset));
 740
 741         /*
 742          * According to McKusick the vn lock was protecting f_offset here.
 743          * It is now protected by the FOFFSET_LOCKED flag.
 744          */
 745         flagsp = &fp->f_vnread_flags;
 746         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 747                 return (atomic_load_long(&fp->f_offset));
 748
 749         sleepq_lock(&fp->f_vnread_flags);
 750         state = atomic_load_16(flagsp);
 751         for (;;) {
 752                 if ((state & FOFFSET_LOCKED) == 0) {
 753                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 754                             FOFFSET_LOCKED))
 755                                 continue;
 756                         break;
 757                 }
 758                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
 759                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 760                             state | FOFFSET_LOCK_WAITING))
 761                                 continue;
 762                 }
 763                 DROP_GIANT();
 764                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 765                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 766                 PICKUP_GIANT();
 767                 sleepq_lock(&fp->f_vnread_flags);
 768                 state = atomic_load_16(flagsp);
 769         }
 770         res = atomic_load_long(&fp->f_offset);
 771         sleepq_release(&fp->f_vnread_flags);
 772         return (res);
 773 }
 774
 775 void
 776 foffset_unlock(struct file *fp, off_t val, int flags)
 777 {
 778         volatile short *flagsp;
 779         short state;
 780
 781         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 782
 783         if ((flags & FOF_NOUPDATE) == 0)
 784                 atomic_store_long(&fp->f_offset, val);
 785         if ((flags & FOF_NEXTOFF_R) != 0)
 786                 fp->f_nextoff[UIO_READ] = val;
 787         if ((flags & FOF_NEXTOFF_W) != 0)
 788                 fp->f_nextoff[UIO_WRITE] = val;
 789
 790         if ((flags & FOF_NOLOCK) != 0)
 791                 return;
 792
 793         flagsp = &fp->f_vnread_flags;
 794         state = atomic_load_16(flagsp);
 795         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 796             atomic_cmpset_rel_16(flagsp, state, 0))
 797                 return;
 798
 799         sleepq_lock(&fp->f_vnread_flags);
 800         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 801         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 802         fp->f_vnread_flags = 0;
 803         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 804         sleepq_release(&fp->f_vnread_flags);
 805 }
 806 #else
 807 off_t
 808 foffset_lock(struct file *fp, int flags)
 809 {
 810         struct mtx *mtxp;
 811         off_t res;
 812
 813         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 814
 815         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 816         mtx_lock(mtxp);
 817         if ((flags & FOF_NOLOCK) == 0) {
 818                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 819                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 820                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 821                             "vofflock", 0);
 822                 }
 823                 fp->f_vnread_flags |= FOFFSET_LOCKED;
 824         }
 825         res = fp->f_offset;
 826         mtx_unlock(mtxp);
 827         return (res);
 828 }
 829
 830 void
 831 foffset_unlock(struct file *fp, off_t val, int flags)
 832 {
 833         struct mtx *mtxp;
 834
 835         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 836
 837         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 838         mtx_lock(mtxp);
 839         if ((flags & FOF_NOUPDATE) == 0)
 840                 fp->f_offset = val;
 841         if ((flags & FOF_NEXTOFF_R) != 0)
 842                 fp->f_nextoff[UIO_READ] = val;
 843         if ((flags & FOF_NEXTOFF_W) != 0)
 844                 fp->f_nextoff[UIO_WRITE] = val;
 845         if ((flags & FOF_NOLOCK) == 0) {
 846                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 847                     ("Lost FOFFSET_LOCKED"));
 848                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 849                         wakeup(&fp->f_vnread_flags);
 850                 fp->f_vnread_flags = 0;
 851         }
 852         mtx_unlock(mtxp);
 853 }
 854 #endif
 855
 856 void
 857 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 858 {
 859
 860         if ((flags & FOF_OFFSET) == 0)
 861                 uio->uio_offset = foffset_lock(fp, flags);
 862 }
 863
 864 void
 865 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 866 {
 867
 868         if ((flags & FOF_OFFSET) == 0)
 869                 foffset_unlock(fp, uio->uio_offset, flags);
 870 }
 871
 872 static int
 873 get_advice(struct file *fp, struct uio *uio)
 874 {
 875         struct mtx *mtxp;
 876         int ret;
 877
 878         ret = POSIX_FADV_NORMAL;
 879         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 880                 return (ret);
 881
 882         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 883         mtx_lock(mtxp);
 884         if (fp->f_advice != NULL &&
 885             uio->uio_offset >= fp->f_advice->fa_start &&
 886             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 887                 ret = fp->f_advice->fa_advice;
 888         mtx_unlock(mtxp);
 889         return (ret);
 890 }
 891
 892 int
 893 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 894 {
 895         vm_object_t obj;
 896         vm_page_t ma[io_hold_cnt + 2];
 897         off_t off, vsz;
 898         ssize_t resid;
 899         int error, i, j;
 900
 901         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 902         obj = atomic_load_ptr(&vp->v_object);
 903         if (obj == NULL)
 904                 return (EJUSTRETURN);
 905
 906         /*
 907          * Depends on type stability of vm_objects.
 908          */
 909         vm_object_pip_add(obj, 1);
 910         if ((obj->flags & OBJ_DEAD) != 0) {
 911                 /*
 912                  * Note that object might be already reused from the
 913                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 914                  * we recheck for DOOMED vnode state after all pages
 915                  * are busied, and retract then.
 916                  *
 917                  * But we check for OBJ_DEAD to ensure that we do not
 918                  * busy pages while vm_object_terminate_pages()
 919                  * processes the queue.
 920                  */
 921                 error = EJUSTRETURN;
 922                 goto out_pip;
 923         }
 924
 925         resid = uio->uio_resid;
 926         off = uio->uio_offset;
 927         for (i = 0; resid > 0; i++) {
 928                 MPASS(i < io_hold_cnt + 2);
 929                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
 930                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 931                     VM_ALLOC_NOWAIT);
 932                 if (ma[i] == NULL)
 933                         break;
 934
 935                 /*
 936                  * Skip invalid pages.  Valid mask can be partial only
 937                  * at EOF, and we clip later.
 938                  */
 939                 if (vm_page_none_valid(ma[i])) {
 940                         vm_page_sunbusy(ma[i]);
 941                         break;
 942                 }
 943
 944                 resid -= PAGE_SIZE;
 945                 off += PAGE_SIZE;
 946         }
 947         if (i == 0) {
 948                 error = EJUSTRETURN;
 949                 goto out_pip;
 950         }
 951
 952         /*
 953          * Check VIRF_DOOMED after we busied our pages.  Since
 954          * vgonel() terminates the vnode' vm_object, it cannot
 955          * process past pages busied by us.
 956          */
 957         if (VN_IS_DOOMED(vp)) {
 958                 error = EJUSTRETURN;
 959                 goto out;
 960         }
 961
 962         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 963         if (resid > uio->uio_resid)
 964                 resid = uio->uio_resid;
 965
 966         /*
 967          * Unlocked read of vnp_size is safe because truncation cannot
 968          * pass busied page.  But we load vnp_size into a local
 969          * variable so that possible concurrent extension does not
 970          * break calculation.
 971          */
 972 #if defined(__powerpc__) && !defined(__powerpc64__)
 973         vsz = obj->un_pager.vnp.vnp_size;
 974 #else
 975         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 976 #endif
 977         if (uio->uio_offset >= vsz) {
 978                 error = EJUSTRETURN;
 979                 goto out;
 980         }
 981         if (uio->uio_offset + resid > vsz)
 982                 resid = vsz - uio->uio_offset;
 983
 984         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 985
 986 out:
 987         for (j = 0; j < i; j++) {
 988                 if (error == 0)
 989                         vm_page_reference(ma[j]);
 990                 vm_page_sunbusy(ma[j]);
 991         }
 992 out_pip:
 993         vm_object_pip_wakeup(obj);
 994         if (error != 0)
 995                 return (error);
 996         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 997 }
 998
 999 /*
1000  * File table vnode read routine.
1001  */
1002 static int
1003 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1004     struct thread *td)
1005 {
1006         struct vnode *vp;
1007         off_t orig_offset;
1008         int error, ioflag;
1009         int advice;
1010
1011         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1012             uio->uio_td, td));
1013         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1014         vp = fp->f_vnode;
1015         ioflag = 0;
1016         if (fp->f_flag & FNONBLOCK)
1017                 ioflag |= IO_NDELAY;
1018         if (fp->f_flag & O_DIRECT)
1019                 ioflag |= IO_DIRECT;
1020
1021         /*
1022          * Try to read from page cache.  VIRF_DOOMED check is racy but
1023          * allows us to avoid unneeded work outright.
1024          */
1025         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
1026             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
1027                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1028                 if (error == 0) {
1029                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1030                         return (0);
1031                 }
1032                 if (error != EJUSTRETURN)
1033                         return (error);
1034         }
1035
1036         advice = get_advice(fp, uio);
1037         vn_lock(vp, LK_SHARED | LK_RETRY);
1038
1039         switch (advice) {
1040         case POSIX_FADV_NORMAL:
1041         case POSIX_FADV_SEQUENTIAL:
1042         case POSIX_FADV_NOREUSE:
1043                 ioflag |= sequential_heuristic(uio, fp);
1044                 break;
1045         case POSIX_FADV_RANDOM:
1046                 /* Disable read-ahead for random I/O. */
1047                 break;
1048         }
1049         orig_offset = uio->uio_offset;
1050
1051 #ifdef MAC
1052         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1053         if (error == 0)
1054 #endif
1055                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1056         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1057         VOP_UNLOCK(vp);
1058         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1059             orig_offset != uio->uio_offset)
1060                 /*
1061                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1062                  * for the backing file after a POSIX_FADV_NOREUSE
1063                  * read(2).
1064                  */
1065                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1066                     POSIX_FADV_DONTNEED);
1067         return (error);
1068 }
1069
1070 /*
1071  * File table vnode write routine.
1072  */
1073 static int
1074 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1075     struct thread *td)
1076 {
1077         struct vnode *vp;
1078         struct mount *mp;
1079         off_t orig_offset;
1080         int error, ioflag, lock_flags;
1081         int advice;
1082
1083         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1084             uio->uio_td, td));
1085         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1086         vp = fp->f_vnode;
1087         if (vp->v_type == VREG)
1088                 bwillwrite();
1089         ioflag = IO_UNIT;
1090         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
1091                 ioflag |= IO_APPEND;
1092         if (fp->f_flag & FNONBLOCK)
1093                 ioflag |= IO_NDELAY;
1094         if (fp->f_flag & O_DIRECT)
1095                 ioflag |= IO_DIRECT;
1096         if ((fp->f_flag & O_FSYNC) ||
1097             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
1098                 ioflag |= IO_SYNC;
1099         /*
1100          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
1101          * implementations that don't understand IO_DATASYNC fall back to full
1102          * O_SYNC behavior.
1103          */
1104         if (fp->f_flag & O_DSYNC)
1105                 ioflag |= IO_SYNC | IO_DATASYNC;
1106         mp = NULL;
1107         if (vp->v_type != VCHR &&
1108             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
1109                 goto unlock;
1110
1111         advice = get_advice(fp, uio);
1112
1113         if (MNT_SHARED_WRITES(mp) ||
1114             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
1115                 lock_flags = LK_SHARED;
1116         } else {
1117                 lock_flags = LK_EXCLUSIVE;
1118         }
1119
1120         vn_lock(vp, lock_flags | LK_RETRY);
1121         switch (advice) {
1122         case POSIX_FADV_NORMAL:
1123         case POSIX_FADV_SEQUENTIAL:
1124         case POSIX_FADV_NOREUSE:
1125                 ioflag |= sequential_heuristic(uio, fp);
1126                 break;
1127         case POSIX_FADV_RANDOM:
1128                 /* XXX: Is this correct? */
1129                 break;
1130         }
1131         orig_offset = uio->uio_offset;
1132
1133 #ifdef MAC
1134         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1135         if (error == 0)
1136 #endif
1137                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1138         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1139         VOP_UNLOCK(vp);
1140         if (vp->v_type != VCHR)
1141                 vn_finished_write(mp);
1142         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1143             orig_offset != uio->uio_offset)
1144                 /*
1145                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1146                  * for the backing file after a POSIX_FADV_NOREUSE
1147                  * write(2).
1148                  */
1149                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1150                     POSIX_FADV_DONTNEED);
1151 unlock:
1152         return (error);
1153 }
1154
1155 /*
1156  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1157  * prevent the following deadlock:
1158  *
1159  * Assume that the thread A reads from the vnode vp1 into userspace
1160  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1161  * currently not resident, then system ends up with the call chain
1162  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1163  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1164  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1165  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1166  * backed by the pages of vnode vp1, and some page in buf2 is not
1167  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1168  *
1169  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1170  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1171  * Instead, it first tries to do the whole range i/o with pagefaults
1172  * disabled. If all pages in the i/o buffer are resident and mapped,
1173  * VOP will succeed (ignoring the genuine filesystem errors).
1174  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1175  * i/o in chunks, with all pages in the chunk prefaulted and held
1176  * using vm_fault_quick_hold_pages().
1177  *
1178  * Filesystems using this deadlock avoidance scheme should use the
1179  * array of the held pages from uio, saved in the curthread->td_ma,
1180  * instead of doing uiomove().  A helper function
1181  * vn_io_fault_uiomove() converts uiomove request into
1182  * uiomove_fromphys() over td_ma array.
1183  *
1184  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1185  * make the current i/o request atomic with respect to other i/os and
1186  * truncations.
1187  */
1188
1189 /*
1190  * Decode vn_io_fault_args and perform the corresponding i/o.
1191  */
1192 static int
1193 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1194     struct thread *td)
1195 {
1196         int error, save;
1197
1198         error = 0;
1199         save = vm_fault_disable_pagefaults();
1200         switch (args->kind) {
1201         case VN_IO_FAULT_FOP:
1202                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1203                     uio, args->cred, args->flags, td);
1204                 break;
1205         case VN_IO_FAULT_VOP:
1206                 if (uio->uio_rw == UIO_READ) {
1207                         error = VOP_READ(args->args.vop_args.vp, uio,
1208                             args->flags, args->cred);
1209                 } else if (uio->uio_rw == UIO_WRITE) {
1210                         error = VOP_WRITE(args->args.vop_args.vp, uio,
1211                             args->flags, args->cred);
1212                 }
1213                 break;
1214         default:
1215                 panic("vn_io_fault_doio: unknown kind of io %d %d",
1216                     args->kind, uio->uio_rw);
1217         }
1218         vm_fault_enable_pagefaults(save);
1219         return (error);
1220 }
1221
1222 static int
1223 vn_io_fault_touch(char *base, const struct uio *uio)
1224 {
1225         int r;
1226
1227         r = fubyte(base);
1228         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1229                 return (EFAULT);
1230         return (0);
1231 }
1232
1233 static int
1234 vn_io_fault_prefault_user(const struct uio *uio)
1235 {
1236         char *base;
1237         const struct iovec *iov;
1238         size_t len;
1239         ssize_t resid;
1240         int error, i;
1241
1242         KASSERT(uio->uio_segflg == UIO_USERSPACE,
1243             ("vn_io_fault_prefault userspace"));
1244
1245         error = i = 0;
1246         iov = uio->uio_iov;
1247         resid = uio->uio_resid;
1248         base = iov->iov_base;
1249         len = iov->iov_len;
1250         while (resid > 0) {
1251                 error = vn_io_fault_touch(base, uio);
1252                 if (error != 0)
1253                         break;
1254                 if (len < PAGE_SIZE) {
1255                         if (len != 0) {
1256                                 error = vn_io_fault_touch(base + len - 1, uio);
1257                                 if (error != 0)
1258                                         break;
1259                                 resid -= len;
1260                         }
1261                         if (++i >= uio->uio_iovcnt)
1262                                 break;
1263                         iov = uio->uio_iov + i;
1264                         base = iov->iov_base;
1265                         len = iov->iov_len;
1266                 } else {
1267                         len -= PAGE_SIZE;
1268                         base += PAGE_SIZE;
1269                         resid -= PAGE_SIZE;
1270                 }
1271         }
1272         return (error);
1273 }
1274
1275 /*
1276  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1277  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1278  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1279  * into args and call vn_io_fault1() to handle faults during the user
1280  * mode buffer accesses.
1281  */
1282 static int
1283 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1284     struct thread *td)
1285 {
1286         vm_page_t ma[io_hold_cnt + 2];
1287         struct uio *uio_clone, short_uio;
1288         struct iovec short_iovec[1];
1289         vm_page_t *prev_td_ma;
1290         vm_prot_t prot;
1291         vm_offset_t addr, end;
1292         size_t len, resid;
1293         ssize_t adv;
1294         int error, cnt, saveheld, prev_td_ma_cnt;
1295
1296         if (vn_io_fault_prefault) {
1297                 error = vn_io_fault_prefault_user(uio);
1298                 if (error != 0)
1299                         return (error); /* Or ignore ? */
1300         }
1301
1302         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1303
1304         /*
1305          * The UFS follows IO_UNIT directive and replays back both
1306          * uio_offset and uio_resid if an error is encountered during the
1307          * operation.  But, since the iovec may be already advanced,
1308          * uio is still in an inconsistent state.
1309          *
1310          * Cache a copy of the original uio, which is advanced to the redo
1311          * point using UIO_NOCOPY below.
1312          */
1313         uio_clone = cloneuio(uio);
1314         resid = uio->uio_resid;
1315
1316         short_uio.uio_segflg = UIO_USERSPACE;
1317         short_uio.uio_rw = uio->uio_rw;
1318         short_uio.uio_td = uio->uio_td;
1319
1320         error = vn_io_fault_doio(args, uio, td);
1321         if (error != EFAULT)
1322                 goto out;
1323
1324         atomic_add_long(&vn_io_faults_cnt, 1);
1325         uio_clone->uio_segflg = UIO_NOCOPY;
1326         uiomove(NULL, resid - uio->uio_resid, uio_clone);
1327         uio_clone->uio_segflg = uio->uio_segflg;
1328
1329         saveheld = curthread_pflags_set(TDP_UIOHELD);
1330         prev_td_ma = td->td_ma;
1331         prev_td_ma_cnt = td->td_ma_cnt;
1332
1333         while (uio_clone->uio_resid != 0) {
1334                 len = uio_clone->uio_iov->iov_len;
1335                 if (len == 0) {
1336                         KASSERT(uio_clone->uio_iovcnt >= 1,
1337                             ("iovcnt underflow"));
1338                         uio_clone->uio_iov++;
1339                         uio_clone->uio_iovcnt--;
1340                         continue;
1341                 }
1342                 if (len > ptoa(io_hold_cnt))
1343                         len = ptoa(io_hold_cnt);
1344                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1345                 end = round_page(addr + len);
1346                 if (end < addr) {
1347                         error = EFAULT;
1348                         break;
1349                 }
1350                 cnt = atop(end - trunc_page(addr));
1351                 /*
1352                  * A perfectly misaligned address and length could cause
1353                  * both the start and the end of the chunk to use partial
1354                  * page.  +2 accounts for such a situation.
1355                  */
1356                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1357                     addr, len, prot, ma, io_hold_cnt + 2);
1358                 if (cnt == -1) {
1359                         error = EFAULT;
1360                         break;
1361                 }
1362                 short_uio.uio_iov = &short_iovec[0];
1363                 short_iovec[0].iov_base = (void *)addr;
1364                 short_uio.uio_iovcnt = 1;
1365                 short_uio.uio_resid = short_iovec[0].iov_len = len;
1366                 short_uio.uio_offset = uio_clone->uio_offset;
1367                 td->td_ma = ma;
1368                 td->td_ma_cnt = cnt;
1369
1370                 error = vn_io_fault_doio(args, &short_uio, td);
1371                 vm_page_unhold_pages(ma, cnt);
1372                 adv = len - short_uio.uio_resid;
1373
1374                 uio_clone->uio_iov->iov_base =
1375                     (char *)uio_clone->uio_iov->iov_base + adv;
1376                 uio_clone->uio_iov->iov_len -= adv;
1377                 uio_clone->uio_resid -= adv;
1378                 uio_clone->uio_offset += adv;
1379
1380                 uio->uio_resid -= adv;
1381                 uio->uio_offset += adv;
1382
1383                 if (error != 0 || adv == 0)
1384                         break;
1385         }
1386         td->td_ma = prev_td_ma;
1387         td->td_ma_cnt = prev_td_ma_cnt;
1388         curthread_pflags_restore(saveheld);
1389 out:
1390         free(uio_clone, M_IOV);
1391         return (error);
1392 }
1393
1394 static int
1395 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1396     int flags, struct thread *td)
1397 {
1398         fo_rdwr_t *doio;
1399         struct vnode *vp;
1400         void *rl_cookie;
1401         struct vn_io_fault_args args;
1402         int error;
1403
1404         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1405         vp = fp->f_vnode;
1406
1407         /*
1408          * The ability to read(2) on a directory has historically been
1409          * allowed for all users, but this can and has been the source of
1410          * at least one security issue in the past.  As such, it is now hidden
1411          * away behind a sysctl for those that actually need it to use it, and
1412          * restricted to root when it's turned on to make it relatively safe to
1413          * leave on for longer sessions of need.
1414          */
1415         if (vp->v_type == VDIR) {
1416                 KASSERT(uio->uio_rw == UIO_READ,
1417                     ("illegal write attempted on a directory"));
1418                 if (!vfs_allow_read_dir)
1419                         return (EISDIR);
1420                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1421                         return (EISDIR);
1422         }
1423
1424         foffset_lock_uio(fp, uio, flags);
1425         if (do_vn_io_fault(vp, uio)) {
1426                 args.kind = VN_IO_FAULT_FOP;
1427                 args.args.fop_args.fp = fp;
1428                 args.args.fop_args.doio = doio;
1429                 args.cred = active_cred;
1430                 args.flags = flags | FOF_OFFSET;
1431                 if (uio->uio_rw == UIO_READ) {
1432                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1433                             uio->uio_offset + uio->uio_resid);
1434                 } else if ((fp->f_flag & O_APPEND) != 0 ||
1435                     (flags & FOF_OFFSET) == 0) {
1436                         /* For appenders, punt and lock the whole range. */
1437                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1438                 } else {
1439                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1440                             uio->uio_offset + uio->uio_resid);
1441                 }
1442                 error = vn_io_fault1(vp, uio, &args, td);
1443                 vn_rangelock_unlock(vp, rl_cookie);
1444         } else {
1445                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1446         }
1447         foffset_unlock_uio(fp, uio, flags);
1448         return (error);
1449 }
1450
1451 /*
1452  * Helper function to perform the requested uiomove operation using
1453  * the held pages for io->uio_iov[0].iov_base buffer instead of
1454  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1455  * instead of iov_base prevents page faults that could occur due to
1456  * pmap_collect() invalidating the mapping created by
1457  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1458  * object cleanup revoking the write access from page mappings.
1459  *
1460  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1461  * instead of plain uiomove().
1462  */
1463 int
1464 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1465 {
1466         struct uio transp_uio;
1467         struct iovec transp_iov[1];
1468         struct thread *td;
1469         size_t adv;
1470         int error, pgadv;
1471
1472         td = curthread;
1473         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1474             uio->uio_segflg != UIO_USERSPACE)
1475                 return (uiomove(data, xfersize, uio));
1476
1477         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1478         transp_iov[0].iov_base = data;
1479         transp_uio.uio_iov = &transp_iov[0];
1480         transp_uio.uio_iovcnt = 1;
1481         if (xfersize > uio->uio_resid)
1482                 xfersize = uio->uio_resid;
1483         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1484         transp_uio.uio_offset = 0;
1485         transp_uio.uio_segflg = UIO_SYSSPACE;
1486         /*
1487          * Since transp_iov points to data, and td_ma page array
1488          * corresponds to original uio->uio_iov, we need to invert the
1489          * direction of the i/o operation as passed to
1490          * uiomove_fromphys().
1491          */
1492         switch (uio->uio_rw) {
1493         case UIO_WRITE:
1494                 transp_uio.uio_rw = UIO_READ;
1495                 break;
1496         case UIO_READ:
1497                 transp_uio.uio_rw = UIO_WRITE;
1498                 break;
1499         }
1500         transp_uio.uio_td = uio->uio_td;
1501         error = uiomove_fromphys(td->td_ma,
1502             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1503             xfersize, &transp_uio);
1504         adv = xfersize - transp_uio.uio_resid;
1505         pgadv =
1506             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1507             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1508         td->td_ma += pgadv;
1509         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1510             pgadv));
1511         td->td_ma_cnt -= pgadv;
1512         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1513         uio->uio_iov->iov_len -= adv;
1514         uio->uio_resid -= adv;
1515         uio->uio_offset += adv;
1516         return (error);
1517 }
1518
1519 int
1520 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1521     struct uio *uio)
1522 {
1523         struct thread *td;
1524         vm_offset_t iov_base;
1525         int cnt, pgadv;
1526
1527         td = curthread;
1528         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1529             uio->uio_segflg != UIO_USERSPACE)
1530                 return (uiomove_fromphys(ma, offset, xfersize, uio));
1531
1532         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1533         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1534         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1535         switch (uio->uio_rw) {
1536         case UIO_WRITE:
1537                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1538                     offset, cnt);
1539                 break;
1540         case UIO_READ:
1541                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1542                     cnt);
1543                 break;
1544         }
1545         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1546         td->td_ma += pgadv;
1547         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1548             pgadv));
1549         td->td_ma_cnt -= pgadv;
1550         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1551         uio->uio_iov->iov_len -= cnt;
1552         uio->uio_resid -= cnt;
1553         uio->uio_offset += cnt;
1554         return (0);
1555 }
1556
1557 /*
1558  * File table truncate routine.
1559  */
1560 static int
1561 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1562     struct thread *td)
1563 {
1564         struct mount *mp;
1565         struct vnode *vp;
1566         void *rl_cookie;
1567         int error;
1568
1569         vp = fp->f_vnode;
1570
1571 retry:
1572         /*
1573          * Lock the whole range for truncation.  Otherwise split i/o
1574          * might happen partly before and partly after the truncation.
1575          */
1576         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1577         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1578         if (error)
1579                 goto out1;
1580         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1581         AUDIT_ARG_VNODE1(vp);
1582         if (vp->v_type == VDIR) {
1583                 error = EISDIR;
1584                 goto out;
1585         }
1586 #ifdef MAC
1587         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1588         if (error)
1589                 goto out;
1590 #endif
1591         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1592             fp->f_cred);
1593 out:
1594         VOP_UNLOCK(vp);
1595         vn_finished_write(mp);
1596 out1:
1597         vn_rangelock_unlock(vp, rl_cookie);
1598         if (error == ERELOOKUP)
1599                 goto retry;
1600         return (error);
1601 }
1602
1603 /*
1604  * Truncate a file that is already locked.
1605  */
1606 int
1607 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1608     struct ucred *cred)
1609 {
1610         struct vattr vattr;
1611         int error;
1612
1613         error = VOP_ADD_WRITECOUNT(vp, 1);
1614         if (error == 0) {
1615                 VATTR_NULL(&vattr);
1616                 vattr.va_size = length;
1617                 if (sync)
1618                         vattr.va_vaflags |= VA_SYNC;
1619                 error = VOP_SETATTR(vp, &vattr, cred);
1620                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1621         }
1622         return (error);
1623 }
1624
1625 /*
1626  * File table vnode stat routine.
1627  */
1628 int
1629 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
1630     struct thread *td)
1631 {
1632         struct vnode *vp = fp->f_vnode;
1633         int error;
1634
1635         vn_lock(vp, LK_SHARED | LK_RETRY);
1636         error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
1637         VOP_UNLOCK(vp);
1638
1639         return (error);
1640 }
1641
1642 /*
1643  * File table vnode ioctl routine.
1644  */
1645 static int
1646 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1647     struct thread *td)
1648 {
1649         struct vattr vattr;
1650         struct vnode *vp;
1651         struct fiobmap2_arg *bmarg;
1652         int error;
1653
1654         vp = fp->f_vnode;
1655         switch (vp->v_type) {
1656         case VDIR:
1657         case VREG:
1658                 switch (com) {
1659                 case FIONREAD:
1660                         vn_lock(vp, LK_SHARED | LK_RETRY);
1661                         error = VOP_GETATTR(vp, &vattr, active_cred);
1662                         VOP_UNLOCK(vp);
1663                         if (error == 0)
1664                                 *(int *)data = vattr.va_size - fp->f_offset;
1665                         return (error);
1666                 case FIOBMAP2:
1667                         bmarg = (struct fiobmap2_arg *)data;
1668                         vn_lock(vp, LK_SHARED | LK_RETRY);
1669 #ifdef MAC
1670                         error = mac_vnode_check_read(active_cred, fp->f_cred,
1671                             vp);
1672                         if (error == 0)
1673 #endif
1674                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
1675                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
1676                         VOP_UNLOCK(vp);
1677                         return (error);
1678                 case FIONBIO:
1679                 case FIOASYNC:
1680                         return (0);
1681                 default:
1682                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
1683                             active_cred, td));
1684                 }
1685                 break;
1686         case VCHR:
1687                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1688                     active_cred, td));
1689         default:
1690                 return (ENOTTY);
1691         }
1692 }
1693
1694 /*
1695  * File table vnode poll routine.
1696  */
1697 static int
1698 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1699     struct thread *td)
1700 {
1701         struct vnode *vp;
1702         int error;
1703
1704         vp = fp->f_vnode;
1705 #if defined(MAC) || defined(AUDIT)
1706         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1707                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1708                 AUDIT_ARG_VNODE1(vp);
1709                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1710                 VOP_UNLOCK(vp);
1711                 if (error != 0)
1712                         return (error);
1713         }
1714 #endif
1715         error = VOP_POLL(vp, events, fp->f_cred, td);
1716         return (error);
1717 }
1718
1719 /*
1720  * Acquire the requested lock and then check for validity.  LK_RETRY
1721  * permits vn_lock to return doomed vnodes.
1722  */
1723 static int __noinline
1724 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1725     int error)
1726 {
1727
1728         KASSERT((flags & LK_RETRY) == 0 || error == 0,
1729             ("vn_lock: error %d incompatible with flags %#x", error, flags));
1730
1731         if (error == 0)
1732                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1733
1734         if ((flags & LK_RETRY) == 0) {
1735                 if (error == 0) {
1736                         VOP_UNLOCK(vp);
1737                         error = ENOENT;
1738                 }
1739                 return (error);
1740         }
1741
1742         /*
1743          * LK_RETRY case.
1744          *
1745          * Nothing to do if we got the lock.
1746          */
1747         if (error == 0)
1748                 return (0);
1749
1750         /*
1751          * Interlock was dropped by the call in _vn_lock.
1752          */
1753         flags &= ~LK_INTERLOCK;
1754         do {
1755                 error = VOP_LOCK1(vp, flags, file, line);
1756         } while (error != 0);
1757         return (0);
1758 }
1759
1760 int
1761 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1762 {
1763         int error;
1764
1765         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1766             ("vn_lock: no locktype (%d passed)", flags));
1767         VNPASS(vp->v_holdcnt > 0, vp);
1768         error = VOP_LOCK1(vp, flags, file, line);
1769         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1770                 return (_vn_lock_fallback(vp, flags, file, line, error));
1771         return (0);
1772 }
1773
1774 /*
1775  * File table vnode close routine.
1776  */
1777 static int
1778 vn_closefile(struct file *fp, struct thread *td)
1779 {
1780         struct vnode *vp;
1781         struct flock lf;
1782         int error;
1783         bool ref;
1784
1785         vp = fp->f_vnode;
1786         fp->f_ops = &badfileops;
1787         ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1788
1789         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1790
1791         if (__predict_false(ref)) {
1792                 lf.l_whence = SEEK_SET;
1793                 lf.l_start = 0;
1794                 lf.l_len = 0;
1795                 lf.l_type = F_UNLCK;
1796                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1797                 vrele(vp);
1798         }
1799         return (error);
1800 }
1801
1802 /*
1803  * Preparing to start a filesystem write operation. If the operation is
1804  * permitted, then we bump the count of operations in progress and
1805  * proceed. If a suspend request is in progress, we wait until the
1806  * suspension is over, and then proceed.
1807  */
1808 static int
1809 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
1810 {
1811         struct mount_pcpu *mpcpu;
1812         int error, mflags;
1813
1814         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
1815             vfs_op_thread_enter(mp, mpcpu)) {
1816                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1817                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
1818                 vfs_op_thread_exit(mp, mpcpu);
1819                 return (0);
1820         }
1821
1822         if (mplocked)
1823                 mtx_assert(MNT_MTX(mp), MA_OWNED);
1824         else
1825                 MNT_ILOCK(mp);
1826
1827         error = 0;
1828
1829         /*
1830          * Check on status of suspension.
1831          */
1832         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1833             mp->mnt_susp_owner != curthread) {
1834                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1835                     (flags & PCATCH) : 0) | (PUSER - 1);
1836                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1837                         if (flags & V_NOWAIT) {
1838                                 error = EWOULDBLOCK;
1839                                 goto unlock;
1840                         }
1841                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1842                             "suspfs", 0);
1843                         if (error)
1844                                 goto unlock;
1845                 }
1846         }
1847         if (flags & V_XSLEEP)
1848                 goto unlock;
1849         mp->mnt_writeopcount++;
1850 unlock:
1851         if (error != 0 || (flags & V_XSLEEP) != 0)
1852                 MNT_REL(mp);
1853         MNT_IUNLOCK(mp);
1854         return (error);
1855 }
1856
1857 int
1858 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1859 {
1860         struct mount *mp;
1861         int error;
1862
1863         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1864             ("V_MNTREF requires mp"));
1865
1866         error = 0;
1867         /*
1868          * If a vnode is provided, get and return the mount point that
1869          * to which it will write.
1870          */
1871         if (vp != NULL) {
1872                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1873                         *mpp = NULL;
1874                         if (error != EOPNOTSUPP)
1875                                 return (error);
1876                         return (0);
1877                 }
1878         }
1879         if ((mp = *mpp) == NULL)
1880                 return (0);
1881
1882         /*
1883          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1884          * a vfs_ref().
1885          * As long as a vnode is not provided we need to acquire a
1886          * refcount for the provided mountpoint too, in order to
1887          * emulate a vfs_ref().
1888          */
1889         if (vp == NULL && (flags & V_MNTREF) == 0)
1890                 vfs_ref(mp);
1891
1892         return (vn_start_write_refed(mp, flags, false));
1893 }
1894
1895 /*
1896  * Secondary suspension. Used by operations such as vop_inactive
1897  * routines that are needed by the higher level functions. These
1898  * are allowed to proceed until all the higher level functions have
1899  * completed (indicated by mnt_writeopcount dropping to zero). At that
1900  * time, these operations are halted until the suspension is over.
1901  */
1902 int
1903 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1904 {
1905         struct mount *mp;
1906         int error;
1907
1908         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1909             ("V_MNTREF requires mp"));
1910
1911  retry:
1912         if (vp != NULL) {
1913                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1914                         *mpp = NULL;
1915                         if (error != EOPNOTSUPP)
1916                                 return (error);
1917                         return (0);
1918                 }
1919         }
1920         /*
1921          * If we are not suspended or have not yet reached suspended
1922          * mode, then let the operation proceed.
1923          */
1924         if ((mp = *mpp) == NULL)
1925                 return (0);
1926
1927         /*
1928          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1929          * a vfs_ref().
1930          * As long as a vnode is not provided we need to acquire a
1931          * refcount for the provided mountpoint too, in order to
1932          * emulate a vfs_ref().
1933          */
1934         MNT_ILOCK(mp);
1935         if (vp == NULL && (flags & V_MNTREF) == 0)
1936                 MNT_REF(mp);
1937         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1938                 mp->mnt_secondary_writes++;
1939                 mp->mnt_secondary_accwrites++;
1940                 MNT_IUNLOCK(mp);
1941                 return (0);
1942         }
1943         if (flags & V_NOWAIT) {
1944                 MNT_REL(mp);
1945                 MNT_IUNLOCK(mp);
1946                 return (EWOULDBLOCK);
1947         }
1948         /*
1949          * Wait for the suspension to finish.
1950          */
1951         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1952             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1953             "suspfs", 0);
1954         vfs_rel(mp);
1955         if (error == 0)
1956                 goto retry;
1957         return (error);
1958 }
1959
1960 /*
1961  * Filesystem write operation has completed. If we are suspending and this
1962  * operation is the last one, notify the suspender that the suspension is
1963  * now in effect.
1964  */
1965 void
1966 vn_finished_write(struct mount *mp)
1967 {
1968         struct mount_pcpu *mpcpu;
1969         int c;
1970
1971         if (mp == NULL)
1972                 return;
1973
1974         if (vfs_op_thread_enter(mp, mpcpu)) {
1975                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
1976                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
1977                 vfs_op_thread_exit(mp, mpcpu);
1978                 return;
1979         }
1980
1981         MNT_ILOCK(mp);
1982         vfs_assert_mount_counters(mp);
1983         MNT_REL(mp);
1984         c = --mp->mnt_writeopcount;
1985         if (mp->mnt_vfs_ops == 0) {
1986                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1987                 MNT_IUNLOCK(mp);
1988                 return;
1989         }
1990         if (c < 0)
1991                 vfs_dump_mount_counters(mp);
1992         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
1993                 wakeup(&mp->mnt_writeopcount);
1994         MNT_IUNLOCK(mp);
1995 }
1996
1997 /*
1998  * Filesystem secondary write operation has completed. If we are
1999  * suspending and this operation is the last one, notify the suspender
2000  * that the suspension is now in effect.
2001  */
2002 void
2003 vn_finished_secondary_write(struct mount *mp)
2004 {
2005         if (mp == NULL)
2006                 return;
2007         MNT_ILOCK(mp);
2008         MNT_REL(mp);
2009         mp->mnt_secondary_writes--;
2010         if (mp->mnt_secondary_writes < 0)
2011                 panic("vn_finished_secondary_write: neg cnt");
2012         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
2013             mp->mnt_secondary_writes <= 0)
2014                 wakeup(&mp->mnt_secondary_writes);
2015         MNT_IUNLOCK(mp);
2016 }
2017
2018 /*
2019  * Request a filesystem to suspend write operations.
2020  */
2021 int
2022 vfs_write_suspend(struct mount *mp, int flags)
2023 {
2024         int error;
2025
2026         vfs_op_enter(mp);
2027
2028         MNT_ILOCK(mp);
2029         vfs_assert_mount_counters(mp);
2030         if (mp->mnt_susp_owner == curthread) {
2031                 vfs_op_exit_locked(mp);
2032                 MNT_IUNLOCK(mp);
2033                 return (EALREADY);
2034         }
2035         while (mp->mnt_kern_flag & MNTK_SUSPEND)
2036                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
2037
2038         /*
2039          * Unmount holds a write reference on the mount point.  If we
2040          * own busy reference and drain for writers, we deadlock with
2041          * the reference draining in the unmount path.  Callers of
2042          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2043          * vfs_busy() reference is owned and caller is not in the
2044          * unmount context.
2045          */
2046         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2047             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2048                 vfs_op_exit_locked(mp);
2049                 MNT_IUNLOCK(mp);
2050                 return (EBUSY);
2051         }
2052
2053         mp->mnt_kern_flag |= MNTK_SUSPEND;
2054         mp->mnt_susp_owner = curthread;
2055         if (mp->mnt_writeopcount > 0)
2056                 (void) msleep(&mp->mnt_writeopcount,
2057                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
2058         else
2059                 MNT_IUNLOCK(mp);
2060         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2061                 vfs_write_resume(mp, 0);
2062                 /* vfs_write_resume does vfs_op_exit() for us */
2063         }
2064         return (error);
2065 }
2066
2067 /*
2068  * Request a filesystem to resume write operations.
2069  */
2070 void
2071 vfs_write_resume(struct mount *mp, int flags)
2072 {
2073
2074         MNT_ILOCK(mp);
2075         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2076                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2077                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2078                                        MNTK_SUSPENDED);
2079                 mp->mnt_susp_owner = NULL;
2080                 wakeup(&mp->mnt_writeopcount);
2081                 wakeup(&mp->mnt_flag);
2082                 curthread->td_pflags &= ~TDP_IGNSUSP;
2083                 if ((flags & VR_START_WRITE) != 0) {
2084                         MNT_REF(mp);
2085                         mp->mnt_writeopcount++;
2086                 }
2087                 MNT_IUNLOCK(mp);
2088                 if ((flags & VR_NO_SUSPCLR) == 0)
2089                         VFS_SUSP_CLEAN(mp);
2090                 vfs_op_exit(mp);
2091         } else if ((flags & VR_START_WRITE) != 0) {
2092                 MNT_REF(mp);
2093                 vn_start_write_refed(mp, 0, true);
2094         } else {
2095                 MNT_IUNLOCK(mp);
2096         }
2097 }
2098
2099 /*
2100  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2101  * methods.
2102  */
2103 int
2104 vfs_write_suspend_umnt(struct mount *mp)
2105 {
2106         int error;
2107
2108         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2109             ("vfs_write_suspend_umnt: recursed"));
2110
2111         /* dounmount() already called vn_start_write(). */
2112         for (;;) {
2113                 vn_finished_write(mp);
2114                 error = vfs_write_suspend(mp, 0);
2115                 if (error != 0) {
2116                         vn_start_write(NULL, &mp, V_WAIT);
2117                         return (error);
2118                 }
2119                 MNT_ILOCK(mp);
2120                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2121                         break;
2122                 MNT_IUNLOCK(mp);
2123                 vn_start_write(NULL, &mp, V_WAIT);
2124         }
2125         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2126         wakeup(&mp->mnt_flag);
2127         MNT_IUNLOCK(mp);
2128         curthread->td_pflags |= TDP_IGNSUSP;
2129         return (0);
2130 }
2131
2132 /*
2133  * Implement kqueues for files by translating it to vnode operation.
2134  */
2135 static int
2136 vn_kqfilter(struct file *fp, struct knote *kn)
2137 {
2138
2139         return (VOP_KQFILTER(fp->f_vnode, kn));
2140 }
2141
2142 /*
2143  * Simplified in-kernel wrapper calls for extended attribute access.
2144  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2145  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2146  */
2147 int
2148 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2149     const char *attrname, int *buflen, char *buf, struct thread *td)
2150 {
2151         struct uio      auio;
2152         struct iovec    iov;
2153         int     error;
2154
2155         iov.iov_len = *buflen;
2156         iov.iov_base = buf;
2157
2158         auio.uio_iov = &iov;
2159         auio.uio_iovcnt = 1;
2160         auio.uio_rw = UIO_READ;
2161         auio.uio_segflg = UIO_SYSSPACE;
2162         auio.uio_td = td;
2163         auio.uio_offset = 0;
2164         auio.uio_resid = *buflen;
2165
2166         if ((ioflg & IO_NODELOCKED) == 0)
2167                 vn_lock(vp, LK_SHARED | LK_RETRY);
2168
2169         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2170
2171         /* authorize attribute retrieval as kernel */
2172         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2173             td);
2174
2175         if ((ioflg & IO_NODELOCKED) == 0)
2176                 VOP_UNLOCK(vp);
2177
2178         if (error == 0) {
2179                 *buflen = *buflen - auio.uio_resid;
2180         }
2181
2182         return (error);
2183 }
2184
2185 /*
2186  * XXX failure mode if partially written?
2187  */
2188 int
2189 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2190     const char *attrname, int buflen, char *buf, struct thread *td)
2191 {
2192         struct uio      auio;
2193         struct iovec    iov;
2194         struct mount    *mp;
2195         int     error;
2196
2197         iov.iov_len = buflen;
2198         iov.iov_base = buf;
2199
2200         auio.uio_iov = &iov;
2201         auio.uio_iovcnt = 1;
2202         auio.uio_rw = UIO_WRITE;
2203         auio.uio_segflg = UIO_SYSSPACE;
2204         auio.uio_td = td;
2205         auio.uio_offset = 0;
2206         auio.uio_resid = buflen;
2207
2208         if ((ioflg & IO_NODELOCKED) == 0) {
2209                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2210                         return (error);
2211                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2212         }
2213
2214         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2215
2216         /* authorize attribute setting as kernel */
2217         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2218
2219         if ((ioflg & IO_NODELOCKED) == 0) {
2220                 vn_finished_write(mp);
2221                 VOP_UNLOCK(vp);
2222         }
2223
2224         return (error);
2225 }
2226
2227 int
2228 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2229     const char *attrname, struct thread *td)
2230 {
2231         struct mount    *mp;
2232         int     error;
2233
2234         if ((ioflg & IO_NODELOCKED) == 0) {
2235                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2236                         return (error);
2237                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2238         }
2239
2240         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2241
2242         /* authorize attribute removal as kernel */
2243         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2244         if (error == EOPNOTSUPP)
2245                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2246                     NULL, td);
2247
2248         if ((ioflg & IO_NODELOCKED) == 0) {
2249                 vn_finished_write(mp);
2250                 VOP_UNLOCK(vp);
2251         }
2252
2253         return (error);
2254 }
2255
2256 static int
2257 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2258     struct vnode **rvp)
2259 {
2260
2261         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2262 }
2263
2264 int
2265 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2266 {
2267
2268         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2269             lkflags, rvp));
2270 }
2271
2272 int
2273 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2274     int lkflags, struct vnode **rvp)
2275 {
2276         struct mount *mp;
2277         int ltype, error;
2278
2279         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2280         mp = vp->v_mount;
2281         ltype = VOP_ISLOCKED(vp);
2282         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2283             ("vn_vget_ino: vp not locked"));
2284         error = vfs_busy(mp, MBF_NOWAIT);
2285         if (error != 0) {
2286                 vfs_ref(mp);
2287                 VOP_UNLOCK(vp);
2288                 error = vfs_busy(mp, 0);
2289                 vn_lock(vp, ltype | LK_RETRY);
2290                 vfs_rel(mp);
2291                 if (error != 0)
2292                         return (ENOENT);
2293                 if (VN_IS_DOOMED(vp)) {
2294                         vfs_unbusy(mp);
2295                         return (ENOENT);
2296                 }
2297         }
2298         VOP_UNLOCK(vp);
2299         error = alloc(mp, alloc_arg, lkflags, rvp);
2300         vfs_unbusy(mp);
2301         if (error != 0 || *rvp != vp)
2302                 vn_lock(vp, ltype | LK_RETRY);
2303         if (VN_IS_DOOMED(vp)) {
2304                 if (error == 0) {
2305                         if (*rvp == vp)
2306                                 vunref(vp);
2307                         else
2308                                 vput(*rvp);
2309                 }
2310                 error = ENOENT;
2311         }
2312         return (error);
2313 }
2314
2315 int
2316 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2317     struct thread *td)
2318 {
2319
2320         if (vp->v_type != VREG || td == NULL)
2321                 return (0);
2322         if ((uoff_t)uio->uio_offset + uio->uio_resid >
2323             lim_cur(td, RLIMIT_FSIZE)) {
2324                 PROC_LOCK(td->td_proc);
2325                 kern_psignal(td->td_proc, SIGXFSZ);
2326                 PROC_UNLOCK(td->td_proc);
2327                 return (EFBIG);
2328         }
2329         return (0);
2330 }
2331
2332 int
2333 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2334     struct thread *td)
2335 {
2336         struct vnode *vp;
2337
2338         vp = fp->f_vnode;
2339 #ifdef AUDIT
2340         vn_lock(vp, LK_SHARED | LK_RETRY);
2341         AUDIT_ARG_VNODE1(vp);
2342         VOP_UNLOCK(vp);
2343 #endif
2344         return (setfmode(td, active_cred, vp, mode));
2345 }
2346
2347 int
2348 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2349     struct thread *td)
2350 {
2351         struct vnode *vp;
2352
2353         vp = fp->f_vnode;
2354 #ifdef AUDIT
2355         vn_lock(vp, LK_SHARED | LK_RETRY);
2356         AUDIT_ARG_VNODE1(vp);
2357         VOP_UNLOCK(vp);
2358 #endif
2359         return (setfown(td, active_cred, vp, uid, gid));
2360 }
2361
2362 void
2363 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2364 {
2365         vm_object_t object;
2366
2367         if ((object = vp->v_object) == NULL)
2368                 return;
2369         VM_OBJECT_WLOCK(object);
2370         vm_object_page_remove(object, start, end, 0);
2371         VM_OBJECT_WUNLOCK(object);
2372 }
2373
2374 int
2375 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2376 {
2377         struct vattr va;
2378         daddr_t bn, bnp;
2379         uint64_t bsize;
2380         off_t noff;
2381         int error;
2382
2383         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2384             ("Wrong command %lu", cmd));
2385
2386         if (vn_lock(vp, LK_SHARED) != 0)
2387                 return (EBADF);
2388         if (vp->v_type != VREG) {
2389                 error = ENOTTY;
2390                 goto unlock;
2391         }
2392         error = VOP_GETATTR(vp, &va, cred);
2393         if (error != 0)
2394                 goto unlock;
2395         noff = *off;
2396         if (noff >= va.va_size) {
2397                 error = ENXIO;
2398                 goto unlock;
2399         }
2400         bsize = vp->v_mount->mnt_stat.f_iosize;
2401         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2402             noff % bsize) {
2403                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2404                 if (error == EOPNOTSUPP) {
2405                         error = ENOTTY;
2406                         goto unlock;
2407                 }
2408                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2409                     (bnp != -1 && cmd == FIOSEEKDATA)) {
2410                         noff = bn * bsize;
2411                         if (noff < *off)
2412                                 noff = *off;
2413                         goto unlock;
2414                 }
2415         }
2416         if (noff > va.va_size)
2417                 noff = va.va_size;
2418         /* noff == va.va_size. There is an implicit hole at the end of file. */
2419         if (cmd == FIOSEEKDATA)
2420                 error = ENXIO;
2421 unlock:
2422         VOP_UNLOCK(vp);
2423         if (error == 0)
2424                 *off = noff;
2425         return (error);
2426 }
2427
2428 int
2429 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2430 {
2431         struct ucred *cred;
2432         struct vnode *vp;
2433         struct vattr vattr;
2434         off_t foffset, size;
2435         int error, noneg;
2436
2437         cred = td->td_ucred;
2438         vp = fp->f_vnode;
2439         foffset = foffset_lock(fp, 0);
2440         noneg = (vp->v_type != VCHR);
2441         error = 0;
2442         switch (whence) {
2443         case L_INCR:
2444                 if (noneg &&
2445                     (foffset < 0 ||
2446                     (offset > 0 && foffset > OFF_MAX - offset))) {
2447                         error = EOVERFLOW;
2448                         break;
2449                 }
2450                 offset += foffset;
2451                 break;
2452         case L_XTND:
2453                 vn_lock(vp, LK_SHARED | LK_RETRY);
2454                 error = VOP_GETATTR(vp, &vattr, cred);
2455                 VOP_UNLOCK(vp);
2456                 if (error)
2457                         break;
2458
2459                 /*
2460                  * If the file references a disk device, then fetch
2461                  * the media size and use that to determine the ending
2462                  * offset.
2463                  */
2464                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2465                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2466                         vattr.va_size = size;
2467                 if (noneg &&
2468                     (vattr.va_size > OFF_MAX ||
2469                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2470                         error = EOVERFLOW;
2471                         break;
2472                 }
2473                 offset += vattr.va_size;
2474                 break;
2475         case L_SET:
2476                 break;
2477         case SEEK_DATA:
2478                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2479                 if (error == ENOTTY)
2480                         error = EINVAL;
2481                 break;
2482         case SEEK_HOLE:
2483                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2484                 if (error == ENOTTY)
2485                         error = EINVAL;
2486                 break;
2487         default:
2488                 error = EINVAL;
2489         }
2490         if (error == 0 && noneg && offset < 0)
2491                 error = EINVAL;
2492         if (error != 0)
2493                 goto drop;
2494         VFS_KNOTE_UNLOCKED(vp, 0);
2495         td->td_uretoff.tdu_off = offset;
2496 drop:
2497         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2498         return (error);
2499 }
2500
2501 int
2502 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2503     struct thread *td)
2504 {
2505         int error;
2506
2507         /*
2508          * Grant permission if the caller is the owner of the file, or
2509          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2510          * on the file.  If the time pointer is null, then write
2511          * permission on the file is also sufficient.
2512          *
2513          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2514          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2515          * will be allowed to set the times [..] to the current
2516          * server time.
2517          */
2518         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2519         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2520                 error = VOP_ACCESS(vp, VWRITE, cred, td);
2521         return (error);
2522 }
2523
2524 int
2525 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2526 {
2527         struct vnode *vp;
2528         int error;
2529
2530         if (fp->f_type == DTYPE_FIFO)
2531                 kif->kf_type = KF_TYPE_FIFO;
2532         else
2533                 kif->kf_type = KF_TYPE_VNODE;
2534         vp = fp->f_vnode;
2535         vref(vp);
2536         FILEDESC_SUNLOCK(fdp);
2537         error = vn_fill_kinfo_vnode(vp, kif);
2538         vrele(vp);
2539         FILEDESC_SLOCK(fdp);
2540         return (error);
2541 }
2542
2543 static inline void
2544 vn_fill_junk(struct kinfo_file *kif)
2545 {
2546         size_t len, olen;
2547
2548         /*
2549          * Simulate vn_fullpath returning changing values for a given
2550          * vp during e.g. coredump.
2551          */
2552         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2553         olen = strlen(kif->kf_path);
2554         if (len < olen)
2555                 strcpy(&kif->kf_path[len - 1], "$");
2556         else
2557                 for (; olen < len; olen++)
2558                         strcpy(&kif->kf_path[olen], "A");
2559 }
2560
2561 int
2562 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2563 {
2564         struct vattr va;
2565         char *fullpath, *freepath;
2566         int error;
2567
2568         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2569         freepath = NULL;
2570         fullpath = "-";
2571         error = vn_fullpath(vp, &fullpath, &freepath);
2572         if (error == 0) {
2573                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2574         }
2575         if (freepath != NULL)
2576                 free(freepath, M_TEMP);
2577
2578         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2579                 vn_fill_junk(kif);
2580         );
2581
2582         /*
2583          * Retrieve vnode attributes.
2584          */
2585         va.va_fsid = VNOVAL;
2586         va.va_rdev = NODEV;
2587         vn_lock(vp, LK_SHARED | LK_RETRY);
2588         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2589         VOP_UNLOCK(vp);
2590         if (error != 0)
2591                 return (error);
2592         if (va.va_fsid != VNOVAL)
2593                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2594         else
2595                 kif->kf_un.kf_file.kf_file_fsid =
2596                     vp->v_mount->mnt_stat.f_fsid.val[0];
2597         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2598             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2599         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2600         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2601         kif->kf_un.kf_file.kf_file_size = va.va_size;
2602         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2603         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2604             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2605         return (0);
2606 }
2607
2608 int
2609 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2610     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2611     struct thread *td)
2612 {
2613 #ifdef HWPMC_HOOKS
2614         struct pmckern_map_in pkm;
2615 #endif
2616         struct mount *mp;
2617         struct vnode *vp;
2618         vm_object_t object;
2619         vm_prot_t maxprot;
2620         boolean_t writecounted;
2621         int error;
2622
2623 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2624     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2625         /*
2626          * POSIX shared-memory objects are defined to have
2627          * kernel persistence, and are not defined to support
2628          * read(2)/write(2) -- or even open(2).  Thus, we can
2629          * use MAP_ASYNC to trade on-disk coherence for speed.
2630          * The shm_open(3) library routine turns on the FPOSIXSHM
2631          * flag to request this behavior.
2632          */
2633         if ((fp->f_flag & FPOSIXSHM) != 0)
2634                 flags |= MAP_NOSYNC;
2635 #endif
2636         vp = fp->f_vnode;
2637
2638         /*
2639          * Ensure that file and memory protections are
2640          * compatible.  Note that we only worry about
2641          * writability if mapping is shared; in this case,
2642          * current and max prot are dictated by the open file.
2643          * XXX use the vnode instead?  Problem is: what
2644          * credentials do we use for determination? What if
2645          * proc does a setuid?
2646          */
2647         mp = vp->v_mount;
2648         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2649                 maxprot = VM_PROT_NONE;
2650                 if ((prot & VM_PROT_EXECUTE) != 0)
2651                         return (EACCES);
2652         } else
2653                 maxprot = VM_PROT_EXECUTE;
2654         if ((fp->f_flag & FREAD) != 0)
2655                 maxprot |= VM_PROT_READ;
2656         else if ((prot & VM_PROT_READ) != 0)
2657                 return (EACCES);
2658
2659         /*
2660          * If we are sharing potential changes via MAP_SHARED and we
2661          * are trying to get write permission although we opened it
2662          * without asking for it, bail out.
2663          */
2664         if ((flags & MAP_SHARED) != 0) {
2665                 if ((fp->f_flag & FWRITE) != 0)
2666                         maxprot |= VM_PROT_WRITE;
2667                 else if ((prot & VM_PROT_WRITE) != 0)
2668                         return (EACCES);
2669         } else {
2670                 maxprot |= VM_PROT_WRITE;
2671                 cap_maxprot |= VM_PROT_WRITE;
2672         }
2673         maxprot &= cap_maxprot;
2674
2675         /*
2676          * For regular files and shared memory, POSIX requires that
2677          * the value of foff be a legitimate offset within the data
2678          * object.  In particular, negative offsets are invalid.
2679          * Blocking negative offsets and overflows here avoids
2680          * possible wraparound or user-level access into reserved
2681          * ranges of the data object later.  In contrast, POSIX does
2682          * not dictate how offsets are used by device drivers, so in
2683          * the case of a device mapping a negative offset is passed
2684          * on.
2685          */
2686         if (
2687 #ifdef _LP64
2688             size > OFF_MAX ||
2689 #endif
2690             foff > OFF_MAX - size)
2691                 return (EINVAL);
2692
2693         writecounted = FALSE;
2694         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2695             &foff, &object, &writecounted);
2696         if (error != 0)
2697                 return (error);
2698         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2699             foff, writecounted, td);
2700         if (error != 0) {
2701                 /*
2702                  * If this mapping was accounted for in the vnode's
2703                  * writecount, then undo that now.
2704                  */
2705                 if (writecounted)
2706                         vm_pager_release_writecount(object, 0, size);
2707                 vm_object_deallocate(object);
2708         }
2709 #ifdef HWPMC_HOOKS
2710         /* Inform hwpmc(4) if an executable is being mapped. */
2711         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2712                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2713                         pkm.pm_file = vp;
2714                         pkm.pm_address = (uintptr_t) *addr;
2715                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2716                 }
2717         }
2718 #endif
2719         return (error);
2720 }
2721
2722 void
2723 vn_fsid(struct vnode *vp, struct vattr *va)
2724 {
2725         fsid_t *f;
2726
2727         f = &vp->v_mount->mnt_stat.f_fsid;
2728         va->va_fsid = (uint32_t)f->val[1];
2729         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2730         va->va_fsid += (uint32_t)f->val[0];
2731 }
2732
2733 int
2734 vn_fsync_buf(struct vnode *vp, int waitfor)
2735 {
2736         struct buf *bp, *nbp;
2737         struct bufobj *bo;
2738         struct mount *mp;
2739         int error, maxretry;
2740
2741         error = 0;
2742         maxretry = 10000;     /* large, arbitrarily chosen */
2743         mp = NULL;
2744         if (vp->v_type == VCHR) {
2745                 VI_LOCK(vp);
2746                 mp = vp->v_rdev->si_mountpt;
2747                 VI_UNLOCK(vp);
2748         }
2749         bo = &vp->v_bufobj;
2750         BO_LOCK(bo);
2751 loop1:
2752         /*
2753          * MARK/SCAN initialization to avoid infinite loops.
2754          */
2755         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2756                 bp->b_vflags &= ~BV_SCANNED;
2757                 bp->b_error = 0;
2758         }
2759
2760         /*
2761          * Flush all dirty buffers associated with a vnode.
2762          */
2763 loop2:
2764         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2765                 if ((bp->b_vflags & BV_SCANNED) != 0)
2766                         continue;
2767                 bp->b_vflags |= BV_SCANNED;
2768                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2769                         if (waitfor != MNT_WAIT)
2770                                 continue;
2771                         if (BUF_LOCK(bp,
2772                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
2773                             BO_LOCKPTR(bo)) != 0) {
2774                                 BO_LOCK(bo);
2775                                 goto loop1;
2776                         }
2777                         BO_LOCK(bo);
2778                 }
2779                 BO_UNLOCK(bo);
2780                 KASSERT(bp->b_bufobj == bo,
2781                     ("bp %p wrong b_bufobj %p should be %p",
2782                     bp, bp->b_bufobj, bo));
2783                 if ((bp->b_flags & B_DELWRI) == 0)
2784                         panic("fsync: not dirty");
2785                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2786                         vfs_bio_awrite(bp);
2787                 } else {
2788                         bremfree(bp);
2789                         bawrite(bp);
2790                 }
2791                 if (maxretry < 1000)
2792                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
2793                 BO_LOCK(bo);
2794                 goto loop2;
2795         }
2796
2797         /*
2798          * If synchronous the caller expects us to completely resolve all
2799          * dirty buffers in the system.  Wait for in-progress I/O to
2800          * complete (which could include background bitmap writes), then
2801          * retry if dirty blocks still exist.
2802          */
2803         if (waitfor == MNT_WAIT) {
2804                 bufobj_wwait(bo, 0, 0);
2805                 if (bo->bo_dirty.bv_cnt > 0) {
2806                         /*
2807                          * If we are unable to write any of these buffers
2808                          * then we fail now rather than trying endlessly
2809                          * to write them out.
2810                          */
2811                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2812                                 if ((error = bp->b_error) != 0)
2813                                         break;
2814                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
2815                             (error == 0 && --maxretry >= 0))
2816                                 goto loop1;
2817                         if (error == 0)
2818                                 error = EAGAIN;
2819                 }
2820         }
2821         BO_UNLOCK(bo);
2822         if (error != 0)
2823                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2824
2825         return (error);
2826 }
2827
2828 /*
2829  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
2830  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
2831  * to do the actual copy.
2832  * vn_generic_copy_file_range() is factored out, so it can be called
2833  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
2834  * different file systems.
2835  */
2836 int
2837 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
2838     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
2839     struct ucred *outcred, struct thread *fsize_td)
2840 {
2841         int error;
2842         size_t len;
2843         uint64_t uval;
2844
2845         len = *lenp;
2846         *lenp = 0;              /* For error returns. */
2847         error = 0;
2848
2849         /* Do some sanity checks on the arguments. */
2850         if (invp->v_type == VDIR || outvp->v_type == VDIR)
2851                 error = EISDIR;
2852         else if (*inoffp < 0 || *outoffp < 0 ||
2853             invp->v_type != VREG || outvp->v_type != VREG)
2854                 error = EINVAL;
2855         if (error != 0)
2856                 goto out;
2857
2858         /* Ensure offset + len does not wrap around. */
2859         uval = *inoffp;
2860         uval += len;
2861         if (uval > INT64_MAX)
2862                 len = INT64_MAX - *inoffp;
2863         uval = *outoffp;
2864         uval += len;
2865         if (uval > INT64_MAX)
2866                 len = INT64_MAX - *outoffp;
2867         if (len == 0)
2868                 goto out;
2869
2870         /*
2871          * If the two vnode are for the same file system, call
2872          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
2873          * which can handle copies across multiple file systems.
2874          */
2875         *lenp = len;
2876         if (invp->v_mount == outvp->v_mount)
2877                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
2878                     lenp, flags, incred, outcred, fsize_td);
2879         else
2880                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
2881                     outoffp, lenp, flags, incred, outcred, fsize_td);
2882 out:
2883         return (error);
2884 }
2885
2886 /*
2887  * Test len bytes of data starting at dat for all bytes == 0.
2888  * Return true if all bytes are zero, false otherwise.
2889  * Expects dat to be well aligned.
2890  */
2891 static bool
2892 mem_iszero(void *dat, int len)
2893 {
2894         int i;
2895         const u_int *p;
2896         const char *cp;
2897
2898         for (p = dat; len > 0; len -= sizeof(*p), p++) {
2899                 if (len >= sizeof(*p)) {
2900                         if (*p != 0)
2901                                 return (false);
2902                 } else {
2903                         cp = (const char *)p;
2904                         for (i = 0; i < len; i++, cp++)
2905                                 if (*cp != '\0')
2906                                         return (false);
2907                 }
2908         }
2909         return (true);
2910 }
2911
2912 /*
2913  * Look for a hole in the output file and, if found, adjust *outoffp
2914  * and *xferp to skip past the hole.
2915  * *xferp is the entire hole length to be written and xfer2 is how many bytes
2916  * to be written as 0's upon return.
2917  */
2918 static off_t
2919 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
2920     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
2921 {
2922         int error;
2923         off_t delta;
2924
2925         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
2926                 *dataoffp = *outoffp;
2927                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
2928                     curthread);
2929                 if (error == 0) {
2930                         *holeoffp = *dataoffp;
2931                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
2932                             curthread);
2933                 }
2934                 if (error != 0 || *holeoffp == *dataoffp) {
2935                         /*
2936                          * Since outvp is unlocked, it may be possible for
2937                          * another thread to do a truncate(), lseek(), write()
2938                          * creating a hole at startoff between the above
2939                          * VOP_IOCTL() calls, if the other thread does not do
2940                          * rangelocking.
2941                          * If that happens, *holeoffp == *dataoffp and finding
2942                          * the hole has failed, so disable vn_skip_hole().
2943                          */
2944                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
2945                         return (xfer2);
2946                 }
2947                 KASSERT(*dataoffp >= *outoffp,
2948                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
2949                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
2950                 KASSERT(*holeoffp > *dataoffp,
2951                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
2952                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
2953         }
2954
2955         /*
2956          * If there is a hole before the data starts, advance *outoffp and
2957          * *xferp past the hole.
2958          */
2959         if (*dataoffp > *outoffp) {
2960                 delta = *dataoffp - *outoffp;
2961                 if (delta >= *xferp) {
2962                         /* Entire *xferp is a hole. */
2963                         *outoffp += *xferp;
2964                         *xferp = 0;
2965                         return (0);
2966                 }
2967                 *xferp -= delta;
2968                 *outoffp += delta;
2969                 xfer2 = MIN(xfer2, *xferp);
2970         }
2971
2972         /*
2973          * If a hole starts before the end of this xfer2, reduce this xfer2 so
2974          * that the write ends at the start of the hole.
2975          * *holeoffp should always be greater than *outoffp, but for the
2976          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
2977          * value.
2978          */
2979         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
2980                 xfer2 = *holeoffp - *outoffp;
2981         return (xfer2);
2982 }
2983
2984 /*
2985  * Write an xfer sized chunk to outvp in blksize blocks from dat.
2986  * dat is a maximum of blksize in length and can be written repeatedly in
2987  * the chunk.
2988  * If growfile == true, just grow the file via vn_truncate_locked() instead
2989  * of doing actual writes.
2990  * If checkhole == true, a hole is being punched, so skip over any hole
2991  * already in the output file.
2992  */
2993 static int
2994 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
2995     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
2996 {
2997         struct mount *mp;
2998         off_t dataoff, holeoff, xfer2;
2999         int error, lckf;
3000
3001         /*
3002          * Loop around doing writes of blksize until write has been completed.
3003          * Lock/unlock on each loop iteration so that a bwillwrite() can be
3004          * done for each iteration, since the xfer argument can be very
3005          * large if there is a large hole to punch in the output file.
3006          */
3007         error = 0;
3008         holeoff = 0;
3009         do {
3010                 xfer2 = MIN(xfer, blksize);
3011                 if (checkhole) {
3012                         /*
3013                          * Punching a hole.  Skip writing if there is
3014                          * already a hole in the output file.
3015                          */
3016                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
3017                             &dataoff, &holeoff, cred);
3018                         if (xfer == 0)
3019                                 break;
3020                         if (holeoff < 0)
3021                                 checkhole = false;
3022                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
3023                             (intmax_t)xfer2));
3024                 }
3025                 bwillwrite();
3026                 mp = NULL;
3027                 error = vn_start_write(outvp, &mp, V_WAIT);
3028                 if (error != 0)
3029                         break;
3030                 if (growfile) {
3031                         error = vn_lock(outvp, LK_EXCLUSIVE);
3032                         if (error == 0) {
3033                                 error = vn_truncate_locked(outvp, outoff + xfer,
3034                                     false, cred);
3035                                 VOP_UNLOCK(outvp);
3036                         }
3037                 } else {
3038                         if (MNT_SHARED_WRITES(mp))
3039                                 lckf = LK_SHARED;
3040                         else
3041                                 lckf = LK_EXCLUSIVE;
3042                         error = vn_lock(outvp, lckf);
3043                         if (error == 0) {
3044                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3045                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
3046                                     curthread->td_ucred, cred, NULL, curthread);
3047                                 outoff += xfer2;
3048                                 xfer -= xfer2;
3049                                 VOP_UNLOCK(outvp);
3050                         }
3051                 }
3052                 if (mp != NULL)
3053                         vn_finished_write(mp);
3054         } while (!growfile && xfer > 0 && error == 0);
3055         return (error);
3056 }
3057
3058 /*
3059  * Copy a byte range of one file to another.  This function can handle the
3060  * case where invp and outvp are on different file systems.
3061  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3062  * is no better file system specific way to do it.
3063  */
3064 int
3065 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3066     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3067     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3068 {
3069         struct vattr va;
3070         struct mount *mp;
3071         struct uio io;
3072         off_t startoff, endoff, xfer, xfer2;
3073         u_long blksize;
3074         int error, interrupted;
3075         bool cantseek, readzeros, eof, lastblock;
3076         ssize_t aresid;
3077         size_t copylen, len, rem, savlen;
3078         char *dat;
3079         long holein, holeout;
3080
3081         holein = holeout = 0;
3082         savlen = len = *lenp;
3083         error = 0;
3084         interrupted = 0;
3085         dat = NULL;
3086
3087         error = vn_lock(invp, LK_SHARED);
3088         if (error != 0)
3089                 goto out;
3090         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3091                 holein = 0;
3092         VOP_UNLOCK(invp);
3093
3094         mp = NULL;
3095         error = vn_start_write(outvp, &mp, V_WAIT);
3096         if (error == 0)
3097                 error = vn_lock(outvp, LK_EXCLUSIVE);
3098         if (error == 0) {
3099                 /*
3100                  * If fsize_td != NULL, do a vn_rlimit_fsize() call,
3101                  * now that outvp is locked.
3102                  */
3103                 if (fsize_td != NULL) {
3104                         io.uio_offset = *outoffp;
3105                         io.uio_resid = len;
3106                         error = vn_rlimit_fsize(outvp, &io, fsize_td);
3107                         if (error != 0)
3108                                 error = EFBIG;
3109                 }
3110                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3111                         holeout = 0;
3112                 /*
3113                  * Holes that are past EOF do not need to be written as a block
3114                  * of zero bytes.  So, truncate the output file as far as
3115                  * possible and then use va.va_size to decide if writing 0
3116                  * bytes is necessary in the loop below.
3117                  */
3118                 if (error == 0)
3119                         error = VOP_GETATTR(outvp, &va, outcred);
3120                 if (error == 0 && va.va_size > *outoffp && va.va_size <=
3121                     *outoffp + len) {
3122 #ifdef MAC
3123                         error = mac_vnode_check_write(curthread->td_ucred,
3124                             outcred, outvp);
3125                         if (error == 0)
3126 #endif
3127                                 error = vn_truncate_locked(outvp, *outoffp,
3128                                     false, outcred);
3129                         if (error == 0)
3130                                 va.va_size = *outoffp;
3131                 }
3132                 VOP_UNLOCK(outvp);
3133         }
3134         if (mp != NULL)
3135                 vn_finished_write(mp);
3136         if (error != 0)
3137                 goto out;
3138
3139         /*
3140          * Set the blksize to the larger of the hole sizes for invp and outvp.
3141          * If hole sizes aren't available, set the blksize to the larger
3142          * f_iosize of invp and outvp.
3143          * This code expects the hole sizes and f_iosizes to be powers of 2.
3144          * This value is clipped at 4Kbytes and 1Mbyte.
3145          */
3146         blksize = MAX(holein, holeout);
3147
3148         /* Clip len to end at an exact multiple of hole size. */
3149         if (blksize > 1) {
3150                 rem = *inoffp % blksize;
3151                 if (rem > 0)
3152                         rem = blksize - rem;
3153                 if (len > rem && len - rem > blksize)
3154                         len = savlen = rounddown(len - rem, blksize) + rem;
3155         }
3156
3157         if (blksize <= 1)
3158                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3159                     outvp->v_mount->mnt_stat.f_iosize);
3160         if (blksize < 4096)
3161                 blksize = 4096;
3162         else if (blksize > 1024 * 1024)
3163                 blksize = 1024 * 1024;
3164         dat = malloc(blksize, M_TEMP, M_WAITOK);
3165
3166         /*
3167          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3168          * to find holes.  Otherwise, just scan the read block for all 0s
3169          * in the inner loop where the data copying is done.
3170          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3171          * support holes on the server, but do not support FIOSEEKHOLE.
3172          */
3173         eof = false;
3174         while (len > 0 && error == 0 && !eof && interrupted == 0) {
3175                 endoff = 0;                     /* To shut up compilers. */
3176                 cantseek = true;
3177                 startoff = *inoffp;
3178                 copylen = len;
3179
3180                 /*
3181                  * Find the next data area.  If there is just a hole to EOF,
3182                  * FIOSEEKDATA should fail and then we drop down into the
3183                  * inner loop and create the hole on the outvp file.
3184                  * (I do not know if any file system will report a hole to
3185                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3186                  *  will fail for those file systems.)
3187                  *
3188                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3189                  * the code just falls through to the inner copy loop.
3190                  */
3191                 error = EINVAL;
3192                 if (holein > 0)
3193                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3194                             incred, curthread);
3195                 if (error == 0) {
3196                         endoff = startoff;
3197                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3198                             incred, curthread);
3199                         /*
3200                          * Since invp is unlocked, it may be possible for
3201                          * another thread to do a truncate(), lseek(), write()
3202                          * creating a hole at startoff between the above
3203                          * VOP_IOCTL() calls, if the other thread does not do
3204                          * rangelocking.
3205                          * If that happens, startoff == endoff and finding
3206                          * the hole has failed, so set an error.
3207                          */
3208                         if (error == 0 && startoff == endoff)
3209                                 error = EINVAL; /* Any error. Reset to 0. */
3210                 }
3211                 if (error == 0) {
3212                         if (startoff > *inoffp) {
3213                                 /* Found hole before data block. */
3214                                 xfer = MIN(startoff - *inoffp, len);
3215                                 if (*outoffp < va.va_size) {
3216                                         /* Must write 0s to punch hole. */
3217                                         xfer2 = MIN(va.va_size - *outoffp,
3218                                             xfer);
3219                                         memset(dat, 0, MIN(xfer2, blksize));
3220                                         error = vn_write_outvp(outvp, dat,
3221                                             *outoffp, xfer2, blksize, false,
3222                                             holeout > 0, outcred);
3223                                 }
3224
3225                                 if (error == 0 && *outoffp + xfer >
3226                                     va.va_size && xfer == len)
3227                                         /* Grow last block. */
3228                                         error = vn_write_outvp(outvp, dat,
3229                                             *outoffp, xfer, blksize, true,
3230                                             false, outcred);
3231                                 if (error == 0) {
3232                                         *inoffp += xfer;
3233                                         *outoffp += xfer;
3234                                         len -= xfer;
3235                                         if (len < savlen)
3236                                                 interrupted = sig_intr();
3237                                 }
3238                         }
3239                         copylen = MIN(len, endoff - startoff);
3240                         cantseek = false;
3241                 } else {
3242                         cantseek = true;
3243                         startoff = *inoffp;
3244                         copylen = len;
3245                         error = 0;
3246                 }
3247
3248                 xfer = blksize;
3249                 if (cantseek) {
3250                         /*
3251                          * Set first xfer to end at a block boundary, so that
3252                          * holes are more likely detected in the loop below via
3253                          * the for all bytes 0 method.
3254                          */
3255                         xfer -= (*inoffp % blksize);
3256                 }
3257                 /* Loop copying the data block. */
3258                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
3259                         if (copylen < xfer)
3260                                 xfer = copylen;
3261                         error = vn_lock(invp, LK_SHARED);
3262                         if (error != 0)
3263                                 goto out;
3264                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
3265                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
3266                             curthread->td_ucred, incred, &aresid,
3267                             curthread);
3268                         VOP_UNLOCK(invp);
3269                         lastblock = false;
3270                         if (error == 0 && aresid > 0) {
3271                                 /* Stop the copy at EOF on the input file. */
3272                                 xfer -= aresid;
3273                                 eof = true;
3274                                 lastblock = true;
3275                         }
3276                         if (error == 0) {
3277                                 /*
3278                                  * Skip the write for holes past the initial EOF
3279                                  * of the output file, unless this is the last
3280                                  * write of the output file at EOF.
3281                                  */
3282                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
3283                                     false;
3284                                 if (xfer == len)
3285                                         lastblock = true;
3286                                 if (!cantseek || *outoffp < va.va_size ||
3287                                     lastblock || !readzeros)
3288                                         error = vn_write_outvp(outvp, dat,
3289                                             *outoffp, xfer, blksize,
3290                                             readzeros && lastblock &&
3291                                             *outoffp >= va.va_size, false,
3292                                             outcred);
3293                                 if (error == 0) {
3294                                         *inoffp += xfer;
3295                                         startoff += xfer;
3296                                         *outoffp += xfer;
3297                                         copylen -= xfer;
3298                                         len -= xfer;
3299                                         if (len < savlen)
3300                                                 interrupted = sig_intr();
3301                                 }
3302                         }
3303                         xfer = blksize;
3304                 }
3305         }
3306 out:
3307         *lenp = savlen - len;
3308         free(dat, M_TEMP);
3309         return (error);
3310 }
3311
3312 static int
3313 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3314 {
3315         struct mount *mp;
3316         struct vnode *vp;
3317         off_t olen, ooffset;
3318         int error;
3319 #ifdef AUDIT
3320         int audited_vnode1 = 0;
3321 #endif
3322
3323         vp = fp->f_vnode;
3324         if (vp->v_type != VREG)
3325                 return (ENODEV);
3326
3327         /* Allocating blocks may take a long time, so iterate. */
3328         for (;;) {
3329                 olen = len;
3330                 ooffset = offset;
3331
3332                 bwillwrite();
3333                 mp = NULL;
3334                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3335                 if (error != 0)
3336                         break;
3337                 error = vn_lock(vp, LK_EXCLUSIVE);
3338                 if (error != 0) {
3339                         vn_finished_write(mp);
3340                         break;
3341                 }
3342 #ifdef AUDIT
3343                 if (!audited_vnode1) {
3344                         AUDIT_ARG_VNODE1(vp);
3345                         audited_vnode1 = 1;
3346                 }
3347 #endif
3348 #ifdef MAC
3349                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3350                 if (error == 0)
3351 #endif
3352                         error = VOP_ALLOCATE(vp, &offset, &len);
3353                 VOP_UNLOCK(vp);
3354                 vn_finished_write(mp);
3355
3356                 if (olen + ooffset != offset + len) {
3357                         panic("offset + len changed from %jx/%jx to %jx/%jx",
3358                             ooffset, olen, offset, len);
3359                 }
3360                 if (error != 0 || len == 0)
3361                         break;
3362                 KASSERT(olen > len, ("Iteration did not make progress?"));
3363                 maybe_yield();
3364         }
3365
3366         return (error);
3367 }
3368
3369 static u_long vn_lock_pair_pause_cnt;
3370 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
3371     &vn_lock_pair_pause_cnt, 0,
3372     "Count of vn_lock_pair deadlocks");
3373
3374 u_int vn_lock_pair_pause_max;
3375 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
3376     &vn_lock_pair_pause_max, 0,
3377     "Max ticks for vn_lock_pair deadlock avoidance sleep");
3378
3379 static void
3380 vn_lock_pair_pause(const char *wmesg)
3381 {
3382         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
3383         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
3384 }
3385
3386 /*
3387  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
3388  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
3389  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
3390  * can be NULL.
3391  *
3392  * The function returns with both vnodes exclusively locked, and
3393  * guarantees that it does not create lock order reversal with other
3394  * threads during its execution.  Both vnodes could be unlocked
3395  * temporary (and reclaimed).
3396  */
3397 void
3398 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
3399     bool vp2_locked)
3400 {
3401         int error;
3402
3403         if (vp1 == NULL && vp2 == NULL)
3404                 return;
3405         if (vp1 != NULL) {
3406                 if (vp1_locked)
3407                         ASSERT_VOP_ELOCKED(vp1, "vp1");
3408                 else
3409                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
3410         } else {
3411                 vp1_locked = true;
3412         }
3413         if (vp2 != NULL) {
3414                 if (vp2_locked)
3415                         ASSERT_VOP_ELOCKED(vp2, "vp2");
3416                 else
3417                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
3418         } else {
3419                 vp2_locked = true;
3420         }
3421         if (!vp1_locked && !vp2_locked) {
3422                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3423                 vp1_locked = true;
3424         }
3425
3426         for (;;) {
3427                 if (vp1_locked && vp2_locked)
3428                         break;
3429                 if (vp1_locked && vp2 != NULL) {
3430                         if (vp1 != NULL) {
3431                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
3432                                     __FILE__, __LINE__);
3433                                 if (error == 0)
3434                                         break;
3435                                 VOP_UNLOCK(vp1);
3436                                 vp1_locked = false;
3437                                 vn_lock_pair_pause("vlp1");
3438                         }
3439                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3440                         vp2_locked = true;
3441                 }
3442                 if (vp2_locked && vp1 != NULL) {
3443                         if (vp2 != NULL) {
3444                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
3445                                     __FILE__, __LINE__);
3446                                 if (error == 0)
3447                                         break;
3448                                 VOP_UNLOCK(vp2);
3449                                 vp2_locked = false;
3450                                 vn_lock_pair_pause("vlp2");
3451                         }
3452                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3453                         vp1_locked = true;
3454                 }
3455         }
3456         if (vp1 != NULL)
3457                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
3458         if (vp2 != NULL)
3459                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
3460 }