sys/kern/vfs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  14  *
  15  * Portions of this software were developed by Konstantin Belousov
  16  * under sponsorship from the FreeBSD Foundation.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  * 3. Neither the name of the University nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  40  * SUCH DAMAGE.
  41  *
  42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include "opt_hwpmc_hooks.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/disk.h>
  53 #include <sys/fail.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/kdb.h>
  57 #include <sys/ktr.h>
  58 #include <sys/stat.h>
  59 #include <sys/priv.h>
  60 #include <sys/proc.h>
  61 #include <sys/limits.h>
  62 #include <sys/lock.h>
  63 #include <sys/mman.h>
  64 #include <sys/mount.h>
  65 #include <sys/mutex.h>
  66 #include <sys/namei.h>
  67 #include <sys/vnode.h>
  68 #include <sys/bio.h>
  69 #include <sys/buf.h>
  70 #include <sys/filio.h>
  71 #include <sys/resourcevar.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/prng.h>
  74 #include <sys/sx.h>
  75 #include <sys/sleepqueue.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/ttycom.h>
  78 #include <sys/conf.h>
  79 #include <sys/syslog.h>
  80 #include <sys/unistd.h>
  81 #include <sys/user.h>
  82 #include <sys/ktrace.h>
  83
  84 #include <security/audit/audit.h>
  85 #include <security/mac/mac_framework.h>
  86
  87 #include <vm/vm.h>
  88 #include <vm/vm_extern.h>
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_page.h>
  93 #include <vm/vm_pager.h>
  94
  95 #ifdef HWPMC_HOOKS
  96 #include <sys/pmckern.h>
  97 #endif
  98
  99 static fo_rdwr_t        vn_read;
 100 static fo_rdwr_t        vn_write;
 101 static fo_rdwr_t        vn_io_fault;
 102 static fo_truncate_t    vn_truncate;
 103 static fo_ioctl_t       vn_ioctl;
 104 static fo_poll_t        vn_poll;
 105 static fo_kqfilter_t    vn_kqfilter;
 106 static fo_close_t       vn_closefile;
 107 static fo_mmap_t        vn_mmap;
 108 static fo_fallocate_t   vn_fallocate;
 109 static fo_fspacectl_t   vn_fspacectl;
 110
 111 struct  fileops vnops = {
 112         .fo_read = vn_io_fault,
 113         .fo_write = vn_io_fault,
 114         .fo_truncate = vn_truncate,
 115         .fo_ioctl = vn_ioctl,
 116         .fo_poll = vn_poll,
 117         .fo_kqfilter = vn_kqfilter,
 118         .fo_stat = vn_statfile,
 119         .fo_close = vn_closefile,
 120         .fo_chmod = vn_chmod,
 121         .fo_chown = vn_chown,
 122         .fo_sendfile = vn_sendfile,
 123         .fo_seek = vn_seek,
 124         .fo_fill_kinfo = vn_fill_kinfo,
 125         .fo_mmap = vn_mmap,
 126         .fo_fallocate = vn_fallocate,
 127         .fo_fspacectl = vn_fspacectl,
 128         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 129 };
 130
 131 const u_int io_hold_cnt = 16;
 132 static int vn_io_fault_enable = 1;
 133 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
 134     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 135 static int vn_io_fault_prefault = 0;
 136 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
 137     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 138 static int vn_io_pgcache_read_enable = 1;
 139 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
 140     &vn_io_pgcache_read_enable, 0,
 141     "Enable copying from page cache for reads, avoiding fs");
 142 static u_long vn_io_faults_cnt;
 143 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
 144     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 145
 146 static int vfs_allow_read_dir = 0;
 147 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
 148     &vfs_allow_read_dir, 0,
 149     "Enable read(2) of directory by root for filesystems that support it");
 150
 151 /*
 152  * Returns true if vn_io_fault mode of handling the i/o request should
 153  * be used.
 154  */
 155 static bool
 156 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 157 {
 158         struct mount *mp;
 159
 160         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 161             (mp = vp->v_mount) != NULL &&
 162             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 163 }
 164
 165 /*
 166  * Structure used to pass arguments to vn_io_fault1(), to do either
 167  * file- or vnode-based I/O calls.
 168  */
 169 struct vn_io_fault_args {
 170         enum {
 171                 VN_IO_FAULT_FOP,
 172                 VN_IO_FAULT_VOP
 173         } kind;
 174         struct ucred *cred;
 175         int flags;
 176         union {
 177                 struct fop_args_tag {
 178                         struct file *fp;
 179                         fo_rdwr_t *doio;
 180                 } fop_args;
 181                 struct vop_args_tag {
 182                         struct vnode *vp;
 183                 } vop_args;
 184         } args;
 185 };
 186
 187 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
 188     struct vn_io_fault_args *args, struct thread *td);
 189
 190 int
 191 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 192 {
 193         struct thread *td = ndp->ni_cnd.cn_thread;
 194
 195         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 196 }
 197
 198 static uint64_t
 199 open2nameif(int fmode, u_int vn_open_flags)
 200 {
 201         uint64_t res;
 202
 203         res = ISOPEN | LOCKLEAF;
 204         if ((fmode & O_RESOLVE_BENEATH) != 0)
 205                 res |= RBENEATH;
 206         if ((fmode & O_EMPTY_PATH) != 0)
 207                 res |= EMPTYPATH;
 208         if ((fmode & FREAD) != 0)
 209                 res |= OPENREAD;
 210         if ((fmode & FWRITE) != 0)
 211                 res |= OPENWRITE;
 212         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 213                 res |= AUDITVNODE1;
 214         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 215                 res |= NOCAPCHECK;
 216         return (res);
 217 }
 218
 219 /*
 220  * Common code for vnode open operations via a name lookup.
 221  * Lookup the vnode and invoke VOP_CREATE if needed.
 222  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 223  *
 224  * Note that this does NOT free nameidata for the successful case,
 225  * due to the NDINIT being done elsewhere.
 226  */
 227 int
 228 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 229     struct ucred *cred, struct file *fp)
 230 {
 231         struct vnode *vp;
 232         struct mount *mp;
 233         struct thread *td = ndp->ni_cnd.cn_thread;
 234         struct vattr vat;
 235         struct vattr *vap = &vat;
 236         int fmode, error;
 237         bool first_open;
 238
 239 restart:
 240         first_open = false;
 241         fmode = *flagp;
 242         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 243             O_EXCL | O_DIRECTORY) ||
 244             (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
 245                 return (EINVAL);
 246         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 247                 ndp->ni_cnd.cn_nameiop = CREATE;
 248                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 249                 /*
 250                  * Set NOCACHE to avoid flushing the cache when
 251                  * rolling in many files at once.
 252                  *
 253                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
 254                  * exist despite NOCACHE.
 255                  */
 256                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 257                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 258                         ndp->ni_cnd.cn_flags |= FOLLOW;
 259                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 260                         bwillwrite();
 261                 if ((error = namei(ndp)) != 0)
 262                         return (error);
 263                 if (ndp->ni_vp == NULL) {
 264                         VATTR_NULL(vap);
 265                         vap->va_type = VREG;
 266                         vap->va_mode = cmode;
 267                         if (fmode & O_EXCL)
 268                                 vap->va_vaflags |= VA_EXCLUSIVE;
 269                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 270                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 271                                 vput(ndp->ni_dvp);
 272                                 if ((error = vn_start_write(NULL, &mp,
 273                                     V_XSLEEP | PCATCH)) != 0)
 274                                         return (error);
 275                                 NDREINIT(ndp);
 276                                 goto restart;
 277                         }
 278                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 279                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
 280 #ifdef MAC
 281                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
 282                             &ndp->ni_cnd, vap);
 283                         if (error == 0)
 284 #endif
 285                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 286                                     &ndp->ni_cnd, vap);
 287                         vp = ndp->ni_vp;
 288                         if (error == 0 && (fmode & O_EXCL) != 0 &&
 289                             (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 290                                 VI_LOCK(vp);
 291                                 vp->v_iflag |= VI_FOPENING;
 292                                 VI_UNLOCK(vp);
 293                                 first_open = true;
 294                         }
 295                         VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 296                             false);
 297                         vn_finished_write(mp);
 298                         if (error) {
 299                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 300                                 if (error == ERELOOKUP) {
 301                                         NDREINIT(ndp);
 302                                         goto restart;
 303                                 }
 304                                 return (error);
 305                         }
 306                         fmode &= ~O_TRUNC;
 307                 } else {
 308                         if (ndp->ni_dvp == ndp->ni_vp)
 309                                 vrele(ndp->ni_dvp);
 310                         else
 311                                 vput(ndp->ni_dvp);
 312                         ndp->ni_dvp = NULL;
 313                         vp = ndp->ni_vp;
 314                         if (fmode & O_EXCL) {
 315                                 error = EEXIST;
 316                                 goto bad;
 317                         }
 318                         if (vp->v_type == VDIR) {
 319                                 error = EISDIR;
 320                                 goto bad;
 321                         }
 322                         fmode &= ~O_CREAT;
 323                 }
 324         } else {
 325                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 326                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 327                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 328                     FOLLOW;
 329                 if ((fmode & FWRITE) == 0)
 330                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
 331                 if ((error = namei(ndp)) != 0)
 332                         return (error);
 333                 vp = ndp->ni_vp;
 334         }
 335         error = vn_open_vnode(vp, fmode, cred, td, fp);
 336         if (first_open) {
 337                 VI_LOCK(vp);
 338                 vp->v_iflag &= ~VI_FOPENING;
 339                 wakeup(vp);
 340                 VI_UNLOCK(vp);
 341         }
 342         if (error)
 343                 goto bad;
 344         *flagp = fmode;
 345         return (0);
 346 bad:
 347         NDFREE(ndp, NDF_ONLY_PNBUF);
 348         vput(vp);
 349         *flagp = fmode;
 350         ndp->ni_vp = NULL;
 351         return (error);
 352 }
 353
 354 static int
 355 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 356 {
 357         struct flock lf;
 358         int error, lock_flags, type;
 359
 360         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 361         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 362                 return (0);
 363         KASSERT(fp != NULL, ("open with flock requires fp"));
 364         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 365                 return (EOPNOTSUPP);
 366
 367         lock_flags = VOP_ISLOCKED(vp);
 368         VOP_UNLOCK(vp);
 369
 370         lf.l_whence = SEEK_SET;
 371         lf.l_start = 0;
 372         lf.l_len = 0;
 373         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 374         type = F_FLOCK;
 375         if ((fmode & FNONBLOCK) == 0)
 376                 type |= F_WAIT;
 377         if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 378                 type |= F_FIRSTOPEN;
 379         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 380         if (error == 0)
 381                 fp->f_flag |= FHASLOCK;
 382
 383         vn_lock(vp, lock_flags | LK_RETRY);
 384         return (error);
 385 }
 386
 387 /*
 388  * Common code for vnode open operations once a vnode is located.
 389  * Check permissions, and call the VOP_OPEN routine.
 390  */
 391 int
 392 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 393     struct thread *td, struct file *fp)
 394 {
 395         accmode_t accmode;
 396         int error;
 397
 398         if (vp->v_type == VLNK) {
 399                 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 400                         return (EMLINK);
 401         }
 402         if (vp->v_type == VSOCK)
 403                 return (EOPNOTSUPP);
 404         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 405                 return (ENOTDIR);
 406
 407         accmode = 0;
 408         if ((fmode & O_PATH) == 0) {
 409                 if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 410                         if (vp->v_type == VDIR)
 411                                 return (EISDIR);
 412                         accmode |= VWRITE;
 413                 }
 414                 if ((fmode & FREAD) != 0)
 415                         accmode |= VREAD;
 416                 if ((fmode & O_APPEND) && (fmode & FWRITE))
 417                         accmode |= VAPPEND;
 418 #ifdef MAC
 419                 if ((fmode & O_CREAT) != 0)
 420                         accmode |= VCREAT;
 421 #endif
 422         }
 423         if ((fmode & FEXEC) != 0)
 424                 accmode |= VEXEC;
 425 #ifdef MAC
 426         if ((fmode & O_VERIFY) != 0)
 427                 accmode |= VVERIFY;
 428         error = mac_vnode_check_open(cred, vp, accmode);
 429         if (error != 0)
 430                 return (error);
 431
 432         accmode &= ~(VCREAT | VVERIFY);
 433 #endif
 434         if ((fmode & O_CREAT) == 0 && accmode != 0) {
 435                 error = VOP_ACCESS(vp, accmode, cred, td);
 436                 if (error != 0)
 437                         return (error);
 438         }
 439         if ((fmode & O_PATH) != 0) {
 440                 if (vp->v_type == VFIFO)
 441                         error = EPIPE;
 442                 else
 443                         error = VOP_ACCESS(vp, VREAD, cred, td);
 444                 if (error == 0)
 445                         fp->f_flag |= FKQALLOWED;
 446                 return (0);
 447         }
 448
 449         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 450                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 451         error = VOP_OPEN(vp, fmode, cred, td, fp);
 452         if (error != 0)
 453                 return (error);
 454
 455         error = vn_open_vnode_advlock(vp, fmode, fp);
 456         if (error == 0 && (fmode & FWRITE) != 0) {
 457                 error = VOP_ADD_WRITECOUNT(vp, 1);
 458                 if (error == 0) {
 459                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 460                              __func__, vp, vp->v_writecount);
 461                 }
 462         }
 463
 464         /*
 465          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 466          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 467          */
 468         if (error != 0) {
 469                 if (fp != NULL) {
 470                         /*
 471                          * Arrange the call by having fdrop() to use
 472                          * vn_closefile().  This is to satisfy
 473                          * filesystems like devfs or tmpfs, which
 474                          * override fo_close().
 475                          */
 476                         fp->f_flag |= FOPENFAILED;
 477                         fp->f_vnode = vp;
 478                         if (fp->f_ops == &badfileops) {
 479                                 fp->f_type = DTYPE_VNODE;
 480                                 fp->f_ops = &vnops;
 481                         }
 482                         vref(vp);
 483                 } else {
 484                         /*
 485                          * If there is no fp, due to kernel-mode open,
 486                          * we can call VOP_CLOSE() now.
 487                          */
 488                         if (vp->v_type != VFIFO && (fmode & FWRITE) != 0 &&
 489                             !MNT_EXTENDED_SHARED(vp->v_mount) &&
 490                             VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 491                                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 492                         (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
 493                             cred, td);
 494                 }
 495         }
 496
 497         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 498         return (error);
 499
 500 }
 501
 502 /*
 503  * Check for write permissions on the specified vnode.
 504  * Prototype text segments cannot be written.
 505  * It is racy.
 506  */
 507 int
 508 vn_writechk(struct vnode *vp)
 509 {
 510
 511         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 512         /*
 513          * If there's shared text associated with
 514          * the vnode, try to free it up once.  If
 515          * we fail, we can't allow writing.
 516          */
 517         if (VOP_IS_TEXT(vp))
 518                 return (ETXTBSY);
 519
 520         return (0);
 521 }
 522
 523 /*
 524  * Vnode close call
 525  */
 526 static int
 527 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
 528     struct thread *td, bool keep_ref)
 529 {
 530         struct mount *mp;
 531         int error, lock_flags;
 532
 533         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 534             MNT_EXTENDED_SHARED(vp->v_mount))
 535                 lock_flags = LK_SHARED;
 536         else
 537                 lock_flags = LK_EXCLUSIVE;
 538
 539         vn_start_write(vp, &mp, V_WAIT);
 540         vn_lock(vp, lock_flags | LK_RETRY);
 541         AUDIT_ARG_VNODE1(vp);
 542         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 543                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 544                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 545                     __func__, vp, vp->v_writecount);
 546         }
 547         error = VOP_CLOSE(vp, flags, file_cred, td);
 548         if (keep_ref)
 549                 VOP_UNLOCK(vp);
 550         else
 551                 vput(vp);
 552         vn_finished_write(mp);
 553         return (error);
 554 }
 555
 556 int
 557 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
 558     struct thread *td)
 559 {
 560
 561         return (vn_close1(vp, flags, file_cred, td, false));
 562 }
 563
 564 /*
 565  * Heuristic to detect sequential operation.
 566  */
 567 static int
 568 sequential_heuristic(struct uio *uio, struct file *fp)
 569 {
 570         enum uio_rw rw;
 571
 572         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 573
 574         rw = uio->uio_rw;
 575         if (fp->f_flag & FRDAHEAD)
 576                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 577
 578         /*
 579          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 580          * that the first I/O is normally considered to be slightly
 581          * sequential.  Seeking to offset 0 doesn't change sequentiality
 582          * unless previous seeks have reduced f_seqcount to 0, in which
 583          * case offset 0 is not special.
 584          */
 585         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 586             uio->uio_offset == fp->f_nextoff[rw]) {
 587                 /*
 588                  * f_seqcount is in units of fixed-size blocks so that it
 589                  * depends mainly on the amount of sequential I/O and not
 590                  * much on the number of sequential I/O's.  The fixed size
 591                  * of 16384 is hard-coded here since it is (not quite) just
 592                  * a magic size that works well here.  This size is more
 593                  * closely related to the best I/O size for real disks than
 594                  * to any block size used by software.
 595                  */
 596                 if (uio->uio_resid >= IO_SEQMAX * 16384)
 597                         fp->f_seqcount[rw] = IO_SEQMAX;
 598                 else {
 599                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 600                         if (fp->f_seqcount[rw] > IO_SEQMAX)
 601                                 fp->f_seqcount[rw] = IO_SEQMAX;
 602                 }
 603                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 604         }
 605
 606         /* Not sequential.  Quickly draw-down sequentiality. */
 607         if (fp->f_seqcount[rw] > 1)
 608                 fp->f_seqcount[rw] = 1;
 609         else
 610                 fp->f_seqcount[rw] = 0;
 611         return (0);
 612 }
 613
 614 /*
 615  * Package up an I/O request on a vnode into a uio and do it.
 616  */
 617 int
 618 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
 619     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 620     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 621 {
 622         struct uio auio;
 623         struct iovec aiov;
 624         struct mount *mp;
 625         struct ucred *cred;
 626         void *rl_cookie;
 627         struct vn_io_fault_args args;
 628         int error, lock_flags;
 629
 630         if (offset < 0 && vp->v_type != VCHR)
 631                 return (EINVAL);
 632         auio.uio_iov = &aiov;
 633         auio.uio_iovcnt = 1;
 634         aiov.iov_base = base;
 635         aiov.iov_len = len;
 636         auio.uio_resid = len;
 637         auio.uio_offset = offset;
 638         auio.uio_segflg = segflg;
 639         auio.uio_rw = rw;
 640         auio.uio_td = td;
 641         error = 0;
 642
 643         if ((ioflg & IO_NODELOCKED) == 0) {
 644                 if ((ioflg & IO_RANGELOCKED) == 0) {
 645                         if (rw == UIO_READ) {
 646                                 rl_cookie = vn_rangelock_rlock(vp, offset,
 647                                     offset + len);
 648                         } else if ((ioflg & IO_APPEND) != 0) {
 649                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 650                         } else {
 651                                 rl_cookie = vn_rangelock_wlock(vp, offset,
 652                                     offset + len);
 653                         }
 654                 } else
 655                         rl_cookie = NULL;
 656                 mp = NULL;
 657                 if (rw == UIO_WRITE) {
 658                         if (vp->v_type != VCHR &&
 659                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 660                             != 0)
 661                                 goto out;
 662                         lock_flags = vn_lktype_write(mp, vp);
 663                 } else
 664                         lock_flags = LK_SHARED;
 665                 vn_lock(vp, lock_flags | LK_RETRY);
 666         } else
 667                 rl_cookie = NULL;
 668
 669         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 670 #ifdef MAC
 671         if ((ioflg & IO_NOMACCHECK) == 0) {
 672                 if (rw == UIO_READ)
 673                         error = mac_vnode_check_read(active_cred, file_cred,
 674                             vp);
 675                 else
 676                         error = mac_vnode_check_write(active_cred, file_cred,
 677                             vp);
 678         }
 679 #endif
 680         if (error == 0) {
 681                 if (file_cred != NULL)
 682                         cred = file_cred;
 683                 else
 684                         cred = active_cred;
 685                 if (do_vn_io_fault(vp, &auio)) {
 686                         args.kind = VN_IO_FAULT_VOP;
 687                         args.cred = cred;
 688                         args.flags = ioflg;
 689                         args.args.vop_args.vp = vp;
 690                         error = vn_io_fault1(vp, &auio, &args, td);
 691                 } else if (rw == UIO_READ) {
 692                         error = VOP_READ(vp, &auio, ioflg, cred);
 693                 } else /* if (rw == UIO_WRITE) */ {
 694                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 695                 }
 696         }
 697         if (aresid)
 698                 *aresid = auio.uio_resid;
 699         else
 700                 if (auio.uio_resid && error == 0)
 701                         error = EIO;
 702         if ((ioflg & IO_NODELOCKED) == 0) {
 703                 VOP_UNLOCK(vp);
 704                 if (mp != NULL)
 705                         vn_finished_write(mp);
 706         }
 707  out:
 708         if (rl_cookie != NULL)
 709                 vn_rangelock_unlock(vp, rl_cookie);
 710         return (error);
 711 }
 712
 713 /*
 714  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 715  * request is split up into smaller chunks and we try to avoid saturating
 716  * the buffer cache while potentially holding a vnode locked, so we
 717  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
 718  * to give other processes a chance to lock the vnode (either other processes
 719  * core'ing the same binary, or unrelated processes scanning the directory).
 720  */
 721 int
 722 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 723     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 724     struct ucred *file_cred, size_t *aresid, struct thread *td)
 725 {
 726         int error = 0;
 727         ssize_t iaresid;
 728
 729         do {
 730                 int chunk;
 731
 732                 /*
 733                  * Force `offset' to a multiple of MAXBSIZE except possibly
 734                  * for the first chunk, so that filesystems only need to
 735                  * write full blocks except possibly for the first and last
 736                  * chunks.
 737                  */
 738                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 739
 740                 if (chunk > len)
 741                         chunk = len;
 742                 if (rw != UIO_READ && vp->v_type == VREG)
 743                         bwillwrite();
 744                 iaresid = 0;
 745                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 746                     ioflg, active_cred, file_cred, &iaresid, td);
 747                 len -= chunk;   /* aresid calc already includes length */
 748                 if (error)
 749                         break;
 750                 offset += chunk;
 751                 base = (char *)base + chunk;
 752                 kern_yield(PRI_USER);
 753         } while (len);
 754         if (aresid)
 755                 *aresid = len + iaresid;
 756         return (error);
 757 }
 758
 759 #if OFF_MAX <= LONG_MAX
 760 off_t
 761 foffset_lock(struct file *fp, int flags)
 762 {
 763         volatile short *flagsp;
 764         off_t res;
 765         short state;
 766
 767         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 768
 769         if ((flags & FOF_NOLOCK) != 0)
 770                 return (atomic_load_long(&fp->f_offset));
 771
 772         /*
 773          * According to McKusick the vn lock was protecting f_offset here.
 774          * It is now protected by the FOFFSET_LOCKED flag.
 775          */
 776         flagsp = &fp->f_vnread_flags;
 777         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 778                 return (atomic_load_long(&fp->f_offset));
 779
 780         sleepq_lock(&fp->f_vnread_flags);
 781         state = atomic_load_16(flagsp);
 782         for (;;) {
 783                 if ((state & FOFFSET_LOCKED) == 0) {
 784                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 785                             FOFFSET_LOCKED))
 786                                 continue;
 787                         break;
 788                 }
 789                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
 790                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 791                             state | FOFFSET_LOCK_WAITING))
 792                                 continue;
 793                 }
 794                 DROP_GIANT();
 795                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 796                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 797                 PICKUP_GIANT();
 798                 sleepq_lock(&fp->f_vnread_flags);
 799                 state = atomic_load_16(flagsp);
 800         }
 801         res = atomic_load_long(&fp->f_offset);
 802         sleepq_release(&fp->f_vnread_flags);
 803         return (res);
 804 }
 805
 806 void
 807 foffset_unlock(struct file *fp, off_t val, int flags)
 808 {
 809         volatile short *flagsp;
 810         short state;
 811
 812         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 813
 814         if ((flags & FOF_NOUPDATE) == 0)
 815                 atomic_store_long(&fp->f_offset, val);
 816         if ((flags & FOF_NEXTOFF_R) != 0)
 817                 fp->f_nextoff[UIO_READ] = val;
 818         if ((flags & FOF_NEXTOFF_W) != 0)
 819                 fp->f_nextoff[UIO_WRITE] = val;
 820
 821         if ((flags & FOF_NOLOCK) != 0)
 822                 return;
 823
 824         flagsp = &fp->f_vnread_flags;
 825         state = atomic_load_16(flagsp);
 826         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 827             atomic_cmpset_rel_16(flagsp, state, 0))
 828                 return;
 829
 830         sleepq_lock(&fp->f_vnread_flags);
 831         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 832         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 833         fp->f_vnread_flags = 0;
 834         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 835         sleepq_release(&fp->f_vnread_flags);
 836 }
 837 #else
 838 off_t
 839 foffset_lock(struct file *fp, int flags)
 840 {
 841         struct mtx *mtxp;
 842         off_t res;
 843
 844         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 845
 846         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 847         mtx_lock(mtxp);
 848         if ((flags & FOF_NOLOCK) == 0) {
 849                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 850                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 851                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 852                             "vofflock", 0);
 853                 }
 854                 fp->f_vnread_flags |= FOFFSET_LOCKED;
 855         }
 856         res = fp->f_offset;
 857         mtx_unlock(mtxp);
 858         return (res);
 859 }
 860
 861 void
 862 foffset_unlock(struct file *fp, off_t val, int flags)
 863 {
 864         struct mtx *mtxp;
 865
 866         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 867
 868         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 869         mtx_lock(mtxp);
 870         if ((flags & FOF_NOUPDATE) == 0)
 871                 fp->f_offset = val;
 872         if ((flags & FOF_NEXTOFF_R) != 0)
 873                 fp->f_nextoff[UIO_READ] = val;
 874         if ((flags & FOF_NEXTOFF_W) != 0)
 875                 fp->f_nextoff[UIO_WRITE] = val;
 876         if ((flags & FOF_NOLOCK) == 0) {
 877                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 878                     ("Lost FOFFSET_LOCKED"));
 879                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 880                         wakeup(&fp->f_vnread_flags);
 881                 fp->f_vnread_flags = 0;
 882         }
 883         mtx_unlock(mtxp);
 884 }
 885 #endif
 886
 887 void
 888 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 889 {
 890
 891         if ((flags & FOF_OFFSET) == 0)
 892                 uio->uio_offset = foffset_lock(fp, flags);
 893 }
 894
 895 void
 896 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 897 {
 898
 899         if ((flags & FOF_OFFSET) == 0)
 900                 foffset_unlock(fp, uio->uio_offset, flags);
 901 }
 902
 903 static int
 904 get_advice(struct file *fp, struct uio *uio)
 905 {
 906         struct mtx *mtxp;
 907         int ret;
 908
 909         ret = POSIX_FADV_NORMAL;
 910         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 911                 return (ret);
 912
 913         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 914         mtx_lock(mtxp);
 915         if (fp->f_advice != NULL &&
 916             uio->uio_offset >= fp->f_advice->fa_start &&
 917             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 918                 ret = fp->f_advice->fa_advice;
 919         mtx_unlock(mtxp);
 920         return (ret);
 921 }
 922
 923 static int
 924 get_write_ioflag(struct file *fp)
 925 {
 926         int ioflag;
 927         struct mount *mp;
 928         struct vnode *vp;
 929
 930         ioflag = 0;
 931         vp = fp->f_vnode;
 932         mp = atomic_load_ptr(&vp->v_mount);
 933
 934         if ((fp->f_flag & O_DIRECT) != 0)
 935                 ioflag |= IO_DIRECT;
 936
 937         if ((fp->f_flag & O_FSYNC) != 0 ||
 938             (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
 939                 ioflag |= IO_SYNC;
 940
 941         /*
 942          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
 943          * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
 944          * fall back to full O_SYNC behavior.
 945          */
 946         if ((fp->f_flag & O_DSYNC) != 0)
 947                 ioflag |= IO_SYNC | IO_DATASYNC;
 948
 949         return (ioflag);
 950 }
 951
 952 int
 953 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 954 {
 955         vm_object_t obj;
 956         vm_page_t ma[io_hold_cnt + 2];
 957         off_t off, vsz;
 958         ssize_t resid;
 959         int error, i, j;
 960
 961         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 962         obj = atomic_load_ptr(&vp->v_object);
 963         if (obj == NULL)
 964                 return (EJUSTRETURN);
 965
 966         /*
 967          * Depends on type stability of vm_objects.
 968          */
 969         vm_object_pip_add(obj, 1);
 970         if ((obj->flags & OBJ_DEAD) != 0) {
 971                 /*
 972                  * Note that object might be already reused from the
 973                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 974                  * we recheck for DOOMED vnode state after all pages
 975                  * are busied, and retract then.
 976                  *
 977                  * But we check for OBJ_DEAD to ensure that we do not
 978                  * busy pages while vm_object_terminate_pages()
 979                  * processes the queue.
 980                  */
 981                 error = EJUSTRETURN;
 982                 goto out_pip;
 983         }
 984
 985         resid = uio->uio_resid;
 986         off = uio->uio_offset;
 987         for (i = 0; resid > 0; i++) {
 988                 MPASS(i < io_hold_cnt + 2);
 989                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
 990                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 991                     VM_ALLOC_NOWAIT);
 992                 if (ma[i] == NULL)
 993                         break;
 994
 995                 /*
 996                  * Skip invalid pages.  Valid mask can be partial only
 997                  * at EOF, and we clip later.
 998                  */
 999                 if (vm_page_none_valid(ma[i])) {
1000                         vm_page_sunbusy(ma[i]);
1001                         break;
1002                 }
1003
1004                 resid -= PAGE_SIZE;
1005                 off += PAGE_SIZE;
1006         }
1007         if (i == 0) {
1008                 error = EJUSTRETURN;
1009                 goto out_pip;
1010         }
1011
1012         /*
1013          * Check VIRF_DOOMED after we busied our pages.  Since
1014          * vgonel() terminates the vnode' vm_object, it cannot
1015          * process past pages busied by us.
1016          */
1017         if (VN_IS_DOOMED(vp)) {
1018                 error = EJUSTRETURN;
1019                 goto out;
1020         }
1021
1022         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
1023         if (resid > uio->uio_resid)
1024                 resid = uio->uio_resid;
1025
1026         /*
1027          * Unlocked read of vnp_size is safe because truncation cannot
1028          * pass busied page.  But we load vnp_size into a local
1029          * variable so that possible concurrent extension does not
1030          * break calculation.
1031          */
1032 #if defined(__powerpc__) && !defined(__powerpc64__)
1033         vsz = obj->un_pager.vnp.vnp_size;
1034 #else
1035         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
1036 #endif
1037         if (uio->uio_offset >= vsz) {
1038                 error = EJUSTRETURN;
1039                 goto out;
1040         }
1041         if (uio->uio_offset + resid > vsz)
1042                 resid = vsz - uio->uio_offset;
1043
1044         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
1045
1046 out:
1047         for (j = 0; j < i; j++) {
1048                 if (error == 0)
1049                         vm_page_reference(ma[j]);
1050                 vm_page_sunbusy(ma[j]);
1051         }
1052 out_pip:
1053         vm_object_pip_wakeup(obj);
1054         if (error != 0)
1055                 return (error);
1056         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
1057 }
1058
1059 /*
1060  * File table vnode read routine.
1061  */
1062 static int
1063 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1064     struct thread *td)
1065 {
1066         struct vnode *vp;
1067         off_t orig_offset;
1068         int error, ioflag;
1069         int advice;
1070
1071         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1072             uio->uio_td, td));
1073         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1074         vp = fp->f_vnode;
1075         ioflag = 0;
1076         if (fp->f_flag & FNONBLOCK)
1077                 ioflag |= IO_NDELAY;
1078         if (fp->f_flag & O_DIRECT)
1079                 ioflag |= IO_DIRECT;
1080
1081         /*
1082          * Try to read from page cache.  VIRF_DOOMED check is racy but
1083          * allows us to avoid unneeded work outright.
1084          */
1085         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
1086             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
1087                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1088                 if (error == 0) {
1089                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1090                         return (0);
1091                 }
1092                 if (error != EJUSTRETURN)
1093                         return (error);
1094         }
1095
1096         advice = get_advice(fp, uio);
1097         vn_lock(vp, LK_SHARED | LK_RETRY);
1098
1099         switch (advice) {
1100         case POSIX_FADV_NORMAL:
1101         case POSIX_FADV_SEQUENTIAL:
1102         case POSIX_FADV_NOREUSE:
1103                 ioflag |= sequential_heuristic(uio, fp);
1104                 break;
1105         case POSIX_FADV_RANDOM:
1106                 /* Disable read-ahead for random I/O. */
1107                 break;
1108         }
1109         orig_offset = uio->uio_offset;
1110
1111 #ifdef MAC
1112         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1113         if (error == 0)
1114 #endif
1115                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1116         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1117         VOP_UNLOCK(vp);
1118         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1119             orig_offset != uio->uio_offset)
1120                 /*
1121                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1122                  * for the backing file after a POSIX_FADV_NOREUSE
1123                  * read(2).
1124                  */
1125                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1126                     POSIX_FADV_DONTNEED);
1127         return (error);
1128 }
1129
1130 /*
1131  * File table vnode write routine.
1132  */
1133 static int
1134 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1135     struct thread *td)
1136 {
1137         struct vnode *vp;
1138         struct mount *mp;
1139         off_t orig_offset;
1140         int error, ioflag;
1141         int advice;
1142         bool need_finished_write;
1143
1144         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1145             uio->uio_td, td));
1146         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1147         vp = fp->f_vnode;
1148         if (vp->v_type == VREG)
1149                 bwillwrite();
1150         ioflag = IO_UNIT;
1151         if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
1152                 ioflag |= IO_APPEND;
1153         if ((fp->f_flag & FNONBLOCK) != 0)
1154                 ioflag |= IO_NDELAY;
1155         ioflag |= get_write_ioflag(fp);
1156
1157         mp = NULL;
1158         need_finished_write = false;
1159         if (vp->v_type != VCHR) {
1160                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1161                 if (error != 0)
1162                         goto unlock;
1163                 need_finished_write = true;
1164         }
1165
1166         advice = get_advice(fp, uio);
1167
1168         vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
1169         switch (advice) {
1170         case POSIX_FADV_NORMAL:
1171         case POSIX_FADV_SEQUENTIAL:
1172         case POSIX_FADV_NOREUSE:
1173                 ioflag |= sequential_heuristic(uio, fp);
1174                 break;
1175         case POSIX_FADV_RANDOM:
1176                 /* XXX: Is this correct? */
1177                 break;
1178         }
1179         orig_offset = uio->uio_offset;
1180
1181 #ifdef MAC
1182         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1183         if (error == 0)
1184 #endif
1185                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1186         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1187         VOP_UNLOCK(vp);
1188         if (need_finished_write)
1189                 vn_finished_write(mp);
1190         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1191             orig_offset != uio->uio_offset)
1192                 /*
1193                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1194                  * for the backing file after a POSIX_FADV_NOREUSE
1195                  * write(2).
1196                  */
1197                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1198                     POSIX_FADV_DONTNEED);
1199 unlock:
1200         return (error);
1201 }
1202
1203 /*
1204  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1205  * prevent the following deadlock:
1206  *
1207  * Assume that the thread A reads from the vnode vp1 into userspace
1208  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1209  * currently not resident, then system ends up with the call chain
1210  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1211  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1212  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1213  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1214  * backed by the pages of vnode vp1, and some page in buf2 is not
1215  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1216  *
1217  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1218  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1219  * Instead, it first tries to do the whole range i/o with pagefaults
1220  * disabled. If all pages in the i/o buffer are resident and mapped,
1221  * VOP will succeed (ignoring the genuine filesystem errors).
1222  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1223  * i/o in chunks, with all pages in the chunk prefaulted and held
1224  * using vm_fault_quick_hold_pages().
1225  *
1226  * Filesystems using this deadlock avoidance scheme should use the
1227  * array of the held pages from uio, saved in the curthread->td_ma,
1228  * instead of doing uiomove().  A helper function
1229  * vn_io_fault_uiomove() converts uiomove request into
1230  * uiomove_fromphys() over td_ma array.
1231  *
1232  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1233  * make the current i/o request atomic with respect to other i/os and
1234  * truncations.
1235  */
1236
1237 /*
1238  * Decode vn_io_fault_args and perform the corresponding i/o.
1239  */
1240 static int
1241 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1242     struct thread *td)
1243 {
1244         int error, save;
1245
1246         error = 0;
1247         save = vm_fault_disable_pagefaults();
1248         switch (args->kind) {
1249         case VN_IO_FAULT_FOP:
1250                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1251                     uio, args->cred, args->flags, td);
1252                 break;
1253         case VN_IO_FAULT_VOP:
1254                 if (uio->uio_rw == UIO_READ) {
1255                         error = VOP_READ(args->args.vop_args.vp, uio,
1256                             args->flags, args->cred);
1257                 } else if (uio->uio_rw == UIO_WRITE) {
1258                         error = VOP_WRITE(args->args.vop_args.vp, uio,
1259                             args->flags, args->cred);
1260                 }
1261                 break;
1262         default:
1263                 panic("vn_io_fault_doio: unknown kind of io %d %d",
1264                     args->kind, uio->uio_rw);
1265         }
1266         vm_fault_enable_pagefaults(save);
1267         return (error);
1268 }
1269
1270 static int
1271 vn_io_fault_touch(char *base, const struct uio *uio)
1272 {
1273         int r;
1274
1275         r = fubyte(base);
1276         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1277                 return (EFAULT);
1278         return (0);
1279 }
1280
1281 static int
1282 vn_io_fault_prefault_user(const struct uio *uio)
1283 {
1284         char *base;
1285         const struct iovec *iov;
1286         size_t len;
1287         ssize_t resid;
1288         int error, i;
1289
1290         KASSERT(uio->uio_segflg == UIO_USERSPACE,
1291             ("vn_io_fault_prefault userspace"));
1292
1293         error = i = 0;
1294         iov = uio->uio_iov;
1295         resid = uio->uio_resid;
1296         base = iov->iov_base;
1297         len = iov->iov_len;
1298         while (resid > 0) {
1299                 error = vn_io_fault_touch(base, uio);
1300                 if (error != 0)
1301                         break;
1302                 if (len < PAGE_SIZE) {
1303                         if (len != 0) {
1304                                 error = vn_io_fault_touch(base + len - 1, uio);
1305                                 if (error != 0)
1306                                         break;
1307                                 resid -= len;
1308                         }
1309                         if (++i >= uio->uio_iovcnt)
1310                                 break;
1311                         iov = uio->uio_iov + i;
1312                         base = iov->iov_base;
1313                         len = iov->iov_len;
1314                 } else {
1315                         len -= PAGE_SIZE;
1316                         base += PAGE_SIZE;
1317                         resid -= PAGE_SIZE;
1318                 }
1319         }
1320         return (error);
1321 }
1322
1323 /*
1324  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1325  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1326  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1327  * into args and call vn_io_fault1() to handle faults during the user
1328  * mode buffer accesses.
1329  */
1330 static int
1331 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1332     struct thread *td)
1333 {
1334         vm_page_t ma[io_hold_cnt + 2];
1335         struct uio *uio_clone, short_uio;
1336         struct iovec short_iovec[1];
1337         vm_page_t *prev_td_ma;
1338         vm_prot_t prot;
1339         vm_offset_t addr, end;
1340         size_t len, resid;
1341         ssize_t adv;
1342         int error, cnt, saveheld, prev_td_ma_cnt;
1343
1344         if (vn_io_fault_prefault) {
1345                 error = vn_io_fault_prefault_user(uio);
1346                 if (error != 0)
1347                         return (error); /* Or ignore ? */
1348         }
1349
1350         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1351
1352         /*
1353          * The UFS follows IO_UNIT directive and replays back both
1354          * uio_offset and uio_resid if an error is encountered during the
1355          * operation.  But, since the iovec may be already advanced,
1356          * uio is still in an inconsistent state.
1357          *
1358          * Cache a copy of the original uio, which is advanced to the redo
1359          * point using UIO_NOCOPY below.
1360          */
1361         uio_clone = cloneuio(uio);
1362         resid = uio->uio_resid;
1363
1364         short_uio.uio_segflg = UIO_USERSPACE;
1365         short_uio.uio_rw = uio->uio_rw;
1366         short_uio.uio_td = uio->uio_td;
1367
1368         error = vn_io_fault_doio(args, uio, td);
1369         if (error != EFAULT)
1370                 goto out;
1371
1372         atomic_add_long(&vn_io_faults_cnt, 1);
1373         uio_clone->uio_segflg = UIO_NOCOPY;
1374         uiomove(NULL, resid - uio->uio_resid, uio_clone);
1375         uio_clone->uio_segflg = uio->uio_segflg;
1376
1377         saveheld = curthread_pflags_set(TDP_UIOHELD);
1378         prev_td_ma = td->td_ma;
1379         prev_td_ma_cnt = td->td_ma_cnt;
1380
1381         while (uio_clone->uio_resid != 0) {
1382                 len = uio_clone->uio_iov->iov_len;
1383                 if (len == 0) {
1384                         KASSERT(uio_clone->uio_iovcnt >= 1,
1385                             ("iovcnt underflow"));
1386                         uio_clone->uio_iov++;
1387                         uio_clone->uio_iovcnt--;
1388                         continue;
1389                 }
1390                 if (len > ptoa(io_hold_cnt))
1391                         len = ptoa(io_hold_cnt);
1392                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1393                 end = round_page(addr + len);
1394                 if (end < addr) {
1395                         error = EFAULT;
1396                         break;
1397                 }
1398                 cnt = atop(end - trunc_page(addr));
1399                 /*
1400                  * A perfectly misaligned address and length could cause
1401                  * both the start and the end of the chunk to use partial
1402                  * page.  +2 accounts for such a situation.
1403                  */
1404                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1405                     addr, len, prot, ma, io_hold_cnt + 2);
1406                 if (cnt == -1) {
1407                         error = EFAULT;
1408                         break;
1409                 }
1410                 short_uio.uio_iov = &short_iovec[0];
1411                 short_iovec[0].iov_base = (void *)addr;
1412                 short_uio.uio_iovcnt = 1;
1413                 short_uio.uio_resid = short_iovec[0].iov_len = len;
1414                 short_uio.uio_offset = uio_clone->uio_offset;
1415                 td->td_ma = ma;
1416                 td->td_ma_cnt = cnt;
1417
1418                 error = vn_io_fault_doio(args, &short_uio, td);
1419                 vm_page_unhold_pages(ma, cnt);
1420                 adv = len - short_uio.uio_resid;
1421
1422                 uio_clone->uio_iov->iov_base =
1423                     (char *)uio_clone->uio_iov->iov_base + adv;
1424                 uio_clone->uio_iov->iov_len -= adv;
1425                 uio_clone->uio_resid -= adv;
1426                 uio_clone->uio_offset += adv;
1427
1428                 uio->uio_resid -= adv;
1429                 uio->uio_offset += adv;
1430
1431                 if (error != 0 || adv == 0)
1432                         break;
1433         }
1434         td->td_ma = prev_td_ma;
1435         td->td_ma_cnt = prev_td_ma_cnt;
1436         curthread_pflags_restore(saveheld);
1437 out:
1438         free(uio_clone, M_IOV);
1439         return (error);
1440 }
1441
1442 static int
1443 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1444     int flags, struct thread *td)
1445 {
1446         fo_rdwr_t *doio;
1447         struct vnode *vp;
1448         void *rl_cookie;
1449         struct vn_io_fault_args args;
1450         int error;
1451
1452         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1453         vp = fp->f_vnode;
1454
1455         /*
1456          * The ability to read(2) on a directory has historically been
1457          * allowed for all users, but this can and has been the source of
1458          * at least one security issue in the past.  As such, it is now hidden
1459          * away behind a sysctl for those that actually need it to use it, and
1460          * restricted to root when it's turned on to make it relatively safe to
1461          * leave on for longer sessions of need.
1462          */
1463         if (vp->v_type == VDIR) {
1464                 KASSERT(uio->uio_rw == UIO_READ,
1465                     ("illegal write attempted on a directory"));
1466                 if (!vfs_allow_read_dir)
1467                         return (EISDIR);
1468                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1469                         return (EISDIR);
1470         }
1471
1472         foffset_lock_uio(fp, uio, flags);
1473         if (do_vn_io_fault(vp, uio)) {
1474                 args.kind = VN_IO_FAULT_FOP;
1475                 args.args.fop_args.fp = fp;
1476                 args.args.fop_args.doio = doio;
1477                 args.cred = active_cred;
1478                 args.flags = flags | FOF_OFFSET;
1479                 if (uio->uio_rw == UIO_READ) {
1480                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1481                             uio->uio_offset + uio->uio_resid);
1482                 } else if ((fp->f_flag & O_APPEND) != 0 ||
1483                     (flags & FOF_OFFSET) == 0) {
1484                         /* For appenders, punt and lock the whole range. */
1485                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1486                 } else {
1487                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1488                             uio->uio_offset + uio->uio_resid);
1489                 }
1490                 error = vn_io_fault1(vp, uio, &args, td);
1491                 vn_rangelock_unlock(vp, rl_cookie);
1492         } else {
1493                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1494         }
1495         foffset_unlock_uio(fp, uio, flags);
1496         return (error);
1497 }
1498
1499 /*
1500  * Helper function to perform the requested uiomove operation using
1501  * the held pages for io->uio_iov[0].iov_base buffer instead of
1502  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1503  * instead of iov_base prevents page faults that could occur due to
1504  * pmap_collect() invalidating the mapping created by
1505  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1506  * object cleanup revoking the write access from page mappings.
1507  *
1508  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1509  * instead of plain uiomove().
1510  */
1511 int
1512 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1513 {
1514         struct uio transp_uio;
1515         struct iovec transp_iov[1];
1516         struct thread *td;
1517         size_t adv;
1518         int error, pgadv;
1519
1520         td = curthread;
1521         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1522             uio->uio_segflg != UIO_USERSPACE)
1523                 return (uiomove(data, xfersize, uio));
1524
1525         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1526         transp_iov[0].iov_base = data;
1527         transp_uio.uio_iov = &transp_iov[0];
1528         transp_uio.uio_iovcnt = 1;
1529         if (xfersize > uio->uio_resid)
1530                 xfersize = uio->uio_resid;
1531         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1532         transp_uio.uio_offset = 0;
1533         transp_uio.uio_segflg = UIO_SYSSPACE;
1534         /*
1535          * Since transp_iov points to data, and td_ma page array
1536          * corresponds to original uio->uio_iov, we need to invert the
1537          * direction of the i/o operation as passed to
1538          * uiomove_fromphys().
1539          */
1540         switch (uio->uio_rw) {
1541         case UIO_WRITE:
1542                 transp_uio.uio_rw = UIO_READ;
1543                 break;
1544         case UIO_READ:
1545                 transp_uio.uio_rw = UIO_WRITE;
1546                 break;
1547         }
1548         transp_uio.uio_td = uio->uio_td;
1549         error = uiomove_fromphys(td->td_ma,
1550             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1551             xfersize, &transp_uio);
1552         adv = xfersize - transp_uio.uio_resid;
1553         pgadv =
1554             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1555             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1556         td->td_ma += pgadv;
1557         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1558             pgadv));
1559         td->td_ma_cnt -= pgadv;
1560         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1561         uio->uio_iov->iov_len -= adv;
1562         uio->uio_resid -= adv;
1563         uio->uio_offset += adv;
1564         return (error);
1565 }
1566
1567 int
1568 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1569     struct uio *uio)
1570 {
1571         struct thread *td;
1572         vm_offset_t iov_base;
1573         int cnt, pgadv;
1574
1575         td = curthread;
1576         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1577             uio->uio_segflg != UIO_USERSPACE)
1578                 return (uiomove_fromphys(ma, offset, xfersize, uio));
1579
1580         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1581         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1582         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1583         switch (uio->uio_rw) {
1584         case UIO_WRITE:
1585                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1586                     offset, cnt);
1587                 break;
1588         case UIO_READ:
1589                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1590                     cnt);
1591                 break;
1592         }
1593         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1594         td->td_ma += pgadv;
1595         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1596             pgadv));
1597         td->td_ma_cnt -= pgadv;
1598         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1599         uio->uio_iov->iov_len -= cnt;
1600         uio->uio_resid -= cnt;
1601         uio->uio_offset += cnt;
1602         return (0);
1603 }
1604
1605 /*
1606  * File table truncate routine.
1607  */
1608 static int
1609 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1610     struct thread *td)
1611 {
1612         struct mount *mp;
1613         struct vnode *vp;
1614         void *rl_cookie;
1615         int error;
1616
1617         vp = fp->f_vnode;
1618
1619 retry:
1620         /*
1621          * Lock the whole range for truncation.  Otherwise split i/o
1622          * might happen partly before and partly after the truncation.
1623          */
1624         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1625         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1626         if (error)
1627                 goto out1;
1628         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1629         AUDIT_ARG_VNODE1(vp);
1630         if (vp->v_type == VDIR) {
1631                 error = EISDIR;
1632                 goto out;
1633         }
1634 #ifdef MAC
1635         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1636         if (error)
1637                 goto out;
1638 #endif
1639         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1640             fp->f_cred);
1641 out:
1642         VOP_UNLOCK(vp);
1643         vn_finished_write(mp);
1644 out1:
1645         vn_rangelock_unlock(vp, rl_cookie);
1646         if (error == ERELOOKUP)
1647                 goto retry;
1648         return (error);
1649 }
1650
1651 /*
1652  * Truncate a file that is already locked.
1653  */
1654 int
1655 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1656     struct ucred *cred)
1657 {
1658         struct vattr vattr;
1659         int error;
1660
1661         error = VOP_ADD_WRITECOUNT(vp, 1);
1662         if (error == 0) {
1663                 VATTR_NULL(&vattr);
1664                 vattr.va_size = length;
1665                 if (sync)
1666                         vattr.va_vaflags |= VA_SYNC;
1667                 error = VOP_SETATTR(vp, &vattr, cred);
1668                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1669         }
1670         return (error);
1671 }
1672
1673 /*
1674  * File table vnode stat routine.
1675  */
1676 int
1677 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
1678     struct thread *td)
1679 {
1680         struct vnode *vp = fp->f_vnode;
1681         int error;
1682
1683         vn_lock(vp, LK_SHARED | LK_RETRY);
1684         error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
1685         VOP_UNLOCK(vp);
1686
1687         return (error);
1688 }
1689
1690 /*
1691  * File table vnode ioctl routine.
1692  */
1693 static int
1694 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1695     struct thread *td)
1696 {
1697         struct vattr vattr;
1698         struct vnode *vp;
1699         struct fiobmap2_arg *bmarg;
1700         int error;
1701
1702         vp = fp->f_vnode;
1703         switch (vp->v_type) {
1704         case VDIR:
1705         case VREG:
1706                 switch (com) {
1707                 case FIONREAD:
1708                         vn_lock(vp, LK_SHARED | LK_RETRY);
1709                         error = VOP_GETATTR(vp, &vattr, active_cred);
1710                         VOP_UNLOCK(vp);
1711                         if (error == 0)
1712                                 *(int *)data = vattr.va_size - fp->f_offset;
1713                         return (error);
1714                 case FIOBMAP2:
1715                         bmarg = (struct fiobmap2_arg *)data;
1716                         vn_lock(vp, LK_SHARED | LK_RETRY);
1717 #ifdef MAC
1718                         error = mac_vnode_check_read(active_cred, fp->f_cred,
1719                             vp);
1720                         if (error == 0)
1721 #endif
1722                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
1723                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
1724                         VOP_UNLOCK(vp);
1725                         return (error);
1726                 case FIONBIO:
1727                 case FIOASYNC:
1728                         return (0);
1729                 default:
1730                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
1731                             active_cred, td));
1732                 }
1733                 break;
1734         case VCHR:
1735                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1736                     active_cred, td));
1737         default:
1738                 return (ENOTTY);
1739         }
1740 }
1741
1742 /*
1743  * File table vnode poll routine.
1744  */
1745 static int
1746 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1747     struct thread *td)
1748 {
1749         struct vnode *vp;
1750         int error;
1751
1752         vp = fp->f_vnode;
1753 #if defined(MAC) || defined(AUDIT)
1754         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1755                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1756                 AUDIT_ARG_VNODE1(vp);
1757                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1758                 VOP_UNLOCK(vp);
1759                 if (error != 0)
1760                         return (error);
1761         }
1762 #endif
1763         error = VOP_POLL(vp, events, fp->f_cred, td);
1764         return (error);
1765 }
1766
1767 /*
1768  * Acquire the requested lock and then check for validity.  LK_RETRY
1769  * permits vn_lock to return doomed vnodes.
1770  */
1771 static int __noinline
1772 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1773     int error)
1774 {
1775
1776         KASSERT((flags & LK_RETRY) == 0 || error == 0,
1777             ("vn_lock: error %d incompatible with flags %#x", error, flags));
1778
1779         if (error == 0)
1780                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1781
1782         if ((flags & LK_RETRY) == 0) {
1783                 if (error == 0) {
1784                         VOP_UNLOCK(vp);
1785                         error = ENOENT;
1786                 }
1787                 return (error);
1788         }
1789
1790         /*
1791          * LK_RETRY case.
1792          *
1793          * Nothing to do if we got the lock.
1794          */
1795         if (error == 0)
1796                 return (0);
1797
1798         /*
1799          * Interlock was dropped by the call in _vn_lock.
1800          */
1801         flags &= ~LK_INTERLOCK;
1802         do {
1803                 error = VOP_LOCK1(vp, flags, file, line);
1804         } while (error != 0);
1805         return (0);
1806 }
1807
1808 int
1809 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1810 {
1811         int error;
1812
1813         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1814             ("vn_lock: no locktype (%d passed)", flags));
1815         VNPASS(vp->v_holdcnt > 0, vp);
1816         error = VOP_LOCK1(vp, flags, file, line);
1817         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1818                 return (_vn_lock_fallback(vp, flags, file, line, error));
1819         return (0);
1820 }
1821
1822 /*
1823  * File table vnode close routine.
1824  */
1825 static int
1826 vn_closefile(struct file *fp, struct thread *td)
1827 {
1828         struct vnode *vp;
1829         struct flock lf;
1830         int error;
1831         bool ref;
1832
1833         vp = fp->f_vnode;
1834         fp->f_ops = &badfileops;
1835         ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1836
1837         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1838
1839         if (__predict_false(ref)) {
1840                 lf.l_whence = SEEK_SET;
1841                 lf.l_start = 0;
1842                 lf.l_len = 0;
1843                 lf.l_type = F_UNLCK;
1844                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1845                 vrele(vp);
1846         }
1847         return (error);
1848 }
1849
1850 /*
1851  * Preparing to start a filesystem write operation. If the operation is
1852  * permitted, then we bump the count of operations in progress and
1853  * proceed. If a suspend request is in progress, we wait until the
1854  * suspension is over, and then proceed.
1855  */
1856 static int
1857 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
1858 {
1859         struct mount_pcpu *mpcpu;
1860         int error, mflags;
1861
1862         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
1863             vfs_op_thread_enter(mp, mpcpu)) {
1864                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1865                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
1866                 vfs_op_thread_exit(mp, mpcpu);
1867                 return (0);
1868         }
1869
1870         if (mplocked)
1871                 mtx_assert(MNT_MTX(mp), MA_OWNED);
1872         else
1873                 MNT_ILOCK(mp);
1874
1875         error = 0;
1876
1877         /*
1878          * Check on status of suspension.
1879          */
1880         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1881             mp->mnt_susp_owner != curthread) {
1882                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1883                     (flags & PCATCH) : 0) | (PUSER - 1);
1884                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1885                         if (flags & V_NOWAIT) {
1886                                 error = EWOULDBLOCK;
1887                                 goto unlock;
1888                         }
1889                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1890                             "suspfs", 0);
1891                         if (error)
1892                                 goto unlock;
1893                 }
1894         }
1895         if (flags & V_XSLEEP)
1896                 goto unlock;
1897         mp->mnt_writeopcount++;
1898 unlock:
1899         if (error != 0 || (flags & V_XSLEEP) != 0)
1900                 MNT_REL(mp);
1901         MNT_IUNLOCK(mp);
1902         return (error);
1903 }
1904
1905 int
1906 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1907 {
1908         struct mount *mp;
1909         int error;
1910
1911         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1912             ("V_MNTREF requires mp"));
1913
1914         error = 0;
1915         /*
1916          * If a vnode is provided, get and return the mount point that
1917          * to which it will write.
1918          */
1919         if (vp != NULL) {
1920                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1921                         *mpp = NULL;
1922                         if (error != EOPNOTSUPP)
1923                                 return (error);
1924                         return (0);
1925                 }
1926         }
1927         if ((mp = *mpp) == NULL)
1928                 return (0);
1929
1930         /*
1931          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1932          * a vfs_ref().
1933          * As long as a vnode is not provided we need to acquire a
1934          * refcount for the provided mountpoint too, in order to
1935          * emulate a vfs_ref().
1936          */
1937         if (vp == NULL && (flags & V_MNTREF) == 0)
1938                 vfs_ref(mp);
1939
1940         return (vn_start_write_refed(mp, flags, false));
1941 }
1942
1943 /*
1944  * Secondary suspension. Used by operations such as vop_inactive
1945  * routines that are needed by the higher level functions. These
1946  * are allowed to proceed until all the higher level functions have
1947  * completed (indicated by mnt_writeopcount dropping to zero). At that
1948  * time, these operations are halted until the suspension is over.
1949  */
1950 int
1951 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1952 {
1953         struct mount *mp;
1954         int error;
1955
1956         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1957             ("V_MNTREF requires mp"));
1958
1959  retry:
1960         if (vp != NULL) {
1961                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1962                         *mpp = NULL;
1963                         if (error != EOPNOTSUPP)
1964                                 return (error);
1965                         return (0);
1966                 }
1967         }
1968         /*
1969          * If we are not suspended or have not yet reached suspended
1970          * mode, then let the operation proceed.
1971          */
1972         if ((mp = *mpp) == NULL)
1973                 return (0);
1974
1975         /*
1976          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1977          * a vfs_ref().
1978          * As long as a vnode is not provided we need to acquire a
1979          * refcount for the provided mountpoint too, in order to
1980          * emulate a vfs_ref().
1981          */
1982         MNT_ILOCK(mp);
1983         if (vp == NULL && (flags & V_MNTREF) == 0)
1984                 MNT_REF(mp);
1985         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1986                 mp->mnt_secondary_writes++;
1987                 mp->mnt_secondary_accwrites++;
1988                 MNT_IUNLOCK(mp);
1989                 return (0);
1990         }
1991         if (flags & V_NOWAIT) {
1992                 MNT_REL(mp);
1993                 MNT_IUNLOCK(mp);
1994                 return (EWOULDBLOCK);
1995         }
1996         /*
1997          * Wait for the suspension to finish.
1998          */
1999         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
2000             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
2001             "suspfs", 0);
2002         vfs_rel(mp);
2003         if (error == 0)
2004                 goto retry;
2005         return (error);
2006 }
2007
2008 /*
2009  * Filesystem write operation has completed. If we are suspending and this
2010  * operation is the last one, notify the suspender that the suspension is
2011  * now in effect.
2012  */
2013 void
2014 vn_finished_write(struct mount *mp)
2015 {
2016         struct mount_pcpu *mpcpu;
2017         int c;
2018
2019         if (mp == NULL)
2020                 return;
2021
2022         if (vfs_op_thread_enter(mp, mpcpu)) {
2023                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
2024                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
2025                 vfs_op_thread_exit(mp, mpcpu);
2026                 return;
2027         }
2028
2029         MNT_ILOCK(mp);
2030         vfs_assert_mount_counters(mp);
2031         MNT_REL(mp);
2032         c = --mp->mnt_writeopcount;
2033         if (mp->mnt_vfs_ops == 0) {
2034                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
2035                 MNT_IUNLOCK(mp);
2036                 return;
2037         }
2038         if (c < 0)
2039                 vfs_dump_mount_counters(mp);
2040         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
2041                 wakeup(&mp->mnt_writeopcount);
2042         MNT_IUNLOCK(mp);
2043 }
2044
2045 /*
2046  * Filesystem secondary write operation has completed. If we are
2047  * suspending and this operation is the last one, notify the suspender
2048  * that the suspension is now in effect.
2049  */
2050 void
2051 vn_finished_secondary_write(struct mount *mp)
2052 {
2053         if (mp == NULL)
2054                 return;
2055         MNT_ILOCK(mp);
2056         MNT_REL(mp);
2057         mp->mnt_secondary_writes--;
2058         if (mp->mnt_secondary_writes < 0)
2059                 panic("vn_finished_secondary_write: neg cnt");
2060         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
2061             mp->mnt_secondary_writes <= 0)
2062                 wakeup(&mp->mnt_secondary_writes);
2063         MNT_IUNLOCK(mp);
2064 }
2065
2066 /*
2067  * Request a filesystem to suspend write operations.
2068  */
2069 int
2070 vfs_write_suspend(struct mount *mp, int flags)
2071 {
2072         int error;
2073
2074         vfs_op_enter(mp);
2075
2076         MNT_ILOCK(mp);
2077         vfs_assert_mount_counters(mp);
2078         if (mp->mnt_susp_owner == curthread) {
2079                 vfs_op_exit_locked(mp);
2080                 MNT_IUNLOCK(mp);
2081                 return (EALREADY);
2082         }
2083         while (mp->mnt_kern_flag & MNTK_SUSPEND)
2084                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
2085
2086         /*
2087          * Unmount holds a write reference on the mount point.  If we
2088          * own busy reference and drain for writers, we deadlock with
2089          * the reference draining in the unmount path.  Callers of
2090          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2091          * vfs_busy() reference is owned and caller is not in the
2092          * unmount context.
2093          */
2094         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2095             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2096                 vfs_op_exit_locked(mp);
2097                 MNT_IUNLOCK(mp);
2098                 return (EBUSY);
2099         }
2100
2101         mp->mnt_kern_flag |= MNTK_SUSPEND;
2102         mp->mnt_susp_owner = curthread;
2103         if (mp->mnt_writeopcount > 0)
2104                 (void) msleep(&mp->mnt_writeopcount,
2105                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
2106         else
2107                 MNT_IUNLOCK(mp);
2108         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2109                 vfs_write_resume(mp, 0);
2110                 /* vfs_write_resume does vfs_op_exit() for us */
2111         }
2112         return (error);
2113 }
2114
2115 /*
2116  * Request a filesystem to resume write operations.
2117  */
2118 void
2119 vfs_write_resume(struct mount *mp, int flags)
2120 {
2121
2122         MNT_ILOCK(mp);
2123         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2124                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2125                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2126                                        MNTK_SUSPENDED);
2127                 mp->mnt_susp_owner = NULL;
2128                 wakeup(&mp->mnt_writeopcount);
2129                 wakeup(&mp->mnt_flag);
2130                 curthread->td_pflags &= ~TDP_IGNSUSP;
2131                 if ((flags & VR_START_WRITE) != 0) {
2132                         MNT_REF(mp);
2133                         mp->mnt_writeopcount++;
2134                 }
2135                 MNT_IUNLOCK(mp);
2136                 if ((flags & VR_NO_SUSPCLR) == 0)
2137                         VFS_SUSP_CLEAN(mp);
2138                 vfs_op_exit(mp);
2139         } else if ((flags & VR_START_WRITE) != 0) {
2140                 MNT_REF(mp);
2141                 vn_start_write_refed(mp, 0, true);
2142         } else {
2143                 MNT_IUNLOCK(mp);
2144         }
2145 }
2146
2147 /*
2148  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2149  * methods.
2150  */
2151 int
2152 vfs_write_suspend_umnt(struct mount *mp)
2153 {
2154         int error;
2155
2156         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2157             ("vfs_write_suspend_umnt: recursed"));
2158
2159         /* dounmount() already called vn_start_write(). */
2160         for (;;) {
2161                 vn_finished_write(mp);
2162                 error = vfs_write_suspend(mp, 0);
2163                 if (error != 0) {
2164                         vn_start_write(NULL, &mp, V_WAIT);
2165                         return (error);
2166                 }
2167                 MNT_ILOCK(mp);
2168                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2169                         break;
2170                 MNT_IUNLOCK(mp);
2171                 vn_start_write(NULL, &mp, V_WAIT);
2172         }
2173         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2174         wakeup(&mp->mnt_flag);
2175         MNT_IUNLOCK(mp);
2176         curthread->td_pflags |= TDP_IGNSUSP;
2177         return (0);
2178 }
2179
2180 /*
2181  * Implement kqueues for files by translating it to vnode operation.
2182  */
2183 static int
2184 vn_kqfilter(struct file *fp, struct knote *kn)
2185 {
2186
2187         return (VOP_KQFILTER(fp->f_vnode, kn));
2188 }
2189
2190 int
2191 vn_kqfilter_opath(struct file *fp, struct knote *kn)
2192 {
2193         if ((fp->f_flag & FKQALLOWED) == 0)
2194                 return (EBADF);
2195         return (vn_kqfilter(fp, kn));
2196 }
2197
2198 /*
2199  * Simplified in-kernel wrapper calls for extended attribute access.
2200  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2201  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2202  */
2203 int
2204 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2205     const char *attrname, int *buflen, char *buf, struct thread *td)
2206 {
2207         struct uio      auio;
2208         struct iovec    iov;
2209         int     error;
2210
2211         iov.iov_len = *buflen;
2212         iov.iov_base = buf;
2213
2214         auio.uio_iov = &iov;
2215         auio.uio_iovcnt = 1;
2216         auio.uio_rw = UIO_READ;
2217         auio.uio_segflg = UIO_SYSSPACE;
2218         auio.uio_td = td;
2219         auio.uio_offset = 0;
2220         auio.uio_resid = *buflen;
2221
2222         if ((ioflg & IO_NODELOCKED) == 0)
2223                 vn_lock(vp, LK_SHARED | LK_RETRY);
2224
2225         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2226
2227         /* authorize attribute retrieval as kernel */
2228         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2229             td);
2230
2231         if ((ioflg & IO_NODELOCKED) == 0)
2232                 VOP_UNLOCK(vp);
2233
2234         if (error == 0) {
2235                 *buflen = *buflen - auio.uio_resid;
2236         }
2237
2238         return (error);
2239 }
2240
2241 /*
2242  * XXX failure mode if partially written?
2243  */
2244 int
2245 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2246     const char *attrname, int buflen, char *buf, struct thread *td)
2247 {
2248         struct uio      auio;
2249         struct iovec    iov;
2250         struct mount    *mp;
2251         int     error;
2252
2253         iov.iov_len = buflen;
2254         iov.iov_base = buf;
2255
2256         auio.uio_iov = &iov;
2257         auio.uio_iovcnt = 1;
2258         auio.uio_rw = UIO_WRITE;
2259         auio.uio_segflg = UIO_SYSSPACE;
2260         auio.uio_td = td;
2261         auio.uio_offset = 0;
2262         auio.uio_resid = buflen;
2263
2264         if ((ioflg & IO_NODELOCKED) == 0) {
2265                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2266                         return (error);
2267                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2268         }
2269
2270         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2271
2272         /* authorize attribute setting as kernel */
2273         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2274
2275         if ((ioflg & IO_NODELOCKED) == 0) {
2276                 vn_finished_write(mp);
2277                 VOP_UNLOCK(vp);
2278         }
2279
2280         return (error);
2281 }
2282
2283 int
2284 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2285     const char *attrname, struct thread *td)
2286 {
2287         struct mount    *mp;
2288         int     error;
2289
2290         if ((ioflg & IO_NODELOCKED) == 0) {
2291                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2292                         return (error);
2293                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2294         }
2295
2296         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2297
2298         /* authorize attribute removal as kernel */
2299         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2300         if (error == EOPNOTSUPP)
2301                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2302                     NULL, td);
2303
2304         if ((ioflg & IO_NODELOCKED) == 0) {
2305                 vn_finished_write(mp);
2306                 VOP_UNLOCK(vp);
2307         }
2308
2309         return (error);
2310 }
2311
2312 static int
2313 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2314     struct vnode **rvp)
2315 {
2316
2317         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2318 }
2319
2320 int
2321 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2322 {
2323
2324         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2325             lkflags, rvp));
2326 }
2327
2328 int
2329 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2330     int lkflags, struct vnode **rvp)
2331 {
2332         struct mount *mp;
2333         int ltype, error;
2334
2335         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2336         mp = vp->v_mount;
2337         ltype = VOP_ISLOCKED(vp);
2338         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2339             ("vn_vget_ino: vp not locked"));
2340         error = vfs_busy(mp, MBF_NOWAIT);
2341         if (error != 0) {
2342                 vfs_ref(mp);
2343                 VOP_UNLOCK(vp);
2344                 error = vfs_busy(mp, 0);
2345                 vn_lock(vp, ltype | LK_RETRY);
2346                 vfs_rel(mp);
2347                 if (error != 0)
2348                         return (ENOENT);
2349                 if (VN_IS_DOOMED(vp)) {
2350                         vfs_unbusy(mp);
2351                         return (ENOENT);
2352                 }
2353         }
2354         VOP_UNLOCK(vp);
2355         error = alloc(mp, alloc_arg, lkflags, rvp);
2356         vfs_unbusy(mp);
2357         if (error != 0 || *rvp != vp)
2358                 vn_lock(vp, ltype | LK_RETRY);
2359         if (VN_IS_DOOMED(vp)) {
2360                 if (error == 0) {
2361                         if (*rvp == vp)
2362                                 vunref(vp);
2363                         else
2364                                 vput(*rvp);
2365                 }
2366                 error = ENOENT;
2367         }
2368         return (error);
2369 }
2370
2371 int
2372 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2373     struct thread *td)
2374 {
2375         off_t lim;
2376         bool ktr_write;
2377
2378         if (td == NULL)
2379                 return (0);
2380
2381         /*
2382          * There are conditions where the limit is to be ignored.
2383          * However, since it is almost never reached, check it first.
2384          */
2385         ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
2386         lim = lim_cur(td, RLIMIT_FSIZE);
2387         if (__predict_false(ktr_write))
2388                 lim = td->td_ktr_io_lim;
2389         if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
2390                 return (0);
2391
2392         /*
2393          * The limit is reached.
2394          */
2395         if (vp->v_type != VREG ||
2396             (td->td_pflags2 & TDP2_ACCT) != 0)
2397                 return (0);
2398
2399         if (!ktr_write || ktr_filesize_limit_signal) {
2400                 PROC_LOCK(td->td_proc);
2401                 kern_psignal(td->td_proc, SIGXFSZ);
2402                 PROC_UNLOCK(td->td_proc);
2403         }
2404         return (EFBIG);
2405 }
2406
2407 int
2408 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2409     struct thread *td)
2410 {
2411         struct vnode *vp;
2412
2413         vp = fp->f_vnode;
2414 #ifdef AUDIT
2415         vn_lock(vp, LK_SHARED | LK_RETRY);
2416         AUDIT_ARG_VNODE1(vp);
2417         VOP_UNLOCK(vp);
2418 #endif
2419         return (setfmode(td, active_cred, vp, mode));
2420 }
2421
2422 int
2423 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2424     struct thread *td)
2425 {
2426         struct vnode *vp;
2427
2428         vp = fp->f_vnode;
2429 #ifdef AUDIT
2430         vn_lock(vp, LK_SHARED | LK_RETRY);
2431         AUDIT_ARG_VNODE1(vp);
2432         VOP_UNLOCK(vp);
2433 #endif
2434         return (setfown(td, active_cred, vp, uid, gid));
2435 }
2436
2437 void
2438 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2439 {
2440         vm_object_t object;
2441
2442         if ((object = vp->v_object) == NULL)
2443                 return;
2444         VM_OBJECT_WLOCK(object);
2445         vm_object_page_remove(object, start, end, 0);
2446         VM_OBJECT_WUNLOCK(object);
2447 }
2448
2449 int
2450 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
2451     struct ucred *cred)
2452 {
2453         struct vattr va;
2454         daddr_t bn, bnp;
2455         uint64_t bsize;
2456         off_t noff;
2457         int error;
2458
2459         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2460             ("%s: Wrong command %lu", __func__, cmd));
2461         ASSERT_VOP_LOCKED(vp, "vn_bmap_seekhole_locked");
2462
2463         if (vp->v_type != VREG) {
2464                 error = ENOTTY;
2465                 goto out;
2466         }
2467         error = VOP_GETATTR(vp, &va, cred);
2468         if (error != 0)
2469                 goto out;
2470         noff = *off;
2471         if (noff >= va.va_size) {
2472                 error = ENXIO;
2473                 goto out;
2474         }
2475         bsize = vp->v_mount->mnt_stat.f_iosize;
2476         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2477             noff % bsize) {
2478                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2479                 if (error == EOPNOTSUPP) {
2480                         error = ENOTTY;
2481                         goto out;
2482                 }
2483                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2484                     (bnp != -1 && cmd == FIOSEEKDATA)) {
2485                         noff = bn * bsize;
2486                         if (noff < *off)
2487                                 noff = *off;
2488                         goto out;
2489                 }
2490         }
2491         if (noff > va.va_size)
2492                 noff = va.va_size;
2493         /* noff == va.va_size. There is an implicit hole at the end of file. */
2494         if (cmd == FIOSEEKDATA)
2495                 error = ENXIO;
2496 out:
2497         if (error == 0)
2498                 *off = noff;
2499         return (error);
2500 }
2501
2502 int
2503 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2504 {
2505         int error;
2506
2507         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2508             ("%s: Wrong command %lu", __func__, cmd));
2509
2510         if (vn_lock(vp, LK_SHARED) != 0)
2511                 return (EBADF);
2512         error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
2513         VOP_UNLOCK(vp);
2514         return (error);
2515 }
2516
2517 int
2518 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2519 {
2520         struct ucred *cred;
2521         struct vnode *vp;
2522         struct vattr vattr;
2523         off_t foffset, size;
2524         int error, noneg;
2525
2526         cred = td->td_ucred;
2527         vp = fp->f_vnode;
2528         foffset = foffset_lock(fp, 0);
2529         noneg = (vp->v_type != VCHR);
2530         error = 0;
2531         switch (whence) {
2532         case L_INCR:
2533                 if (noneg &&
2534                     (foffset < 0 ||
2535                     (offset > 0 && foffset > OFF_MAX - offset))) {
2536                         error = EOVERFLOW;
2537                         break;
2538                 }
2539                 offset += foffset;
2540                 break;
2541         case L_XTND:
2542                 vn_lock(vp, LK_SHARED | LK_RETRY);
2543                 error = VOP_GETATTR(vp, &vattr, cred);
2544                 VOP_UNLOCK(vp);
2545                 if (error)
2546                         break;
2547
2548                 /*
2549                  * If the file references a disk device, then fetch
2550                  * the media size and use that to determine the ending
2551                  * offset.
2552                  */
2553                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2554                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2555                         vattr.va_size = size;
2556                 if (noneg &&
2557                     (vattr.va_size > OFF_MAX ||
2558                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2559                         error = EOVERFLOW;
2560                         break;
2561                 }
2562                 offset += vattr.va_size;
2563                 break;
2564         case L_SET:
2565                 break;
2566         case SEEK_DATA:
2567                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2568                 if (error == ENOTTY)
2569                         error = EINVAL;
2570                 break;
2571         case SEEK_HOLE:
2572                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2573                 if (error == ENOTTY)
2574                         error = EINVAL;
2575                 break;
2576         default:
2577                 error = EINVAL;
2578         }
2579         if (error == 0 && noneg && offset < 0)
2580                 error = EINVAL;
2581         if (error != 0)
2582                 goto drop;
2583         VFS_KNOTE_UNLOCKED(vp, 0);
2584         td->td_uretoff.tdu_off = offset;
2585 drop:
2586         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2587         return (error);
2588 }
2589
2590 int
2591 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2592     struct thread *td)
2593 {
2594         int error;
2595
2596         /*
2597          * Grant permission if the caller is the owner of the file, or
2598          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2599          * on the file.  If the time pointer is null, then write
2600          * permission on the file is also sufficient.
2601          *
2602          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2603          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2604          * will be allowed to set the times [..] to the current
2605          * server time.
2606          */
2607         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2608         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2609                 error = VOP_ACCESS(vp, VWRITE, cred, td);
2610         return (error);
2611 }
2612
2613 int
2614 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2615 {
2616         struct vnode *vp;
2617         int error;
2618
2619         if (fp->f_type == DTYPE_FIFO)
2620                 kif->kf_type = KF_TYPE_FIFO;
2621         else
2622                 kif->kf_type = KF_TYPE_VNODE;
2623         vp = fp->f_vnode;
2624         vref(vp);
2625         FILEDESC_SUNLOCK(fdp);
2626         error = vn_fill_kinfo_vnode(vp, kif);
2627         vrele(vp);
2628         FILEDESC_SLOCK(fdp);
2629         return (error);
2630 }
2631
2632 static inline void
2633 vn_fill_junk(struct kinfo_file *kif)
2634 {
2635         size_t len, olen;
2636
2637         /*
2638          * Simulate vn_fullpath returning changing values for a given
2639          * vp during e.g. coredump.
2640          */
2641         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2642         olen = strlen(kif->kf_path);
2643         if (len < olen)
2644                 strcpy(&kif->kf_path[len - 1], "$");
2645         else
2646                 for (; olen < len; olen++)
2647                         strcpy(&kif->kf_path[olen], "A");
2648 }
2649
2650 int
2651 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2652 {
2653         struct vattr va;
2654         char *fullpath, *freepath;
2655         int error;
2656
2657         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2658         freepath = NULL;
2659         fullpath = "-";
2660         error = vn_fullpath(vp, &fullpath, &freepath);
2661         if (error == 0) {
2662                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2663         }
2664         if (freepath != NULL)
2665                 free(freepath, M_TEMP);
2666
2667         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2668                 vn_fill_junk(kif);
2669         );
2670
2671         /*
2672          * Retrieve vnode attributes.
2673          */
2674         va.va_fsid = VNOVAL;
2675         va.va_rdev = NODEV;
2676         vn_lock(vp, LK_SHARED | LK_RETRY);
2677         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2678         VOP_UNLOCK(vp);
2679         if (error != 0)
2680                 return (error);
2681         if (va.va_fsid != VNOVAL)
2682                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2683         else
2684                 kif->kf_un.kf_file.kf_file_fsid =
2685                     vp->v_mount->mnt_stat.f_fsid.val[0];
2686         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2687             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2688         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2689         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2690         kif->kf_un.kf_file.kf_file_size = va.va_size;
2691         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2692         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2693             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2694         return (0);
2695 }
2696
2697 int
2698 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2699     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2700     struct thread *td)
2701 {
2702 #ifdef HWPMC_HOOKS
2703         struct pmckern_map_in pkm;
2704 #endif
2705         struct mount *mp;
2706         struct vnode *vp;
2707         vm_object_t object;
2708         vm_prot_t maxprot;
2709         boolean_t writecounted;
2710         int error;
2711
2712 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2713     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2714         /*
2715          * POSIX shared-memory objects are defined to have
2716          * kernel persistence, and are not defined to support
2717          * read(2)/write(2) -- or even open(2).  Thus, we can
2718          * use MAP_ASYNC to trade on-disk coherence for speed.
2719          * The shm_open(3) library routine turns on the FPOSIXSHM
2720          * flag to request this behavior.
2721          */
2722         if ((fp->f_flag & FPOSIXSHM) != 0)
2723                 flags |= MAP_NOSYNC;
2724 #endif
2725         vp = fp->f_vnode;
2726
2727         /*
2728          * Ensure that file and memory protections are
2729          * compatible.  Note that we only worry about
2730          * writability if mapping is shared; in this case,
2731          * current and max prot are dictated by the open file.
2732          * XXX use the vnode instead?  Problem is: what
2733          * credentials do we use for determination? What if
2734          * proc does a setuid?
2735          */
2736         mp = vp->v_mount;
2737         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2738                 maxprot = VM_PROT_NONE;
2739                 if ((prot & VM_PROT_EXECUTE) != 0)
2740                         return (EACCES);
2741         } else
2742                 maxprot = VM_PROT_EXECUTE;
2743         if ((fp->f_flag & FREAD) != 0)
2744                 maxprot |= VM_PROT_READ;
2745         else if ((prot & VM_PROT_READ) != 0)
2746                 return (EACCES);
2747
2748         /*
2749          * If we are sharing potential changes via MAP_SHARED and we
2750          * are trying to get write permission although we opened it
2751          * without asking for it, bail out.
2752          */
2753         if ((flags & MAP_SHARED) != 0) {
2754                 if ((fp->f_flag & FWRITE) != 0)
2755                         maxprot |= VM_PROT_WRITE;
2756                 else if ((prot & VM_PROT_WRITE) != 0)
2757                         return (EACCES);
2758         } else {
2759                 maxprot |= VM_PROT_WRITE;
2760                 cap_maxprot |= VM_PROT_WRITE;
2761         }
2762         maxprot &= cap_maxprot;
2763
2764         /*
2765          * For regular files and shared memory, POSIX requires that
2766          * the value of foff be a legitimate offset within the data
2767          * object.  In particular, negative offsets are invalid.
2768          * Blocking negative offsets and overflows here avoids
2769          * possible wraparound or user-level access into reserved
2770          * ranges of the data object later.  In contrast, POSIX does
2771          * not dictate how offsets are used by device drivers, so in
2772          * the case of a device mapping a negative offset is passed
2773          * on.
2774          */
2775         if (
2776 #ifdef _LP64
2777             size > OFF_MAX ||
2778 #endif
2779             foff > OFF_MAX - size)
2780                 return (EINVAL);
2781
2782         writecounted = FALSE;
2783         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2784             &foff, &object, &writecounted);
2785         if (error != 0)
2786                 return (error);
2787         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2788             foff, writecounted, td);
2789         if (error != 0) {
2790                 /*
2791                  * If this mapping was accounted for in the vnode's
2792                  * writecount, then undo that now.
2793                  */
2794                 if (writecounted)
2795                         vm_pager_release_writecount(object, 0, size);
2796                 vm_object_deallocate(object);
2797         }
2798 #ifdef HWPMC_HOOKS
2799         /* Inform hwpmc(4) if an executable is being mapped. */
2800         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2801                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2802                         pkm.pm_file = vp;
2803                         pkm.pm_address = (uintptr_t) *addr;
2804                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2805                 }
2806         }
2807 #endif
2808         return (error);
2809 }
2810
2811 void
2812 vn_fsid(struct vnode *vp, struct vattr *va)
2813 {
2814         fsid_t *f;
2815
2816         f = &vp->v_mount->mnt_stat.f_fsid;
2817         va->va_fsid = (uint32_t)f->val[1];
2818         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2819         va->va_fsid += (uint32_t)f->val[0];
2820 }
2821
2822 int
2823 vn_fsync_buf(struct vnode *vp, int waitfor)
2824 {
2825         struct buf *bp, *nbp;
2826         struct bufobj *bo;
2827         struct mount *mp;
2828         int error, maxretry;
2829
2830         error = 0;
2831         maxretry = 10000;     /* large, arbitrarily chosen */
2832         mp = NULL;
2833         if (vp->v_type == VCHR) {
2834                 VI_LOCK(vp);
2835                 mp = vp->v_rdev->si_mountpt;
2836                 VI_UNLOCK(vp);
2837         }
2838         bo = &vp->v_bufobj;
2839         BO_LOCK(bo);
2840 loop1:
2841         /*
2842          * MARK/SCAN initialization to avoid infinite loops.
2843          */
2844         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2845                 bp->b_vflags &= ~BV_SCANNED;
2846                 bp->b_error = 0;
2847         }
2848
2849         /*
2850          * Flush all dirty buffers associated with a vnode.
2851          */
2852 loop2:
2853         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2854                 if ((bp->b_vflags & BV_SCANNED) != 0)
2855                         continue;
2856                 bp->b_vflags |= BV_SCANNED;
2857                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2858                         if (waitfor != MNT_WAIT)
2859                                 continue;
2860                         if (BUF_LOCK(bp,
2861                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
2862                             BO_LOCKPTR(bo)) != 0) {
2863                                 BO_LOCK(bo);
2864                                 goto loop1;
2865                         }
2866                         BO_LOCK(bo);
2867                 }
2868                 BO_UNLOCK(bo);
2869                 KASSERT(bp->b_bufobj == bo,
2870                     ("bp %p wrong b_bufobj %p should be %p",
2871                     bp, bp->b_bufobj, bo));
2872                 if ((bp->b_flags & B_DELWRI) == 0)
2873                         panic("fsync: not dirty");
2874                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2875                         vfs_bio_awrite(bp);
2876                 } else {
2877                         bremfree(bp);
2878                         bawrite(bp);
2879                 }
2880                 if (maxretry < 1000)
2881                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
2882                 BO_LOCK(bo);
2883                 goto loop2;
2884         }
2885
2886         /*
2887          * If synchronous the caller expects us to completely resolve all
2888          * dirty buffers in the system.  Wait for in-progress I/O to
2889          * complete (which could include background bitmap writes), then
2890          * retry if dirty blocks still exist.
2891          */
2892         if (waitfor == MNT_WAIT) {
2893                 bufobj_wwait(bo, 0, 0);
2894                 if (bo->bo_dirty.bv_cnt > 0) {
2895                         /*
2896                          * If we are unable to write any of these buffers
2897                          * then we fail now rather than trying endlessly
2898                          * to write them out.
2899                          */
2900                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2901                                 if ((error = bp->b_error) != 0)
2902                                         break;
2903                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
2904                             (error == 0 && --maxretry >= 0))
2905                                 goto loop1;
2906                         if (error == 0)
2907                                 error = EAGAIN;
2908                 }
2909         }
2910         BO_UNLOCK(bo);
2911         if (error != 0)
2912                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2913
2914         return (error);
2915 }
2916
2917 /*
2918  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
2919  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
2920  * to do the actual copy.
2921  * vn_generic_copy_file_range() is factored out, so it can be called
2922  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
2923  * different file systems.
2924  */
2925 int
2926 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
2927     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
2928     struct ucred *outcred, struct thread *fsize_td)
2929 {
2930         int error;
2931         size_t len;
2932         uint64_t uval;
2933
2934         len = *lenp;
2935         *lenp = 0;              /* For error returns. */
2936         error = 0;
2937
2938         /* Do some sanity checks on the arguments. */
2939         if (invp->v_type == VDIR || outvp->v_type == VDIR)
2940                 error = EISDIR;
2941         else if (*inoffp < 0 || *outoffp < 0 ||
2942             invp->v_type != VREG || outvp->v_type != VREG)
2943                 error = EINVAL;
2944         if (error != 0)
2945                 goto out;
2946
2947         /* Ensure offset + len does not wrap around. */
2948         uval = *inoffp;
2949         uval += len;
2950         if (uval > INT64_MAX)
2951                 len = INT64_MAX - *inoffp;
2952         uval = *outoffp;
2953         uval += len;
2954         if (uval > INT64_MAX)
2955                 len = INT64_MAX - *outoffp;
2956         if (len == 0)
2957                 goto out;
2958
2959         /*
2960          * If the two vnode are for the same file system, call
2961          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
2962          * which can handle copies across multiple file systems.
2963          */
2964         *lenp = len;
2965         if (invp->v_mount == outvp->v_mount)
2966                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
2967                     lenp, flags, incred, outcred, fsize_td);
2968         else
2969                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
2970                     outoffp, lenp, flags, incred, outcred, fsize_td);
2971 out:
2972         return (error);
2973 }
2974
2975 /*
2976  * Test len bytes of data starting at dat for all bytes == 0.
2977  * Return true if all bytes are zero, false otherwise.
2978  * Expects dat to be well aligned.
2979  */
2980 static bool
2981 mem_iszero(void *dat, int len)
2982 {
2983         int i;
2984         const u_int *p;
2985         const char *cp;
2986
2987         for (p = dat; len > 0; len -= sizeof(*p), p++) {
2988                 if (len >= sizeof(*p)) {
2989                         if (*p != 0)
2990                                 return (false);
2991                 } else {
2992                         cp = (const char *)p;
2993                         for (i = 0; i < len; i++, cp++)
2994                                 if (*cp != '\0')
2995                                         return (false);
2996                 }
2997         }
2998         return (true);
2999 }
3000
3001 /*
3002  * Look for a hole in the output file and, if found, adjust *outoffp
3003  * and *xferp to skip past the hole.
3004  * *xferp is the entire hole length to be written and xfer2 is how many bytes
3005  * to be written as 0's upon return.
3006  */
3007 static off_t
3008 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
3009     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
3010 {
3011         int error;
3012         off_t delta;
3013
3014         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
3015                 *dataoffp = *outoffp;
3016                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
3017                     curthread);
3018                 if (error == 0) {
3019                         *holeoffp = *dataoffp;
3020                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
3021                             curthread);
3022                 }
3023                 if (error != 0 || *holeoffp == *dataoffp) {
3024                         /*
3025                          * Since outvp is unlocked, it may be possible for
3026                          * another thread to do a truncate(), lseek(), write()
3027                          * creating a hole at startoff between the above
3028                          * VOP_IOCTL() calls, if the other thread does not do
3029                          * rangelocking.
3030                          * If that happens, *holeoffp == *dataoffp and finding
3031                          * the hole has failed, so disable vn_skip_hole().
3032                          */
3033                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
3034                         return (xfer2);
3035                 }
3036                 KASSERT(*dataoffp >= *outoffp,
3037                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
3038                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
3039                 KASSERT(*holeoffp > *dataoffp,
3040                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
3041                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
3042         }
3043
3044         /*
3045          * If there is a hole before the data starts, advance *outoffp and
3046          * *xferp past the hole.
3047          */
3048         if (*dataoffp > *outoffp) {
3049                 delta = *dataoffp - *outoffp;
3050                 if (delta >= *xferp) {
3051                         /* Entire *xferp is a hole. */
3052                         *outoffp += *xferp;
3053                         *xferp = 0;
3054                         return (0);
3055                 }
3056                 *xferp -= delta;
3057                 *outoffp += delta;
3058                 xfer2 = MIN(xfer2, *xferp);
3059         }
3060
3061         /*
3062          * If a hole starts before the end of this xfer2, reduce this xfer2 so
3063          * that the write ends at the start of the hole.
3064          * *holeoffp should always be greater than *outoffp, but for the
3065          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
3066          * value.
3067          */
3068         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
3069                 xfer2 = *holeoffp - *outoffp;
3070         return (xfer2);
3071 }
3072
3073 /*
3074  * Write an xfer sized chunk to outvp in blksize blocks from dat.
3075  * dat is a maximum of blksize in length and can be written repeatedly in
3076  * the chunk.
3077  * If growfile == true, just grow the file via vn_truncate_locked() instead
3078  * of doing actual writes.
3079  * If checkhole == true, a hole is being punched, so skip over any hole
3080  * already in the output file.
3081  */
3082 static int
3083 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
3084     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
3085 {
3086         struct mount *mp;
3087         off_t dataoff, holeoff, xfer2;
3088         int error;
3089
3090         /*
3091          * Loop around doing writes of blksize until write has been completed.
3092          * Lock/unlock on each loop iteration so that a bwillwrite() can be
3093          * done for each iteration, since the xfer argument can be very
3094          * large if there is a large hole to punch in the output file.
3095          */
3096         error = 0;
3097         holeoff = 0;
3098         do {
3099                 xfer2 = MIN(xfer, blksize);
3100                 if (checkhole) {
3101                         /*
3102                          * Punching a hole.  Skip writing if there is
3103                          * already a hole in the output file.
3104                          */
3105                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
3106                             &dataoff, &holeoff, cred);
3107                         if (xfer == 0)
3108                                 break;
3109                         if (holeoff < 0)
3110                                 checkhole = false;
3111                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
3112                             (intmax_t)xfer2));
3113                 }
3114                 bwillwrite();
3115                 mp = NULL;
3116                 error = vn_start_write(outvp, &mp, V_WAIT);
3117                 if (error != 0)
3118                         break;
3119                 if (growfile) {
3120                         error = vn_lock(outvp, LK_EXCLUSIVE);
3121                         if (error == 0) {
3122                                 error = vn_truncate_locked(outvp, outoff + xfer,
3123                                     false, cred);
3124                                 VOP_UNLOCK(outvp);
3125                         }
3126                 } else {
3127                         error = vn_lock(outvp, vn_lktype_write(mp, outvp));
3128                         if (error == 0) {
3129                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3130                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
3131                                     curthread->td_ucred, cred, NULL, curthread);
3132                                 outoff += xfer2;
3133                                 xfer -= xfer2;
3134                                 VOP_UNLOCK(outvp);
3135                         }
3136                 }
3137                 if (mp != NULL)
3138                         vn_finished_write(mp);
3139         } while (!growfile && xfer > 0 && error == 0);
3140         return (error);
3141 }
3142
3143 /*
3144  * Copy a byte range of one file to another.  This function can handle the
3145  * case where invp and outvp are on different file systems.
3146  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3147  * is no better file system specific way to do it.
3148  */
3149 int
3150 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3151     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3152     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3153 {
3154         struct vattr va, inva;
3155         struct mount *mp;
3156         struct uio io;
3157         off_t startoff, endoff, xfer, xfer2;
3158         u_long blksize;
3159         int error, interrupted;
3160         bool cantseek, readzeros, eof, lastblock, holetoeof;
3161         ssize_t aresid;
3162         size_t copylen, len, rem, savlen;
3163         char *dat;
3164         long holein, holeout;
3165         struct timespec curts, endts;
3166
3167         holein = holeout = 0;
3168         savlen = len = *lenp;
3169         error = 0;
3170         interrupted = 0;
3171         dat = NULL;
3172
3173         error = vn_lock(invp, LK_SHARED);
3174         if (error != 0)
3175                 goto out;
3176         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3177                 holein = 0;
3178         if (holein > 0)
3179                 error = VOP_GETATTR(invp, &inva, incred);
3180         VOP_UNLOCK(invp);
3181         if (error != 0)
3182                 goto out;
3183
3184         mp = NULL;
3185         error = vn_start_write(outvp, &mp, V_WAIT);
3186         if (error == 0)
3187                 error = vn_lock(outvp, LK_EXCLUSIVE);
3188         if (error == 0) {
3189                 /*
3190                  * If fsize_td != NULL, do a vn_rlimit_fsize() call,
3191                  * now that outvp is locked.
3192                  */
3193                 if (fsize_td != NULL) {
3194                         io.uio_offset = *outoffp;
3195                         io.uio_resid = len;
3196                         error = vn_rlimit_fsize(outvp, &io, fsize_td);
3197                         if (error != 0)
3198                                 error = EFBIG;
3199                 }
3200                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3201                         holeout = 0;
3202                 /*
3203                  * Holes that are past EOF do not need to be written as a block
3204                  * of zero bytes.  So, truncate the output file as far as
3205                  * possible and then use va.va_size to decide if writing 0
3206                  * bytes is necessary in the loop below.
3207                  */
3208                 if (error == 0)
3209                         error = VOP_GETATTR(outvp, &va, outcred);
3210                 if (error == 0 && va.va_size > *outoffp && va.va_size <=
3211                     *outoffp + len) {
3212 #ifdef MAC
3213                         error = mac_vnode_check_write(curthread->td_ucred,
3214                             outcred, outvp);
3215                         if (error == 0)
3216 #endif
3217                                 error = vn_truncate_locked(outvp, *outoffp,
3218                                     false, outcred);
3219                         if (error == 0)
3220                                 va.va_size = *outoffp;
3221                 }
3222                 VOP_UNLOCK(outvp);
3223         }
3224         if (mp != NULL)
3225                 vn_finished_write(mp);
3226         if (error != 0)
3227                 goto out;
3228
3229         /*
3230          * Set the blksize to the larger of the hole sizes for invp and outvp.
3231          * If hole sizes aren't available, set the blksize to the larger
3232          * f_iosize of invp and outvp.
3233          * This code expects the hole sizes and f_iosizes to be powers of 2.
3234          * This value is clipped at 4Kbytes and 1Mbyte.
3235          */
3236         blksize = MAX(holein, holeout);
3237
3238         /* Clip len to end at an exact multiple of hole size. */
3239         if (blksize > 1) {
3240                 rem = *inoffp % blksize;
3241                 if (rem > 0)
3242                         rem = blksize - rem;
3243                 if (len > rem && len - rem > blksize)
3244                         len = savlen = rounddown(len - rem, blksize) + rem;
3245         }
3246
3247         if (blksize <= 1)
3248                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3249                     outvp->v_mount->mnt_stat.f_iosize);
3250         if (blksize < 4096)
3251                 blksize = 4096;
3252         else if (blksize > 1024 * 1024)
3253                 blksize = 1024 * 1024;
3254         dat = malloc(blksize, M_TEMP, M_WAITOK);
3255
3256         /*
3257          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3258          * to find holes.  Otherwise, just scan the read block for all 0s
3259          * in the inner loop where the data copying is done.
3260          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3261          * support holes on the server, but do not support FIOSEEKHOLE.
3262          * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
3263          * that this function should return after 1second with a partial
3264          * completion.
3265          */
3266         if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
3267                 getnanouptime(&endts);
3268                 endts.tv_sec++;
3269         } else
3270                 timespecclear(&endts);
3271         holetoeof = eof = false;
3272         while (len > 0 && error == 0 && !eof && interrupted == 0) {
3273                 endoff = 0;                     /* To shut up compilers. */
3274                 cantseek = true;
3275                 startoff = *inoffp;
3276                 copylen = len;
3277
3278                 /*
3279                  * Find the next data area.  If there is just a hole to EOF,
3280                  * FIOSEEKDATA should fail with ENXIO.
3281                  * (I do not know if any file system will report a hole to
3282                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3283                  *  will fail for those file systems.)
3284                  *
3285                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3286                  * the code just falls through to the inner copy loop.
3287                  */
3288                 error = EINVAL;
3289                 if (holein > 0) {
3290                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3291                             incred, curthread);
3292                         if (error == ENXIO) {
3293                                 startoff = endoff = inva.va_size;
3294                                 eof = holetoeof = true;
3295                                 error = 0;
3296                         }
3297                 }
3298                 if (error == 0 && !holetoeof) {
3299                         endoff = startoff;
3300                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3301                             incred, curthread);
3302                         /*
3303                          * Since invp is unlocked, it may be possible for
3304                          * another thread to do a truncate(), lseek(), write()
3305                          * creating a hole at startoff between the above
3306                          * VOP_IOCTL() calls, if the other thread does not do
3307                          * rangelocking.
3308                          * If that happens, startoff == endoff and finding
3309                          * the hole has failed, so set an error.
3310                          */
3311                         if (error == 0 && startoff == endoff)
3312                                 error = EINVAL; /* Any error. Reset to 0. */
3313                 }
3314                 if (error == 0) {
3315                         if (startoff > *inoffp) {
3316                                 /* Found hole before data block. */
3317                                 xfer = MIN(startoff - *inoffp, len);
3318                                 if (*outoffp < va.va_size) {
3319                                         /* Must write 0s to punch hole. */
3320                                         xfer2 = MIN(va.va_size - *outoffp,
3321                                             xfer);
3322                                         memset(dat, 0, MIN(xfer2, blksize));
3323                                         error = vn_write_outvp(outvp, dat,
3324                                             *outoffp, xfer2, blksize, false,
3325                                             holeout > 0, outcred);
3326                                 }
3327
3328                                 if (error == 0 && *outoffp + xfer >
3329                                     va.va_size && (xfer == len || holetoeof)) {
3330                                         /* Grow output file (hole at end). */
3331                                         error = vn_write_outvp(outvp, dat,
3332                                             *outoffp, xfer, blksize, true,
3333                                             false, outcred);
3334                                 }
3335                                 if (error == 0) {
3336                                         *inoffp += xfer;
3337                                         *outoffp += xfer;
3338                                         len -= xfer;
3339                                         if (len < savlen) {
3340                                                 interrupted = sig_intr();
3341                                                 if (timespecisset(&endts) &&
3342                                                     interrupted == 0) {
3343                                                         getnanouptime(&curts);
3344                                                         if (timespeccmp(&curts,
3345                                                             &endts, >=))
3346                                                                 interrupted =
3347                                                                     EINTR;
3348                                                 }
3349                                         }
3350                                 }
3351                         }
3352                         copylen = MIN(len, endoff - startoff);
3353                         cantseek = false;
3354                 } else {
3355                         cantseek = true;
3356                         startoff = *inoffp;
3357                         copylen = len;
3358                         error = 0;
3359                 }
3360
3361                 xfer = blksize;
3362                 if (cantseek) {
3363                         /*
3364                          * Set first xfer to end at a block boundary, so that
3365                          * holes are more likely detected in the loop below via
3366                          * the for all bytes 0 method.
3367                          */
3368                         xfer -= (*inoffp % blksize);
3369                 }
3370                 /* Loop copying the data block. */
3371                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
3372                         if (copylen < xfer)
3373                                 xfer = copylen;
3374                         error = vn_lock(invp, LK_SHARED);
3375                         if (error != 0)
3376                                 goto out;
3377                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
3378                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
3379                             curthread->td_ucred, incred, &aresid,
3380                             curthread);
3381                         VOP_UNLOCK(invp);
3382                         lastblock = false;
3383                         if (error == 0 && aresid > 0) {
3384                                 /* Stop the copy at EOF on the input file. */
3385                                 xfer -= aresid;
3386                                 eof = true;
3387                                 lastblock = true;
3388                         }
3389                         if (error == 0) {
3390                                 /*
3391                                  * Skip the write for holes past the initial EOF
3392                                  * of the output file, unless this is the last
3393                                  * write of the output file at EOF.
3394                                  */
3395                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
3396                                     false;
3397                                 if (xfer == len)
3398                                         lastblock = true;
3399                                 if (!cantseek || *outoffp < va.va_size ||
3400                                     lastblock || !readzeros)
3401                                         error = vn_write_outvp(outvp, dat,
3402                                             *outoffp, xfer, blksize,
3403                                             readzeros && lastblock &&
3404                                             *outoffp >= va.va_size, false,
3405                                             outcred);
3406                                 if (error == 0) {
3407                                         *inoffp += xfer;
3408                                         startoff += xfer;
3409                                         *outoffp += xfer;
3410                                         copylen -= xfer;
3411                                         len -= xfer;
3412                                         if (len < savlen) {
3413                                                 interrupted = sig_intr();
3414                                                 if (timespecisset(&endts) &&
3415                                                     interrupted == 0) {
3416                                                         getnanouptime(&curts);
3417                                                         if (timespeccmp(&curts,
3418                                                             &endts, >=))
3419                                                                 interrupted =
3420                                                                     EINTR;
3421                                                 }
3422                                         }
3423                                 }
3424                         }
3425                         xfer = blksize;
3426                 }
3427         }
3428 out:
3429         *lenp = savlen - len;
3430         free(dat, M_TEMP);
3431         return (error);
3432 }
3433
3434 static int
3435 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3436 {
3437         struct mount *mp;
3438         struct vnode *vp;
3439         off_t olen, ooffset;
3440         int error;
3441 #ifdef AUDIT
3442         int audited_vnode1 = 0;
3443 #endif
3444
3445         vp = fp->f_vnode;
3446         if (vp->v_type != VREG)
3447                 return (ENODEV);
3448
3449         /* Allocating blocks may take a long time, so iterate. */
3450         for (;;) {
3451                 olen = len;
3452                 ooffset = offset;
3453
3454                 bwillwrite();
3455                 mp = NULL;
3456                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3457                 if (error != 0)
3458                         break;
3459                 error = vn_lock(vp, LK_EXCLUSIVE);
3460                 if (error != 0) {
3461                         vn_finished_write(mp);
3462                         break;
3463                 }
3464 #ifdef AUDIT
3465                 if (!audited_vnode1) {
3466                         AUDIT_ARG_VNODE1(vp);
3467                         audited_vnode1 = 1;
3468                 }
3469 #endif
3470 #ifdef MAC
3471                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3472                 if (error == 0)
3473 #endif
3474                         error = VOP_ALLOCATE(vp, &offset, &len);
3475                 VOP_UNLOCK(vp);
3476                 vn_finished_write(mp);
3477
3478                 if (olen + ooffset != offset + len) {
3479                         panic("offset + len changed from %jx/%jx to %jx/%jx",
3480                             ooffset, olen, offset, len);
3481                 }
3482                 if (error != 0 || len == 0)
3483                         break;
3484                 KASSERT(olen > len, ("Iteration did not make progress?"));
3485                 maybe_yield();
3486         }
3487
3488         return (error);
3489 }
3490
3491 static int
3492 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
3493     int ioflag, struct ucred *cred, struct ucred *active_cred,
3494     struct ucred *file_cred)
3495 {
3496         struct mount *mp;
3497         void *rl_cookie;
3498         off_t off, len;
3499         int error;
3500 #ifdef AUDIT
3501         bool audited_vnode1 = false;
3502 #endif
3503
3504         rl_cookie = NULL;
3505         error = 0;
3506         mp = NULL;
3507         off = *offset;
3508         len = *length;
3509
3510         if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
3511                 rl_cookie = vn_rangelock_wlock(vp, off, off + len);
3512         while (len > 0 && error == 0) {
3513                 /*
3514                  * Try to deallocate the longest range in one pass.
3515                  * In case a pass takes too long to be executed, it returns
3516                  * partial result. The residue will be proceeded in the next
3517                  * pass.
3518                  */
3519
3520                 if ((ioflag & IO_NODELOCKED) == 0) {
3521                         bwillwrite();
3522                         if ((error = vn_start_write(vp, &mp,
3523                             V_WAIT | PCATCH)) != 0)
3524                                 goto out;
3525                         vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
3526                 }
3527 #ifdef AUDIT
3528                 if (!audited_vnode1) {
3529                         AUDIT_ARG_VNODE1(vp);
3530                         audited_vnode1 = true;
3531                 }
3532 #endif
3533
3534 #ifdef MAC
3535                 if ((ioflag & IO_NOMACCHECK) == 0)
3536                         error = mac_vnode_check_write(active_cred, file_cred,
3537                             vp);
3538 #endif
3539                 if (error == 0)
3540                         error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
3541                             cred);
3542
3543                 if ((ioflag & IO_NODELOCKED) == 0) {
3544                         VOP_UNLOCK(vp);
3545                         if (mp != NULL) {
3546                                 vn_finished_write(mp);
3547                                 mp = NULL;
3548                         }
3549                 }
3550                 if (error == 0 && len != 0)
3551                         maybe_yield();
3552         }
3553 out:
3554         if (rl_cookie != NULL)
3555                 vn_rangelock_unlock(vp, rl_cookie);
3556         *offset = off;
3557         *length = len;
3558         return (error);
3559 }
3560
3561 /*
3562  * This function is supposed to be used in the situations where the deallocation
3563  * is not triggered by a user request.
3564  */
3565 int
3566 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
3567     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
3568 {
3569         struct ucred *cred;
3570
3571         if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
3572             flags != 0)
3573                 return (EINVAL);
3574         if (vp->v_type != VREG)
3575                 return (ENODEV);
3576
3577         cred = file_cred != NOCRED ? file_cred : active_cred;
3578         return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
3579             active_cred, file_cred));
3580 }
3581
3582 static int
3583 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
3584     struct ucred *active_cred, struct thread *td)
3585 {
3586         int error;
3587         struct vnode *vp;
3588         int ioflag;
3589
3590         vp = fp->f_vnode;
3591
3592         if (cmd != SPACECTL_DEALLOC || *offset < 0 || *length <= 0 ||
3593             *length > OFF_MAX - *offset || flags != 0)
3594                 return (EINVAL);
3595         if (vp->v_type != VREG)
3596                 return (ENODEV);
3597
3598         ioflag = get_write_ioflag(fp);
3599
3600         switch (cmd) {
3601         case SPACECTL_DEALLOC:
3602                 error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
3603                     active_cred, active_cred, fp->f_cred);
3604                 break;
3605         default:
3606                 panic("vn_fspacectl: unknown cmd %d", cmd);
3607         }
3608
3609         return (error);
3610 }
3611
3612 static u_long vn_lock_pair_pause_cnt;
3613 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
3614     &vn_lock_pair_pause_cnt, 0,
3615     "Count of vn_lock_pair deadlocks");
3616
3617 u_int vn_lock_pair_pause_max;
3618 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
3619     &vn_lock_pair_pause_max, 0,
3620     "Max ticks for vn_lock_pair deadlock avoidance sleep");
3621
3622 static void
3623 vn_lock_pair_pause(const char *wmesg)
3624 {
3625         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
3626         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
3627 }
3628
3629 /*
3630  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
3631  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
3632  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
3633  * can be NULL.
3634  *
3635  * The function returns with both vnodes exclusively locked, and
3636  * guarantees that it does not create lock order reversal with other
3637  * threads during its execution.  Both vnodes could be unlocked
3638  * temporary (and reclaimed).
3639  */
3640 void
3641 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
3642     bool vp2_locked)
3643 {
3644         int error;
3645
3646         if (vp1 == NULL && vp2 == NULL)
3647                 return;
3648         if (vp1 != NULL) {
3649                 if (vp1_locked)
3650                         ASSERT_VOP_ELOCKED(vp1, "vp1");
3651                 else
3652                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
3653         } else {
3654                 vp1_locked = true;
3655         }
3656         if (vp2 != NULL) {
3657                 if (vp2_locked)
3658                         ASSERT_VOP_ELOCKED(vp2, "vp2");
3659                 else
3660                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
3661         } else {
3662                 vp2_locked = true;
3663         }
3664         if (!vp1_locked && !vp2_locked) {
3665                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3666                 vp1_locked = true;
3667         }
3668
3669         for (;;) {
3670                 if (vp1_locked && vp2_locked)
3671                         break;
3672                 if (vp1_locked && vp2 != NULL) {
3673                         if (vp1 != NULL) {
3674                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
3675                                     __FILE__, __LINE__);
3676                                 if (error == 0)
3677                                         break;
3678                                 VOP_UNLOCK(vp1);
3679                                 vp1_locked = false;
3680                                 vn_lock_pair_pause("vlp1");
3681                         }
3682                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3683                         vp2_locked = true;
3684                 }
3685                 if (vp2_locked && vp1 != NULL) {
3686                         if (vp2 != NULL) {
3687                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
3688                                     __FILE__, __LINE__);
3689                                 if (error == 0)
3690                                         break;
3691                                 VOP_UNLOCK(vp2);
3692                                 vp2_locked = false;
3693                                 vn_lock_pair_pause("vlp2");
3694                         }
3695                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3696                         vp1_locked = true;
3697                 }
3698         }
3699         if (vp1 != NULL)
3700                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
3701         if (vp2 != NULL)
3702                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
3703 }
3704
3705 int
3706 vn_lktype_write(struct mount *mp, struct vnode *vp)
3707 {
3708         if (MNT_SHARED_WRITES(mp) ||
3709             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
3710                 return (LK_SHARED);
3711         return (LK_EXCLUSIVE);
3712 }