sys/kern/vfs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  14  *
  15  * Portions of this software were developed by Konstantin Belousov
  16  * under sponsorship from the FreeBSD Foundation.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  * 3. Neither the name of the University nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  40  * SUCH DAMAGE.
  41  *
  42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include "opt_hwpmc_hooks.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/disk.h>
  53 #include <sys/fail.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/kdb.h>
  57 #include <sys/ktr.h>
  58 #include <sys/stat.h>
  59 #include <sys/priv.h>
  60 #include <sys/proc.h>
  61 #include <sys/limits.h>
  62 #include <sys/lock.h>
  63 #include <sys/mman.h>
  64 #include <sys/mount.h>
  65 #include <sys/mutex.h>
  66 #include <sys/namei.h>
  67 #include <sys/vnode.h>
  68 #include <sys/bio.h>
  69 #include <sys/buf.h>
  70 #include <sys/filio.h>
  71 #include <sys/resourcevar.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/prng.h>
  74 #include <sys/sx.h>
  75 #include <sys/sleepqueue.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/ttycom.h>
  78 #include <sys/conf.h>
  79 #include <sys/syslog.h>
  80 #include <sys/unistd.h>
  81 #include <sys/user.h>
  82
  83 #include <security/audit/audit.h>
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_object.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_pager.h>
  93
  94 #ifdef HWPMC_HOOKS
  95 #include <sys/pmckern.h>
  96 #endif
  97
  98 static fo_rdwr_t        vn_read;
  99 static fo_rdwr_t        vn_write;
 100 static fo_rdwr_t        vn_io_fault;
 101 static fo_truncate_t    vn_truncate;
 102 static fo_ioctl_t       vn_ioctl;
 103 static fo_poll_t        vn_poll;
 104 static fo_kqfilter_t    vn_kqfilter;
 105 static fo_stat_t        vn_statfile;
 106 static fo_close_t       vn_closefile;
 107 static fo_mmap_t        vn_mmap;
 108 static fo_fallocate_t   vn_fallocate;
 109
 110 struct  fileops vnops = {
 111         .fo_read = vn_io_fault,
 112         .fo_write = vn_io_fault,
 113         .fo_truncate = vn_truncate,
 114         .fo_ioctl = vn_ioctl,
 115         .fo_poll = vn_poll,
 116         .fo_kqfilter = vn_kqfilter,
 117         .fo_stat = vn_statfile,
 118         .fo_close = vn_closefile,
 119         .fo_chmod = vn_chmod,
 120         .fo_chown = vn_chown,
 121         .fo_sendfile = vn_sendfile,
 122         .fo_seek = vn_seek,
 123         .fo_fill_kinfo = vn_fill_kinfo,
 124         .fo_mmap = vn_mmap,
 125         .fo_fallocate = vn_fallocate,
 126         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 127 };
 128
 129 const u_int io_hold_cnt = 16;
 130 static int vn_io_fault_enable = 1;
 131 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
 132     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 133 static int vn_io_fault_prefault = 0;
 134 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
 135     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 136 static int vn_io_pgcache_read_enable = 1;
 137 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
 138     &vn_io_pgcache_read_enable, 0,
 139     "Enable copying from page cache for reads, avoiding fs");
 140 static u_long vn_io_faults_cnt;
 141 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
 142     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 143
 144 static int vfs_allow_read_dir = 0;
 145 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
 146     &vfs_allow_read_dir, 0,
 147     "Enable read(2) of directory by root for filesystems that support it");
 148
 149 /*
 150  * Returns true if vn_io_fault mode of handling the i/o request should
 151  * be used.
 152  */
 153 static bool
 154 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 155 {
 156         struct mount *mp;
 157
 158         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 159             (mp = vp->v_mount) != NULL &&
 160             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 161 }
 162
 163 /*
 164  * Structure used to pass arguments to vn_io_fault1(), to do either
 165  * file- or vnode-based I/O calls.
 166  */
 167 struct vn_io_fault_args {
 168         enum {
 169                 VN_IO_FAULT_FOP,
 170                 VN_IO_FAULT_VOP
 171         } kind;
 172         struct ucred *cred;
 173         int flags;
 174         union {
 175                 struct fop_args_tag {
 176                         struct file *fp;
 177                         fo_rdwr_t *doio;
 178                 } fop_args;
 179                 struct vop_args_tag {
 180                         struct vnode *vp;
 181                 } vop_args;
 182         } args;
 183 };
 184
 185 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
 186     struct vn_io_fault_args *args, struct thread *td);
 187
 188 int
 189 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 190 {
 191         struct thread *td = ndp->ni_cnd.cn_thread;
 192
 193         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 194 }
 195
 196 static uint64_t
 197 open2nameif(int fmode, u_int vn_open_flags)
 198 {
 199         uint64_t res;
 200
 201         res = ISOPEN | LOCKLEAF;
 202         if ((fmode & O_BENEATH) != 0)
 203                 res |= BENEATH;
 204         if ((fmode & O_RESOLVE_BENEATH) != 0)
 205                 res |= RBENEATH;
 206         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 207                 res |= AUDITVNODE1;
 208         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 209                 res |= NOCAPCHECK;
 210         return (res);
 211 }
 212
 213 /*
 214  * Common code for vnode open operations via a name lookup.
 215  * Lookup the vnode and invoke VOP_CREATE if needed.
 216  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 217  *
 218  * Note that this does NOT free nameidata for the successful case,
 219  * due to the NDINIT being done elsewhere.
 220  */
 221 int
 222 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 223     struct ucred *cred, struct file *fp)
 224 {
 225         struct vnode *vp;
 226         struct mount *mp;
 227         struct thread *td = ndp->ni_cnd.cn_thread;
 228         struct vattr vat;
 229         struct vattr *vap = &vat;
 230         int fmode, error;
 231
 232 restart:
 233         fmode = *flagp;
 234         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 235             O_EXCL | O_DIRECTORY))
 236                 return (EINVAL);
 237         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 238                 ndp->ni_cnd.cn_nameiop = CREATE;
 239                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 240                 /*
 241                  * Set NOCACHE to avoid flushing the cache when
 242                  * rolling in many files at once.
 243                  *
 244                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
 245                  * exist despite NOCACHE.
 246                  */
 247                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 248                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 249                         ndp->ni_cnd.cn_flags |= FOLLOW;
 250                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 251                         bwillwrite();
 252                 if ((error = namei(ndp)) != 0)
 253                         return (error);
 254                 if (ndp->ni_vp == NULL) {
 255                         VATTR_NULL(vap);
 256                         vap->va_type = VREG;
 257                         vap->va_mode = cmode;
 258                         if (fmode & O_EXCL)
 259                                 vap->va_vaflags |= VA_EXCLUSIVE;
 260                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 261                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 262                                 vput(ndp->ni_dvp);
 263                                 if ((error = vn_start_write(NULL, &mp,
 264                                     V_XSLEEP | PCATCH)) != 0)
 265                                         return (error);
 266                                 NDREINIT(ndp);
 267                                 goto restart;
 268                         }
 269                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 270                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
 271 #ifdef MAC
 272                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
 273                             &ndp->ni_cnd, vap);
 274                         if (error == 0)
 275 #endif
 276                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 277                                                    &ndp->ni_cnd, vap);
 278                         vput(ndp->ni_dvp);
 279                         vn_finished_write(mp);
 280                         if (error) {
 281                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 282                                 if (error == ERELOOKUP) {
 283                                         NDREINIT(ndp);
 284                                         goto restart;
 285                                 }
 286                                 return (error);
 287                         }
 288                         fmode &= ~O_TRUNC;
 289                         vp = ndp->ni_vp;
 290                 } else {
 291                         if (ndp->ni_dvp == ndp->ni_vp)
 292                                 vrele(ndp->ni_dvp);
 293                         else
 294                                 vput(ndp->ni_dvp);
 295                         ndp->ni_dvp = NULL;
 296                         vp = ndp->ni_vp;
 297                         if (fmode & O_EXCL) {
 298                                 error = EEXIST;
 299                                 goto bad;
 300                         }
 301                         if (vp->v_type == VDIR) {
 302                                 error = EISDIR;
 303                                 goto bad;
 304                         }
 305                         fmode &= ~O_CREAT;
 306                 }
 307         } else {
 308                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 309                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 310                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 311                     FOLLOW;
 312                 if ((fmode & FWRITE) == 0)
 313                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
 314                 if ((error = namei(ndp)) != 0)
 315                         return (error);
 316                 vp = ndp->ni_vp;
 317         }
 318         error = vn_open_vnode(vp, fmode, cred, td, fp);
 319         if (error)
 320                 goto bad;
 321         *flagp = fmode;
 322         return (0);
 323 bad:
 324         NDFREE(ndp, NDF_ONLY_PNBUF);
 325         vput(vp);
 326         *flagp = fmode;
 327         ndp->ni_vp = NULL;
 328         return (error);
 329 }
 330
 331 static int
 332 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 333 {
 334         struct flock lf;
 335         int error, lock_flags, type;
 336
 337         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 338         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 339                 return (0);
 340         KASSERT(fp != NULL, ("open with flock requires fp"));
 341         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 342                 return (EOPNOTSUPP);
 343
 344         lock_flags = VOP_ISLOCKED(vp);
 345         VOP_UNLOCK(vp);
 346
 347         lf.l_whence = SEEK_SET;
 348         lf.l_start = 0;
 349         lf.l_len = 0;
 350         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 351         type = F_FLOCK;
 352         if ((fmode & FNONBLOCK) == 0)
 353                 type |= F_WAIT;
 354         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 355         if (error == 0)
 356                 fp->f_flag |= FHASLOCK;
 357
 358         vn_lock(vp, lock_flags | LK_RETRY);
 359         if (error == 0 && VN_IS_DOOMED(vp))
 360                 error = ENOENT;
 361         return (error);
 362 }
 363
 364 /*
 365  * Common code for vnode open operations once a vnode is located.
 366  * Check permissions, and call the VOP_OPEN routine.
 367  */
 368 int
 369 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 370     struct thread *td, struct file *fp)
 371 {
 372         accmode_t accmode;
 373         int error;
 374
 375         if (vp->v_type == VLNK)
 376                 return (EMLINK);
 377         if (vp->v_type == VSOCK)
 378                 return (EOPNOTSUPP);
 379         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 380                 return (ENOTDIR);
 381         accmode = 0;
 382         if (fmode & (FWRITE | O_TRUNC)) {
 383                 if (vp->v_type == VDIR)
 384                         return (EISDIR);
 385                 accmode |= VWRITE;
 386         }
 387         if (fmode & FREAD)
 388                 accmode |= VREAD;
 389         if (fmode & FEXEC)
 390                 accmode |= VEXEC;
 391         if ((fmode & O_APPEND) && (fmode & FWRITE))
 392                 accmode |= VAPPEND;
 393 #ifdef MAC
 394         if (fmode & O_CREAT)
 395                 accmode |= VCREAT;
 396         if (fmode & O_VERIFY)
 397                 accmode |= VVERIFY;
 398         error = mac_vnode_check_open(cred, vp, accmode);
 399         if (error)
 400                 return (error);
 401
 402         accmode &= ~(VCREAT | VVERIFY);
 403 #endif
 404         if ((fmode & O_CREAT) == 0 && accmode != 0) {
 405                 error = VOP_ACCESS(vp, accmode, cred, td);
 406                 if (error != 0)
 407                         return (error);
 408         }
 409         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 410                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 411         error = VOP_OPEN(vp, fmode, cred, td, fp);
 412         if (error != 0)
 413                 return (error);
 414
 415         error = vn_open_vnode_advlock(vp, fmode, fp);
 416         if (error == 0 && (fmode & FWRITE) != 0) {
 417                 error = VOP_ADD_WRITECOUNT(vp, 1);
 418                 if (error == 0) {
 419                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 420                              __func__, vp, vp->v_writecount);
 421                 }
 422         }
 423
 424         /*
 425          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 426          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 427          * Arrange for that by having fdrop() to use vn_closefile().
 428          */
 429         if (error != 0) {
 430                 fp->f_flag |= FOPENFAILED;
 431                 fp->f_vnode = vp;
 432                 if (fp->f_ops == &badfileops) {
 433                         fp->f_type = DTYPE_VNODE;
 434                         fp->f_ops = &vnops;
 435                 }
 436                 vref(vp);
 437         }
 438
 439         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 440         return (error);
 441
 442 }
 443
 444 /*
 445  * Check for write permissions on the specified vnode.
 446  * Prototype text segments cannot be written.
 447  * It is racy.
 448  */
 449 int
 450 vn_writechk(struct vnode *vp)
 451 {
 452
 453         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 454         /*
 455          * If there's shared text associated with
 456          * the vnode, try to free it up once.  If
 457          * we fail, we can't allow writing.
 458          */
 459         if (VOP_IS_TEXT(vp))
 460                 return (ETXTBSY);
 461
 462         return (0);
 463 }
 464
 465 /*
 466  * Vnode close call
 467  */
 468 static int
 469 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
 470     struct thread *td, bool keep_ref)
 471 {
 472         struct mount *mp;
 473         int error, lock_flags;
 474
 475         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 476             MNT_EXTENDED_SHARED(vp->v_mount))
 477                 lock_flags = LK_SHARED;
 478         else
 479                 lock_flags = LK_EXCLUSIVE;
 480
 481         vn_start_write(vp, &mp, V_WAIT);
 482         vn_lock(vp, lock_flags | LK_RETRY);
 483         AUDIT_ARG_VNODE1(vp);
 484         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 485                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 486                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 487                     __func__, vp, vp->v_writecount);
 488         }
 489         error = VOP_CLOSE(vp, flags, file_cred, td);
 490         if (keep_ref)
 491                 VOP_UNLOCK(vp);
 492         else
 493                 vput(vp);
 494         vn_finished_write(mp);
 495         return (error);
 496 }
 497
 498 int
 499 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
 500     struct thread *td)
 501 {
 502
 503         return (vn_close1(vp, flags, file_cred, td, false));
 504 }
 505
 506 /*
 507  * Heuristic to detect sequential operation.
 508  */
 509 static int
 510 sequential_heuristic(struct uio *uio, struct file *fp)
 511 {
 512         enum uio_rw rw;
 513
 514         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 515
 516         rw = uio->uio_rw;
 517         if (fp->f_flag & FRDAHEAD)
 518                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 519
 520         /*
 521          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 522          * that the first I/O is normally considered to be slightly
 523          * sequential.  Seeking to offset 0 doesn't change sequentiality
 524          * unless previous seeks have reduced f_seqcount to 0, in which
 525          * case offset 0 is not special.
 526          */
 527         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 528             uio->uio_offset == fp->f_nextoff[rw]) {
 529                 /*
 530                  * f_seqcount is in units of fixed-size blocks so that it
 531                  * depends mainly on the amount of sequential I/O and not
 532                  * much on the number of sequential I/O's.  The fixed size
 533                  * of 16384 is hard-coded here since it is (not quite) just
 534                  * a magic size that works well here.  This size is more
 535                  * closely related to the best I/O size for real disks than
 536                  * to any block size used by software.
 537                  */
 538                 if (uio->uio_resid >= IO_SEQMAX * 16384)
 539                         fp->f_seqcount[rw] = IO_SEQMAX;
 540                 else {
 541                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 542                         if (fp->f_seqcount[rw] > IO_SEQMAX)
 543                                 fp->f_seqcount[rw] = IO_SEQMAX;
 544                 }
 545                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 546         }
 547
 548         /* Not sequential.  Quickly draw-down sequentiality. */
 549         if (fp->f_seqcount[rw] > 1)
 550                 fp->f_seqcount[rw] = 1;
 551         else
 552                 fp->f_seqcount[rw] = 0;
 553         return (0);
 554 }
 555
 556 /*
 557  * Package up an I/O request on a vnode into a uio and do it.
 558  */
 559 int
 560 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
 561     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 562     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 563 {
 564         struct uio auio;
 565         struct iovec aiov;
 566         struct mount *mp;
 567         struct ucred *cred;
 568         void *rl_cookie;
 569         struct vn_io_fault_args args;
 570         int error, lock_flags;
 571
 572         if (offset < 0 && vp->v_type != VCHR)
 573                 return (EINVAL);
 574         auio.uio_iov = &aiov;
 575         auio.uio_iovcnt = 1;
 576         aiov.iov_base = base;
 577         aiov.iov_len = len;
 578         auio.uio_resid = len;
 579         auio.uio_offset = offset;
 580         auio.uio_segflg = segflg;
 581         auio.uio_rw = rw;
 582         auio.uio_td = td;
 583         error = 0;
 584
 585         if ((ioflg & IO_NODELOCKED) == 0) {
 586                 if ((ioflg & IO_RANGELOCKED) == 0) {
 587                         if (rw == UIO_READ) {
 588                                 rl_cookie = vn_rangelock_rlock(vp, offset,
 589                                     offset + len);
 590                         } else if ((ioflg & IO_APPEND) != 0) {
 591                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 592                         } else {
 593                                 rl_cookie = vn_rangelock_wlock(vp, offset,
 594                                     offset + len);
 595                         }
 596                 } else
 597                         rl_cookie = NULL;
 598                 mp = NULL;
 599                 if (rw == UIO_WRITE) {
 600                         if (vp->v_type != VCHR &&
 601                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 602                             != 0)
 603                                 goto out;
 604                         if (MNT_SHARED_WRITES(mp) ||
 605                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 606                                 lock_flags = LK_SHARED;
 607                         else
 608                                 lock_flags = LK_EXCLUSIVE;
 609                 } else
 610                         lock_flags = LK_SHARED;
 611                 vn_lock(vp, lock_flags | LK_RETRY);
 612         } else
 613                 rl_cookie = NULL;
 614
 615         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 616 #ifdef MAC
 617         if ((ioflg & IO_NOMACCHECK) == 0) {
 618                 if (rw == UIO_READ)
 619                         error = mac_vnode_check_read(active_cred, file_cred,
 620                             vp);
 621                 else
 622                         error = mac_vnode_check_write(active_cred, file_cred,
 623                             vp);
 624         }
 625 #endif
 626         if (error == 0) {
 627                 if (file_cred != NULL)
 628                         cred = file_cred;
 629                 else
 630                         cred = active_cred;
 631                 if (do_vn_io_fault(vp, &auio)) {
 632                         args.kind = VN_IO_FAULT_VOP;
 633                         args.cred = cred;
 634                         args.flags = ioflg;
 635                         args.args.vop_args.vp = vp;
 636                         error = vn_io_fault1(vp, &auio, &args, td);
 637                 } else if (rw == UIO_READ) {
 638                         error = VOP_READ(vp, &auio, ioflg, cred);
 639                 } else /* if (rw == UIO_WRITE) */ {
 640                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 641                 }
 642         }
 643         if (aresid)
 644                 *aresid = auio.uio_resid;
 645         else
 646                 if (auio.uio_resid && error == 0)
 647                         error = EIO;
 648         if ((ioflg & IO_NODELOCKED) == 0) {
 649                 VOP_UNLOCK(vp);
 650                 if (mp != NULL)
 651                         vn_finished_write(mp);
 652         }
 653  out:
 654         if (rl_cookie != NULL)
 655                 vn_rangelock_unlock(vp, rl_cookie);
 656         return (error);
 657 }
 658
 659 /*
 660  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 661  * request is split up into smaller chunks and we try to avoid saturating
 662  * the buffer cache while potentially holding a vnode locked, so we
 663  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
 664  * to give other processes a chance to lock the vnode (either other processes
 665  * core'ing the same binary, or unrelated processes scanning the directory).
 666  */
 667 int
 668 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 669     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 670     struct ucred *file_cred, size_t *aresid, struct thread *td)
 671 {
 672         int error = 0;
 673         ssize_t iaresid;
 674
 675         do {
 676                 int chunk;
 677
 678                 /*
 679                  * Force `offset' to a multiple of MAXBSIZE except possibly
 680                  * for the first chunk, so that filesystems only need to
 681                  * write full blocks except possibly for the first and last
 682                  * chunks.
 683                  */
 684                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 685
 686                 if (chunk > len)
 687                         chunk = len;
 688                 if (rw != UIO_READ && vp->v_type == VREG)
 689                         bwillwrite();
 690                 iaresid = 0;
 691                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 692                     ioflg, active_cred, file_cred, &iaresid, td);
 693                 len -= chunk;   /* aresid calc already includes length */
 694                 if (error)
 695                         break;
 696                 offset += chunk;
 697                 base = (char *)base + chunk;
 698                 kern_yield(PRI_USER);
 699         } while (len);
 700         if (aresid)
 701                 *aresid = len + iaresid;
 702         return (error);
 703 }
 704
 705 #if OFF_MAX <= LONG_MAX
 706 off_t
 707 foffset_lock(struct file *fp, int flags)
 708 {
 709         volatile short *flagsp;
 710         off_t res;
 711         short state;
 712
 713         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 714
 715         if ((flags & FOF_NOLOCK) != 0)
 716                 return (atomic_load_long(&fp->f_offset));
 717
 718         /*
 719          * According to McKusick the vn lock was protecting f_offset here.
 720          * It is now protected by the FOFFSET_LOCKED flag.
 721          */
 722         flagsp = &fp->f_vnread_flags;
 723         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 724                 return (atomic_load_long(&fp->f_offset));
 725
 726         sleepq_lock(&fp->f_vnread_flags);
 727         state = atomic_load_16(flagsp);
 728         for (;;) {
 729                 if ((state & FOFFSET_LOCKED) == 0) {
 730                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 731                             FOFFSET_LOCKED))
 732                                 continue;
 733                         break;
 734                 }
 735                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
 736                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 737                             state | FOFFSET_LOCK_WAITING))
 738                                 continue;
 739                 }
 740                 DROP_GIANT();
 741                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 742                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 743                 PICKUP_GIANT();
 744                 sleepq_lock(&fp->f_vnread_flags);
 745                 state = atomic_load_16(flagsp);
 746         }
 747         res = atomic_load_long(&fp->f_offset);
 748         sleepq_release(&fp->f_vnread_flags);
 749         return (res);
 750 }
 751
 752 void
 753 foffset_unlock(struct file *fp, off_t val, int flags)
 754 {
 755         volatile short *flagsp;
 756         short state;
 757
 758         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 759
 760         if ((flags & FOF_NOUPDATE) == 0)
 761                 atomic_store_long(&fp->f_offset, val);
 762         if ((flags & FOF_NEXTOFF_R) != 0)
 763                 fp->f_nextoff[UIO_READ] = val;
 764         if ((flags & FOF_NEXTOFF_W) != 0)
 765                 fp->f_nextoff[UIO_WRITE] = val;
 766
 767         if ((flags & FOF_NOLOCK) != 0)
 768                 return;
 769
 770         flagsp = &fp->f_vnread_flags;
 771         state = atomic_load_16(flagsp);
 772         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 773             atomic_cmpset_rel_16(flagsp, state, 0))
 774                 return;
 775
 776         sleepq_lock(&fp->f_vnread_flags);
 777         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 778         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 779         fp->f_vnread_flags = 0;
 780         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 781         sleepq_release(&fp->f_vnread_flags);
 782 }
 783 #else
 784 off_t
 785 foffset_lock(struct file *fp, int flags)
 786 {
 787         struct mtx *mtxp;
 788         off_t res;
 789
 790         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 791
 792         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 793         mtx_lock(mtxp);
 794         if ((flags & FOF_NOLOCK) == 0) {
 795                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 796                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 797                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 798                             "vofflock", 0);
 799                 }
 800                 fp->f_vnread_flags |= FOFFSET_LOCKED;
 801         }
 802         res = fp->f_offset;
 803         mtx_unlock(mtxp);
 804         return (res);
 805 }
 806
 807 void
 808 foffset_unlock(struct file *fp, off_t val, int flags)
 809 {
 810         struct mtx *mtxp;
 811
 812         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 813
 814         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 815         mtx_lock(mtxp);
 816         if ((flags & FOF_NOUPDATE) == 0)
 817                 fp->f_offset = val;
 818         if ((flags & FOF_NEXTOFF_R) != 0)
 819                 fp->f_nextoff[UIO_READ] = val;
 820         if ((flags & FOF_NEXTOFF_W) != 0)
 821                 fp->f_nextoff[UIO_WRITE] = val;
 822         if ((flags & FOF_NOLOCK) == 0) {
 823                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 824                     ("Lost FOFFSET_LOCKED"));
 825                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 826                         wakeup(&fp->f_vnread_flags);
 827                 fp->f_vnread_flags = 0;
 828         }
 829         mtx_unlock(mtxp);
 830 }
 831 #endif
 832
 833 void
 834 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 835 {
 836
 837         if ((flags & FOF_OFFSET) == 0)
 838                 uio->uio_offset = foffset_lock(fp, flags);
 839 }
 840
 841 void
 842 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 843 {
 844
 845         if ((flags & FOF_OFFSET) == 0)
 846                 foffset_unlock(fp, uio->uio_offset, flags);
 847 }
 848
 849 static int
 850 get_advice(struct file *fp, struct uio *uio)
 851 {
 852         struct mtx *mtxp;
 853         int ret;
 854
 855         ret = POSIX_FADV_NORMAL;
 856         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 857                 return (ret);
 858
 859         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 860         mtx_lock(mtxp);
 861         if (fp->f_advice != NULL &&
 862             uio->uio_offset >= fp->f_advice->fa_start &&
 863             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 864                 ret = fp->f_advice->fa_advice;
 865         mtx_unlock(mtxp);
 866         return (ret);
 867 }
 868
 869 int
 870 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 871 {
 872         vm_object_t obj;
 873         vm_page_t ma[io_hold_cnt + 2];
 874         off_t off, vsz;
 875         ssize_t resid;
 876         int error, i, j;
 877
 878         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 879         obj = atomic_load_ptr(&vp->v_object);
 880         if (obj == NULL)
 881                 return (EJUSTRETURN);
 882
 883         /*
 884          * Depends on type stability of vm_objects.
 885          */
 886         vm_object_pip_add(obj, 1);
 887         if ((obj->flags & OBJ_DEAD) != 0) {
 888                 /*
 889                  * Note that object might be already reused from the
 890                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 891                  * we recheck for DOOMED vnode state after all pages
 892                  * are busied, and retract then.
 893                  *
 894                  * But we check for OBJ_DEAD to ensure that we do not
 895                  * busy pages while vm_object_terminate_pages()
 896                  * processes the queue.
 897                  */
 898                 error = EJUSTRETURN;
 899                 goto out_pip;
 900         }
 901
 902         resid = uio->uio_resid;
 903         off = uio->uio_offset;
 904         for (i = 0; resid > 0; i++) {
 905                 MPASS(i < io_hold_cnt + 2);
 906                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
 907                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 908                     VM_ALLOC_NOWAIT);
 909                 if (ma[i] == NULL)
 910                         break;
 911
 912                 /*
 913                  * Skip invalid pages.  Valid mask can be partial only
 914                  * at EOF, and we clip later.
 915                  */
 916                 if (vm_page_none_valid(ma[i])) {
 917                         vm_page_sunbusy(ma[i]);
 918                         break;
 919                 }
 920
 921                 resid -= PAGE_SIZE;
 922                 off += PAGE_SIZE;
 923         }
 924         if (i == 0) {
 925                 error = EJUSTRETURN;
 926                 goto out_pip;
 927         }
 928
 929         /*
 930          * Check VIRF_DOOMED after we busied our pages.  Since
 931          * vgonel() terminates the vnode' vm_object, it cannot
 932          * process past pages busied by us.
 933          */
 934         if (VN_IS_DOOMED(vp)) {
 935                 error = EJUSTRETURN;
 936                 goto out;
 937         }
 938
 939         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 940         if (resid > uio->uio_resid)
 941                 resid = uio->uio_resid;
 942
 943         /*
 944          * Unlocked read of vnp_size is safe because truncation cannot
 945          * pass busied page.  But we load vnp_size into a local
 946          * variable so that possible concurrent extension does not
 947          * break calculation.
 948          */
 949 #if defined(__powerpc__) && !defined(__powerpc64__)
 950         vsz = obj->un_pager.vnp.vnp_size;
 951 #else
 952         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 953 #endif
 954         if (uio->uio_offset + resid > vsz)
 955                 resid = vsz - uio->uio_offset;
 956
 957         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 958
 959 out:
 960         for (j = 0; j < i; j++) {
 961                 if (error == 0)
 962                         vm_page_reference(ma[j]);
 963                 vm_page_sunbusy(ma[j]);
 964         }
 965 out_pip:
 966         vm_object_pip_wakeup(obj);
 967         if (error != 0)
 968                 return (error);
 969         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 970 }
 971
 972 /*
 973  * File table vnode read routine.
 974  */
 975 static int
 976 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
 977     struct thread *td)
 978 {
 979         struct vnode *vp;
 980         off_t orig_offset;
 981         int error, ioflag;
 982         int advice;
 983
 984         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 985             uio->uio_td, td));
 986         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 987         vp = fp->f_vnode;
 988         ioflag = 0;
 989         if (fp->f_flag & FNONBLOCK)
 990                 ioflag |= IO_NDELAY;
 991         if (fp->f_flag & O_DIRECT)
 992                 ioflag |= IO_DIRECT;
 993
 994         /*
 995          * Try to read from page cache.  VIRF_DOOMED check is racy but
 996          * allows us to avoid unneeded work outright.
 997          */
 998         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 999             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
1000                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1001                 if (error == 0) {
1002                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1003                         return (0);
1004                 }
1005                 if (error != EJUSTRETURN)
1006                         return (error);
1007         }
1008
1009         advice = get_advice(fp, uio);
1010         vn_lock(vp, LK_SHARED | LK_RETRY);
1011
1012         switch (advice) {
1013         case POSIX_FADV_NORMAL:
1014         case POSIX_FADV_SEQUENTIAL:
1015         case POSIX_FADV_NOREUSE:
1016                 ioflag |= sequential_heuristic(uio, fp);
1017                 break;
1018         case POSIX_FADV_RANDOM:
1019                 /* Disable read-ahead for random I/O. */
1020                 break;
1021         }
1022         orig_offset = uio->uio_offset;
1023
1024 #ifdef MAC
1025         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1026         if (error == 0)
1027 #endif
1028                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1029         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1030         VOP_UNLOCK(vp);
1031         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1032             orig_offset != uio->uio_offset)
1033                 /*
1034                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1035                  * for the backing file after a POSIX_FADV_NOREUSE
1036                  * read(2).
1037                  */
1038                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1039                     POSIX_FADV_DONTNEED);
1040         return (error);
1041 }
1042
1043 /*
1044  * File table vnode write routine.
1045  */
1046 static int
1047 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1048     struct thread *td)
1049 {
1050         struct vnode *vp;
1051         struct mount *mp;
1052         off_t orig_offset;
1053         int error, ioflag, lock_flags;
1054         int advice;
1055
1056         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1057             uio->uio_td, td));
1058         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1059         vp = fp->f_vnode;
1060         if (vp->v_type == VREG)
1061                 bwillwrite();
1062         ioflag = IO_UNIT;
1063         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
1064                 ioflag |= IO_APPEND;
1065         if (fp->f_flag & FNONBLOCK)
1066                 ioflag |= IO_NDELAY;
1067         if (fp->f_flag & O_DIRECT)
1068                 ioflag |= IO_DIRECT;
1069         if ((fp->f_flag & O_FSYNC) ||
1070             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
1071                 ioflag |= IO_SYNC;
1072         /*
1073          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
1074          * implementations that don't understand IO_DATASYNC fall back to full
1075          * O_SYNC behavior.
1076          */
1077         if (fp->f_flag & O_DSYNC)
1078                 ioflag |= IO_SYNC | IO_DATASYNC;
1079         mp = NULL;
1080         if (vp->v_type != VCHR &&
1081             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
1082                 goto unlock;
1083
1084         advice = get_advice(fp, uio);
1085
1086         if (MNT_SHARED_WRITES(mp) ||
1087             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
1088                 lock_flags = LK_SHARED;
1089         } else {
1090                 lock_flags = LK_EXCLUSIVE;
1091         }
1092
1093         vn_lock(vp, lock_flags | LK_RETRY);
1094         switch (advice) {
1095         case POSIX_FADV_NORMAL:
1096         case POSIX_FADV_SEQUENTIAL:
1097         case POSIX_FADV_NOREUSE:
1098                 ioflag |= sequential_heuristic(uio, fp);
1099                 break;
1100         case POSIX_FADV_RANDOM:
1101                 /* XXX: Is this correct? */
1102                 break;
1103         }
1104         orig_offset = uio->uio_offset;
1105
1106 #ifdef MAC
1107         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1108         if (error == 0)
1109 #endif
1110                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1111         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1112         VOP_UNLOCK(vp);
1113         if (vp->v_type != VCHR)
1114                 vn_finished_write(mp);
1115         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1116             orig_offset != uio->uio_offset)
1117                 /*
1118                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1119                  * for the backing file after a POSIX_FADV_NOREUSE
1120                  * write(2).
1121                  */
1122                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1123                     POSIX_FADV_DONTNEED);
1124 unlock:
1125         return (error);
1126 }
1127
1128 /*
1129  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1130  * prevent the following deadlock:
1131  *
1132  * Assume that the thread A reads from the vnode vp1 into userspace
1133  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1134  * currently not resident, then system ends up with the call chain
1135  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1136  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1137  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1138  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1139  * backed by the pages of vnode vp1, and some page in buf2 is not
1140  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1141  *
1142  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1143  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1144  * Instead, it first tries to do the whole range i/o with pagefaults
1145  * disabled. If all pages in the i/o buffer are resident and mapped,
1146  * VOP will succeed (ignoring the genuine filesystem errors).
1147  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1148  * i/o in chunks, with all pages in the chunk prefaulted and held
1149  * using vm_fault_quick_hold_pages().
1150  *
1151  * Filesystems using this deadlock avoidance scheme should use the
1152  * array of the held pages from uio, saved in the curthread->td_ma,
1153  * instead of doing uiomove().  A helper function
1154  * vn_io_fault_uiomove() converts uiomove request into
1155  * uiomove_fromphys() over td_ma array.
1156  *
1157  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1158  * make the current i/o request atomic with respect to other i/os and
1159  * truncations.
1160  */
1161
1162 /*
1163  * Decode vn_io_fault_args and perform the corresponding i/o.
1164  */
1165 static int
1166 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1167     struct thread *td)
1168 {
1169         int error, save;
1170
1171         error = 0;
1172         save = vm_fault_disable_pagefaults();
1173         switch (args->kind) {
1174         case VN_IO_FAULT_FOP:
1175                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1176                     uio, args->cred, args->flags, td);
1177                 break;
1178         case VN_IO_FAULT_VOP:
1179                 if (uio->uio_rw == UIO_READ) {
1180                         error = VOP_READ(args->args.vop_args.vp, uio,
1181                             args->flags, args->cred);
1182                 } else if (uio->uio_rw == UIO_WRITE) {
1183                         error = VOP_WRITE(args->args.vop_args.vp, uio,
1184                             args->flags, args->cred);
1185                 }
1186                 break;
1187         default:
1188                 panic("vn_io_fault_doio: unknown kind of io %d %d",
1189                     args->kind, uio->uio_rw);
1190         }
1191         vm_fault_enable_pagefaults(save);
1192         return (error);
1193 }
1194
1195 static int
1196 vn_io_fault_touch(char *base, const struct uio *uio)
1197 {
1198         int r;
1199
1200         r = fubyte(base);
1201         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1202                 return (EFAULT);
1203         return (0);
1204 }
1205
1206 static int
1207 vn_io_fault_prefault_user(const struct uio *uio)
1208 {
1209         char *base;
1210         const struct iovec *iov;
1211         size_t len;
1212         ssize_t resid;
1213         int error, i;
1214
1215         KASSERT(uio->uio_segflg == UIO_USERSPACE,
1216             ("vn_io_fault_prefault userspace"));
1217
1218         error = i = 0;
1219         iov = uio->uio_iov;
1220         resid = uio->uio_resid;
1221         base = iov->iov_base;
1222         len = iov->iov_len;
1223         while (resid > 0) {
1224                 error = vn_io_fault_touch(base, uio);
1225                 if (error != 0)
1226                         break;
1227                 if (len < PAGE_SIZE) {
1228                         if (len != 0) {
1229                                 error = vn_io_fault_touch(base + len - 1, uio);
1230                                 if (error != 0)
1231                                         break;
1232                                 resid -= len;
1233                         }
1234                         if (++i >= uio->uio_iovcnt)
1235                                 break;
1236                         iov = uio->uio_iov + i;
1237                         base = iov->iov_base;
1238                         len = iov->iov_len;
1239                 } else {
1240                         len -= PAGE_SIZE;
1241                         base += PAGE_SIZE;
1242                         resid -= PAGE_SIZE;
1243                 }
1244         }
1245         return (error);
1246 }
1247
1248 /*
1249  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1250  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1251  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1252  * into args and call vn_io_fault1() to handle faults during the user
1253  * mode buffer accesses.
1254  */
1255 static int
1256 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1257     struct thread *td)
1258 {
1259         vm_page_t ma[io_hold_cnt + 2];
1260         struct uio *uio_clone, short_uio;
1261         struct iovec short_iovec[1];
1262         vm_page_t *prev_td_ma;
1263         vm_prot_t prot;
1264         vm_offset_t addr, end;
1265         size_t len, resid;
1266         ssize_t adv;
1267         int error, cnt, saveheld, prev_td_ma_cnt;
1268
1269         if (vn_io_fault_prefault) {
1270                 error = vn_io_fault_prefault_user(uio);
1271                 if (error != 0)
1272                         return (error); /* Or ignore ? */
1273         }
1274
1275         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1276
1277         /*
1278          * The UFS follows IO_UNIT directive and replays back both
1279          * uio_offset and uio_resid if an error is encountered during the
1280          * operation.  But, since the iovec may be already advanced,
1281          * uio is still in an inconsistent state.
1282          *
1283          * Cache a copy of the original uio, which is advanced to the redo
1284          * point using UIO_NOCOPY below.
1285          */
1286         uio_clone = cloneuio(uio);
1287         resid = uio->uio_resid;
1288
1289         short_uio.uio_segflg = UIO_USERSPACE;
1290         short_uio.uio_rw = uio->uio_rw;
1291         short_uio.uio_td = uio->uio_td;
1292
1293         error = vn_io_fault_doio(args, uio, td);
1294         if (error != EFAULT)
1295                 goto out;
1296
1297         atomic_add_long(&vn_io_faults_cnt, 1);
1298         uio_clone->uio_segflg = UIO_NOCOPY;
1299         uiomove(NULL, resid - uio->uio_resid, uio_clone);
1300         uio_clone->uio_segflg = uio->uio_segflg;
1301
1302         saveheld = curthread_pflags_set(TDP_UIOHELD);
1303         prev_td_ma = td->td_ma;
1304         prev_td_ma_cnt = td->td_ma_cnt;
1305
1306         while (uio_clone->uio_resid != 0) {
1307                 len = uio_clone->uio_iov->iov_len;
1308                 if (len == 0) {
1309                         KASSERT(uio_clone->uio_iovcnt >= 1,
1310                             ("iovcnt underflow"));
1311                         uio_clone->uio_iov++;
1312                         uio_clone->uio_iovcnt--;
1313                         continue;
1314                 }
1315                 if (len > ptoa(io_hold_cnt))
1316                         len = ptoa(io_hold_cnt);
1317                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1318                 end = round_page(addr + len);
1319                 if (end < addr) {
1320                         error = EFAULT;
1321                         break;
1322                 }
1323                 cnt = atop(end - trunc_page(addr));
1324                 /*
1325                  * A perfectly misaligned address and length could cause
1326                  * both the start and the end of the chunk to use partial
1327                  * page.  +2 accounts for such a situation.
1328                  */
1329                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1330                     addr, len, prot, ma, io_hold_cnt + 2);
1331                 if (cnt == -1) {
1332                         error = EFAULT;
1333                         break;
1334                 }
1335                 short_uio.uio_iov = &short_iovec[0];
1336                 short_iovec[0].iov_base = (void *)addr;
1337                 short_uio.uio_iovcnt = 1;
1338                 short_uio.uio_resid = short_iovec[0].iov_len = len;
1339                 short_uio.uio_offset = uio_clone->uio_offset;
1340                 td->td_ma = ma;
1341                 td->td_ma_cnt = cnt;
1342
1343                 error = vn_io_fault_doio(args, &short_uio, td);
1344                 vm_page_unhold_pages(ma, cnt);
1345                 adv = len - short_uio.uio_resid;
1346
1347                 uio_clone->uio_iov->iov_base =
1348                     (char *)uio_clone->uio_iov->iov_base + adv;
1349                 uio_clone->uio_iov->iov_len -= adv;
1350                 uio_clone->uio_resid -= adv;
1351                 uio_clone->uio_offset += adv;
1352
1353                 uio->uio_resid -= adv;
1354                 uio->uio_offset += adv;
1355
1356                 if (error != 0 || adv == 0)
1357                         break;
1358         }
1359         td->td_ma = prev_td_ma;
1360         td->td_ma_cnt = prev_td_ma_cnt;
1361         curthread_pflags_restore(saveheld);
1362 out:
1363         free(uio_clone, M_IOV);
1364         return (error);
1365 }
1366
1367 static int
1368 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1369     int flags, struct thread *td)
1370 {
1371         fo_rdwr_t *doio;
1372         struct vnode *vp;
1373         void *rl_cookie;
1374         struct vn_io_fault_args args;
1375         int error;
1376
1377         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1378         vp = fp->f_vnode;
1379
1380         /*
1381          * The ability to read(2) on a directory has historically been
1382          * allowed for all users, but this can and has been the source of
1383          * at least one security issue in the past.  As such, it is now hidden
1384          * away behind a sysctl for those that actually need it to use it, and
1385          * restricted to root when it's turned on to make it relatively safe to
1386          * leave on for longer sessions of need.
1387          */
1388         if (vp->v_type == VDIR) {
1389                 KASSERT(uio->uio_rw == UIO_READ,
1390                     ("illegal write attempted on a directory"));
1391                 if (!vfs_allow_read_dir)
1392                         return (EISDIR);
1393                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1394                         return (EISDIR);
1395         }
1396
1397         foffset_lock_uio(fp, uio, flags);
1398         if (do_vn_io_fault(vp, uio)) {
1399                 args.kind = VN_IO_FAULT_FOP;
1400                 args.args.fop_args.fp = fp;
1401                 args.args.fop_args.doio = doio;
1402                 args.cred = active_cred;
1403                 args.flags = flags | FOF_OFFSET;
1404                 if (uio->uio_rw == UIO_READ) {
1405                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1406                             uio->uio_offset + uio->uio_resid);
1407                 } else if ((fp->f_flag & O_APPEND) != 0 ||
1408                     (flags & FOF_OFFSET) == 0) {
1409                         /* For appenders, punt and lock the whole range. */
1410                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1411                 } else {
1412                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1413                             uio->uio_offset + uio->uio_resid);
1414                 }
1415                 error = vn_io_fault1(vp, uio, &args, td);
1416                 vn_rangelock_unlock(vp, rl_cookie);
1417         } else {
1418                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1419         }
1420         foffset_unlock_uio(fp, uio, flags);
1421         return (error);
1422 }
1423
1424 /*
1425  * Helper function to perform the requested uiomove operation using
1426  * the held pages for io->uio_iov[0].iov_base buffer instead of
1427  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1428  * instead of iov_base prevents page faults that could occur due to
1429  * pmap_collect() invalidating the mapping created by
1430  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1431  * object cleanup revoking the write access from page mappings.
1432  *
1433  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1434  * instead of plain uiomove().
1435  */
1436 int
1437 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1438 {
1439         struct uio transp_uio;
1440         struct iovec transp_iov[1];
1441         struct thread *td;
1442         size_t adv;
1443         int error, pgadv;
1444
1445         td = curthread;
1446         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1447             uio->uio_segflg != UIO_USERSPACE)
1448                 return (uiomove(data, xfersize, uio));
1449
1450         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1451         transp_iov[0].iov_base = data;
1452         transp_uio.uio_iov = &transp_iov[0];
1453         transp_uio.uio_iovcnt = 1;
1454         if (xfersize > uio->uio_resid)
1455                 xfersize = uio->uio_resid;
1456         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1457         transp_uio.uio_offset = 0;
1458         transp_uio.uio_segflg = UIO_SYSSPACE;
1459         /*
1460          * Since transp_iov points to data, and td_ma page array
1461          * corresponds to original uio->uio_iov, we need to invert the
1462          * direction of the i/o operation as passed to
1463          * uiomove_fromphys().
1464          */
1465         switch (uio->uio_rw) {
1466         case UIO_WRITE:
1467                 transp_uio.uio_rw = UIO_READ;
1468                 break;
1469         case UIO_READ:
1470                 transp_uio.uio_rw = UIO_WRITE;
1471                 break;
1472         }
1473         transp_uio.uio_td = uio->uio_td;
1474         error = uiomove_fromphys(td->td_ma,
1475             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1476             xfersize, &transp_uio);
1477         adv = xfersize - transp_uio.uio_resid;
1478         pgadv =
1479             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1480             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1481         td->td_ma += pgadv;
1482         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1483             pgadv));
1484         td->td_ma_cnt -= pgadv;
1485         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1486         uio->uio_iov->iov_len -= adv;
1487         uio->uio_resid -= adv;
1488         uio->uio_offset += adv;
1489         return (error);
1490 }
1491
1492 int
1493 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1494     struct uio *uio)
1495 {
1496         struct thread *td;
1497         vm_offset_t iov_base;
1498         int cnt, pgadv;
1499
1500         td = curthread;
1501         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1502             uio->uio_segflg != UIO_USERSPACE)
1503                 return (uiomove_fromphys(ma, offset, xfersize, uio));
1504
1505         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1506         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1507         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1508         switch (uio->uio_rw) {
1509         case UIO_WRITE:
1510                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1511                     offset, cnt);
1512                 break;
1513         case UIO_READ:
1514                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1515                     cnt);
1516                 break;
1517         }
1518         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1519         td->td_ma += pgadv;
1520         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1521             pgadv));
1522         td->td_ma_cnt -= pgadv;
1523         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1524         uio->uio_iov->iov_len -= cnt;
1525         uio->uio_resid -= cnt;
1526         uio->uio_offset += cnt;
1527         return (0);
1528 }
1529
1530 /*
1531  * File table truncate routine.
1532  */
1533 static int
1534 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1535     struct thread *td)
1536 {
1537         struct mount *mp;
1538         struct vnode *vp;
1539         void *rl_cookie;
1540         int error;
1541
1542         vp = fp->f_vnode;
1543
1544 retry:
1545         /*
1546          * Lock the whole range for truncation.  Otherwise split i/o
1547          * might happen partly before and partly after the truncation.
1548          */
1549         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1550         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1551         if (error)
1552                 goto out1;
1553         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1554         AUDIT_ARG_VNODE1(vp);
1555         if (vp->v_type == VDIR) {
1556                 error = EISDIR;
1557                 goto out;
1558         }
1559 #ifdef MAC
1560         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1561         if (error)
1562                 goto out;
1563 #endif
1564         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1565             fp->f_cred);
1566 out:
1567         VOP_UNLOCK(vp);
1568         vn_finished_write(mp);
1569 out1:
1570         vn_rangelock_unlock(vp, rl_cookie);
1571         if (error == ERELOOKUP)
1572                 goto retry;
1573         return (error);
1574 }
1575
1576 /*
1577  * Truncate a file that is already locked.
1578  */
1579 int
1580 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1581     struct ucred *cred)
1582 {
1583         struct vattr vattr;
1584         int error;
1585
1586         error = VOP_ADD_WRITECOUNT(vp, 1);
1587         if (error == 0) {
1588                 VATTR_NULL(&vattr);
1589                 vattr.va_size = length;
1590                 if (sync)
1591                         vattr.va_vaflags |= VA_SYNC;
1592                 error = VOP_SETATTR(vp, &vattr, cred);
1593                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1594         }
1595         return (error);
1596 }
1597
1598 /*
1599  * File table vnode stat routine.
1600  */
1601 static int
1602 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
1603     struct thread *td)
1604 {
1605         struct vnode *vp = fp->f_vnode;
1606         int error;
1607
1608         vn_lock(vp, LK_SHARED | LK_RETRY);
1609         error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
1610         VOP_UNLOCK(vp);
1611
1612         return (error);
1613 }
1614
1615 /*
1616  * File table vnode ioctl routine.
1617  */
1618 static int
1619 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1620     struct thread *td)
1621 {
1622         struct vattr vattr;
1623         struct vnode *vp;
1624         struct fiobmap2_arg *bmarg;
1625         int error;
1626
1627         vp = fp->f_vnode;
1628         switch (vp->v_type) {
1629         case VDIR:
1630         case VREG:
1631                 switch (com) {
1632                 case FIONREAD:
1633                         vn_lock(vp, LK_SHARED | LK_RETRY);
1634                         error = VOP_GETATTR(vp, &vattr, active_cred);
1635                         VOP_UNLOCK(vp);
1636                         if (error == 0)
1637                                 *(int *)data = vattr.va_size - fp->f_offset;
1638                         return (error);
1639                 case FIOBMAP2:
1640                         bmarg = (struct fiobmap2_arg *)data;
1641                         vn_lock(vp, LK_SHARED | LK_RETRY);
1642 #ifdef MAC
1643                         error = mac_vnode_check_read(active_cred, fp->f_cred,
1644                             vp);
1645                         if (error == 0)
1646 #endif
1647                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
1648                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
1649                         VOP_UNLOCK(vp);
1650                         return (error);
1651                 case FIONBIO:
1652                 case FIOASYNC:
1653                         return (0);
1654                 default:
1655                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
1656                             active_cred, td));
1657                 }
1658                 break;
1659         case VCHR:
1660                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1661                     active_cred, td));
1662         default:
1663                 return (ENOTTY);
1664         }
1665 }
1666
1667 /*
1668  * File table vnode poll routine.
1669  */
1670 static int
1671 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1672     struct thread *td)
1673 {
1674         struct vnode *vp;
1675         int error;
1676
1677         vp = fp->f_vnode;
1678 #if defined(MAC) || defined(AUDIT)
1679         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1680                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1681                 AUDIT_ARG_VNODE1(vp);
1682                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1683                 VOP_UNLOCK(vp);
1684                 if (error != 0)
1685                         return (error);
1686         }
1687 #endif
1688         error = VOP_POLL(vp, events, fp->f_cred, td);
1689         return (error);
1690 }
1691
1692 /*
1693  * Acquire the requested lock and then check for validity.  LK_RETRY
1694  * permits vn_lock to return doomed vnodes.
1695  */
1696 static int __noinline
1697 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1698     int error)
1699 {
1700
1701         KASSERT((flags & LK_RETRY) == 0 || error == 0,
1702             ("vn_lock: error %d incompatible with flags %#x", error, flags));
1703
1704         if (error == 0)
1705                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1706
1707         if ((flags & LK_RETRY) == 0) {
1708                 if (error == 0) {
1709                         VOP_UNLOCK(vp);
1710                         error = ENOENT;
1711                 }
1712                 return (error);
1713         }
1714
1715         /*
1716          * LK_RETRY case.
1717          *
1718          * Nothing to do if we got the lock.
1719          */
1720         if (error == 0)
1721                 return (0);
1722
1723         /*
1724          * Interlock was dropped by the call in _vn_lock.
1725          */
1726         flags &= ~LK_INTERLOCK;
1727         do {
1728                 error = VOP_LOCK1(vp, flags, file, line);
1729         } while (error != 0);
1730         return (0);
1731 }
1732
1733 int
1734 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1735 {
1736         int error;
1737
1738         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1739             ("vn_lock: no locktype (%d passed)", flags));
1740         VNPASS(vp->v_holdcnt > 0, vp);
1741         error = VOP_LOCK1(vp, flags, file, line);
1742         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1743                 return (_vn_lock_fallback(vp, flags, file, line, error));
1744         return (0);
1745 }
1746
1747 /*
1748  * File table vnode close routine.
1749  */
1750 static int
1751 vn_closefile(struct file *fp, struct thread *td)
1752 {
1753         struct vnode *vp;
1754         struct flock lf;
1755         int error;
1756         bool ref;
1757
1758         vp = fp->f_vnode;
1759         fp->f_ops = &badfileops;
1760         ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1761
1762         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1763
1764         if (__predict_false(ref)) {
1765                 lf.l_whence = SEEK_SET;
1766                 lf.l_start = 0;
1767                 lf.l_len = 0;
1768                 lf.l_type = F_UNLCK;
1769                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1770                 vrele(vp);
1771         }
1772         return (error);
1773 }
1774
1775 /*
1776  * Preparing to start a filesystem write operation. If the operation is
1777  * permitted, then we bump the count of operations in progress and
1778  * proceed. If a suspend request is in progress, we wait until the
1779  * suspension is over, and then proceed.
1780  */
1781 static int
1782 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
1783 {
1784         struct mount_pcpu *mpcpu;
1785         int error, mflags;
1786
1787         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
1788             vfs_op_thread_enter(mp, mpcpu)) {
1789                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1790                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
1791                 vfs_op_thread_exit(mp, mpcpu);
1792                 return (0);
1793         }
1794
1795         if (mplocked)
1796                 mtx_assert(MNT_MTX(mp), MA_OWNED);
1797         else
1798                 MNT_ILOCK(mp);
1799
1800         error = 0;
1801
1802         /*
1803          * Check on status of suspension.
1804          */
1805         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1806             mp->mnt_susp_owner != curthread) {
1807                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1808                     (flags & PCATCH) : 0) | (PUSER - 1);
1809                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1810                         if (flags & V_NOWAIT) {
1811                                 error = EWOULDBLOCK;
1812                                 goto unlock;
1813                         }
1814                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1815                             "suspfs", 0);
1816                         if (error)
1817                                 goto unlock;
1818                 }
1819         }
1820         if (flags & V_XSLEEP)
1821                 goto unlock;
1822         mp->mnt_writeopcount++;
1823 unlock:
1824         if (error != 0 || (flags & V_XSLEEP) != 0)
1825                 MNT_REL(mp);
1826         MNT_IUNLOCK(mp);
1827         return (error);
1828 }
1829
1830 int
1831 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1832 {
1833         struct mount *mp;
1834         int error;
1835
1836         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1837             ("V_MNTREF requires mp"));
1838
1839         error = 0;
1840         /*
1841          * If a vnode is provided, get and return the mount point that
1842          * to which it will write.
1843          */
1844         if (vp != NULL) {
1845                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1846                         *mpp = NULL;
1847                         if (error != EOPNOTSUPP)
1848                                 return (error);
1849                         return (0);
1850                 }
1851         }
1852         if ((mp = *mpp) == NULL)
1853                 return (0);
1854
1855         /*
1856          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1857          * a vfs_ref().
1858          * As long as a vnode is not provided we need to acquire a
1859          * refcount for the provided mountpoint too, in order to
1860          * emulate a vfs_ref().
1861          */
1862         if (vp == NULL && (flags & V_MNTREF) == 0)
1863                 vfs_ref(mp);
1864
1865         return (vn_start_write_refed(mp, flags, false));
1866 }
1867
1868 /*
1869  * Secondary suspension. Used by operations such as vop_inactive
1870  * routines that are needed by the higher level functions. These
1871  * are allowed to proceed until all the higher level functions have
1872  * completed (indicated by mnt_writeopcount dropping to zero). At that
1873  * time, these operations are halted until the suspension is over.
1874  */
1875 int
1876 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1877 {
1878         struct mount *mp;
1879         int error;
1880
1881         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1882             ("V_MNTREF requires mp"));
1883
1884  retry:
1885         if (vp != NULL) {
1886                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1887                         *mpp = NULL;
1888                         if (error != EOPNOTSUPP)
1889                                 return (error);
1890                         return (0);
1891                 }
1892         }
1893         /*
1894          * If we are not suspended or have not yet reached suspended
1895          * mode, then let the operation proceed.
1896          */
1897         if ((mp = *mpp) == NULL)
1898                 return (0);
1899
1900         /*
1901          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1902          * a vfs_ref().
1903          * As long as a vnode is not provided we need to acquire a
1904          * refcount for the provided mountpoint too, in order to
1905          * emulate a vfs_ref().
1906          */
1907         MNT_ILOCK(mp);
1908         if (vp == NULL && (flags & V_MNTREF) == 0)
1909                 MNT_REF(mp);
1910         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1911                 mp->mnt_secondary_writes++;
1912                 mp->mnt_secondary_accwrites++;
1913                 MNT_IUNLOCK(mp);
1914                 return (0);
1915         }
1916         if (flags & V_NOWAIT) {
1917                 MNT_REL(mp);
1918                 MNT_IUNLOCK(mp);
1919                 return (EWOULDBLOCK);
1920         }
1921         /*
1922          * Wait for the suspension to finish.
1923          */
1924         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1925             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1926             "suspfs", 0);
1927         vfs_rel(mp);
1928         if (error == 0)
1929                 goto retry;
1930         return (error);
1931 }
1932
1933 /*
1934  * Filesystem write operation has completed. If we are suspending and this
1935  * operation is the last one, notify the suspender that the suspension is
1936  * now in effect.
1937  */
1938 void
1939 vn_finished_write(struct mount *mp)
1940 {
1941         struct mount_pcpu *mpcpu;
1942         int c;
1943
1944         if (mp == NULL)
1945                 return;
1946
1947         if (vfs_op_thread_enter(mp, mpcpu)) {
1948                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
1949                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
1950                 vfs_op_thread_exit(mp, mpcpu);
1951                 return;
1952         }
1953
1954         MNT_ILOCK(mp);
1955         vfs_assert_mount_counters(mp);
1956         MNT_REL(mp);
1957         c = --mp->mnt_writeopcount;
1958         if (mp->mnt_vfs_ops == 0) {
1959                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1960                 MNT_IUNLOCK(mp);
1961                 return;
1962         }
1963         if (c < 0)
1964                 vfs_dump_mount_counters(mp);
1965         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
1966                 wakeup(&mp->mnt_writeopcount);
1967         MNT_IUNLOCK(mp);
1968 }
1969
1970 /*
1971  * Filesystem secondary write operation has completed. If we are
1972  * suspending and this operation is the last one, notify the suspender
1973  * that the suspension is now in effect.
1974  */
1975 void
1976 vn_finished_secondary_write(struct mount *mp)
1977 {
1978         if (mp == NULL)
1979                 return;
1980         MNT_ILOCK(mp);
1981         MNT_REL(mp);
1982         mp->mnt_secondary_writes--;
1983         if (mp->mnt_secondary_writes < 0)
1984                 panic("vn_finished_secondary_write: neg cnt");
1985         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1986             mp->mnt_secondary_writes <= 0)
1987                 wakeup(&mp->mnt_secondary_writes);
1988         MNT_IUNLOCK(mp);
1989 }
1990
1991 /*
1992  * Request a filesystem to suspend write operations.
1993  */
1994 int
1995 vfs_write_suspend(struct mount *mp, int flags)
1996 {
1997         int error;
1998
1999         vfs_op_enter(mp);
2000
2001         MNT_ILOCK(mp);
2002         vfs_assert_mount_counters(mp);
2003         if (mp->mnt_susp_owner == curthread) {
2004                 vfs_op_exit_locked(mp);
2005                 MNT_IUNLOCK(mp);
2006                 return (EALREADY);
2007         }
2008         while (mp->mnt_kern_flag & MNTK_SUSPEND)
2009                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
2010
2011         /*
2012          * Unmount holds a write reference on the mount point.  If we
2013          * own busy reference and drain for writers, we deadlock with
2014          * the reference draining in the unmount path.  Callers of
2015          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2016          * vfs_busy() reference is owned and caller is not in the
2017          * unmount context.
2018          */
2019         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2020             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2021                 vfs_op_exit_locked(mp);
2022                 MNT_IUNLOCK(mp);
2023                 return (EBUSY);
2024         }
2025
2026         mp->mnt_kern_flag |= MNTK_SUSPEND;
2027         mp->mnt_susp_owner = curthread;
2028         if (mp->mnt_writeopcount > 0)
2029                 (void) msleep(&mp->mnt_writeopcount,
2030                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
2031         else
2032                 MNT_IUNLOCK(mp);
2033         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2034                 vfs_write_resume(mp, 0);
2035                 /* vfs_write_resume does vfs_op_exit() for us */
2036         }
2037         return (error);
2038 }
2039
2040 /*
2041  * Request a filesystem to resume write operations.
2042  */
2043 void
2044 vfs_write_resume(struct mount *mp, int flags)
2045 {
2046
2047         MNT_ILOCK(mp);
2048         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2049                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2050                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2051                                        MNTK_SUSPENDED);
2052                 mp->mnt_susp_owner = NULL;
2053                 wakeup(&mp->mnt_writeopcount);
2054                 wakeup(&mp->mnt_flag);
2055                 curthread->td_pflags &= ~TDP_IGNSUSP;
2056                 if ((flags & VR_START_WRITE) != 0) {
2057                         MNT_REF(mp);
2058                         mp->mnt_writeopcount++;
2059                 }
2060                 MNT_IUNLOCK(mp);
2061                 if ((flags & VR_NO_SUSPCLR) == 0)
2062                         VFS_SUSP_CLEAN(mp);
2063                 vfs_op_exit(mp);
2064         } else if ((flags & VR_START_WRITE) != 0) {
2065                 MNT_REF(mp);
2066                 vn_start_write_refed(mp, 0, true);
2067         } else {
2068                 MNT_IUNLOCK(mp);
2069         }
2070 }
2071
2072 /*
2073  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2074  * methods.
2075  */
2076 int
2077 vfs_write_suspend_umnt(struct mount *mp)
2078 {
2079         int error;
2080
2081         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2082             ("vfs_write_suspend_umnt: recursed"));
2083
2084         /* dounmount() already called vn_start_write(). */
2085         for (;;) {
2086                 vn_finished_write(mp);
2087                 error = vfs_write_suspend(mp, 0);
2088                 if (error != 0) {
2089                         vn_start_write(NULL, &mp, V_WAIT);
2090                         return (error);
2091                 }
2092                 MNT_ILOCK(mp);
2093                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2094                         break;
2095                 MNT_IUNLOCK(mp);
2096                 vn_start_write(NULL, &mp, V_WAIT);
2097         }
2098         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2099         wakeup(&mp->mnt_flag);
2100         MNT_IUNLOCK(mp);
2101         curthread->td_pflags |= TDP_IGNSUSP;
2102         return (0);
2103 }
2104
2105 /*
2106  * Implement kqueues for files by translating it to vnode operation.
2107  */
2108 static int
2109 vn_kqfilter(struct file *fp, struct knote *kn)
2110 {
2111
2112         return (VOP_KQFILTER(fp->f_vnode, kn));
2113 }
2114
2115 /*
2116  * Simplified in-kernel wrapper calls for extended attribute access.
2117  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2118  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2119  */
2120 int
2121 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2122     const char *attrname, int *buflen, char *buf, struct thread *td)
2123 {
2124         struct uio      auio;
2125         struct iovec    iov;
2126         int     error;
2127
2128         iov.iov_len = *buflen;
2129         iov.iov_base = buf;
2130
2131         auio.uio_iov = &iov;
2132         auio.uio_iovcnt = 1;
2133         auio.uio_rw = UIO_READ;
2134         auio.uio_segflg = UIO_SYSSPACE;
2135         auio.uio_td = td;
2136         auio.uio_offset = 0;
2137         auio.uio_resid = *buflen;
2138
2139         if ((ioflg & IO_NODELOCKED) == 0)
2140                 vn_lock(vp, LK_SHARED | LK_RETRY);
2141
2142         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2143
2144         /* authorize attribute retrieval as kernel */
2145         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2146             td);
2147
2148         if ((ioflg & IO_NODELOCKED) == 0)
2149                 VOP_UNLOCK(vp);
2150
2151         if (error == 0) {
2152                 *buflen = *buflen - auio.uio_resid;
2153         }
2154
2155         return (error);
2156 }
2157
2158 /*
2159  * XXX failure mode if partially written?
2160  */
2161 int
2162 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2163     const char *attrname, int buflen, char *buf, struct thread *td)
2164 {
2165         struct uio      auio;
2166         struct iovec    iov;
2167         struct mount    *mp;
2168         int     error;
2169
2170         iov.iov_len = buflen;
2171         iov.iov_base = buf;
2172
2173         auio.uio_iov = &iov;
2174         auio.uio_iovcnt = 1;
2175         auio.uio_rw = UIO_WRITE;
2176         auio.uio_segflg = UIO_SYSSPACE;
2177         auio.uio_td = td;
2178         auio.uio_offset = 0;
2179         auio.uio_resid = buflen;
2180
2181         if ((ioflg & IO_NODELOCKED) == 0) {
2182                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2183                         return (error);
2184                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2185         }
2186
2187         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2188
2189         /* authorize attribute setting as kernel */
2190         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2191
2192         if ((ioflg & IO_NODELOCKED) == 0) {
2193                 vn_finished_write(mp);
2194                 VOP_UNLOCK(vp);
2195         }
2196
2197         return (error);
2198 }
2199
2200 int
2201 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2202     const char *attrname, struct thread *td)
2203 {
2204         struct mount    *mp;
2205         int     error;
2206
2207         if ((ioflg & IO_NODELOCKED) == 0) {
2208                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2209                         return (error);
2210                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2211         }
2212
2213         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2214
2215         /* authorize attribute removal as kernel */
2216         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2217         if (error == EOPNOTSUPP)
2218                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2219                     NULL, td);
2220
2221         if ((ioflg & IO_NODELOCKED) == 0) {
2222                 vn_finished_write(mp);
2223                 VOP_UNLOCK(vp);
2224         }
2225
2226         return (error);
2227 }
2228
2229 static int
2230 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2231     struct vnode **rvp)
2232 {
2233
2234         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2235 }
2236
2237 int
2238 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2239 {
2240
2241         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2242             lkflags, rvp));
2243 }
2244
2245 int
2246 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2247     int lkflags, struct vnode **rvp)
2248 {
2249         struct mount *mp;
2250         int ltype, error;
2251
2252         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2253         mp = vp->v_mount;
2254         ltype = VOP_ISLOCKED(vp);
2255         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2256             ("vn_vget_ino: vp not locked"));
2257         error = vfs_busy(mp, MBF_NOWAIT);
2258         if (error != 0) {
2259                 vfs_ref(mp);
2260                 VOP_UNLOCK(vp);
2261                 error = vfs_busy(mp, 0);
2262                 vn_lock(vp, ltype | LK_RETRY);
2263                 vfs_rel(mp);
2264                 if (error != 0)
2265                         return (ENOENT);
2266                 if (VN_IS_DOOMED(vp)) {
2267                         vfs_unbusy(mp);
2268                         return (ENOENT);
2269                 }
2270         }
2271         VOP_UNLOCK(vp);
2272         error = alloc(mp, alloc_arg, lkflags, rvp);
2273         vfs_unbusy(mp);
2274         if (error != 0 || *rvp != vp)
2275                 vn_lock(vp, ltype | LK_RETRY);
2276         if (VN_IS_DOOMED(vp)) {
2277                 if (error == 0) {
2278                         if (*rvp == vp)
2279                                 vunref(vp);
2280                         else
2281                                 vput(*rvp);
2282                 }
2283                 error = ENOENT;
2284         }
2285         return (error);
2286 }
2287
2288 int
2289 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2290     struct thread *td)
2291 {
2292
2293         if (vp->v_type != VREG || td == NULL)
2294                 return (0);
2295         if ((uoff_t)uio->uio_offset + uio->uio_resid >
2296             lim_cur(td, RLIMIT_FSIZE)) {
2297                 PROC_LOCK(td->td_proc);
2298                 kern_psignal(td->td_proc, SIGXFSZ);
2299                 PROC_UNLOCK(td->td_proc);
2300                 return (EFBIG);
2301         }
2302         return (0);
2303 }
2304
2305 int
2306 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2307     struct thread *td)
2308 {
2309         struct vnode *vp;
2310
2311         vp = fp->f_vnode;
2312 #ifdef AUDIT
2313         vn_lock(vp, LK_SHARED | LK_RETRY);
2314         AUDIT_ARG_VNODE1(vp);
2315         VOP_UNLOCK(vp);
2316 #endif
2317         return (setfmode(td, active_cred, vp, mode));
2318 }
2319
2320 int
2321 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2322     struct thread *td)
2323 {
2324         struct vnode *vp;
2325
2326         vp = fp->f_vnode;
2327 #ifdef AUDIT
2328         vn_lock(vp, LK_SHARED | LK_RETRY);
2329         AUDIT_ARG_VNODE1(vp);
2330         VOP_UNLOCK(vp);
2331 #endif
2332         return (setfown(td, active_cred, vp, uid, gid));
2333 }
2334
2335 void
2336 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2337 {
2338         vm_object_t object;
2339
2340         if ((object = vp->v_object) == NULL)
2341                 return;
2342         VM_OBJECT_WLOCK(object);
2343         vm_object_page_remove(object, start, end, 0);
2344         VM_OBJECT_WUNLOCK(object);
2345 }
2346
2347 int
2348 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2349 {
2350         struct vattr va;
2351         daddr_t bn, bnp;
2352         uint64_t bsize;
2353         off_t noff;
2354         int error;
2355
2356         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2357             ("Wrong command %lu", cmd));
2358
2359         if (vn_lock(vp, LK_SHARED) != 0)
2360                 return (EBADF);
2361         if (vp->v_type != VREG) {
2362                 error = ENOTTY;
2363                 goto unlock;
2364         }
2365         error = VOP_GETATTR(vp, &va, cred);
2366         if (error != 0)
2367                 goto unlock;
2368         noff = *off;
2369         if (noff >= va.va_size) {
2370                 error = ENXIO;
2371                 goto unlock;
2372         }
2373         bsize = vp->v_mount->mnt_stat.f_iosize;
2374         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2375             noff % bsize) {
2376                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2377                 if (error == EOPNOTSUPP) {
2378                         error = ENOTTY;
2379                         goto unlock;
2380                 }
2381                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2382                     (bnp != -1 && cmd == FIOSEEKDATA)) {
2383                         noff = bn * bsize;
2384                         if (noff < *off)
2385                                 noff = *off;
2386                         goto unlock;
2387                 }
2388         }
2389         if (noff > va.va_size)
2390                 noff = va.va_size;
2391         /* noff == va.va_size. There is an implicit hole at the end of file. */
2392         if (cmd == FIOSEEKDATA)
2393                 error = ENXIO;
2394 unlock:
2395         VOP_UNLOCK(vp);
2396         if (error == 0)
2397                 *off = noff;
2398         return (error);
2399 }
2400
2401 int
2402 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2403 {
2404         struct ucred *cred;
2405         struct vnode *vp;
2406         struct vattr vattr;
2407         off_t foffset, size;
2408         int error, noneg;
2409
2410         cred = td->td_ucred;
2411         vp = fp->f_vnode;
2412         foffset = foffset_lock(fp, 0);
2413         noneg = (vp->v_type != VCHR);
2414         error = 0;
2415         switch (whence) {
2416         case L_INCR:
2417                 if (noneg &&
2418                     (foffset < 0 ||
2419                     (offset > 0 && foffset > OFF_MAX - offset))) {
2420                         error = EOVERFLOW;
2421                         break;
2422                 }
2423                 offset += foffset;
2424                 break;
2425         case L_XTND:
2426                 vn_lock(vp, LK_SHARED | LK_RETRY);
2427                 error = VOP_GETATTR(vp, &vattr, cred);
2428                 VOP_UNLOCK(vp);
2429                 if (error)
2430                         break;
2431
2432                 /*
2433                  * If the file references a disk device, then fetch
2434                  * the media size and use that to determine the ending
2435                  * offset.
2436                  */
2437                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2438                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2439                         vattr.va_size = size;
2440                 if (noneg &&
2441                     (vattr.va_size > OFF_MAX ||
2442                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2443                         error = EOVERFLOW;
2444                         break;
2445                 }
2446                 offset += vattr.va_size;
2447                 break;
2448         case L_SET:
2449                 break;
2450         case SEEK_DATA:
2451                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2452                 if (error == ENOTTY)
2453                         error = EINVAL;
2454                 break;
2455         case SEEK_HOLE:
2456                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2457                 if (error == ENOTTY)
2458                         error = EINVAL;
2459                 break;
2460         default:
2461                 error = EINVAL;
2462         }
2463         if (error == 0 && noneg && offset < 0)
2464                 error = EINVAL;
2465         if (error != 0)
2466                 goto drop;
2467         VFS_KNOTE_UNLOCKED(vp, 0);
2468         td->td_uretoff.tdu_off = offset;
2469 drop:
2470         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2471         return (error);
2472 }
2473
2474 int
2475 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2476     struct thread *td)
2477 {
2478         int error;
2479
2480         /*
2481          * Grant permission if the caller is the owner of the file, or
2482          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2483          * on the file.  If the time pointer is null, then write
2484          * permission on the file is also sufficient.
2485          *
2486          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2487          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2488          * will be allowed to set the times [..] to the current
2489          * server time.
2490          */
2491         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2492         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2493                 error = VOP_ACCESS(vp, VWRITE, cred, td);
2494         return (error);
2495 }
2496
2497 int
2498 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2499 {
2500         struct vnode *vp;
2501         int error;
2502
2503         if (fp->f_type == DTYPE_FIFO)
2504                 kif->kf_type = KF_TYPE_FIFO;
2505         else
2506                 kif->kf_type = KF_TYPE_VNODE;
2507         vp = fp->f_vnode;
2508         vref(vp);
2509         FILEDESC_SUNLOCK(fdp);
2510         error = vn_fill_kinfo_vnode(vp, kif);
2511         vrele(vp);
2512         FILEDESC_SLOCK(fdp);
2513         return (error);
2514 }
2515
2516 static inline void
2517 vn_fill_junk(struct kinfo_file *kif)
2518 {
2519         size_t len, olen;
2520
2521         /*
2522          * Simulate vn_fullpath returning changing values for a given
2523          * vp during e.g. coredump.
2524          */
2525         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2526         olen = strlen(kif->kf_path);
2527         if (len < olen)
2528                 strcpy(&kif->kf_path[len - 1], "$");
2529         else
2530                 for (; olen < len; olen++)
2531                         strcpy(&kif->kf_path[olen], "A");
2532 }
2533
2534 int
2535 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2536 {
2537         struct vattr va;
2538         char *fullpath, *freepath;
2539         int error;
2540
2541         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2542         freepath = NULL;
2543         fullpath = "-";
2544         error = vn_fullpath(vp, &fullpath, &freepath);
2545         if (error == 0) {
2546                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2547         }
2548         if (freepath != NULL)
2549                 free(freepath, M_TEMP);
2550
2551         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2552                 vn_fill_junk(kif);
2553         );
2554
2555         /*
2556          * Retrieve vnode attributes.
2557          */
2558         va.va_fsid = VNOVAL;
2559         va.va_rdev = NODEV;
2560         vn_lock(vp, LK_SHARED | LK_RETRY);
2561         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2562         VOP_UNLOCK(vp);
2563         if (error != 0)
2564                 return (error);
2565         if (va.va_fsid != VNOVAL)
2566                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2567         else
2568                 kif->kf_un.kf_file.kf_file_fsid =
2569                     vp->v_mount->mnt_stat.f_fsid.val[0];
2570         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2571             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2572         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2573         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2574         kif->kf_un.kf_file.kf_file_size = va.va_size;
2575         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2576         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2577             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2578         return (0);
2579 }
2580
2581 int
2582 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2583     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2584     struct thread *td)
2585 {
2586 #ifdef HWPMC_HOOKS
2587         struct pmckern_map_in pkm;
2588 #endif
2589         struct mount *mp;
2590         struct vnode *vp;
2591         vm_object_t object;
2592         vm_prot_t maxprot;
2593         boolean_t writecounted;
2594         int error;
2595
2596 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2597     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2598         /*
2599          * POSIX shared-memory objects are defined to have
2600          * kernel persistence, and are not defined to support
2601          * read(2)/write(2) -- or even open(2).  Thus, we can
2602          * use MAP_ASYNC to trade on-disk coherence for speed.
2603          * The shm_open(3) library routine turns on the FPOSIXSHM
2604          * flag to request this behavior.
2605          */
2606         if ((fp->f_flag & FPOSIXSHM) != 0)
2607                 flags |= MAP_NOSYNC;
2608 #endif
2609         vp = fp->f_vnode;
2610
2611         /*
2612          * Ensure that file and memory protections are
2613          * compatible.  Note that we only worry about
2614          * writability if mapping is shared; in this case,
2615          * current and max prot are dictated by the open file.
2616          * XXX use the vnode instead?  Problem is: what
2617          * credentials do we use for determination? What if
2618          * proc does a setuid?
2619          */
2620         mp = vp->v_mount;
2621         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2622                 maxprot = VM_PROT_NONE;
2623                 if ((prot & VM_PROT_EXECUTE) != 0)
2624                         return (EACCES);
2625         } else
2626                 maxprot = VM_PROT_EXECUTE;
2627         if ((fp->f_flag & FREAD) != 0)
2628                 maxprot |= VM_PROT_READ;
2629         else if ((prot & VM_PROT_READ) != 0)
2630                 return (EACCES);
2631
2632         /*
2633          * If we are sharing potential changes via MAP_SHARED and we
2634          * are trying to get write permission although we opened it
2635          * without asking for it, bail out.
2636          */
2637         if ((flags & MAP_SHARED) != 0) {
2638                 if ((fp->f_flag & FWRITE) != 0)
2639                         maxprot |= VM_PROT_WRITE;
2640                 else if ((prot & VM_PROT_WRITE) != 0)
2641                         return (EACCES);
2642         } else {
2643                 maxprot |= VM_PROT_WRITE;
2644                 cap_maxprot |= VM_PROT_WRITE;
2645         }
2646         maxprot &= cap_maxprot;
2647
2648         /*
2649          * For regular files and shared memory, POSIX requires that
2650          * the value of foff be a legitimate offset within the data
2651          * object.  In particular, negative offsets are invalid.
2652          * Blocking negative offsets and overflows here avoids
2653          * possible wraparound or user-level access into reserved
2654          * ranges of the data object later.  In contrast, POSIX does
2655          * not dictate how offsets are used by device drivers, so in
2656          * the case of a device mapping a negative offset is passed
2657          * on.
2658          */
2659         if (
2660 #ifdef _LP64
2661             size > OFF_MAX ||
2662 #endif
2663             foff > OFF_MAX - size)
2664                 return (EINVAL);
2665
2666         writecounted = FALSE;
2667         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2668             &foff, &object, &writecounted);
2669         if (error != 0)
2670                 return (error);
2671         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2672             foff, writecounted, td);
2673         if (error != 0) {
2674                 /*
2675                  * If this mapping was accounted for in the vnode's
2676                  * writecount, then undo that now.
2677                  */
2678                 if (writecounted)
2679                         vm_pager_release_writecount(object, 0, size);
2680                 vm_object_deallocate(object);
2681         }
2682 #ifdef HWPMC_HOOKS
2683         /* Inform hwpmc(4) if an executable is being mapped. */
2684         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2685                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2686                         pkm.pm_file = vp;
2687                         pkm.pm_address = (uintptr_t) *addr;
2688                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2689                 }
2690         }
2691 #endif
2692         return (error);
2693 }
2694
2695 void
2696 vn_fsid(struct vnode *vp, struct vattr *va)
2697 {
2698         fsid_t *f;
2699
2700         f = &vp->v_mount->mnt_stat.f_fsid;
2701         va->va_fsid = (uint32_t)f->val[1];
2702         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2703         va->va_fsid += (uint32_t)f->val[0];
2704 }
2705
2706 int
2707 vn_fsync_buf(struct vnode *vp, int waitfor)
2708 {
2709         struct buf *bp, *nbp;
2710         struct bufobj *bo;
2711         struct mount *mp;
2712         int error, maxretry;
2713
2714         error = 0;
2715         maxretry = 10000;     /* large, arbitrarily chosen */
2716         mp = NULL;
2717         if (vp->v_type == VCHR) {
2718                 VI_LOCK(vp);
2719                 mp = vp->v_rdev->si_mountpt;
2720                 VI_UNLOCK(vp);
2721         }
2722         bo = &vp->v_bufobj;
2723         BO_LOCK(bo);
2724 loop1:
2725         /*
2726          * MARK/SCAN initialization to avoid infinite loops.
2727          */
2728         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2729                 bp->b_vflags &= ~BV_SCANNED;
2730                 bp->b_error = 0;
2731         }
2732
2733         /*
2734          * Flush all dirty buffers associated with a vnode.
2735          */
2736 loop2:
2737         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2738                 if ((bp->b_vflags & BV_SCANNED) != 0)
2739                         continue;
2740                 bp->b_vflags |= BV_SCANNED;
2741                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2742                         if (waitfor != MNT_WAIT)
2743                                 continue;
2744                         if (BUF_LOCK(bp,
2745                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
2746                             BO_LOCKPTR(bo)) != 0) {
2747                                 BO_LOCK(bo);
2748                                 goto loop1;
2749                         }
2750                         BO_LOCK(bo);
2751                 }
2752                 BO_UNLOCK(bo);
2753                 KASSERT(bp->b_bufobj == bo,
2754                     ("bp %p wrong b_bufobj %p should be %p",
2755                     bp, bp->b_bufobj, bo));
2756                 if ((bp->b_flags & B_DELWRI) == 0)
2757                         panic("fsync: not dirty");
2758                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2759                         vfs_bio_awrite(bp);
2760                 } else {
2761                         bremfree(bp);
2762                         bawrite(bp);
2763                 }
2764                 if (maxretry < 1000)
2765                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
2766                 BO_LOCK(bo);
2767                 goto loop2;
2768         }
2769
2770         /*
2771          * If synchronous the caller expects us to completely resolve all
2772          * dirty buffers in the system.  Wait for in-progress I/O to
2773          * complete (which could include background bitmap writes), then
2774          * retry if dirty blocks still exist.
2775          */
2776         if (waitfor == MNT_WAIT) {
2777                 bufobj_wwait(bo, 0, 0);
2778                 if (bo->bo_dirty.bv_cnt > 0) {
2779                         /*
2780                          * If we are unable to write any of these buffers
2781                          * then we fail now rather than trying endlessly
2782                          * to write them out.
2783                          */
2784                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2785                                 if ((error = bp->b_error) != 0)
2786                                         break;
2787                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
2788                             (error == 0 && --maxretry >= 0))
2789                                 goto loop1;
2790                         if (error == 0)
2791                                 error = EAGAIN;
2792                 }
2793         }
2794         BO_UNLOCK(bo);
2795         if (error != 0)
2796                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2797
2798         return (error);
2799 }
2800
2801 /*
2802  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
2803  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
2804  * to do the actual copy.
2805  * vn_generic_copy_file_range() is factored out, so it can be called
2806  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
2807  * different file systems.
2808  */
2809 int
2810 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
2811     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
2812     struct ucred *outcred, struct thread *fsize_td)
2813 {
2814         int error;
2815         size_t len;
2816         uint64_t uval;
2817
2818         len = *lenp;
2819         *lenp = 0;              /* For error returns. */
2820         error = 0;
2821
2822         /* Do some sanity checks on the arguments. */
2823         if (invp->v_type == VDIR || outvp->v_type == VDIR)
2824                 error = EISDIR;
2825         else if (*inoffp < 0 || *outoffp < 0 ||
2826             invp->v_type != VREG || outvp->v_type != VREG)
2827                 error = EINVAL;
2828         if (error != 0)
2829                 goto out;
2830
2831         /* Ensure offset + len does not wrap around. */
2832         uval = *inoffp;
2833         uval += len;
2834         if (uval > INT64_MAX)
2835                 len = INT64_MAX - *inoffp;
2836         uval = *outoffp;
2837         uval += len;
2838         if (uval > INT64_MAX)
2839                 len = INT64_MAX - *outoffp;
2840         if (len == 0)
2841                 goto out;
2842
2843         /*
2844          * If the two vnode are for the same file system, call
2845          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
2846          * which can handle copies across multiple file systems.
2847          */
2848         *lenp = len;
2849         if (invp->v_mount == outvp->v_mount)
2850                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
2851                     lenp, flags, incred, outcred, fsize_td);
2852         else
2853                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
2854                     outoffp, lenp, flags, incred, outcred, fsize_td);
2855 out:
2856         return (error);
2857 }
2858
2859 /*
2860  * Test len bytes of data starting at dat for all bytes == 0.
2861  * Return true if all bytes are zero, false otherwise.
2862  * Expects dat to be well aligned.
2863  */
2864 static bool
2865 mem_iszero(void *dat, int len)
2866 {
2867         int i;
2868         const u_int *p;
2869         const char *cp;
2870
2871         for (p = dat; len > 0; len -= sizeof(*p), p++) {
2872                 if (len >= sizeof(*p)) {
2873                         if (*p != 0)
2874                                 return (false);
2875                 } else {
2876                         cp = (const char *)p;
2877                         for (i = 0; i < len; i++, cp++)
2878                                 if (*cp != '\0')
2879                                         return (false);
2880                 }
2881         }
2882         return (true);
2883 }
2884
2885 /*
2886  * Look for a hole in the output file and, if found, adjust *outoffp
2887  * and *xferp to skip past the hole.
2888  * *xferp is the entire hole length to be written and xfer2 is how many bytes
2889  * to be written as 0's upon return.
2890  */
2891 static off_t
2892 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
2893     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
2894 {
2895         int error;
2896         off_t delta;
2897
2898         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
2899                 *dataoffp = *outoffp;
2900                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
2901                     curthread);
2902                 if (error == 0) {
2903                         *holeoffp = *dataoffp;
2904                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
2905                             curthread);
2906                 }
2907                 if (error != 0 || *holeoffp == *dataoffp) {
2908                         /*
2909                          * Since outvp is unlocked, it may be possible for
2910                          * another thread to do a truncate(), lseek(), write()
2911                          * creating a hole at startoff between the above
2912                          * VOP_IOCTL() calls, if the other thread does not do
2913                          * rangelocking.
2914                          * If that happens, *holeoffp == *dataoffp and finding
2915                          * the hole has failed, so disable vn_skip_hole().
2916                          */
2917                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
2918                         return (xfer2);
2919                 }
2920                 KASSERT(*dataoffp >= *outoffp,
2921                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
2922                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
2923                 KASSERT(*holeoffp > *dataoffp,
2924                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
2925                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
2926         }
2927
2928         /*
2929          * If there is a hole before the data starts, advance *outoffp and
2930          * *xferp past the hole.
2931          */
2932         if (*dataoffp > *outoffp) {
2933                 delta = *dataoffp - *outoffp;
2934                 if (delta >= *xferp) {
2935                         /* Entire *xferp is a hole. */
2936                         *outoffp += *xferp;
2937                         *xferp = 0;
2938                         return (0);
2939                 }
2940                 *xferp -= delta;
2941                 *outoffp += delta;
2942                 xfer2 = MIN(xfer2, *xferp);
2943         }
2944
2945         /*
2946          * If a hole starts before the end of this xfer2, reduce this xfer2 so
2947          * that the write ends at the start of the hole.
2948          * *holeoffp should always be greater than *outoffp, but for the
2949          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
2950          * value.
2951          */
2952         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
2953                 xfer2 = *holeoffp - *outoffp;
2954         return (xfer2);
2955 }
2956
2957 /*
2958  * Write an xfer sized chunk to outvp in blksize blocks from dat.
2959  * dat is a maximum of blksize in length and can be written repeatedly in
2960  * the chunk.
2961  * If growfile == true, just grow the file via vn_truncate_locked() instead
2962  * of doing actual writes.
2963  * If checkhole == true, a hole is being punched, so skip over any hole
2964  * already in the output file.
2965  */
2966 static int
2967 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
2968     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
2969 {
2970         struct mount *mp;
2971         off_t dataoff, holeoff, xfer2;
2972         int error, lckf;
2973
2974         /*
2975          * Loop around doing writes of blksize until write has been completed.
2976          * Lock/unlock on each loop iteration so that a bwillwrite() can be
2977          * done for each iteration, since the xfer argument can be very
2978          * large if there is a large hole to punch in the output file.
2979          */
2980         error = 0;
2981         holeoff = 0;
2982         do {
2983                 xfer2 = MIN(xfer, blksize);
2984                 if (checkhole) {
2985                         /*
2986                          * Punching a hole.  Skip writing if there is
2987                          * already a hole in the output file.
2988                          */
2989                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
2990                             &dataoff, &holeoff, cred);
2991                         if (xfer == 0)
2992                                 break;
2993                         if (holeoff < 0)
2994                                 checkhole = false;
2995                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
2996                             (intmax_t)xfer2));
2997                 }
2998                 bwillwrite();
2999                 mp = NULL;
3000                 error = vn_start_write(outvp, &mp, V_WAIT);
3001                 if (error != 0)
3002                         break;
3003                 if (growfile) {
3004                         error = vn_lock(outvp, LK_EXCLUSIVE);
3005                         if (error == 0) {
3006                                 error = vn_truncate_locked(outvp, outoff + xfer,
3007                                     false, cred);
3008                                 VOP_UNLOCK(outvp);
3009                         }
3010                 } else {
3011                         if (MNT_SHARED_WRITES(mp))
3012                                 lckf = LK_SHARED;
3013                         else
3014                                 lckf = LK_EXCLUSIVE;
3015                         error = vn_lock(outvp, lckf);
3016                         if (error == 0) {
3017                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3018                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
3019                                     curthread->td_ucred, cred, NULL, curthread);
3020                                 outoff += xfer2;
3021                                 xfer -= xfer2;
3022                                 VOP_UNLOCK(outvp);
3023                         }
3024                 }
3025                 if (mp != NULL)
3026                         vn_finished_write(mp);
3027         } while (!growfile && xfer > 0 && error == 0);
3028         return (error);
3029 }
3030
3031 /*
3032  * Copy a byte range of one file to another.  This function can handle the
3033  * case where invp and outvp are on different file systems.
3034  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3035  * is no better file system specific way to do it.
3036  */
3037 int
3038 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3039     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3040     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3041 {
3042         struct vattr va;
3043         struct mount *mp;
3044         struct uio io;
3045         off_t startoff, endoff, xfer, xfer2;
3046         u_long blksize;
3047         int error, interrupted;
3048         bool cantseek, readzeros, eof, lastblock;
3049         ssize_t aresid;
3050         size_t copylen, len, rem, savlen;
3051         char *dat;
3052         long holein, holeout;
3053
3054         holein = holeout = 0;
3055         savlen = len = *lenp;
3056         error = 0;
3057         interrupted = 0;
3058         dat = NULL;
3059
3060         error = vn_lock(invp, LK_SHARED);
3061         if (error != 0)
3062                 goto out;
3063         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3064                 holein = 0;
3065         VOP_UNLOCK(invp);
3066
3067         mp = NULL;
3068         error = vn_start_write(outvp, &mp, V_WAIT);
3069         if (error == 0)
3070                 error = vn_lock(outvp, LK_EXCLUSIVE);
3071         if (error == 0) {
3072                 /*
3073                  * If fsize_td != NULL, do a vn_rlimit_fsize() call,
3074                  * now that outvp is locked.
3075                  */
3076                 if (fsize_td != NULL) {
3077                         io.uio_offset = *outoffp;
3078                         io.uio_resid = len;
3079                         error = vn_rlimit_fsize(outvp, &io, fsize_td);
3080                         if (error != 0)
3081                                 error = EFBIG;
3082                 }
3083                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3084                         holeout = 0;
3085                 /*
3086                  * Holes that are past EOF do not need to be written as a block
3087                  * of zero bytes.  So, truncate the output file as far as
3088                  * possible and then use va.va_size to decide if writing 0
3089                  * bytes is necessary in the loop below.
3090                  */
3091                 if (error == 0)
3092                         error = VOP_GETATTR(outvp, &va, outcred);
3093                 if (error == 0 && va.va_size > *outoffp && va.va_size <=
3094                     *outoffp + len) {
3095 #ifdef MAC
3096                         error = mac_vnode_check_write(curthread->td_ucred,
3097                             outcred, outvp);
3098                         if (error == 0)
3099 #endif
3100                                 error = vn_truncate_locked(outvp, *outoffp,
3101                                     false, outcred);
3102                         if (error == 0)
3103                                 va.va_size = *outoffp;
3104                 }
3105                 VOP_UNLOCK(outvp);
3106         }
3107         if (mp != NULL)
3108                 vn_finished_write(mp);
3109         if (error != 0)
3110                 goto out;
3111
3112         /*
3113          * Set the blksize to the larger of the hole sizes for invp and outvp.
3114          * If hole sizes aren't available, set the blksize to the larger
3115          * f_iosize of invp and outvp.
3116          * This code expects the hole sizes and f_iosizes to be powers of 2.
3117          * This value is clipped at 4Kbytes and 1Mbyte.
3118          */
3119         blksize = MAX(holein, holeout);
3120
3121         /* Clip len to end at an exact multiple of hole size. */
3122         if (blksize > 1) {
3123                 rem = *inoffp % blksize;
3124                 if (rem > 0)
3125                         rem = blksize - rem;
3126                 if (len - rem > blksize)
3127                         len = savlen = rounddown(len - rem, blksize) + rem;
3128         }
3129
3130         if (blksize <= 1)
3131                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3132                     outvp->v_mount->mnt_stat.f_iosize);
3133         if (blksize < 4096)
3134                 blksize = 4096;
3135         else if (blksize > 1024 * 1024)
3136                 blksize = 1024 * 1024;
3137         dat = malloc(blksize, M_TEMP, M_WAITOK);
3138
3139         /*
3140          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3141          * to find holes.  Otherwise, just scan the read block for all 0s
3142          * in the inner loop where the data copying is done.
3143          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3144          * support holes on the server, but do not support FIOSEEKHOLE.
3145          */
3146         eof = false;
3147         while (len > 0 && error == 0 && !eof && interrupted == 0) {
3148                 endoff = 0;                     /* To shut up compilers. */
3149                 cantseek = true;
3150                 startoff = *inoffp;
3151                 copylen = len;
3152
3153                 /*
3154                  * Find the next data area.  If there is just a hole to EOF,
3155                  * FIOSEEKDATA should fail and then we drop down into the
3156                  * inner loop and create the hole on the outvp file.
3157                  * (I do not know if any file system will report a hole to
3158                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3159                  *  will fail for those file systems.)
3160                  *
3161                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3162                  * the code just falls through to the inner copy loop.
3163                  */
3164                 error = EINVAL;
3165                 if (holein > 0)
3166                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3167                             incred, curthread);
3168                 if (error == 0) {
3169                         endoff = startoff;
3170                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3171                             incred, curthread);
3172                         /*
3173                          * Since invp is unlocked, it may be possible for
3174                          * another thread to do a truncate(), lseek(), write()
3175                          * creating a hole at startoff between the above
3176                          * VOP_IOCTL() calls, if the other thread does not do
3177                          * rangelocking.
3178                          * If that happens, startoff == endoff and finding
3179                          * the hole has failed, so set an error.
3180                          */
3181                         if (error == 0 && startoff == endoff)
3182                                 error = EINVAL; /* Any error. Reset to 0. */
3183                 }
3184                 if (error == 0) {
3185                         if (startoff > *inoffp) {
3186                                 /* Found hole before data block. */
3187                                 xfer = MIN(startoff - *inoffp, len);
3188                                 if (*outoffp < va.va_size) {
3189                                         /* Must write 0s to punch hole. */
3190                                         xfer2 = MIN(va.va_size - *outoffp,
3191                                             xfer);
3192                                         memset(dat, 0, MIN(xfer2, blksize));
3193                                         error = vn_write_outvp(outvp, dat,
3194                                             *outoffp, xfer2, blksize, false,
3195                                             holeout > 0, outcred);
3196                                 }
3197
3198                                 if (error == 0 && *outoffp + xfer >
3199                                     va.va_size && xfer == len)
3200                                         /* Grow last block. */
3201                                         error = vn_write_outvp(outvp, dat,
3202                                             *outoffp, xfer, blksize, true,
3203                                             false, outcred);
3204                                 if (error == 0) {
3205                                         *inoffp += xfer;
3206                                         *outoffp += xfer;
3207                                         len -= xfer;
3208                                         if (len < savlen)
3209                                                 interrupted = sig_intr();
3210                                 }
3211                         }
3212                         copylen = MIN(len, endoff - startoff);
3213                         cantseek = false;
3214                 } else {
3215                         cantseek = true;
3216                         startoff = *inoffp;
3217                         copylen = len;
3218                         error = 0;
3219                 }
3220
3221                 xfer = blksize;
3222                 if (cantseek) {
3223                         /*
3224                          * Set first xfer to end at a block boundary, so that
3225                          * holes are more likely detected in the loop below via
3226                          * the for all bytes 0 method.
3227                          */
3228                         xfer -= (*inoffp % blksize);
3229                 }
3230                 /* Loop copying the data block. */
3231                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
3232                         if (copylen < xfer)
3233                                 xfer = copylen;
3234                         error = vn_lock(invp, LK_SHARED);
3235                         if (error != 0)
3236                                 goto out;
3237                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
3238                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
3239                             curthread->td_ucred, incred, &aresid,
3240                             curthread);
3241                         VOP_UNLOCK(invp);
3242                         lastblock = false;
3243                         if (error == 0 && aresid > 0) {
3244                                 /* Stop the copy at EOF on the input file. */
3245                                 xfer -= aresid;
3246                                 eof = true;
3247                                 lastblock = true;
3248                         }
3249                         if (error == 0) {
3250                                 /*
3251                                  * Skip the write for holes past the initial EOF
3252                                  * of the output file, unless this is the last
3253                                  * write of the output file at EOF.
3254                                  */
3255                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
3256                                     false;
3257                                 if (xfer == len)
3258                                         lastblock = true;
3259                                 if (!cantseek || *outoffp < va.va_size ||
3260                                     lastblock || !readzeros)
3261                                         error = vn_write_outvp(outvp, dat,
3262                                             *outoffp, xfer, blksize,
3263                                             readzeros && lastblock &&
3264                                             *outoffp >= va.va_size, false,
3265                                             outcred);
3266                                 if (error == 0) {
3267                                         *inoffp += xfer;
3268                                         startoff += xfer;
3269                                         *outoffp += xfer;
3270                                         copylen -= xfer;
3271                                         len -= xfer;
3272                                         if (len < savlen)
3273                                                 interrupted = sig_intr();
3274                                 }
3275                         }
3276                         xfer = blksize;
3277                 }
3278         }
3279 out:
3280         *lenp = savlen - len;
3281         free(dat, M_TEMP);
3282         return (error);
3283 }
3284
3285 static int
3286 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3287 {
3288         struct mount *mp;
3289         struct vnode *vp;
3290         off_t olen, ooffset;
3291         int error;
3292 #ifdef AUDIT
3293         int audited_vnode1 = 0;
3294 #endif
3295
3296         vp = fp->f_vnode;
3297         if (vp->v_type != VREG)
3298                 return (ENODEV);
3299
3300         /* Allocating blocks may take a long time, so iterate. */
3301         for (;;) {
3302                 olen = len;
3303                 ooffset = offset;
3304
3305                 bwillwrite();
3306                 mp = NULL;
3307                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3308                 if (error != 0)
3309                         break;
3310                 error = vn_lock(vp, LK_EXCLUSIVE);
3311                 if (error != 0) {
3312                         vn_finished_write(mp);
3313                         break;
3314                 }
3315 #ifdef AUDIT
3316                 if (!audited_vnode1) {
3317                         AUDIT_ARG_VNODE1(vp);
3318                         audited_vnode1 = 1;
3319                 }
3320 #endif
3321 #ifdef MAC
3322                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3323                 if (error == 0)
3324 #endif
3325                         error = VOP_ALLOCATE(vp, &offset, &len);
3326                 VOP_UNLOCK(vp);
3327                 vn_finished_write(mp);
3328
3329                 if (olen + ooffset != offset + len) {
3330                         panic("offset + len changed from %jx/%jx to %jx/%jx",
3331                             ooffset, olen, offset, len);
3332                 }
3333                 if (error != 0 || len == 0)
3334                         break;
3335                 KASSERT(olen > len, ("Iteration did not make progress?"));
3336                 maybe_yield();
3337         }
3338
3339         return (error);
3340 }
3341
3342 static u_long vn_lock_pair_pause_cnt;
3343 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
3344     &vn_lock_pair_pause_cnt, 0,
3345     "Count of vn_lock_pair deadlocks");
3346
3347 u_int vn_lock_pair_pause_max;
3348 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
3349     &vn_lock_pair_pause_max, 0,
3350     "Max ticks for vn_lock_pair deadlock avoidance sleep");
3351
3352 static void
3353 vn_lock_pair_pause(const char *wmesg)
3354 {
3355         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
3356         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
3357 }
3358
3359 /*
3360  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
3361  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
3362  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
3363  * can be NULL.
3364  *
3365  * The function returns with both vnodes exclusively locked, and
3366  * guarantees that it does not create lock order reversal with other
3367  * threads during its execution.  Both vnodes could be unlocked
3368  * temporary (and reclaimed).
3369  */
3370 void
3371 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
3372     bool vp2_locked)
3373 {
3374         int error;
3375
3376         if (vp1 == NULL && vp2 == NULL)
3377                 return;
3378         if (vp1 != NULL) {
3379                 if (vp1_locked)
3380                         ASSERT_VOP_ELOCKED(vp1, "vp1");
3381                 else
3382                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
3383         } else {
3384                 vp1_locked = true;
3385         }
3386         if (vp2 != NULL) {
3387                 if (vp2_locked)
3388                         ASSERT_VOP_ELOCKED(vp2, "vp2");
3389                 else
3390                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
3391         } else {
3392                 vp2_locked = true;
3393         }
3394         if (!vp1_locked && !vp2_locked) {
3395                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3396                 vp1_locked = true;
3397         }
3398
3399         for (;;) {
3400                 if (vp1_locked && vp2_locked)
3401                         break;
3402                 if (vp1_locked && vp2 != NULL) {
3403                         if (vp1 != NULL) {
3404                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
3405                                     __FILE__, __LINE__);
3406                                 if (error == 0)
3407                                         break;
3408                                 VOP_UNLOCK(vp1);
3409                                 vp1_locked = false;
3410                                 vn_lock_pair_pause("vlp1");
3411                         }
3412                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3413                         vp2_locked = true;
3414                 }
3415                 if (vp2_locked && vp1 != NULL) {
3416                         if (vp2 != NULL) {
3417                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
3418                                     __FILE__, __LINE__);
3419                                 if (error == 0)
3420                                         break;
3421                                 VOP_UNLOCK(vp2);
3422                                 vp2_locked = false;
3423                                 vn_lock_pair_pause("vlp2");
3424                         }
3425                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3426                         vp1_locked = true;
3427                 }
3428         }
3429         if (vp1 != NULL)
3430                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
3431         if (vp2 != NULL)
3432                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
3433 }