sys/kern/vfs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  14  *
  15  * Portions of this software were developed by Konstantin Belousov
  16  * under sponsorship from the FreeBSD Foundation.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  * 3. Neither the name of the University nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  40  * SUCH DAMAGE.
  41  *
  42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include "opt_hwpmc_hooks.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/disk.h>
  53 #include <sys/fail.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/kdb.h>
  57 #include <sys/ktr.h>
  58 #include <sys/stat.h>
  59 #include <sys/priv.h>
  60 #include <sys/proc.h>
  61 #include <sys/limits.h>
  62 #include <sys/lock.h>
  63 #include <sys/mman.h>
  64 #include <sys/mount.h>
  65 #include <sys/mutex.h>
  66 #include <sys/namei.h>
  67 #include <sys/vnode.h>
  68 #include <sys/bio.h>
  69 #include <sys/buf.h>
  70 #include <sys/filio.h>
  71 #include <sys/resourcevar.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/prng.h>
  74 #include <sys/sx.h>
  75 #include <sys/sleepqueue.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/ttycom.h>
  78 #include <sys/conf.h>
  79 #include <sys/syslog.h>
  80 #include <sys/unistd.h>
  81 #include <sys/user.h>
  82
  83 #include <security/audit/audit.h>
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_object.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_pager.h>
  93
  94 #ifdef HWPMC_HOOKS
  95 #include <sys/pmckern.h>
  96 #endif
  97
  98 static fo_rdwr_t        vn_read;
  99 static fo_rdwr_t        vn_write;
 100 static fo_rdwr_t        vn_io_fault;
 101 static fo_truncate_t    vn_truncate;
 102 static fo_ioctl_t       vn_ioctl;
 103 static fo_poll_t        vn_poll;
 104 static fo_kqfilter_t    vn_kqfilter;
 105 static fo_stat_t        vn_statfile;
 106 static fo_close_t       vn_closefile;
 107 static fo_mmap_t        vn_mmap;
 108 static fo_fallocate_t   vn_fallocate;
 109
 110 struct  fileops vnops = {
 111         .fo_read = vn_io_fault,
 112         .fo_write = vn_io_fault,
 113         .fo_truncate = vn_truncate,
 114         .fo_ioctl = vn_ioctl,
 115         .fo_poll = vn_poll,
 116         .fo_kqfilter = vn_kqfilter,
 117         .fo_stat = vn_statfile,
 118         .fo_close = vn_closefile,
 119         .fo_chmod = vn_chmod,
 120         .fo_chown = vn_chown,
 121         .fo_sendfile = vn_sendfile,
 122         .fo_seek = vn_seek,
 123         .fo_fill_kinfo = vn_fill_kinfo,
 124         .fo_mmap = vn_mmap,
 125         .fo_fallocate = vn_fallocate,
 126         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 127 };
 128
 129 const u_int io_hold_cnt = 16;
 130 static int vn_io_fault_enable = 1;
 131 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
 132     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 133 static int vn_io_fault_prefault = 0;
 134 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
 135     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 136 static int vn_io_pgcache_read_enable = 1;
 137 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
 138     &vn_io_pgcache_read_enable, 0,
 139     "Enable copying from page cache for reads, avoiding fs");
 140 static u_long vn_io_faults_cnt;
 141 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
 142     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 143
 144 static int vfs_allow_read_dir = 0;
 145 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
 146     &vfs_allow_read_dir, 0,
 147     "Enable read(2) of directory by root for filesystems that support it");
 148
 149 /*
 150  * Returns true if vn_io_fault mode of handling the i/o request should
 151  * be used.
 152  */
 153 static bool
 154 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 155 {
 156         struct mount *mp;
 157
 158         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 159             (mp = vp->v_mount) != NULL &&
 160             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 161 }
 162
 163 /*
 164  * Structure used to pass arguments to vn_io_fault1(), to do either
 165  * file- or vnode-based I/O calls.
 166  */
 167 struct vn_io_fault_args {
 168         enum {
 169                 VN_IO_FAULT_FOP,
 170                 VN_IO_FAULT_VOP
 171         } kind;
 172         struct ucred *cred;
 173         int flags;
 174         union {
 175                 struct fop_args_tag {
 176                         struct file *fp;
 177                         fo_rdwr_t *doio;
 178                 } fop_args;
 179                 struct vop_args_tag {
 180                         struct vnode *vp;
 181                 } vop_args;
 182         } args;
 183 };
 184
 185 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
 186     struct vn_io_fault_args *args, struct thread *td);
 187
 188 int
 189 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 190 {
 191         struct thread *td = ndp->ni_cnd.cn_thread;
 192
 193         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 194 }
 195
 196 static uint64_t
 197 open2nameif(int fmode, u_int vn_open_flags)
 198 {
 199         uint64_t res;
 200
 201         res = ISOPEN | LOCKLEAF;
 202         if ((fmode & O_BENEATH) != 0)
 203                 res |= BENEATH;
 204         if ((fmode & O_RESOLVE_BENEATH) != 0)
 205                 res |= RBENEATH;
 206         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 207                 res |= AUDITVNODE1;
 208         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 209                 res |= NOCAPCHECK;
 210         return (res);
 211 }
 212
 213 /*
 214  * Common code for vnode open operations via a name lookup.
 215  * Lookup the vnode and invoke VOP_CREATE if needed.
 216  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 217  *
 218  * Note that this does NOT free nameidata for the successful case,
 219  * due to the NDINIT being done elsewhere.
 220  */
 221 int
 222 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 223     struct ucred *cred, struct file *fp)
 224 {
 225         struct vnode *vp;
 226         struct mount *mp;
 227         struct thread *td = ndp->ni_cnd.cn_thread;
 228         struct vattr vat;
 229         struct vattr *vap = &vat;
 230         int fmode, error;
 231
 232 restart:
 233         fmode = *flagp;
 234         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 235             O_EXCL | O_DIRECTORY))
 236                 return (EINVAL);
 237         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 238                 ndp->ni_cnd.cn_nameiop = CREATE;
 239                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 240                 /*
 241                  * Set NOCACHE to avoid flushing the cache when
 242                  * rolling in many files at once.
 243                  *
 244                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
 245                  * exist despite NOCACHE.
 246                  */
 247                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 248                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 249                         ndp->ni_cnd.cn_flags |= FOLLOW;
 250                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 251                         bwillwrite();
 252                 if ((error = namei(ndp)) != 0)
 253                         return (error);
 254                 if (ndp->ni_vp == NULL) {
 255                         VATTR_NULL(vap);
 256                         vap->va_type = VREG;
 257                         vap->va_mode = cmode;
 258                         if (fmode & O_EXCL)
 259                                 vap->va_vaflags |= VA_EXCLUSIVE;
 260                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 261                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 262                                 vput(ndp->ni_dvp);
 263                                 if ((error = vn_start_write(NULL, &mp,
 264                                     V_XSLEEP | PCATCH)) != 0)
 265                                         return (error);
 266                                 NDREINIT(ndp);
 267                                 goto restart;
 268                         }
 269                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 270                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
 271 #ifdef MAC
 272                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
 273                             &ndp->ni_cnd, vap);
 274                         if (error == 0)
 275 #endif
 276                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 277                                     &ndp->ni_cnd, vap);
 278                         VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp :
 279                             NULL, false);
 280                         vn_finished_write(mp);
 281                         if (error) {
 282                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 283                                 if (error == ERELOOKUP) {
 284                                         NDREINIT(ndp);
 285                                         goto restart;
 286                                 }
 287                                 return (error);
 288                         }
 289                         fmode &= ~O_TRUNC;
 290                         vp = ndp->ni_vp;
 291                 } else {
 292                         if (ndp->ni_dvp == ndp->ni_vp)
 293                                 vrele(ndp->ni_dvp);
 294                         else
 295                                 vput(ndp->ni_dvp);
 296                         ndp->ni_dvp = NULL;
 297                         vp = ndp->ni_vp;
 298                         if (fmode & O_EXCL) {
 299                                 error = EEXIST;
 300                                 goto bad;
 301                         }
 302                         if (vp->v_type == VDIR) {
 303                                 error = EISDIR;
 304                                 goto bad;
 305                         }
 306                         fmode &= ~O_CREAT;
 307                 }
 308         } else {
 309                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 310                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 311                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 312                     FOLLOW;
 313                 if ((fmode & FWRITE) == 0)
 314                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
 315                 if ((error = namei(ndp)) != 0)
 316                         return (error);
 317                 vp = ndp->ni_vp;
 318         }
 319         error = vn_open_vnode(vp, fmode, cred, td, fp);
 320         if (error)
 321                 goto bad;
 322         *flagp = fmode;
 323         return (0);
 324 bad:
 325         NDFREE(ndp, NDF_ONLY_PNBUF);
 326         vput(vp);
 327         *flagp = fmode;
 328         ndp->ni_vp = NULL;
 329         return (error);
 330 }
 331
 332 static int
 333 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 334 {
 335         struct flock lf;
 336         int error, lock_flags, type;
 337
 338         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 339         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 340                 return (0);
 341         KASSERT(fp != NULL, ("open with flock requires fp"));
 342         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 343                 return (EOPNOTSUPP);
 344
 345         lock_flags = VOP_ISLOCKED(vp);
 346         VOP_UNLOCK(vp);
 347
 348         lf.l_whence = SEEK_SET;
 349         lf.l_start = 0;
 350         lf.l_len = 0;
 351         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 352         type = F_FLOCK;
 353         if ((fmode & FNONBLOCK) == 0)
 354                 type |= F_WAIT;
 355         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 356         if (error == 0)
 357                 fp->f_flag |= FHASLOCK;
 358
 359         vn_lock(vp, lock_flags | LK_RETRY);
 360         return (error);
 361 }
 362
 363 /*
 364  * Common code for vnode open operations once a vnode is located.
 365  * Check permissions, and call the VOP_OPEN routine.
 366  */
 367 int
 368 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 369     struct thread *td, struct file *fp)
 370 {
 371         accmode_t accmode;
 372         int error;
 373
 374         if (vp->v_type == VLNK)
 375                 return (EMLINK);
 376         if (vp->v_type == VSOCK)
 377                 return (EOPNOTSUPP);
 378         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 379                 return (ENOTDIR);
 380         accmode = 0;
 381         if (fmode & (FWRITE | O_TRUNC)) {
 382                 if (vp->v_type == VDIR)
 383                         return (EISDIR);
 384                 accmode |= VWRITE;
 385         }
 386         if (fmode & FREAD)
 387                 accmode |= VREAD;
 388         if (fmode & FEXEC)
 389                 accmode |= VEXEC;
 390         if ((fmode & O_APPEND) && (fmode & FWRITE))
 391                 accmode |= VAPPEND;
 392 #ifdef MAC
 393         if (fmode & O_CREAT)
 394                 accmode |= VCREAT;
 395         if (fmode & O_VERIFY)
 396                 accmode |= VVERIFY;
 397         error = mac_vnode_check_open(cred, vp, accmode);
 398         if (error)
 399                 return (error);
 400
 401         accmode &= ~(VCREAT | VVERIFY);
 402 #endif
 403         if ((fmode & O_CREAT) == 0 && accmode != 0) {
 404                 error = VOP_ACCESS(vp, accmode, cred, td);
 405                 if (error != 0)
 406                         return (error);
 407         }
 408         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 409                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
 410         error = VOP_OPEN(vp, fmode, cred, td, fp);
 411         if (error != 0)
 412                 return (error);
 413
 414         error = vn_open_vnode_advlock(vp, fmode, fp);
 415         if (error == 0 && (fmode & FWRITE) != 0) {
 416                 error = VOP_ADD_WRITECOUNT(vp, 1);
 417                 if (error == 0) {
 418                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 419                              __func__, vp, vp->v_writecount);
 420                 }
 421         }
 422
 423         /*
 424          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 425          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 426          * Arrange for that by having fdrop() to use vn_closefile().
 427          */
 428         if (error != 0) {
 429                 fp->f_flag |= FOPENFAILED;
 430                 fp->f_vnode = vp;
 431                 if (fp->f_ops == &badfileops) {
 432                         fp->f_type = DTYPE_VNODE;
 433                         fp->f_ops = &vnops;
 434                 }
 435                 vref(vp);
 436         }
 437
 438         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 439         return (error);
 440
 441 }
 442
 443 /*
 444  * Check for write permissions on the specified vnode.
 445  * Prototype text segments cannot be written.
 446  * It is racy.
 447  */
 448 int
 449 vn_writechk(struct vnode *vp)
 450 {
 451
 452         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 453         /*
 454          * If there's shared text associated with
 455          * the vnode, try to free it up once.  If
 456          * we fail, we can't allow writing.
 457          */
 458         if (VOP_IS_TEXT(vp))
 459                 return (ETXTBSY);
 460
 461         return (0);
 462 }
 463
 464 /*
 465  * Vnode close call
 466  */
 467 static int
 468 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
 469     struct thread *td, bool keep_ref)
 470 {
 471         struct mount *mp;
 472         int error, lock_flags;
 473
 474         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 475             MNT_EXTENDED_SHARED(vp->v_mount))
 476                 lock_flags = LK_SHARED;
 477         else
 478                 lock_flags = LK_EXCLUSIVE;
 479
 480         vn_start_write(vp, &mp, V_WAIT);
 481         vn_lock(vp, lock_flags | LK_RETRY);
 482         AUDIT_ARG_VNODE1(vp);
 483         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 484                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 485                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 486                     __func__, vp, vp->v_writecount);
 487         }
 488         error = VOP_CLOSE(vp, flags, file_cred, td);
 489         if (keep_ref)
 490                 VOP_UNLOCK(vp);
 491         else
 492                 vput(vp);
 493         vn_finished_write(mp);
 494         return (error);
 495 }
 496
 497 int
 498 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
 499     struct thread *td)
 500 {
 501
 502         return (vn_close1(vp, flags, file_cred, td, false));
 503 }
 504
 505 /*
 506  * Heuristic to detect sequential operation.
 507  */
 508 static int
 509 sequential_heuristic(struct uio *uio, struct file *fp)
 510 {
 511         enum uio_rw rw;
 512
 513         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 514
 515         rw = uio->uio_rw;
 516         if (fp->f_flag & FRDAHEAD)
 517                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 518
 519         /*
 520          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 521          * that the first I/O is normally considered to be slightly
 522          * sequential.  Seeking to offset 0 doesn't change sequentiality
 523          * unless previous seeks have reduced f_seqcount to 0, in which
 524          * case offset 0 is not special.
 525          */
 526         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 527             uio->uio_offset == fp->f_nextoff[rw]) {
 528                 /*
 529                  * f_seqcount is in units of fixed-size blocks so that it
 530                  * depends mainly on the amount of sequential I/O and not
 531                  * much on the number of sequential I/O's.  The fixed size
 532                  * of 16384 is hard-coded here since it is (not quite) just
 533                  * a magic size that works well here.  This size is more
 534                  * closely related to the best I/O size for real disks than
 535                  * to any block size used by software.
 536                  */
 537                 if (uio->uio_resid >= IO_SEQMAX * 16384)
 538                         fp->f_seqcount[rw] = IO_SEQMAX;
 539                 else {
 540                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 541                         if (fp->f_seqcount[rw] > IO_SEQMAX)
 542                                 fp->f_seqcount[rw] = IO_SEQMAX;
 543                 }
 544                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 545         }
 546
 547         /* Not sequential.  Quickly draw-down sequentiality. */
 548         if (fp->f_seqcount[rw] > 1)
 549                 fp->f_seqcount[rw] = 1;
 550         else
 551                 fp->f_seqcount[rw] = 0;
 552         return (0);
 553 }
 554
 555 /*
 556  * Package up an I/O request on a vnode into a uio and do it.
 557  */
 558 int
 559 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
 560     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 561     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 562 {
 563         struct uio auio;
 564         struct iovec aiov;
 565         struct mount *mp;
 566         struct ucred *cred;
 567         void *rl_cookie;
 568         struct vn_io_fault_args args;
 569         int error, lock_flags;
 570
 571         if (offset < 0 && vp->v_type != VCHR)
 572                 return (EINVAL);
 573         auio.uio_iov = &aiov;
 574         auio.uio_iovcnt = 1;
 575         aiov.iov_base = base;
 576         aiov.iov_len = len;
 577         auio.uio_resid = len;
 578         auio.uio_offset = offset;
 579         auio.uio_segflg = segflg;
 580         auio.uio_rw = rw;
 581         auio.uio_td = td;
 582         error = 0;
 583
 584         if ((ioflg & IO_NODELOCKED) == 0) {
 585                 if ((ioflg & IO_RANGELOCKED) == 0) {
 586                         if (rw == UIO_READ) {
 587                                 rl_cookie = vn_rangelock_rlock(vp, offset,
 588                                     offset + len);
 589                         } else if ((ioflg & IO_APPEND) != 0) {
 590                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 591                         } else {
 592                                 rl_cookie = vn_rangelock_wlock(vp, offset,
 593                                     offset + len);
 594                         }
 595                 } else
 596                         rl_cookie = NULL;
 597                 mp = NULL;
 598                 if (rw == UIO_WRITE) {
 599                         if (vp->v_type != VCHR &&
 600                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 601                             != 0)
 602                                 goto out;
 603                         if (MNT_SHARED_WRITES(mp) ||
 604                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 605                                 lock_flags = LK_SHARED;
 606                         else
 607                                 lock_flags = LK_EXCLUSIVE;
 608                 } else
 609                         lock_flags = LK_SHARED;
 610                 vn_lock(vp, lock_flags | LK_RETRY);
 611         } else
 612                 rl_cookie = NULL;
 613
 614         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 615 #ifdef MAC
 616         if ((ioflg & IO_NOMACCHECK) == 0) {
 617                 if (rw == UIO_READ)
 618                         error = mac_vnode_check_read(active_cred, file_cred,
 619                             vp);
 620                 else
 621                         error = mac_vnode_check_write(active_cred, file_cred,
 622                             vp);
 623         }
 624 #endif
 625         if (error == 0) {
 626                 if (file_cred != NULL)
 627                         cred = file_cred;
 628                 else
 629                         cred = active_cred;
 630                 if (do_vn_io_fault(vp, &auio)) {
 631                         args.kind = VN_IO_FAULT_VOP;
 632                         args.cred = cred;
 633                         args.flags = ioflg;
 634                         args.args.vop_args.vp = vp;
 635                         error = vn_io_fault1(vp, &auio, &args, td);
 636                 } else if (rw == UIO_READ) {
 637                         error = VOP_READ(vp, &auio, ioflg, cred);
 638                 } else /* if (rw == UIO_WRITE) */ {
 639                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 640                 }
 641         }
 642         if (aresid)
 643                 *aresid = auio.uio_resid;
 644         else
 645                 if (auio.uio_resid && error == 0)
 646                         error = EIO;
 647         if ((ioflg & IO_NODELOCKED) == 0) {
 648                 VOP_UNLOCK(vp);
 649                 if (mp != NULL)
 650                         vn_finished_write(mp);
 651         }
 652  out:
 653         if (rl_cookie != NULL)
 654                 vn_rangelock_unlock(vp, rl_cookie);
 655         return (error);
 656 }
 657
 658 /*
 659  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 660  * request is split up into smaller chunks and we try to avoid saturating
 661  * the buffer cache while potentially holding a vnode locked, so we
 662  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
 663  * to give other processes a chance to lock the vnode (either other processes
 664  * core'ing the same binary, or unrelated processes scanning the directory).
 665  */
 666 int
 667 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
 668     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
 669     struct ucred *file_cred, size_t *aresid, struct thread *td)
 670 {
 671         int error = 0;
 672         ssize_t iaresid;
 673
 674         do {
 675                 int chunk;
 676
 677                 /*
 678                  * Force `offset' to a multiple of MAXBSIZE except possibly
 679                  * for the first chunk, so that filesystems only need to
 680                  * write full blocks except possibly for the first and last
 681                  * chunks.
 682                  */
 683                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 684
 685                 if (chunk > len)
 686                         chunk = len;
 687                 if (rw != UIO_READ && vp->v_type == VREG)
 688                         bwillwrite();
 689                 iaresid = 0;
 690                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 691                     ioflg, active_cred, file_cred, &iaresid, td);
 692                 len -= chunk;   /* aresid calc already includes length */
 693                 if (error)
 694                         break;
 695                 offset += chunk;
 696                 base = (char *)base + chunk;
 697                 kern_yield(PRI_USER);
 698         } while (len);
 699         if (aresid)
 700                 *aresid = len + iaresid;
 701         return (error);
 702 }
 703
 704 #if OFF_MAX <= LONG_MAX
 705 off_t
 706 foffset_lock(struct file *fp, int flags)
 707 {
 708         volatile short *flagsp;
 709         off_t res;
 710         short state;
 711
 712         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 713
 714         if ((flags & FOF_NOLOCK) != 0)
 715                 return (atomic_load_long(&fp->f_offset));
 716
 717         /*
 718          * According to McKusick the vn lock was protecting f_offset here.
 719          * It is now protected by the FOFFSET_LOCKED flag.
 720          */
 721         flagsp = &fp->f_vnread_flags;
 722         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 723                 return (atomic_load_long(&fp->f_offset));
 724
 725         sleepq_lock(&fp->f_vnread_flags);
 726         state = atomic_load_16(flagsp);
 727         for (;;) {
 728                 if ((state & FOFFSET_LOCKED) == 0) {
 729                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 730                             FOFFSET_LOCKED))
 731                                 continue;
 732                         break;
 733                 }
 734                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
 735                         if (!atomic_fcmpset_acq_16(flagsp, &state,
 736                             state | FOFFSET_LOCK_WAITING))
 737                                 continue;
 738                 }
 739                 DROP_GIANT();
 740                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 741                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 742                 PICKUP_GIANT();
 743                 sleepq_lock(&fp->f_vnread_flags);
 744                 state = atomic_load_16(flagsp);
 745         }
 746         res = atomic_load_long(&fp->f_offset);
 747         sleepq_release(&fp->f_vnread_flags);
 748         return (res);
 749 }
 750
 751 void
 752 foffset_unlock(struct file *fp, off_t val, int flags)
 753 {
 754         volatile short *flagsp;
 755         short state;
 756
 757         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 758
 759         if ((flags & FOF_NOUPDATE) == 0)
 760                 atomic_store_long(&fp->f_offset, val);
 761         if ((flags & FOF_NEXTOFF_R) != 0)
 762                 fp->f_nextoff[UIO_READ] = val;
 763         if ((flags & FOF_NEXTOFF_W) != 0)
 764                 fp->f_nextoff[UIO_WRITE] = val;
 765
 766         if ((flags & FOF_NOLOCK) != 0)
 767                 return;
 768
 769         flagsp = &fp->f_vnread_flags;
 770         state = atomic_load_16(flagsp);
 771         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 772             atomic_cmpset_rel_16(flagsp, state, 0))
 773                 return;
 774
 775         sleepq_lock(&fp->f_vnread_flags);
 776         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 777         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 778         fp->f_vnread_flags = 0;
 779         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 780         sleepq_release(&fp->f_vnread_flags);
 781 }
 782 #else
 783 off_t
 784 foffset_lock(struct file *fp, int flags)
 785 {
 786         struct mtx *mtxp;
 787         off_t res;
 788
 789         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 790
 791         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 792         mtx_lock(mtxp);
 793         if ((flags & FOF_NOLOCK) == 0) {
 794                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 795                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 796                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 797                             "vofflock", 0);
 798                 }
 799                 fp->f_vnread_flags |= FOFFSET_LOCKED;
 800         }
 801         res = fp->f_offset;
 802         mtx_unlock(mtxp);
 803         return (res);
 804 }
 805
 806 void
 807 foffset_unlock(struct file *fp, off_t val, int flags)
 808 {
 809         struct mtx *mtxp;
 810
 811         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 812
 813         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 814         mtx_lock(mtxp);
 815         if ((flags & FOF_NOUPDATE) == 0)
 816                 fp->f_offset = val;
 817         if ((flags & FOF_NEXTOFF_R) != 0)
 818                 fp->f_nextoff[UIO_READ] = val;
 819         if ((flags & FOF_NEXTOFF_W) != 0)
 820                 fp->f_nextoff[UIO_WRITE] = val;
 821         if ((flags & FOF_NOLOCK) == 0) {
 822                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 823                     ("Lost FOFFSET_LOCKED"));
 824                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 825                         wakeup(&fp->f_vnread_flags);
 826                 fp->f_vnread_flags = 0;
 827         }
 828         mtx_unlock(mtxp);
 829 }
 830 #endif
 831
 832 void
 833 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 834 {
 835
 836         if ((flags & FOF_OFFSET) == 0)
 837                 uio->uio_offset = foffset_lock(fp, flags);
 838 }
 839
 840 void
 841 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 842 {
 843
 844         if ((flags & FOF_OFFSET) == 0)
 845                 foffset_unlock(fp, uio->uio_offset, flags);
 846 }
 847
 848 static int
 849 get_advice(struct file *fp, struct uio *uio)
 850 {
 851         struct mtx *mtxp;
 852         int ret;
 853
 854         ret = POSIX_FADV_NORMAL;
 855         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 856                 return (ret);
 857
 858         mtxp = mtx_pool_find(mtxpool_sleep, fp);
 859         mtx_lock(mtxp);
 860         if (fp->f_advice != NULL &&
 861             uio->uio_offset >= fp->f_advice->fa_start &&
 862             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 863                 ret = fp->f_advice->fa_advice;
 864         mtx_unlock(mtxp);
 865         return (ret);
 866 }
 867
 868 int
 869 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 870 {
 871         vm_object_t obj;
 872         vm_page_t ma[io_hold_cnt + 2];
 873         off_t off, vsz;
 874         ssize_t resid;
 875         int error, i, j;
 876
 877         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 878         obj = atomic_load_ptr(&vp->v_object);
 879         if (obj == NULL)
 880                 return (EJUSTRETURN);
 881
 882         /*
 883          * Depends on type stability of vm_objects.
 884          */
 885         vm_object_pip_add(obj, 1);
 886         if ((obj->flags & OBJ_DEAD) != 0) {
 887                 /*
 888                  * Note that object might be already reused from the
 889                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 890                  * we recheck for DOOMED vnode state after all pages
 891                  * are busied, and retract then.
 892                  *
 893                  * But we check for OBJ_DEAD to ensure that we do not
 894                  * busy pages while vm_object_terminate_pages()
 895                  * processes the queue.
 896                  */
 897                 error = EJUSTRETURN;
 898                 goto out_pip;
 899         }
 900
 901         resid = uio->uio_resid;
 902         off = uio->uio_offset;
 903         for (i = 0; resid > 0; i++) {
 904                 MPASS(i < io_hold_cnt + 2);
 905                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
 906                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 907                     VM_ALLOC_NOWAIT);
 908                 if (ma[i] == NULL)
 909                         break;
 910
 911                 /*
 912                  * Skip invalid pages.  Valid mask can be partial only
 913                  * at EOF, and we clip later.
 914                  */
 915                 if (vm_page_none_valid(ma[i])) {
 916                         vm_page_sunbusy(ma[i]);
 917                         break;
 918                 }
 919
 920                 resid -= PAGE_SIZE;
 921                 off += PAGE_SIZE;
 922         }
 923         if (i == 0) {
 924                 error = EJUSTRETURN;
 925                 goto out_pip;
 926         }
 927
 928         /*
 929          * Check VIRF_DOOMED after we busied our pages.  Since
 930          * vgonel() terminates the vnode' vm_object, it cannot
 931          * process past pages busied by us.
 932          */
 933         if (VN_IS_DOOMED(vp)) {
 934                 error = EJUSTRETURN;
 935                 goto out;
 936         }
 937
 938         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 939         if (resid > uio->uio_resid)
 940                 resid = uio->uio_resid;
 941
 942         /*
 943          * Unlocked read of vnp_size is safe because truncation cannot
 944          * pass busied page.  But we load vnp_size into a local
 945          * variable so that possible concurrent extension does not
 946          * break calculation.
 947          */
 948 #if defined(__powerpc__) && !defined(__powerpc64__)
 949         vsz = obj->un_pager.vnp.vnp_size;
 950 #else
 951         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 952 #endif
 953         if (uio->uio_offset + resid > vsz)
 954                 resid = vsz - uio->uio_offset;
 955
 956         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 957
 958 out:
 959         for (j = 0; j < i; j++) {
 960                 if (error == 0)
 961                         vm_page_reference(ma[j]);
 962                 vm_page_sunbusy(ma[j]);
 963         }
 964 out_pip:
 965         vm_object_pip_wakeup(obj);
 966         if (error != 0)
 967                 return (error);
 968         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 969 }
 970
 971 /*
 972  * File table vnode read routine.
 973  */
 974 static int
 975 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
 976     struct thread *td)
 977 {
 978         struct vnode *vp;
 979         off_t orig_offset;
 980         int error, ioflag;
 981         int advice;
 982
 983         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 984             uio->uio_td, td));
 985         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 986         vp = fp->f_vnode;
 987         ioflag = 0;
 988         if (fp->f_flag & FNONBLOCK)
 989                 ioflag |= IO_NDELAY;
 990         if (fp->f_flag & O_DIRECT)
 991                 ioflag |= IO_DIRECT;
 992
 993         /*
 994          * Try to read from page cache.  VIRF_DOOMED check is racy but
 995          * allows us to avoid unneeded work outright.
 996          */
 997         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 998             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 999                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1000                 if (error == 0) {
1001                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1002                         return (0);
1003                 }
1004                 if (error != EJUSTRETURN)
1005                         return (error);
1006         }
1007
1008         advice = get_advice(fp, uio);
1009         vn_lock(vp, LK_SHARED | LK_RETRY);
1010
1011         switch (advice) {
1012         case POSIX_FADV_NORMAL:
1013         case POSIX_FADV_SEQUENTIAL:
1014         case POSIX_FADV_NOREUSE:
1015                 ioflag |= sequential_heuristic(uio, fp);
1016                 break;
1017         case POSIX_FADV_RANDOM:
1018                 /* Disable read-ahead for random I/O. */
1019                 break;
1020         }
1021         orig_offset = uio->uio_offset;
1022
1023 #ifdef MAC
1024         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1025         if (error == 0)
1026 #endif
1027                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1028         fp->f_nextoff[UIO_READ] = uio->uio_offset;
1029         VOP_UNLOCK(vp);
1030         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1031             orig_offset != uio->uio_offset)
1032                 /*
1033                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1034                  * for the backing file after a POSIX_FADV_NOREUSE
1035                  * read(2).
1036                  */
1037                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1038                     POSIX_FADV_DONTNEED);
1039         return (error);
1040 }
1041
1042 /*
1043  * File table vnode write routine.
1044  */
1045 static int
1046 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1047     struct thread *td)
1048 {
1049         struct vnode *vp;
1050         struct mount *mp;
1051         off_t orig_offset;
1052         int error, ioflag, lock_flags;
1053         int advice;
1054
1055         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1056             uio->uio_td, td));
1057         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1058         vp = fp->f_vnode;
1059         if (vp->v_type == VREG)
1060                 bwillwrite();
1061         ioflag = IO_UNIT;
1062         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
1063                 ioflag |= IO_APPEND;
1064         if (fp->f_flag & FNONBLOCK)
1065                 ioflag |= IO_NDELAY;
1066         if (fp->f_flag & O_DIRECT)
1067                 ioflag |= IO_DIRECT;
1068         if ((fp->f_flag & O_FSYNC) ||
1069             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
1070                 ioflag |= IO_SYNC;
1071         /*
1072          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
1073          * implementations that don't understand IO_DATASYNC fall back to full
1074          * O_SYNC behavior.
1075          */
1076         if (fp->f_flag & O_DSYNC)
1077                 ioflag |= IO_SYNC | IO_DATASYNC;
1078         mp = NULL;
1079         if (vp->v_type != VCHR &&
1080             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
1081                 goto unlock;
1082
1083         advice = get_advice(fp, uio);
1084
1085         if (MNT_SHARED_WRITES(mp) ||
1086             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
1087                 lock_flags = LK_SHARED;
1088         } else {
1089                 lock_flags = LK_EXCLUSIVE;
1090         }
1091
1092         vn_lock(vp, lock_flags | LK_RETRY);
1093         switch (advice) {
1094         case POSIX_FADV_NORMAL:
1095         case POSIX_FADV_SEQUENTIAL:
1096         case POSIX_FADV_NOREUSE:
1097                 ioflag |= sequential_heuristic(uio, fp);
1098                 break;
1099         case POSIX_FADV_RANDOM:
1100                 /* XXX: Is this correct? */
1101                 break;
1102         }
1103         orig_offset = uio->uio_offset;
1104
1105 #ifdef MAC
1106         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1107         if (error == 0)
1108 #endif
1109                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1110         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1111         VOP_UNLOCK(vp);
1112         if (vp->v_type != VCHR)
1113                 vn_finished_write(mp);
1114         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1115             orig_offset != uio->uio_offset)
1116                 /*
1117                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
1118                  * for the backing file after a POSIX_FADV_NOREUSE
1119                  * write(2).
1120                  */
1121                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1122                     POSIX_FADV_DONTNEED);
1123 unlock:
1124         return (error);
1125 }
1126
1127 /*
1128  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1129  * prevent the following deadlock:
1130  *
1131  * Assume that the thread A reads from the vnode vp1 into userspace
1132  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1133  * currently not resident, then system ends up with the call chain
1134  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1135  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1136  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1137  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1138  * backed by the pages of vnode vp1, and some page in buf2 is not
1139  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1140  *
1141  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1142  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1143  * Instead, it first tries to do the whole range i/o with pagefaults
1144  * disabled. If all pages in the i/o buffer are resident and mapped,
1145  * VOP will succeed (ignoring the genuine filesystem errors).
1146  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1147  * i/o in chunks, with all pages in the chunk prefaulted and held
1148  * using vm_fault_quick_hold_pages().
1149  *
1150  * Filesystems using this deadlock avoidance scheme should use the
1151  * array of the held pages from uio, saved in the curthread->td_ma,
1152  * instead of doing uiomove().  A helper function
1153  * vn_io_fault_uiomove() converts uiomove request into
1154  * uiomove_fromphys() over td_ma array.
1155  *
1156  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1157  * make the current i/o request atomic with respect to other i/os and
1158  * truncations.
1159  */
1160
1161 /*
1162  * Decode vn_io_fault_args and perform the corresponding i/o.
1163  */
1164 static int
1165 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1166     struct thread *td)
1167 {
1168         int error, save;
1169
1170         error = 0;
1171         save = vm_fault_disable_pagefaults();
1172         switch (args->kind) {
1173         case VN_IO_FAULT_FOP:
1174                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1175                     uio, args->cred, args->flags, td);
1176                 break;
1177         case VN_IO_FAULT_VOP:
1178                 if (uio->uio_rw == UIO_READ) {
1179                         error = VOP_READ(args->args.vop_args.vp, uio,
1180                             args->flags, args->cred);
1181                 } else if (uio->uio_rw == UIO_WRITE) {
1182                         error = VOP_WRITE(args->args.vop_args.vp, uio,
1183                             args->flags, args->cred);
1184                 }
1185                 break;
1186         default:
1187                 panic("vn_io_fault_doio: unknown kind of io %d %d",
1188                     args->kind, uio->uio_rw);
1189         }
1190         vm_fault_enable_pagefaults(save);
1191         return (error);
1192 }
1193
1194 static int
1195 vn_io_fault_touch(char *base, const struct uio *uio)
1196 {
1197         int r;
1198
1199         r = fubyte(base);
1200         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1201                 return (EFAULT);
1202         return (0);
1203 }
1204
1205 static int
1206 vn_io_fault_prefault_user(const struct uio *uio)
1207 {
1208         char *base;
1209         const struct iovec *iov;
1210         size_t len;
1211         ssize_t resid;
1212         int error, i;
1213
1214         KASSERT(uio->uio_segflg == UIO_USERSPACE,
1215             ("vn_io_fault_prefault userspace"));
1216
1217         error = i = 0;
1218         iov = uio->uio_iov;
1219         resid = uio->uio_resid;
1220         base = iov->iov_base;
1221         len = iov->iov_len;
1222         while (resid > 0) {
1223                 error = vn_io_fault_touch(base, uio);
1224                 if (error != 0)
1225                         break;
1226                 if (len < PAGE_SIZE) {
1227                         if (len != 0) {
1228                                 error = vn_io_fault_touch(base + len - 1, uio);
1229                                 if (error != 0)
1230                                         break;
1231                                 resid -= len;
1232                         }
1233                         if (++i >= uio->uio_iovcnt)
1234                                 break;
1235                         iov = uio->uio_iov + i;
1236                         base = iov->iov_base;
1237                         len = iov->iov_len;
1238                 } else {
1239                         len -= PAGE_SIZE;
1240                         base += PAGE_SIZE;
1241                         resid -= PAGE_SIZE;
1242                 }
1243         }
1244         return (error);
1245 }
1246
1247 /*
1248  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1249  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1250  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1251  * into args and call vn_io_fault1() to handle faults during the user
1252  * mode buffer accesses.
1253  */
1254 static int
1255 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1256     struct thread *td)
1257 {
1258         vm_page_t ma[io_hold_cnt + 2];
1259         struct uio *uio_clone, short_uio;
1260         struct iovec short_iovec[1];
1261         vm_page_t *prev_td_ma;
1262         vm_prot_t prot;
1263         vm_offset_t addr, end;
1264         size_t len, resid;
1265         ssize_t adv;
1266         int error, cnt, saveheld, prev_td_ma_cnt;
1267
1268         if (vn_io_fault_prefault) {
1269                 error = vn_io_fault_prefault_user(uio);
1270                 if (error != 0)
1271                         return (error); /* Or ignore ? */
1272         }
1273
1274         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1275
1276         /*
1277          * The UFS follows IO_UNIT directive and replays back both
1278          * uio_offset and uio_resid if an error is encountered during the
1279          * operation.  But, since the iovec may be already advanced,
1280          * uio is still in an inconsistent state.
1281          *
1282          * Cache a copy of the original uio, which is advanced to the redo
1283          * point using UIO_NOCOPY below.
1284          */
1285         uio_clone = cloneuio(uio);
1286         resid = uio->uio_resid;
1287
1288         short_uio.uio_segflg = UIO_USERSPACE;
1289         short_uio.uio_rw = uio->uio_rw;
1290         short_uio.uio_td = uio->uio_td;
1291
1292         error = vn_io_fault_doio(args, uio, td);
1293         if (error != EFAULT)
1294                 goto out;
1295
1296         atomic_add_long(&vn_io_faults_cnt, 1);
1297         uio_clone->uio_segflg = UIO_NOCOPY;
1298         uiomove(NULL, resid - uio->uio_resid, uio_clone);
1299         uio_clone->uio_segflg = uio->uio_segflg;
1300
1301         saveheld = curthread_pflags_set(TDP_UIOHELD);
1302         prev_td_ma = td->td_ma;
1303         prev_td_ma_cnt = td->td_ma_cnt;
1304
1305         while (uio_clone->uio_resid != 0) {
1306                 len = uio_clone->uio_iov->iov_len;
1307                 if (len == 0) {
1308                         KASSERT(uio_clone->uio_iovcnt >= 1,
1309                             ("iovcnt underflow"));
1310                         uio_clone->uio_iov++;
1311                         uio_clone->uio_iovcnt--;
1312                         continue;
1313                 }
1314                 if (len > ptoa(io_hold_cnt))
1315                         len = ptoa(io_hold_cnt);
1316                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1317                 end = round_page(addr + len);
1318                 if (end < addr) {
1319                         error = EFAULT;
1320                         break;
1321                 }
1322                 cnt = atop(end - trunc_page(addr));
1323                 /*
1324                  * A perfectly misaligned address and length could cause
1325                  * both the start and the end of the chunk to use partial
1326                  * page.  +2 accounts for such a situation.
1327                  */
1328                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1329                     addr, len, prot, ma, io_hold_cnt + 2);
1330                 if (cnt == -1) {
1331                         error = EFAULT;
1332                         break;
1333                 }
1334                 short_uio.uio_iov = &short_iovec[0];
1335                 short_iovec[0].iov_base = (void *)addr;
1336                 short_uio.uio_iovcnt = 1;
1337                 short_uio.uio_resid = short_iovec[0].iov_len = len;
1338                 short_uio.uio_offset = uio_clone->uio_offset;
1339                 td->td_ma = ma;
1340                 td->td_ma_cnt = cnt;
1341
1342                 error = vn_io_fault_doio(args, &short_uio, td);
1343                 vm_page_unhold_pages(ma, cnt);
1344                 adv = len - short_uio.uio_resid;
1345
1346                 uio_clone->uio_iov->iov_base =
1347                     (char *)uio_clone->uio_iov->iov_base + adv;
1348                 uio_clone->uio_iov->iov_len -= adv;
1349                 uio_clone->uio_resid -= adv;
1350                 uio_clone->uio_offset += adv;
1351
1352                 uio->uio_resid -= adv;
1353                 uio->uio_offset += adv;
1354
1355                 if (error != 0 || adv == 0)
1356                         break;
1357         }
1358         td->td_ma = prev_td_ma;
1359         td->td_ma_cnt = prev_td_ma_cnt;
1360         curthread_pflags_restore(saveheld);
1361 out:
1362         free(uio_clone, M_IOV);
1363         return (error);
1364 }
1365
1366 static int
1367 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1368     int flags, struct thread *td)
1369 {
1370         fo_rdwr_t *doio;
1371         struct vnode *vp;
1372         void *rl_cookie;
1373         struct vn_io_fault_args args;
1374         int error;
1375
1376         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1377         vp = fp->f_vnode;
1378
1379         /*
1380          * The ability to read(2) on a directory has historically been
1381          * allowed for all users, but this can and has been the source of
1382          * at least one security issue in the past.  As such, it is now hidden
1383          * away behind a sysctl for those that actually need it to use it, and
1384          * restricted to root when it's turned on to make it relatively safe to
1385          * leave on for longer sessions of need.
1386          */
1387         if (vp->v_type == VDIR) {
1388                 KASSERT(uio->uio_rw == UIO_READ,
1389                     ("illegal write attempted on a directory"));
1390                 if (!vfs_allow_read_dir)
1391                         return (EISDIR);
1392                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1393                         return (EISDIR);
1394         }
1395
1396         foffset_lock_uio(fp, uio, flags);
1397         if (do_vn_io_fault(vp, uio)) {
1398                 args.kind = VN_IO_FAULT_FOP;
1399                 args.args.fop_args.fp = fp;
1400                 args.args.fop_args.doio = doio;
1401                 args.cred = active_cred;
1402                 args.flags = flags | FOF_OFFSET;
1403                 if (uio->uio_rw == UIO_READ) {
1404                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1405                             uio->uio_offset + uio->uio_resid);
1406                 } else if ((fp->f_flag & O_APPEND) != 0 ||
1407                     (flags & FOF_OFFSET) == 0) {
1408                         /* For appenders, punt and lock the whole range. */
1409                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1410                 } else {
1411                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1412                             uio->uio_offset + uio->uio_resid);
1413                 }
1414                 error = vn_io_fault1(vp, uio, &args, td);
1415                 vn_rangelock_unlock(vp, rl_cookie);
1416         } else {
1417                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1418         }
1419         foffset_unlock_uio(fp, uio, flags);
1420         return (error);
1421 }
1422
1423 /*
1424  * Helper function to perform the requested uiomove operation using
1425  * the held pages for io->uio_iov[0].iov_base buffer instead of
1426  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1427  * instead of iov_base prevents page faults that could occur due to
1428  * pmap_collect() invalidating the mapping created by
1429  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1430  * object cleanup revoking the write access from page mappings.
1431  *
1432  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1433  * instead of plain uiomove().
1434  */
1435 int
1436 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1437 {
1438         struct uio transp_uio;
1439         struct iovec transp_iov[1];
1440         struct thread *td;
1441         size_t adv;
1442         int error, pgadv;
1443
1444         td = curthread;
1445         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1446             uio->uio_segflg != UIO_USERSPACE)
1447                 return (uiomove(data, xfersize, uio));
1448
1449         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1450         transp_iov[0].iov_base = data;
1451         transp_uio.uio_iov = &transp_iov[0];
1452         transp_uio.uio_iovcnt = 1;
1453         if (xfersize > uio->uio_resid)
1454                 xfersize = uio->uio_resid;
1455         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1456         transp_uio.uio_offset = 0;
1457         transp_uio.uio_segflg = UIO_SYSSPACE;
1458         /*
1459          * Since transp_iov points to data, and td_ma page array
1460          * corresponds to original uio->uio_iov, we need to invert the
1461          * direction of the i/o operation as passed to
1462          * uiomove_fromphys().
1463          */
1464         switch (uio->uio_rw) {
1465         case UIO_WRITE:
1466                 transp_uio.uio_rw = UIO_READ;
1467                 break;
1468         case UIO_READ:
1469                 transp_uio.uio_rw = UIO_WRITE;
1470                 break;
1471         }
1472         transp_uio.uio_td = uio->uio_td;
1473         error = uiomove_fromphys(td->td_ma,
1474             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1475             xfersize, &transp_uio);
1476         adv = xfersize - transp_uio.uio_resid;
1477         pgadv =
1478             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1479             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1480         td->td_ma += pgadv;
1481         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1482             pgadv));
1483         td->td_ma_cnt -= pgadv;
1484         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1485         uio->uio_iov->iov_len -= adv;
1486         uio->uio_resid -= adv;
1487         uio->uio_offset += adv;
1488         return (error);
1489 }
1490
1491 int
1492 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1493     struct uio *uio)
1494 {
1495         struct thread *td;
1496         vm_offset_t iov_base;
1497         int cnt, pgadv;
1498
1499         td = curthread;
1500         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1501             uio->uio_segflg != UIO_USERSPACE)
1502                 return (uiomove_fromphys(ma, offset, xfersize, uio));
1503
1504         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1505         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1506         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1507         switch (uio->uio_rw) {
1508         case UIO_WRITE:
1509                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1510                     offset, cnt);
1511                 break;
1512         case UIO_READ:
1513                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1514                     cnt);
1515                 break;
1516         }
1517         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1518         td->td_ma += pgadv;
1519         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1520             pgadv));
1521         td->td_ma_cnt -= pgadv;
1522         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1523         uio->uio_iov->iov_len -= cnt;
1524         uio->uio_resid -= cnt;
1525         uio->uio_offset += cnt;
1526         return (0);
1527 }
1528
1529 /*
1530  * File table truncate routine.
1531  */
1532 static int
1533 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1534     struct thread *td)
1535 {
1536         struct mount *mp;
1537         struct vnode *vp;
1538         void *rl_cookie;
1539         int error;
1540
1541         vp = fp->f_vnode;
1542
1543 retry:
1544         /*
1545          * Lock the whole range for truncation.  Otherwise split i/o
1546          * might happen partly before and partly after the truncation.
1547          */
1548         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1549         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1550         if (error)
1551                 goto out1;
1552         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1553         AUDIT_ARG_VNODE1(vp);
1554         if (vp->v_type == VDIR) {
1555                 error = EISDIR;
1556                 goto out;
1557         }
1558 #ifdef MAC
1559         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1560         if (error)
1561                 goto out;
1562 #endif
1563         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1564             fp->f_cred);
1565 out:
1566         VOP_UNLOCK(vp);
1567         vn_finished_write(mp);
1568 out1:
1569         vn_rangelock_unlock(vp, rl_cookie);
1570         if (error == ERELOOKUP)
1571                 goto retry;
1572         return (error);
1573 }
1574
1575 /*
1576  * Truncate a file that is already locked.
1577  */
1578 int
1579 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1580     struct ucred *cred)
1581 {
1582         struct vattr vattr;
1583         int error;
1584
1585         error = VOP_ADD_WRITECOUNT(vp, 1);
1586         if (error == 0) {
1587                 VATTR_NULL(&vattr);
1588                 vattr.va_size = length;
1589                 if (sync)
1590                         vattr.va_vaflags |= VA_SYNC;
1591                 error = VOP_SETATTR(vp, &vattr, cred);
1592                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1593         }
1594         return (error);
1595 }
1596
1597 /*
1598  * File table vnode stat routine.
1599  */
1600 static int
1601 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
1602     struct thread *td)
1603 {
1604         struct vnode *vp = fp->f_vnode;
1605         int error;
1606
1607         vn_lock(vp, LK_SHARED | LK_RETRY);
1608         error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
1609         VOP_UNLOCK(vp);
1610
1611         return (error);
1612 }
1613
1614 /*
1615  * File table vnode ioctl routine.
1616  */
1617 static int
1618 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1619     struct thread *td)
1620 {
1621         struct vattr vattr;
1622         struct vnode *vp;
1623         struct fiobmap2_arg *bmarg;
1624         int error;
1625
1626         vp = fp->f_vnode;
1627         switch (vp->v_type) {
1628         case VDIR:
1629         case VREG:
1630                 switch (com) {
1631                 case FIONREAD:
1632                         vn_lock(vp, LK_SHARED | LK_RETRY);
1633                         error = VOP_GETATTR(vp, &vattr, active_cred);
1634                         VOP_UNLOCK(vp);
1635                         if (error == 0)
1636                                 *(int *)data = vattr.va_size - fp->f_offset;
1637                         return (error);
1638                 case FIOBMAP2:
1639                         bmarg = (struct fiobmap2_arg *)data;
1640                         vn_lock(vp, LK_SHARED | LK_RETRY);
1641 #ifdef MAC
1642                         error = mac_vnode_check_read(active_cred, fp->f_cred,
1643                             vp);
1644                         if (error == 0)
1645 #endif
1646                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
1647                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
1648                         VOP_UNLOCK(vp);
1649                         return (error);
1650                 case FIONBIO:
1651                 case FIOASYNC:
1652                         return (0);
1653                 default:
1654                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
1655                             active_cred, td));
1656                 }
1657                 break;
1658         case VCHR:
1659                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1660                     active_cred, td));
1661         default:
1662                 return (ENOTTY);
1663         }
1664 }
1665
1666 /*
1667  * File table vnode poll routine.
1668  */
1669 static int
1670 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1671     struct thread *td)
1672 {
1673         struct vnode *vp;
1674         int error;
1675
1676         vp = fp->f_vnode;
1677 #if defined(MAC) || defined(AUDIT)
1678         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1679                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1680                 AUDIT_ARG_VNODE1(vp);
1681                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1682                 VOP_UNLOCK(vp);
1683                 if (error != 0)
1684                         return (error);
1685         }
1686 #endif
1687         error = VOP_POLL(vp, events, fp->f_cred, td);
1688         return (error);
1689 }
1690
1691 /*
1692  * Acquire the requested lock and then check for validity.  LK_RETRY
1693  * permits vn_lock to return doomed vnodes.
1694  */
1695 static int __noinline
1696 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1697     int error)
1698 {
1699
1700         KASSERT((flags & LK_RETRY) == 0 || error == 0,
1701             ("vn_lock: error %d incompatible with flags %#x", error, flags));
1702
1703         if (error == 0)
1704                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1705
1706         if ((flags & LK_RETRY) == 0) {
1707                 if (error == 0) {
1708                         VOP_UNLOCK(vp);
1709                         error = ENOENT;
1710                 }
1711                 return (error);
1712         }
1713
1714         /*
1715          * LK_RETRY case.
1716          *
1717          * Nothing to do if we got the lock.
1718          */
1719         if (error == 0)
1720                 return (0);
1721
1722         /*
1723          * Interlock was dropped by the call in _vn_lock.
1724          */
1725         flags &= ~LK_INTERLOCK;
1726         do {
1727                 error = VOP_LOCK1(vp, flags, file, line);
1728         } while (error != 0);
1729         return (0);
1730 }
1731
1732 int
1733 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1734 {
1735         int error;
1736
1737         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1738             ("vn_lock: no locktype (%d passed)", flags));
1739         VNPASS(vp->v_holdcnt > 0, vp);
1740         error = VOP_LOCK1(vp, flags, file, line);
1741         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1742                 return (_vn_lock_fallback(vp, flags, file, line, error));
1743         return (0);
1744 }
1745
1746 /*
1747  * File table vnode close routine.
1748  */
1749 static int
1750 vn_closefile(struct file *fp, struct thread *td)
1751 {
1752         struct vnode *vp;
1753         struct flock lf;
1754         int error;
1755         bool ref;
1756
1757         vp = fp->f_vnode;
1758         fp->f_ops = &badfileops;
1759         ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1760
1761         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1762
1763         if (__predict_false(ref)) {
1764                 lf.l_whence = SEEK_SET;
1765                 lf.l_start = 0;
1766                 lf.l_len = 0;
1767                 lf.l_type = F_UNLCK;
1768                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1769                 vrele(vp);
1770         }
1771         return (error);
1772 }
1773
1774 /*
1775  * Preparing to start a filesystem write operation. If the operation is
1776  * permitted, then we bump the count of operations in progress and
1777  * proceed. If a suspend request is in progress, we wait until the
1778  * suspension is over, and then proceed.
1779  */
1780 static int
1781 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
1782 {
1783         struct mount_pcpu *mpcpu;
1784         int error, mflags;
1785
1786         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
1787             vfs_op_thread_enter(mp, mpcpu)) {
1788                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1789                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
1790                 vfs_op_thread_exit(mp, mpcpu);
1791                 return (0);
1792         }
1793
1794         if (mplocked)
1795                 mtx_assert(MNT_MTX(mp), MA_OWNED);
1796         else
1797                 MNT_ILOCK(mp);
1798
1799         error = 0;
1800
1801         /*
1802          * Check on status of suspension.
1803          */
1804         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1805             mp->mnt_susp_owner != curthread) {
1806                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1807                     (flags & PCATCH) : 0) | (PUSER - 1);
1808                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1809                         if (flags & V_NOWAIT) {
1810                                 error = EWOULDBLOCK;
1811                                 goto unlock;
1812                         }
1813                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1814                             "suspfs", 0);
1815                         if (error)
1816                                 goto unlock;
1817                 }
1818         }
1819         if (flags & V_XSLEEP)
1820                 goto unlock;
1821         mp->mnt_writeopcount++;
1822 unlock:
1823         if (error != 0 || (flags & V_XSLEEP) != 0)
1824                 MNT_REL(mp);
1825         MNT_IUNLOCK(mp);
1826         return (error);
1827 }
1828
1829 int
1830 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1831 {
1832         struct mount *mp;
1833         int error;
1834
1835         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1836             ("V_MNTREF requires mp"));
1837
1838         error = 0;
1839         /*
1840          * If a vnode is provided, get and return the mount point that
1841          * to which it will write.
1842          */
1843         if (vp != NULL) {
1844                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1845                         *mpp = NULL;
1846                         if (error != EOPNOTSUPP)
1847                                 return (error);
1848                         return (0);
1849                 }
1850         }
1851         if ((mp = *mpp) == NULL)
1852                 return (0);
1853
1854         /*
1855          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1856          * a vfs_ref().
1857          * As long as a vnode is not provided we need to acquire a
1858          * refcount for the provided mountpoint too, in order to
1859          * emulate a vfs_ref().
1860          */
1861         if (vp == NULL && (flags & V_MNTREF) == 0)
1862                 vfs_ref(mp);
1863
1864         return (vn_start_write_refed(mp, flags, false));
1865 }
1866
1867 /*
1868  * Secondary suspension. Used by operations such as vop_inactive
1869  * routines that are needed by the higher level functions. These
1870  * are allowed to proceed until all the higher level functions have
1871  * completed (indicated by mnt_writeopcount dropping to zero). At that
1872  * time, these operations are halted until the suspension is over.
1873  */
1874 int
1875 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1876 {
1877         struct mount *mp;
1878         int error;
1879
1880         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1881             ("V_MNTREF requires mp"));
1882
1883  retry:
1884         if (vp != NULL) {
1885                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1886                         *mpp = NULL;
1887                         if (error != EOPNOTSUPP)
1888                                 return (error);
1889                         return (0);
1890                 }
1891         }
1892         /*
1893          * If we are not suspended or have not yet reached suspended
1894          * mode, then let the operation proceed.
1895          */
1896         if ((mp = *mpp) == NULL)
1897                 return (0);
1898
1899         /*
1900          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1901          * a vfs_ref().
1902          * As long as a vnode is not provided we need to acquire a
1903          * refcount for the provided mountpoint too, in order to
1904          * emulate a vfs_ref().
1905          */
1906         MNT_ILOCK(mp);
1907         if (vp == NULL && (flags & V_MNTREF) == 0)
1908                 MNT_REF(mp);
1909         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1910                 mp->mnt_secondary_writes++;
1911                 mp->mnt_secondary_accwrites++;
1912                 MNT_IUNLOCK(mp);
1913                 return (0);
1914         }
1915         if (flags & V_NOWAIT) {
1916                 MNT_REL(mp);
1917                 MNT_IUNLOCK(mp);
1918                 return (EWOULDBLOCK);
1919         }
1920         /*
1921          * Wait for the suspension to finish.
1922          */
1923         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1924             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1925             "suspfs", 0);
1926         vfs_rel(mp);
1927         if (error == 0)
1928                 goto retry;
1929         return (error);
1930 }
1931
1932 /*
1933  * Filesystem write operation has completed. If we are suspending and this
1934  * operation is the last one, notify the suspender that the suspension is
1935  * now in effect.
1936  */
1937 void
1938 vn_finished_write(struct mount *mp)
1939 {
1940         struct mount_pcpu *mpcpu;
1941         int c;
1942
1943         if (mp == NULL)
1944                 return;
1945
1946         if (vfs_op_thread_enter(mp, mpcpu)) {
1947                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
1948                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
1949                 vfs_op_thread_exit(mp, mpcpu);
1950                 return;
1951         }
1952
1953         MNT_ILOCK(mp);
1954         vfs_assert_mount_counters(mp);
1955         MNT_REL(mp);
1956         c = --mp->mnt_writeopcount;
1957         if (mp->mnt_vfs_ops == 0) {
1958                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
1959                 MNT_IUNLOCK(mp);
1960                 return;
1961         }
1962         if (c < 0)
1963                 vfs_dump_mount_counters(mp);
1964         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
1965                 wakeup(&mp->mnt_writeopcount);
1966         MNT_IUNLOCK(mp);
1967 }
1968
1969 /*
1970  * Filesystem secondary write operation has completed. If we are
1971  * suspending and this operation is the last one, notify the suspender
1972  * that the suspension is now in effect.
1973  */
1974 void
1975 vn_finished_secondary_write(struct mount *mp)
1976 {
1977         if (mp == NULL)
1978                 return;
1979         MNT_ILOCK(mp);
1980         MNT_REL(mp);
1981         mp->mnt_secondary_writes--;
1982         if (mp->mnt_secondary_writes < 0)
1983                 panic("vn_finished_secondary_write: neg cnt");
1984         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1985             mp->mnt_secondary_writes <= 0)
1986                 wakeup(&mp->mnt_secondary_writes);
1987         MNT_IUNLOCK(mp);
1988 }
1989
1990 /*
1991  * Request a filesystem to suspend write operations.
1992  */
1993 int
1994 vfs_write_suspend(struct mount *mp, int flags)
1995 {
1996         int error;
1997
1998         vfs_op_enter(mp);
1999
2000         MNT_ILOCK(mp);
2001         vfs_assert_mount_counters(mp);
2002         if (mp->mnt_susp_owner == curthread) {
2003                 vfs_op_exit_locked(mp);
2004                 MNT_IUNLOCK(mp);
2005                 return (EALREADY);
2006         }
2007         while (mp->mnt_kern_flag & MNTK_SUSPEND)
2008                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
2009
2010         /*
2011          * Unmount holds a write reference on the mount point.  If we
2012          * own busy reference and drain for writers, we deadlock with
2013          * the reference draining in the unmount path.  Callers of
2014          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2015          * vfs_busy() reference is owned and caller is not in the
2016          * unmount context.
2017          */
2018         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2019             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2020                 vfs_op_exit_locked(mp);
2021                 MNT_IUNLOCK(mp);
2022                 return (EBUSY);
2023         }
2024
2025         mp->mnt_kern_flag |= MNTK_SUSPEND;
2026         mp->mnt_susp_owner = curthread;
2027         if (mp->mnt_writeopcount > 0)
2028                 (void) msleep(&mp->mnt_writeopcount,
2029                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
2030         else
2031                 MNT_IUNLOCK(mp);
2032         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2033                 vfs_write_resume(mp, 0);
2034                 /* vfs_write_resume does vfs_op_exit() for us */
2035         }
2036         return (error);
2037 }
2038
2039 /*
2040  * Request a filesystem to resume write operations.
2041  */
2042 void
2043 vfs_write_resume(struct mount *mp, int flags)
2044 {
2045
2046         MNT_ILOCK(mp);
2047         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2048                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2049                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2050                                        MNTK_SUSPENDED);
2051                 mp->mnt_susp_owner = NULL;
2052                 wakeup(&mp->mnt_writeopcount);
2053                 wakeup(&mp->mnt_flag);
2054                 curthread->td_pflags &= ~TDP_IGNSUSP;
2055                 if ((flags & VR_START_WRITE) != 0) {
2056                         MNT_REF(mp);
2057                         mp->mnt_writeopcount++;
2058                 }
2059                 MNT_IUNLOCK(mp);
2060                 if ((flags & VR_NO_SUSPCLR) == 0)
2061                         VFS_SUSP_CLEAN(mp);
2062                 vfs_op_exit(mp);
2063         } else if ((flags & VR_START_WRITE) != 0) {
2064                 MNT_REF(mp);
2065                 vn_start_write_refed(mp, 0, true);
2066         } else {
2067                 MNT_IUNLOCK(mp);
2068         }
2069 }
2070
2071 /*
2072  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2073  * methods.
2074  */
2075 int
2076 vfs_write_suspend_umnt(struct mount *mp)
2077 {
2078         int error;
2079
2080         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2081             ("vfs_write_suspend_umnt: recursed"));
2082
2083         /* dounmount() already called vn_start_write(). */
2084         for (;;) {
2085                 vn_finished_write(mp);
2086                 error = vfs_write_suspend(mp, 0);
2087                 if (error != 0) {
2088                         vn_start_write(NULL, &mp, V_WAIT);
2089                         return (error);
2090                 }
2091                 MNT_ILOCK(mp);
2092                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2093                         break;
2094                 MNT_IUNLOCK(mp);
2095                 vn_start_write(NULL, &mp, V_WAIT);
2096         }
2097         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2098         wakeup(&mp->mnt_flag);
2099         MNT_IUNLOCK(mp);
2100         curthread->td_pflags |= TDP_IGNSUSP;
2101         return (0);
2102 }
2103
2104 /*
2105  * Implement kqueues for files by translating it to vnode operation.
2106  */
2107 static int
2108 vn_kqfilter(struct file *fp, struct knote *kn)
2109 {
2110
2111         return (VOP_KQFILTER(fp->f_vnode, kn));
2112 }
2113
2114 /*
2115  * Simplified in-kernel wrapper calls for extended attribute access.
2116  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2117  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2118  */
2119 int
2120 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2121     const char *attrname, int *buflen, char *buf, struct thread *td)
2122 {
2123         struct uio      auio;
2124         struct iovec    iov;
2125         int     error;
2126
2127         iov.iov_len = *buflen;
2128         iov.iov_base = buf;
2129
2130         auio.uio_iov = &iov;
2131         auio.uio_iovcnt = 1;
2132         auio.uio_rw = UIO_READ;
2133         auio.uio_segflg = UIO_SYSSPACE;
2134         auio.uio_td = td;
2135         auio.uio_offset = 0;
2136         auio.uio_resid = *buflen;
2137
2138         if ((ioflg & IO_NODELOCKED) == 0)
2139                 vn_lock(vp, LK_SHARED | LK_RETRY);
2140
2141         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2142
2143         /* authorize attribute retrieval as kernel */
2144         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2145             td);
2146
2147         if ((ioflg & IO_NODELOCKED) == 0)
2148                 VOP_UNLOCK(vp);
2149
2150         if (error == 0) {
2151                 *buflen = *buflen - auio.uio_resid;
2152         }
2153
2154         return (error);
2155 }
2156
2157 /*
2158  * XXX failure mode if partially written?
2159  */
2160 int
2161 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2162     const char *attrname, int buflen, char *buf, struct thread *td)
2163 {
2164         struct uio      auio;
2165         struct iovec    iov;
2166         struct mount    *mp;
2167         int     error;
2168
2169         iov.iov_len = buflen;
2170         iov.iov_base = buf;
2171
2172         auio.uio_iov = &iov;
2173         auio.uio_iovcnt = 1;
2174         auio.uio_rw = UIO_WRITE;
2175         auio.uio_segflg = UIO_SYSSPACE;
2176         auio.uio_td = td;
2177         auio.uio_offset = 0;
2178         auio.uio_resid = buflen;
2179
2180         if ((ioflg & IO_NODELOCKED) == 0) {
2181                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2182                         return (error);
2183                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2184         }
2185
2186         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2187
2188         /* authorize attribute setting as kernel */
2189         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2190
2191         if ((ioflg & IO_NODELOCKED) == 0) {
2192                 vn_finished_write(mp);
2193                 VOP_UNLOCK(vp);
2194         }
2195
2196         return (error);
2197 }
2198
2199 int
2200 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2201     const char *attrname, struct thread *td)
2202 {
2203         struct mount    *mp;
2204         int     error;
2205
2206         if ((ioflg & IO_NODELOCKED) == 0) {
2207                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2208                         return (error);
2209                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2210         }
2211
2212         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2213
2214         /* authorize attribute removal as kernel */
2215         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2216         if (error == EOPNOTSUPP)
2217                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2218                     NULL, td);
2219
2220         if ((ioflg & IO_NODELOCKED) == 0) {
2221                 vn_finished_write(mp);
2222                 VOP_UNLOCK(vp);
2223         }
2224
2225         return (error);
2226 }
2227
2228 static int
2229 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2230     struct vnode **rvp)
2231 {
2232
2233         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2234 }
2235
2236 int
2237 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2238 {
2239
2240         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2241             lkflags, rvp));
2242 }
2243
2244 int
2245 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2246     int lkflags, struct vnode **rvp)
2247 {
2248         struct mount *mp;
2249         int ltype, error;
2250
2251         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2252         mp = vp->v_mount;
2253         ltype = VOP_ISLOCKED(vp);
2254         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2255             ("vn_vget_ino: vp not locked"));
2256         error = vfs_busy(mp, MBF_NOWAIT);
2257         if (error != 0) {
2258                 vfs_ref(mp);
2259                 VOP_UNLOCK(vp);
2260                 error = vfs_busy(mp, 0);
2261                 vn_lock(vp, ltype | LK_RETRY);
2262                 vfs_rel(mp);
2263                 if (error != 0)
2264                         return (ENOENT);
2265                 if (VN_IS_DOOMED(vp)) {
2266                         vfs_unbusy(mp);
2267                         return (ENOENT);
2268                 }
2269         }
2270         VOP_UNLOCK(vp);
2271         error = alloc(mp, alloc_arg, lkflags, rvp);
2272         vfs_unbusy(mp);
2273         if (error != 0 || *rvp != vp)
2274                 vn_lock(vp, ltype | LK_RETRY);
2275         if (VN_IS_DOOMED(vp)) {
2276                 if (error == 0) {
2277                         if (*rvp == vp)
2278                                 vunref(vp);
2279                         else
2280                                 vput(*rvp);
2281                 }
2282                 error = ENOENT;
2283         }
2284         return (error);
2285 }
2286
2287 int
2288 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2289     struct thread *td)
2290 {
2291
2292         if (vp->v_type != VREG || td == NULL)
2293                 return (0);
2294         if ((uoff_t)uio->uio_offset + uio->uio_resid >
2295             lim_cur(td, RLIMIT_FSIZE)) {
2296                 PROC_LOCK(td->td_proc);
2297                 kern_psignal(td->td_proc, SIGXFSZ);
2298                 PROC_UNLOCK(td->td_proc);
2299                 return (EFBIG);
2300         }
2301         return (0);
2302 }
2303
2304 int
2305 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2306     struct thread *td)
2307 {
2308         struct vnode *vp;
2309
2310         vp = fp->f_vnode;
2311 #ifdef AUDIT
2312         vn_lock(vp, LK_SHARED | LK_RETRY);
2313         AUDIT_ARG_VNODE1(vp);
2314         VOP_UNLOCK(vp);
2315 #endif
2316         return (setfmode(td, active_cred, vp, mode));
2317 }
2318
2319 int
2320 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2321     struct thread *td)
2322 {
2323         struct vnode *vp;
2324
2325         vp = fp->f_vnode;
2326 #ifdef AUDIT
2327         vn_lock(vp, LK_SHARED | LK_RETRY);
2328         AUDIT_ARG_VNODE1(vp);
2329         VOP_UNLOCK(vp);
2330 #endif
2331         return (setfown(td, active_cred, vp, uid, gid));
2332 }
2333
2334 void
2335 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2336 {
2337         vm_object_t object;
2338
2339         if ((object = vp->v_object) == NULL)
2340                 return;
2341         VM_OBJECT_WLOCK(object);
2342         vm_object_page_remove(object, start, end, 0);
2343         VM_OBJECT_WUNLOCK(object);
2344 }
2345
2346 int
2347 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2348 {
2349         struct vattr va;
2350         daddr_t bn, bnp;
2351         uint64_t bsize;
2352         off_t noff;
2353         int error;
2354
2355         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2356             ("Wrong command %lu", cmd));
2357
2358         if (vn_lock(vp, LK_SHARED) != 0)
2359                 return (EBADF);
2360         if (vp->v_type != VREG) {
2361                 error = ENOTTY;
2362                 goto unlock;
2363         }
2364         error = VOP_GETATTR(vp, &va, cred);
2365         if (error != 0)
2366                 goto unlock;
2367         noff = *off;
2368         if (noff >= va.va_size) {
2369                 error = ENXIO;
2370                 goto unlock;
2371         }
2372         bsize = vp->v_mount->mnt_stat.f_iosize;
2373         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
2374             noff % bsize) {
2375                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2376                 if (error == EOPNOTSUPP) {
2377                         error = ENOTTY;
2378                         goto unlock;
2379                 }
2380                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2381                     (bnp != -1 && cmd == FIOSEEKDATA)) {
2382                         noff = bn * bsize;
2383                         if (noff < *off)
2384                                 noff = *off;
2385                         goto unlock;
2386                 }
2387         }
2388         if (noff > va.va_size)
2389                 noff = va.va_size;
2390         /* noff == va.va_size. There is an implicit hole at the end of file. */
2391         if (cmd == FIOSEEKDATA)
2392                 error = ENXIO;
2393 unlock:
2394         VOP_UNLOCK(vp);
2395         if (error == 0)
2396                 *off = noff;
2397         return (error);
2398 }
2399
2400 int
2401 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2402 {
2403         struct ucred *cred;
2404         struct vnode *vp;
2405         struct vattr vattr;
2406         off_t foffset, size;
2407         int error, noneg;
2408
2409         cred = td->td_ucred;
2410         vp = fp->f_vnode;
2411         foffset = foffset_lock(fp, 0);
2412         noneg = (vp->v_type != VCHR);
2413         error = 0;
2414         switch (whence) {
2415         case L_INCR:
2416                 if (noneg &&
2417                     (foffset < 0 ||
2418                     (offset > 0 && foffset > OFF_MAX - offset))) {
2419                         error = EOVERFLOW;
2420                         break;
2421                 }
2422                 offset += foffset;
2423                 break;
2424         case L_XTND:
2425                 vn_lock(vp, LK_SHARED | LK_RETRY);
2426                 error = VOP_GETATTR(vp, &vattr, cred);
2427                 VOP_UNLOCK(vp);
2428                 if (error)
2429                         break;
2430
2431                 /*
2432                  * If the file references a disk device, then fetch
2433                  * the media size and use that to determine the ending
2434                  * offset.
2435                  */
2436                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2437                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2438                         vattr.va_size = size;
2439                 if (noneg &&
2440                     (vattr.va_size > OFF_MAX ||
2441                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2442                         error = EOVERFLOW;
2443                         break;
2444                 }
2445                 offset += vattr.va_size;
2446                 break;
2447         case L_SET:
2448                 break;
2449         case SEEK_DATA:
2450                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2451                 if (error == ENOTTY)
2452                         error = EINVAL;
2453                 break;
2454         case SEEK_HOLE:
2455                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2456                 if (error == ENOTTY)
2457                         error = EINVAL;
2458                 break;
2459         default:
2460                 error = EINVAL;
2461         }
2462         if (error == 0 && noneg && offset < 0)
2463                 error = EINVAL;
2464         if (error != 0)
2465                 goto drop;
2466         VFS_KNOTE_UNLOCKED(vp, 0);
2467         td->td_uretoff.tdu_off = offset;
2468 drop:
2469         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2470         return (error);
2471 }
2472
2473 int
2474 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2475     struct thread *td)
2476 {
2477         int error;
2478
2479         /*
2480          * Grant permission if the caller is the owner of the file, or
2481          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2482          * on the file.  If the time pointer is null, then write
2483          * permission on the file is also sufficient.
2484          *
2485          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2486          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2487          * will be allowed to set the times [..] to the current
2488          * server time.
2489          */
2490         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2491         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2492                 error = VOP_ACCESS(vp, VWRITE, cred, td);
2493         return (error);
2494 }
2495
2496 int
2497 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2498 {
2499         struct vnode *vp;
2500         int error;
2501
2502         if (fp->f_type == DTYPE_FIFO)
2503                 kif->kf_type = KF_TYPE_FIFO;
2504         else
2505                 kif->kf_type = KF_TYPE_VNODE;
2506         vp = fp->f_vnode;
2507         vref(vp);
2508         FILEDESC_SUNLOCK(fdp);
2509         error = vn_fill_kinfo_vnode(vp, kif);
2510         vrele(vp);
2511         FILEDESC_SLOCK(fdp);
2512         return (error);
2513 }
2514
2515 static inline void
2516 vn_fill_junk(struct kinfo_file *kif)
2517 {
2518         size_t len, olen;
2519
2520         /*
2521          * Simulate vn_fullpath returning changing values for a given
2522          * vp during e.g. coredump.
2523          */
2524         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2525         olen = strlen(kif->kf_path);
2526         if (len < olen)
2527                 strcpy(&kif->kf_path[len - 1], "$");
2528         else
2529                 for (; olen < len; olen++)
2530                         strcpy(&kif->kf_path[olen], "A");
2531 }
2532
2533 int
2534 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2535 {
2536         struct vattr va;
2537         char *fullpath, *freepath;
2538         int error;
2539
2540         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2541         freepath = NULL;
2542         fullpath = "-";
2543         error = vn_fullpath(vp, &fullpath, &freepath);
2544         if (error == 0) {
2545                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2546         }
2547         if (freepath != NULL)
2548                 free(freepath, M_TEMP);
2549
2550         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2551                 vn_fill_junk(kif);
2552         );
2553
2554         /*
2555          * Retrieve vnode attributes.
2556          */
2557         va.va_fsid = VNOVAL;
2558         va.va_rdev = NODEV;
2559         vn_lock(vp, LK_SHARED | LK_RETRY);
2560         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2561         VOP_UNLOCK(vp);
2562         if (error != 0)
2563                 return (error);
2564         if (va.va_fsid != VNOVAL)
2565                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2566         else
2567                 kif->kf_un.kf_file.kf_file_fsid =
2568                     vp->v_mount->mnt_stat.f_fsid.val[0];
2569         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2570             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2571         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2572         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2573         kif->kf_un.kf_file.kf_file_size = va.va_size;
2574         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2575         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2576             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2577         return (0);
2578 }
2579
2580 int
2581 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2582     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2583     struct thread *td)
2584 {
2585 #ifdef HWPMC_HOOKS
2586         struct pmckern_map_in pkm;
2587 #endif
2588         struct mount *mp;
2589         struct vnode *vp;
2590         vm_object_t object;
2591         vm_prot_t maxprot;
2592         boolean_t writecounted;
2593         int error;
2594
2595 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2596     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2597         /*
2598          * POSIX shared-memory objects are defined to have
2599          * kernel persistence, and are not defined to support
2600          * read(2)/write(2) -- or even open(2).  Thus, we can
2601          * use MAP_ASYNC to trade on-disk coherence for speed.
2602          * The shm_open(3) library routine turns on the FPOSIXSHM
2603          * flag to request this behavior.
2604          */
2605         if ((fp->f_flag & FPOSIXSHM) != 0)
2606                 flags |= MAP_NOSYNC;
2607 #endif
2608         vp = fp->f_vnode;
2609
2610         /*
2611          * Ensure that file and memory protections are
2612          * compatible.  Note that we only worry about
2613          * writability if mapping is shared; in this case,
2614          * current and max prot are dictated by the open file.
2615          * XXX use the vnode instead?  Problem is: what
2616          * credentials do we use for determination? What if
2617          * proc does a setuid?
2618          */
2619         mp = vp->v_mount;
2620         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2621                 maxprot = VM_PROT_NONE;
2622                 if ((prot & VM_PROT_EXECUTE) != 0)
2623                         return (EACCES);
2624         } else
2625                 maxprot = VM_PROT_EXECUTE;
2626         if ((fp->f_flag & FREAD) != 0)
2627                 maxprot |= VM_PROT_READ;
2628         else if ((prot & VM_PROT_READ) != 0)
2629                 return (EACCES);
2630
2631         /*
2632          * If we are sharing potential changes via MAP_SHARED and we
2633          * are trying to get write permission although we opened it
2634          * without asking for it, bail out.
2635          */
2636         if ((flags & MAP_SHARED) != 0) {
2637                 if ((fp->f_flag & FWRITE) != 0)
2638                         maxprot |= VM_PROT_WRITE;
2639                 else if ((prot & VM_PROT_WRITE) != 0)
2640                         return (EACCES);
2641         } else {
2642                 maxprot |= VM_PROT_WRITE;
2643                 cap_maxprot |= VM_PROT_WRITE;
2644         }
2645         maxprot &= cap_maxprot;
2646
2647         /*
2648          * For regular files and shared memory, POSIX requires that
2649          * the value of foff be a legitimate offset within the data
2650          * object.  In particular, negative offsets are invalid.
2651          * Blocking negative offsets and overflows here avoids
2652          * possible wraparound or user-level access into reserved
2653          * ranges of the data object later.  In contrast, POSIX does
2654          * not dictate how offsets are used by device drivers, so in
2655          * the case of a device mapping a negative offset is passed
2656          * on.
2657          */
2658         if (
2659 #ifdef _LP64
2660             size > OFF_MAX ||
2661 #endif
2662             foff > OFF_MAX - size)
2663                 return (EINVAL);
2664
2665         writecounted = FALSE;
2666         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2667             &foff, &object, &writecounted);
2668         if (error != 0)
2669                 return (error);
2670         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2671             foff, writecounted, td);
2672         if (error != 0) {
2673                 /*
2674                  * If this mapping was accounted for in the vnode's
2675                  * writecount, then undo that now.
2676                  */
2677                 if (writecounted)
2678                         vm_pager_release_writecount(object, 0, size);
2679                 vm_object_deallocate(object);
2680         }
2681 #ifdef HWPMC_HOOKS
2682         /* Inform hwpmc(4) if an executable is being mapped. */
2683         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2684                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2685                         pkm.pm_file = vp;
2686                         pkm.pm_address = (uintptr_t) *addr;
2687                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
2688                 }
2689         }
2690 #endif
2691         return (error);
2692 }
2693
2694 void
2695 vn_fsid(struct vnode *vp, struct vattr *va)
2696 {
2697         fsid_t *f;
2698
2699         f = &vp->v_mount->mnt_stat.f_fsid;
2700         va->va_fsid = (uint32_t)f->val[1];
2701         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
2702         va->va_fsid += (uint32_t)f->val[0];
2703 }
2704
2705 int
2706 vn_fsync_buf(struct vnode *vp, int waitfor)
2707 {
2708         struct buf *bp, *nbp;
2709         struct bufobj *bo;
2710         struct mount *mp;
2711         int error, maxretry;
2712
2713         error = 0;
2714         maxretry = 10000;     /* large, arbitrarily chosen */
2715         mp = NULL;
2716         if (vp->v_type == VCHR) {
2717                 VI_LOCK(vp);
2718                 mp = vp->v_rdev->si_mountpt;
2719                 VI_UNLOCK(vp);
2720         }
2721         bo = &vp->v_bufobj;
2722         BO_LOCK(bo);
2723 loop1:
2724         /*
2725          * MARK/SCAN initialization to avoid infinite loops.
2726          */
2727         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2728                 bp->b_vflags &= ~BV_SCANNED;
2729                 bp->b_error = 0;
2730         }
2731
2732         /*
2733          * Flush all dirty buffers associated with a vnode.
2734          */
2735 loop2:
2736         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2737                 if ((bp->b_vflags & BV_SCANNED) != 0)
2738                         continue;
2739                 bp->b_vflags |= BV_SCANNED;
2740                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2741                         if (waitfor != MNT_WAIT)
2742                                 continue;
2743                         if (BUF_LOCK(bp,
2744                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
2745                             BO_LOCKPTR(bo)) != 0) {
2746                                 BO_LOCK(bo);
2747                                 goto loop1;
2748                         }
2749                         BO_LOCK(bo);
2750                 }
2751                 BO_UNLOCK(bo);
2752                 KASSERT(bp->b_bufobj == bo,
2753                     ("bp %p wrong b_bufobj %p should be %p",
2754                     bp, bp->b_bufobj, bo));
2755                 if ((bp->b_flags & B_DELWRI) == 0)
2756                         panic("fsync: not dirty");
2757                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
2758                         vfs_bio_awrite(bp);
2759                 } else {
2760                         bremfree(bp);
2761                         bawrite(bp);
2762                 }
2763                 if (maxretry < 1000)
2764                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
2765                 BO_LOCK(bo);
2766                 goto loop2;
2767         }
2768
2769         /*
2770          * If synchronous the caller expects us to completely resolve all
2771          * dirty buffers in the system.  Wait for in-progress I/O to
2772          * complete (which could include background bitmap writes), then
2773          * retry if dirty blocks still exist.
2774          */
2775         if (waitfor == MNT_WAIT) {
2776                 bufobj_wwait(bo, 0, 0);
2777                 if (bo->bo_dirty.bv_cnt > 0) {
2778                         /*
2779                          * If we are unable to write any of these buffers
2780                          * then we fail now rather than trying endlessly
2781                          * to write them out.
2782                          */
2783                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
2784                                 if ((error = bp->b_error) != 0)
2785                                         break;
2786                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
2787                             (error == 0 && --maxretry >= 0))
2788                                 goto loop1;
2789                         if (error == 0)
2790                                 error = EAGAIN;
2791                 }
2792         }
2793         BO_UNLOCK(bo);
2794         if (error != 0)
2795                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
2796
2797         return (error);
2798 }
2799
2800 /*
2801  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
2802  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
2803  * to do the actual copy.
2804  * vn_generic_copy_file_range() is factored out, so it can be called
2805  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
2806  * different file systems.
2807  */
2808 int
2809 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
2810     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
2811     struct ucred *outcred, struct thread *fsize_td)
2812 {
2813         int error;
2814         size_t len;
2815         uint64_t uval;
2816
2817         len = *lenp;
2818         *lenp = 0;              /* For error returns. */
2819         error = 0;
2820
2821         /* Do some sanity checks on the arguments. */
2822         if (invp->v_type == VDIR || outvp->v_type == VDIR)
2823                 error = EISDIR;
2824         else if (*inoffp < 0 || *outoffp < 0 ||
2825             invp->v_type != VREG || outvp->v_type != VREG)
2826                 error = EINVAL;
2827         if (error != 0)
2828                 goto out;
2829
2830         /* Ensure offset + len does not wrap around. */
2831         uval = *inoffp;
2832         uval += len;
2833         if (uval > INT64_MAX)
2834                 len = INT64_MAX - *inoffp;
2835         uval = *outoffp;
2836         uval += len;
2837         if (uval > INT64_MAX)
2838                 len = INT64_MAX - *outoffp;
2839         if (len == 0)
2840                 goto out;
2841
2842         /*
2843          * If the two vnode are for the same file system, call
2844          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
2845          * which can handle copies across multiple file systems.
2846          */
2847         *lenp = len;
2848         if (invp->v_mount == outvp->v_mount)
2849                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
2850                     lenp, flags, incred, outcred, fsize_td);
2851         else
2852                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
2853                     outoffp, lenp, flags, incred, outcred, fsize_td);
2854 out:
2855         return (error);
2856 }
2857
2858 /*
2859  * Test len bytes of data starting at dat for all bytes == 0.
2860  * Return true if all bytes are zero, false otherwise.
2861  * Expects dat to be well aligned.
2862  */
2863 static bool
2864 mem_iszero(void *dat, int len)
2865 {
2866         int i;
2867         const u_int *p;
2868         const char *cp;
2869
2870         for (p = dat; len > 0; len -= sizeof(*p), p++) {
2871                 if (len >= sizeof(*p)) {
2872                         if (*p != 0)
2873                                 return (false);
2874                 } else {
2875                         cp = (const char *)p;
2876                         for (i = 0; i < len; i++, cp++)
2877                                 if (*cp != '\0')
2878                                         return (false);
2879                 }
2880         }
2881         return (true);
2882 }
2883
2884 /*
2885  * Look for a hole in the output file and, if found, adjust *outoffp
2886  * and *xferp to skip past the hole.
2887  * *xferp is the entire hole length to be written and xfer2 is how many bytes
2888  * to be written as 0's upon return.
2889  */
2890 static off_t
2891 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
2892     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
2893 {
2894         int error;
2895         off_t delta;
2896
2897         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
2898                 *dataoffp = *outoffp;
2899                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
2900                     curthread);
2901                 if (error == 0) {
2902                         *holeoffp = *dataoffp;
2903                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
2904                             curthread);
2905                 }
2906                 if (error != 0 || *holeoffp == *dataoffp) {
2907                         /*
2908                          * Since outvp is unlocked, it may be possible for
2909                          * another thread to do a truncate(), lseek(), write()
2910                          * creating a hole at startoff between the above
2911                          * VOP_IOCTL() calls, if the other thread does not do
2912                          * rangelocking.
2913                          * If that happens, *holeoffp == *dataoffp and finding
2914                          * the hole has failed, so disable vn_skip_hole().
2915                          */
2916                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
2917                         return (xfer2);
2918                 }
2919                 KASSERT(*dataoffp >= *outoffp,
2920                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
2921                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
2922                 KASSERT(*holeoffp > *dataoffp,
2923                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
2924                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
2925         }
2926
2927         /*
2928          * If there is a hole before the data starts, advance *outoffp and
2929          * *xferp past the hole.
2930          */
2931         if (*dataoffp > *outoffp) {
2932                 delta = *dataoffp - *outoffp;
2933                 if (delta >= *xferp) {
2934                         /* Entire *xferp is a hole. */
2935                         *outoffp += *xferp;
2936                         *xferp = 0;
2937                         return (0);
2938                 }
2939                 *xferp -= delta;
2940                 *outoffp += delta;
2941                 xfer2 = MIN(xfer2, *xferp);
2942         }
2943
2944         /*
2945          * If a hole starts before the end of this xfer2, reduce this xfer2 so
2946          * that the write ends at the start of the hole.
2947          * *holeoffp should always be greater than *outoffp, but for the
2948          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
2949          * value.
2950          */
2951         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
2952                 xfer2 = *holeoffp - *outoffp;
2953         return (xfer2);
2954 }
2955
2956 /*
2957  * Write an xfer sized chunk to outvp in blksize blocks from dat.
2958  * dat is a maximum of blksize in length and can be written repeatedly in
2959  * the chunk.
2960  * If growfile == true, just grow the file via vn_truncate_locked() instead
2961  * of doing actual writes.
2962  * If checkhole == true, a hole is being punched, so skip over any hole
2963  * already in the output file.
2964  */
2965 static int
2966 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
2967     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
2968 {
2969         struct mount *mp;
2970         off_t dataoff, holeoff, xfer2;
2971         int error, lckf;
2972
2973         /*
2974          * Loop around doing writes of blksize until write has been completed.
2975          * Lock/unlock on each loop iteration so that a bwillwrite() can be
2976          * done for each iteration, since the xfer argument can be very
2977          * large if there is a large hole to punch in the output file.
2978          */
2979         error = 0;
2980         holeoff = 0;
2981         do {
2982                 xfer2 = MIN(xfer, blksize);
2983                 if (checkhole) {
2984                         /*
2985                          * Punching a hole.  Skip writing if there is
2986                          * already a hole in the output file.
2987                          */
2988                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
2989                             &dataoff, &holeoff, cred);
2990                         if (xfer == 0)
2991                                 break;
2992                         if (holeoff < 0)
2993                                 checkhole = false;
2994                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
2995                             (intmax_t)xfer2));
2996                 }
2997                 bwillwrite();
2998                 mp = NULL;
2999                 error = vn_start_write(outvp, &mp, V_WAIT);
3000                 if (error != 0)
3001                         break;
3002                 if (growfile) {
3003                         error = vn_lock(outvp, LK_EXCLUSIVE);
3004                         if (error == 0) {
3005                                 error = vn_truncate_locked(outvp, outoff + xfer,
3006                                     false, cred);
3007                                 VOP_UNLOCK(outvp);
3008                         }
3009                 } else {
3010                         if (MNT_SHARED_WRITES(mp))
3011                                 lckf = LK_SHARED;
3012                         else
3013                                 lckf = LK_EXCLUSIVE;
3014                         error = vn_lock(outvp, lckf);
3015                         if (error == 0) {
3016                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3017                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
3018                                     curthread->td_ucred, cred, NULL, curthread);
3019                                 outoff += xfer2;
3020                                 xfer -= xfer2;
3021                                 VOP_UNLOCK(outvp);
3022                         }
3023                 }
3024                 if (mp != NULL)
3025                         vn_finished_write(mp);
3026         } while (!growfile && xfer > 0 && error == 0);
3027         return (error);
3028 }
3029
3030 /*
3031  * Copy a byte range of one file to another.  This function can handle the
3032  * case where invp and outvp are on different file systems.
3033  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3034  * is no better file system specific way to do it.
3035  */
3036 int
3037 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3038     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3039     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3040 {
3041         struct vattr va;
3042         struct mount *mp;
3043         struct uio io;
3044         off_t startoff, endoff, xfer, xfer2;
3045         u_long blksize;
3046         int error, interrupted;
3047         bool cantseek, readzeros, eof, lastblock;
3048         ssize_t aresid;
3049         size_t copylen, len, rem, savlen;
3050         char *dat;
3051         long holein, holeout;
3052
3053         holein = holeout = 0;
3054         savlen = len = *lenp;
3055         error = 0;
3056         interrupted = 0;
3057         dat = NULL;
3058
3059         error = vn_lock(invp, LK_SHARED);
3060         if (error != 0)
3061                 goto out;
3062         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3063                 holein = 0;
3064         VOP_UNLOCK(invp);
3065
3066         mp = NULL;
3067         error = vn_start_write(outvp, &mp, V_WAIT);
3068         if (error == 0)
3069                 error = vn_lock(outvp, LK_EXCLUSIVE);
3070         if (error == 0) {
3071                 /*
3072                  * If fsize_td != NULL, do a vn_rlimit_fsize() call,
3073                  * now that outvp is locked.
3074                  */
3075                 if (fsize_td != NULL) {
3076                         io.uio_offset = *outoffp;
3077                         io.uio_resid = len;
3078                         error = vn_rlimit_fsize(outvp, &io, fsize_td);
3079                         if (error != 0)
3080                                 error = EFBIG;
3081                 }
3082                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3083                         holeout = 0;
3084                 /*
3085                  * Holes that are past EOF do not need to be written as a block
3086                  * of zero bytes.  So, truncate the output file as far as
3087                  * possible and then use va.va_size to decide if writing 0
3088                  * bytes is necessary in the loop below.
3089                  */
3090                 if (error == 0)
3091                         error = VOP_GETATTR(outvp, &va, outcred);
3092                 if (error == 0 && va.va_size > *outoffp && va.va_size <=
3093                     *outoffp + len) {
3094 #ifdef MAC
3095                         error = mac_vnode_check_write(curthread->td_ucred,
3096                             outcred, outvp);
3097                         if (error == 0)
3098 #endif
3099                                 error = vn_truncate_locked(outvp, *outoffp,
3100                                     false, outcred);
3101                         if (error == 0)
3102                                 va.va_size = *outoffp;
3103                 }
3104                 VOP_UNLOCK(outvp);
3105         }
3106         if (mp != NULL)
3107                 vn_finished_write(mp);
3108         if (error != 0)
3109                 goto out;
3110
3111         /*
3112          * Set the blksize to the larger of the hole sizes for invp and outvp.
3113          * If hole sizes aren't available, set the blksize to the larger
3114          * f_iosize of invp and outvp.
3115          * This code expects the hole sizes and f_iosizes to be powers of 2.
3116          * This value is clipped at 4Kbytes and 1Mbyte.
3117          */
3118         blksize = MAX(holein, holeout);
3119
3120         /* Clip len to end at an exact multiple of hole size. */
3121         if (blksize > 1) {
3122                 rem = *inoffp % blksize;
3123                 if (rem > 0)
3124                         rem = blksize - rem;
3125                 if (len - rem > blksize)
3126                         len = savlen = rounddown(len - rem, blksize) + rem;
3127         }
3128
3129         if (blksize <= 1)
3130                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3131                     outvp->v_mount->mnt_stat.f_iosize);
3132         if (blksize < 4096)
3133                 blksize = 4096;
3134         else if (blksize > 1024 * 1024)
3135                 blksize = 1024 * 1024;
3136         dat = malloc(blksize, M_TEMP, M_WAITOK);
3137
3138         /*
3139          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3140          * to find holes.  Otherwise, just scan the read block for all 0s
3141          * in the inner loop where the data copying is done.
3142          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3143          * support holes on the server, but do not support FIOSEEKHOLE.
3144          */
3145         eof = false;
3146         while (len > 0 && error == 0 && !eof && interrupted == 0) {
3147                 endoff = 0;                     /* To shut up compilers. */
3148                 cantseek = true;
3149                 startoff = *inoffp;
3150                 copylen = len;
3151
3152                 /*
3153                  * Find the next data area.  If there is just a hole to EOF,
3154                  * FIOSEEKDATA should fail and then we drop down into the
3155                  * inner loop and create the hole on the outvp file.
3156                  * (I do not know if any file system will report a hole to
3157                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3158                  *  will fail for those file systems.)
3159                  *
3160                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3161                  * the code just falls through to the inner copy loop.
3162                  */
3163                 error = EINVAL;
3164                 if (holein > 0)
3165                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3166                             incred, curthread);
3167                 if (error == 0) {
3168                         endoff = startoff;
3169                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3170                             incred, curthread);
3171                         /*
3172                          * Since invp is unlocked, it may be possible for
3173                          * another thread to do a truncate(), lseek(), write()
3174                          * creating a hole at startoff between the above
3175                          * VOP_IOCTL() calls, if the other thread does not do
3176                          * rangelocking.
3177                          * If that happens, startoff == endoff and finding
3178                          * the hole has failed, so set an error.
3179                          */
3180                         if (error == 0 && startoff == endoff)
3181                                 error = EINVAL; /* Any error. Reset to 0. */
3182                 }
3183                 if (error == 0) {
3184                         if (startoff > *inoffp) {
3185                                 /* Found hole before data block. */
3186                                 xfer = MIN(startoff - *inoffp, len);
3187                                 if (*outoffp < va.va_size) {
3188                                         /* Must write 0s to punch hole. */
3189                                         xfer2 = MIN(va.va_size - *outoffp,
3190                                             xfer);
3191                                         memset(dat, 0, MIN(xfer2, blksize));
3192                                         error = vn_write_outvp(outvp, dat,
3193                                             *outoffp, xfer2, blksize, false,
3194                                             holeout > 0, outcred);
3195                                 }
3196
3197                                 if (error == 0 && *outoffp + xfer >
3198                                     va.va_size && xfer == len)
3199                                         /* Grow last block. */
3200                                         error = vn_write_outvp(outvp, dat,
3201                                             *outoffp, xfer, blksize, true,
3202                                             false, outcred);
3203                                 if (error == 0) {
3204                                         *inoffp += xfer;
3205                                         *outoffp += xfer;
3206                                         len -= xfer;
3207                                         if (len < savlen)
3208                                                 interrupted = sig_intr();
3209                                 }
3210                         }
3211                         copylen = MIN(len, endoff - startoff);
3212                         cantseek = false;
3213                 } else {
3214                         cantseek = true;
3215                         startoff = *inoffp;
3216                         copylen = len;
3217                         error = 0;
3218                 }
3219
3220                 xfer = blksize;
3221                 if (cantseek) {
3222                         /*
3223                          * Set first xfer to end at a block boundary, so that
3224                          * holes are more likely detected in the loop below via
3225                          * the for all bytes 0 method.
3226                          */
3227                         xfer -= (*inoffp % blksize);
3228                 }
3229                 /* Loop copying the data block. */
3230                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
3231                         if (copylen < xfer)
3232                                 xfer = copylen;
3233                         error = vn_lock(invp, LK_SHARED);
3234                         if (error != 0)
3235                                 goto out;
3236                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
3237                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
3238                             curthread->td_ucred, incred, &aresid,
3239                             curthread);
3240                         VOP_UNLOCK(invp);
3241                         lastblock = false;
3242                         if (error == 0 && aresid > 0) {
3243                                 /* Stop the copy at EOF on the input file. */
3244                                 xfer -= aresid;
3245                                 eof = true;
3246                                 lastblock = true;
3247                         }
3248                         if (error == 0) {
3249                                 /*
3250                                  * Skip the write for holes past the initial EOF
3251                                  * of the output file, unless this is the last
3252                                  * write of the output file at EOF.
3253                                  */
3254                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
3255                                     false;
3256                                 if (xfer == len)
3257                                         lastblock = true;
3258                                 if (!cantseek || *outoffp < va.va_size ||
3259                                     lastblock || !readzeros)
3260                                         error = vn_write_outvp(outvp, dat,
3261                                             *outoffp, xfer, blksize,
3262                                             readzeros && lastblock &&
3263                                             *outoffp >= va.va_size, false,
3264                                             outcred);
3265                                 if (error == 0) {
3266                                         *inoffp += xfer;
3267                                         startoff += xfer;
3268                                         *outoffp += xfer;
3269                                         copylen -= xfer;
3270                                         len -= xfer;
3271                                         if (len < savlen)
3272                                                 interrupted = sig_intr();
3273                                 }
3274                         }
3275                         xfer = blksize;
3276                 }
3277         }
3278 out:
3279         *lenp = savlen - len;
3280         free(dat, M_TEMP);
3281         return (error);
3282 }
3283
3284 static int
3285 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3286 {
3287         struct mount *mp;
3288         struct vnode *vp;
3289         off_t olen, ooffset;
3290         int error;
3291 #ifdef AUDIT
3292         int audited_vnode1 = 0;
3293 #endif
3294
3295         vp = fp->f_vnode;
3296         if (vp->v_type != VREG)
3297                 return (ENODEV);
3298
3299         /* Allocating blocks may take a long time, so iterate. */
3300         for (;;) {
3301                 olen = len;
3302                 ooffset = offset;
3303
3304                 bwillwrite();
3305                 mp = NULL;
3306                 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3307                 if (error != 0)
3308                         break;
3309                 error = vn_lock(vp, LK_EXCLUSIVE);
3310                 if (error != 0) {
3311                         vn_finished_write(mp);
3312                         break;
3313                 }
3314 #ifdef AUDIT
3315                 if (!audited_vnode1) {
3316                         AUDIT_ARG_VNODE1(vp);
3317                         audited_vnode1 = 1;
3318                 }
3319 #endif
3320 #ifdef MAC
3321                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3322                 if (error == 0)
3323 #endif
3324                         error = VOP_ALLOCATE(vp, &offset, &len);
3325                 VOP_UNLOCK(vp);
3326                 vn_finished_write(mp);
3327
3328                 if (olen + ooffset != offset + len) {
3329                         panic("offset + len changed from %jx/%jx to %jx/%jx",
3330                             ooffset, olen, offset, len);
3331                 }
3332                 if (error != 0 || len == 0)
3333                         break;
3334                 KASSERT(olen > len, ("Iteration did not make progress?"));
3335                 maybe_yield();
3336         }
3337
3338         return (error);
3339 }
3340
3341 static u_long vn_lock_pair_pause_cnt;
3342 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
3343     &vn_lock_pair_pause_cnt, 0,
3344     "Count of vn_lock_pair deadlocks");
3345
3346 u_int vn_lock_pair_pause_max;
3347 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
3348     &vn_lock_pair_pause_max, 0,
3349     "Max ticks for vn_lock_pair deadlock avoidance sleep");
3350
3351 static void
3352 vn_lock_pair_pause(const char *wmesg)
3353 {
3354         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
3355         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
3356 }
3357
3358 /*
3359  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
3360  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
3361  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
3362  * can be NULL.
3363  *
3364  * The function returns with both vnodes exclusively locked, and
3365  * guarantees that it does not create lock order reversal with other
3366  * threads during its execution.  Both vnodes could be unlocked
3367  * temporary (and reclaimed).
3368  */
3369 void
3370 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
3371     bool vp2_locked)
3372 {
3373         int error;
3374
3375         if (vp1 == NULL && vp2 == NULL)
3376                 return;
3377         if (vp1 != NULL) {
3378                 if (vp1_locked)
3379                         ASSERT_VOP_ELOCKED(vp1, "vp1");
3380                 else
3381                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
3382         } else {
3383                 vp1_locked = true;
3384         }
3385         if (vp2 != NULL) {
3386                 if (vp2_locked)
3387                         ASSERT_VOP_ELOCKED(vp2, "vp2");
3388                 else
3389                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
3390         } else {
3391                 vp2_locked = true;
3392         }
3393         if (!vp1_locked && !vp2_locked) {
3394                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3395                 vp1_locked = true;
3396         }
3397
3398         for (;;) {
3399                 if (vp1_locked && vp2_locked)
3400                         break;
3401                 if (vp1_locked && vp2 != NULL) {
3402                         if (vp1 != NULL) {
3403                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
3404                                     __FILE__, __LINE__);
3405                                 if (error == 0)
3406                                         break;
3407                                 VOP_UNLOCK(vp1);
3408                                 vp1_locked = false;
3409                                 vn_lock_pair_pause("vlp1");
3410                         }
3411                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3412                         vp2_locked = true;
3413                 }
3414                 if (vp2_locked && vp1 != NULL) {
3415                         if (vp2 != NULL) {
3416                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
3417                                     __FILE__, __LINE__);
3418                                 if (error == 0)
3419                                         break;
3420                                 VOP_UNLOCK(vp2);
3421                                 vp2_locked = false;
3422                                 vn_lock_pair_pause("vlp2");
3423                         }
3424                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
3425                         vp1_locked = true;
3426                 }
3427         }
3428         if (vp1 != NULL)
3429                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
3430         if (vp2 != NULL)
3431                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
3432 }