sys/kern/vfs_vnops.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_mac.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/file.h>
  46 #include <sys/kdb.h>
  47 #include <sys/stat.h>
  48 #include <sys/proc.h>
  49 #include <sys/limits.h>
  50 #include <sys/lock.h>
  51 #include <sys/mac.h>
  52 #include <sys/mount.h>
  53 #include <sys/mutex.h>
  54 #include <sys/namei.h>
  55 #include <sys/vnode.h>
  56 #include <sys/bio.h>
  57 #include <sys/buf.h>
  58 #include <sys/filio.h>
  59 #include <sys/sx.h>
  60 #include <sys/ttycom.h>
  61 #include <sys/conf.h>
  62 #include <sys/syslog.h>
  63 #include <sys/unistd.h>
  64
  65 static fo_rdwr_t        vn_read;
  66 static fo_rdwr_t        vn_write;
  67 static fo_ioctl_t       vn_ioctl;
  68 static fo_poll_t        vn_poll;
  69 static fo_kqfilter_t    vn_kqfilter;
  70 static fo_stat_t        vn_statfile;
  71 static fo_close_t       vn_closefile;
  72
  73 struct  fileops vnops = {
  74         .fo_read = vn_read,
  75         .fo_write = vn_write,
  76         .fo_ioctl = vn_ioctl,
  77         .fo_poll = vn_poll,
  78         .fo_kqfilter = vn_kqfilter,
  79         .fo_stat = vn_statfile,
  80         .fo_close = vn_closefile,
  81         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
  82 };
  83
  84 int
  85 vn_open(ndp, flagp, cmode, fdidx)
  86         struct nameidata *ndp;
  87         int *flagp, cmode, fdidx;
  88 {
  89         struct thread *td = ndp->ni_cnd.cn_thread;
  90
  91         return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
  92 }
  93
  94 /*
  95  * Common code for vnode open operations.
  96  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  97  *
  98  * Note that this does NOT free nameidata for the successful case,
  99  * due to the NDINIT being done elsewhere.
 100  */
 101 int
 102 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
 103         struct nameidata *ndp;
 104         int *flagp, cmode;
 105         struct ucred *cred;
 106         int fdidx;
 107 {
 108         struct vnode *vp;
 109         struct mount *mp;
 110         struct thread *td = ndp->ni_cnd.cn_thread;
 111         struct vattr vat;
 112         struct vattr *vap = &vat;
 113         int mode, fmode, error;
 114         int vfslocked;
 115
 116 restart:
 117         vfslocked = 0;
 118         fmode = *flagp;
 119         if (fmode & O_CREAT) {
 120                 ndp->ni_cnd.cn_nameiop = CREATE;
 121                 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE;
 122                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 123                         ndp->ni_cnd.cn_flags |= FOLLOW;
 124                 bwillwrite();
 125                 if ((error = namei(ndp)) != 0)
 126                         return (error);
 127                 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
 128                 ndp->ni_cnd.cn_flags &= ~MPSAFE;
 129                 if (ndp->ni_vp == NULL) {
 130                         VATTR_NULL(vap);
 131                         vap->va_type = VREG;
 132                         vap->va_mode = cmode;
 133                         if (fmode & O_EXCL)
 134                                 vap->va_vaflags |= VA_EXCLUSIVE;
 135                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 136                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 137                                 vput(ndp->ni_dvp);
 138                                 VFS_UNLOCK_GIANT(vfslocked);
 139                                 if ((error = vn_start_write(NULL, &mp,
 140                                     V_XSLEEP | PCATCH)) != 0)
 141                                         return (error);
 142                                 goto restart;
 143                         }
 144 #ifdef MAC
 145                         error = mac_check_vnode_create(cred, ndp->ni_dvp,
 146                             &ndp->ni_cnd, vap);
 147                         if (error == 0) {
 148 #endif
 149                                 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
 150                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 151                                                    &ndp->ni_cnd, vap);
 152 #ifdef MAC
 153                         }
 154 #endif
 155                         vput(ndp->ni_dvp);
 156                         vn_finished_write(mp);
 157                         if (error) {
 158                                 VFS_UNLOCK_GIANT(vfslocked);
 159                                 NDFREE(ndp, NDF_ONLY_PNBUF);
 160                                 return (error);
 161                         }
 162                         ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
 163                         ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
 164                         fmode &= ~O_TRUNC;
 165                         vp = ndp->ni_vp;
 166                 } else {
 167                         if (ndp->ni_dvp == ndp->ni_vp)
 168                                 vrele(ndp->ni_dvp);
 169                         else
 170                                 vput(ndp->ni_dvp);
 171                         ndp->ni_dvp = NULL;
 172                         vp = ndp->ni_vp;
 173                         if (fmode & O_EXCL) {
 174                                 error = EEXIST;
 175                                 goto bad;
 176                         }
 177                         fmode &= ~O_CREAT;
 178                 }
 179         } else {
 180                 ndp->ni_cnd.cn_nameiop = LOOKUP;
 181                 ndp->ni_cnd.cn_flags = ISOPEN |
 182                     ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
 183                     LOCKSHARED | LOCKLEAF | MPSAFE;
 184                 if ((error = namei(ndp)) != 0)
 185                         return (error);
 186                 ndp->ni_cnd.cn_flags &= ~MPSAFE;
 187                 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
 188                 vp = ndp->ni_vp;
 189         }
 190         if (vp->v_type == VLNK) {
 191                 error = EMLINK;
 192                 goto bad;
 193         }
 194         if (vp->v_type == VSOCK) {
 195                 error = EOPNOTSUPP;
 196                 goto bad;
 197         }
 198         mode = 0;
 199         if (fmode & (FWRITE | O_TRUNC)) {
 200                 if (vp->v_type == VDIR) {
 201                         error = EISDIR;
 202                         goto bad;
 203                 }
 204                 mode |= VWRITE;
 205         }
 206         if (fmode & FREAD)
 207                 mode |= VREAD;
 208         if (fmode & O_APPEND)
 209                 mode |= VAPPEND;
 210 #ifdef MAC
 211         error = mac_check_vnode_open(cred, vp, mode);
 212         if (error)
 213                 goto bad;
 214 #endif
 215         if ((fmode & O_CREAT) == 0) {
 216                 if (mode & VWRITE) {
 217                         error = vn_writechk(vp);
 218                         if (error)
 219                                 goto bad;
 220                 }
 221                 if (mode) {
 222                         error = VOP_ACCESS(vp, mode, cred, td);
 223                         if (error)
 224                                 goto bad;
 225                 }
 226         }
 227         if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
 228                 goto bad;
 229
 230         if (fmode & FWRITE)
 231                 vp->v_writecount++;
 232         *flagp = fmode;
 233         ASSERT_VOP_LOCKED(vp, "vn_open_cred");
 234         if (fdidx == -1)
 235                 VFS_UNLOCK_GIANT(vfslocked);
 236         return (0);
 237 bad:
 238         NDFREE(ndp, NDF_ONLY_PNBUF);
 239         vput(vp);
 240         VFS_UNLOCK_GIANT(vfslocked);
 241         *flagp = fmode;
 242         ndp->ni_vp = NULL;
 243         return (error);
 244 }
 245
 246 /*
 247  * Check for write permissions on the specified vnode.
 248  * Prototype text segments cannot be written.
 249  */
 250 int
 251 vn_writechk(vp)
 252         register struct vnode *vp;
 253 {
 254
 255         ASSERT_VOP_LOCKED(vp, "vn_writechk");
 256         /*
 257          * If there's shared text associated with
 258          * the vnode, try to free it up once.  If
 259          * we fail, we can't allow writing.
 260          */
 261         if (vp->v_vflag & VV_TEXT)
 262                 return (ETXTBSY);
 263
 264         return (0);
 265 }
 266
 267 /*
 268  * Vnode close call
 269  */
 270 int
 271 vn_close(vp, flags, file_cred, td)
 272         register struct vnode *vp;
 273         int flags;
 274         struct ucred *file_cred;
 275         struct thread *td;
 276 {
 277         struct mount *mp;
 278         int error;
 279
 280         VFS_ASSERT_GIANT(vp->v_mount);
 281
 282         vn_start_write(vp, &mp, V_WAIT);
 283         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 284         if (flags & FWRITE)
 285                 vp->v_writecount--;
 286         error = VOP_CLOSE(vp, flags, file_cred, td);
 287         vput(vp);
 288         vn_finished_write(mp);
 289         return (error);
 290 }
 291
 292 /*
 293  * Sequential heuristic - detect sequential operation
 294  */
 295 static __inline
 296 int
 297 sequential_heuristic(struct uio *uio, struct file *fp)
 298 {
 299
 300         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 301             uio->uio_offset == fp->f_nextoff) {
 302                 /*
 303                  * XXX we assume that the filesystem block size is
 304                  * the default.  Not true, but still gives us a pretty
 305                  * good indicator of how sequential the read operations
 306                  * are.
 307                  */
 308                 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
 309                 if (fp->f_seqcount > IO_SEQMAX)
 310                         fp->f_seqcount = IO_SEQMAX;
 311                 return(fp->f_seqcount << IO_SEQSHIFT);
 312         }
 313
 314         /*
 315          * Not sequential, quick draw-down of seqcount
 316          */
 317         if (fp->f_seqcount > 1)
 318                 fp->f_seqcount = 1;
 319         else
 320                 fp->f_seqcount = 0;
 321         return(0);
 322 }
 323
 324 /*
 325  * Package up an I/O request on a vnode into a uio and do it.
 326  */
 327 int
 328 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
 329     aresid, td)
 330         enum uio_rw rw;
 331         struct vnode *vp;
 332         caddr_t base;
 333         int len;
 334         off_t offset;
 335         enum uio_seg segflg;
 336         int ioflg;
 337         struct ucred *active_cred;
 338         struct ucred *file_cred;
 339         int *aresid;
 340         struct thread *td;
 341 {
 342         struct uio auio;
 343         struct iovec aiov;
 344         struct mount *mp;
 345         struct ucred *cred;
 346         int error;
 347
 348         VFS_ASSERT_GIANT(vp->v_mount);
 349
 350         if ((ioflg & IO_NODELOCKED) == 0) {
 351                 mp = NULL;
 352                 if (rw == UIO_WRITE) {
 353                         if (vp->v_type != VCHR &&
 354                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 355                             != 0)
 356                                 return (error);
 357                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 358                 } else {
 359                         /*
 360                          * XXX This should be LK_SHARED but I don't trust VFS
 361                          * enough to leave it like that until it has been
 362                          * reviewed further.
 363                          */
 364                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 365                 }
 366
 367         }
 368         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 369         auio.uio_iov = &aiov;
 370         auio.uio_iovcnt = 1;
 371         aiov.iov_base = base;
 372         aiov.iov_len = len;
 373         auio.uio_resid = len;
 374         auio.uio_offset = offset;
 375         auio.uio_segflg = segflg;
 376         auio.uio_rw = rw;
 377         auio.uio_td = td;
 378         error = 0;
 379 #ifdef MAC
 380         if ((ioflg & IO_NOMACCHECK) == 0) {
 381                 if (rw == UIO_READ)
 382                         error = mac_check_vnode_read(active_cred, file_cred,
 383                             vp);
 384                 else
 385                         error = mac_check_vnode_write(active_cred, file_cred,
 386                             vp);
 387         }
 388 #endif
 389         if (error == 0) {
 390                 if (file_cred)
 391                         cred = file_cred;
 392                 else
 393                         cred = active_cred;
 394                 if (rw == UIO_READ)
 395                         error = VOP_READ(vp, &auio, ioflg, cred);
 396                 else
 397                         error = VOP_WRITE(vp, &auio, ioflg, cred);
 398         }
 399         if (aresid)
 400                 *aresid = auio.uio_resid;
 401         else
 402                 if (auio.uio_resid && error == 0)
 403                         error = EIO;
 404         if ((ioflg & IO_NODELOCKED) == 0) {
 405                 if (rw == UIO_WRITE)
 406                         vn_finished_write(mp);
 407                 VOP_UNLOCK(vp, 0, td);
 408         }
 409         return (error);
 410 }
 411
 412 /*
 413  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 414  * request is split up into smaller chunks and we try to avoid saturating
 415  * the buffer cache while potentially holding a vnode locked, so we
 416  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
 417  * to give other processes a chance to lock the vnode (either other processes
 418  * core'ing the same binary, or unrelated processes scanning the directory).
 419  */
 420 int
 421 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
 422     file_cred, aresid, td)
 423         enum uio_rw rw;
 424         struct vnode *vp;
 425         caddr_t base;
 426         size_t len;
 427         off_t offset;
 428         enum uio_seg segflg;
 429         int ioflg;
 430         struct ucred *active_cred;
 431         struct ucred *file_cred;
 432         size_t *aresid;
 433         struct thread *td;
 434 {
 435         int error = 0;
 436         int iaresid;
 437
 438         VFS_ASSERT_GIANT(vp->v_mount);
 439
 440         do {
 441                 int chunk;
 442
 443                 /*
 444                  * Force `offset' to a multiple of MAXBSIZE except possibly
 445                  * for the first chunk, so that filesystems only need to
 446                  * write full blocks except possibly for the first and last
 447                  * chunks.
 448                  */
 449                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 450
 451                 if (chunk > len)
 452                         chunk = len;
 453                 if (rw != UIO_READ && vp->v_type == VREG)
 454                         bwillwrite();
 455                 iaresid = 0;
 456                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 457                     ioflg, active_cred, file_cred, &iaresid, td);
 458                 len -= chunk;   /* aresid calc already includes length */
 459                 if (error)
 460                         break;
 461                 offset += chunk;
 462                 base += chunk;
 463                 uio_yield();
 464         } while (len);
 465         if (aresid)
 466                 *aresid = len + iaresid;
 467         return (error);
 468 }
 469
 470 /*
 471  * File table vnode read routine.
 472  */
 473 static int
 474 vn_read(fp, uio, active_cred, flags, td)
 475         struct file *fp;
 476         struct uio *uio;
 477         struct ucred *active_cred;
 478         struct thread *td;
 479         int flags;
 480 {
 481         struct vnode *vp;
 482         int error, ioflag;
 483         int vfslocked;
 484
 485         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 486             uio->uio_td, td));
 487         vp = fp->f_vnode;
 488         ioflag = 0;
 489         if (fp->f_flag & FNONBLOCK)
 490                 ioflag |= IO_NDELAY;
 491         if (fp->f_flag & O_DIRECT)
 492                 ioflag |= IO_DIRECT;
 493         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 494         VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
 495         /*
 496          * According to McKusick the vn lock is protecting f_offset here.
 497          * Once this field has it's own lock we can acquire this shared.
 498          */
 499         if ((flags & FOF_OFFSET) == 0) {
 500                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 501                 uio->uio_offset = fp->f_offset;
 502         } else
 503                 vn_lock(vp, LK_SHARED | LK_RETRY, td);
 504
 505         ioflag |= sequential_heuristic(uio, fp);
 506
 507 #ifdef MAC
 508         error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
 509         if (error == 0)
 510 #endif
 511                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 512         if ((flags & FOF_OFFSET) == 0)
 513                 fp->f_offset = uio->uio_offset;
 514         fp->f_nextoff = uio->uio_offset;
 515         VOP_UNLOCK(vp, 0, td);
 516         VFS_UNLOCK_GIANT(vfslocked);
 517         return (error);
 518 }
 519
 520 /*
 521  * File table vnode write routine.
 522  */
 523 static int
 524 vn_write(fp, uio, active_cred, flags, td)
 525         struct file *fp;
 526         struct uio *uio;
 527         struct ucred *active_cred;
 528         struct thread *td;
 529         int flags;
 530 {
 531         struct vnode *vp;
 532         struct mount *mp;
 533         int error, ioflag;
 534         int vfslocked;
 535
 536         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 537             uio->uio_td, td));
 538         vp = fp->f_vnode;
 539         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 540         if (vp->v_type == VREG)
 541                 bwillwrite();
 542         ioflag = IO_UNIT;
 543         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 544                 ioflag |= IO_APPEND;
 545         if (fp->f_flag & FNONBLOCK)
 546                 ioflag |= IO_NDELAY;
 547         if (fp->f_flag & O_DIRECT)
 548                 ioflag |= IO_DIRECT;
 549         if ((fp->f_flag & O_FSYNC) ||
 550             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 551                 ioflag |= IO_SYNC;
 552         mp = NULL;
 553         if (vp->v_type != VCHR &&
 554             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 555                 goto unlock;
 556         VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
 557         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 558         if ((flags & FOF_OFFSET) == 0)
 559                 uio->uio_offset = fp->f_offset;
 560         ioflag |= sequential_heuristic(uio, fp);
 561 #ifdef MAC
 562         error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
 563         if (error == 0)
 564 #endif
 565                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 566         if ((flags & FOF_OFFSET) == 0)
 567                 fp->f_offset = uio->uio_offset;
 568         fp->f_nextoff = uio->uio_offset;
 569         VOP_UNLOCK(vp, 0, td);
 570         vn_finished_write(mp);
 571 unlock:
 572         VFS_UNLOCK_GIANT(vfslocked);
 573         return (error);
 574 }
 575
 576 /*
 577  * File table vnode stat routine.
 578  */
 579 static int
 580 vn_statfile(fp, sb, active_cred, td)
 581         struct file *fp;
 582         struct stat *sb;
 583         struct ucred *active_cred;
 584         struct thread *td;
 585 {
 586         struct vnode *vp = fp->f_vnode;
 587         int vfslocked;
 588         int error;
 589
 590         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 591         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 592         error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 593         VOP_UNLOCK(vp, 0, td);
 594         VFS_UNLOCK_GIANT(vfslocked);
 595
 596         return (error);
 597 }
 598
 599 /*
 600  * Stat a vnode; implementation for the stat syscall
 601  */
 602 int
 603 vn_stat(vp, sb, active_cred, file_cred, td)
 604         struct vnode *vp;
 605         register struct stat *sb;
 606         struct ucred *active_cred;
 607         struct ucred *file_cred;
 608         struct thread *td;
 609 {
 610         struct vattr vattr;
 611         register struct vattr *vap;
 612         int error;
 613         u_short mode;
 614
 615 #ifdef MAC
 616         error = mac_check_vnode_stat(active_cred, file_cred, vp);
 617         if (error)
 618                 return (error);
 619 #endif
 620
 621         vap = &vattr;
 622         error = VOP_GETATTR(vp, vap, active_cred, td);
 623         if (error)
 624                 return (error);
 625
 626         /*
 627          * Zero the spare stat fields
 628          */
 629         bzero(sb, sizeof *sb);
 630
 631         /*
 632          * Copy from vattr table
 633          */
 634         if (vap->va_fsid != VNOVAL)
 635                 sb->st_dev = vap->va_fsid;
 636         else
 637                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 638         sb->st_ino = vap->va_fileid;
 639         mode = vap->va_mode;
 640         switch (vap->va_type) {
 641         case VREG:
 642                 mode |= S_IFREG;
 643                 break;
 644         case VDIR:
 645                 mode |= S_IFDIR;
 646                 break;
 647         case VBLK:
 648                 mode |= S_IFBLK;
 649                 break;
 650         case VCHR:
 651                 mode |= S_IFCHR;
 652                 break;
 653         case VLNK:
 654                 mode |= S_IFLNK;
 655                 /* This is a cosmetic change, symlinks do not have a mode. */
 656                 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
 657                         sb->st_mode &= ~ACCESSPERMS;    /* 0000 */
 658                 else
 659                         sb->st_mode |= ACCESSPERMS;     /* 0777 */
 660                 break;
 661         case VSOCK:
 662                 mode |= S_IFSOCK;
 663                 break;
 664         case VFIFO:
 665                 mode |= S_IFIFO;
 666                 break;
 667         default:
 668                 return (EBADF);
 669         };
 670         sb->st_mode = mode;
 671         sb->st_nlink = vap->va_nlink;
 672         sb->st_uid = vap->va_uid;
 673         sb->st_gid = vap->va_gid;
 674         sb->st_rdev = vap->va_rdev;
 675         if (vap->va_size > OFF_MAX)
 676                 return (EOVERFLOW);
 677         sb->st_size = vap->va_size;
 678         sb->st_atimespec = vap->va_atime;
 679         sb->st_mtimespec = vap->va_mtime;
 680         sb->st_ctimespec = vap->va_ctime;
 681         sb->st_birthtimespec = vap->va_birthtime;
 682
 683         /*
 684          * According to www.opengroup.org, the meaning of st_blksize is
 685          *   "a filesystem-specific preferred I/O block size for this
 686          *    object.  In some filesystem types, this may vary from file
 687          *    to file"
 688          * Default to PAGE_SIZE after much discussion.
 689          * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
 690          */
 691
 692         sb->st_blksize = PAGE_SIZE;
 693
 694         sb->st_flags = vap->va_flags;
 695         if (suser(td))
 696                 sb->st_gen = 0;
 697         else
 698                 sb->st_gen = vap->va_gen;
 699
 700 #if (S_BLKSIZE == 512)
 701         /* Optimize this case */
 702         sb->st_blocks = vap->va_bytes >> 9;
 703 #else
 704         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 705 #endif
 706         return (0);
 707 }
 708
 709 /*
 710  * File table vnode ioctl routine.
 711  */
 712 static int
 713 vn_ioctl(fp, com, data, active_cred, td)
 714         struct file *fp;
 715         u_long com;
 716         void *data;
 717         struct ucred *active_cred;
 718         struct thread *td;
 719 {
 720         struct vnode *vp = fp->f_vnode;
 721         struct vattr vattr;
 722         int vfslocked;
 723         int error;
 724
 725         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 726         error = ENOTTY;
 727         switch (vp->v_type) {
 728         case VREG:
 729         case VDIR:
 730                 if (com == FIONREAD) {
 731                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 732                         error = VOP_GETATTR(vp, &vattr, active_cred, td);
 733                         VOP_UNLOCK(vp, 0, td);
 734                         if (!error)
 735                                 *(int *)data = vattr.va_size - fp->f_offset;
 736                 }
 737                 if (com == FIONBIO || com == FIOASYNC)  /* XXX */
 738                         error = 0;
 739                 else
 740                         error = VOP_IOCTL(vp, com, data, fp->f_flag,
 741                             active_cred, td);
 742                 break;
 743
 744         default:
 745                 break;
 746         }
 747         VFS_UNLOCK_GIANT(vfslocked);
 748         return (error);
 749 }
 750
 751 /*
 752  * File table vnode poll routine.
 753  */
 754 static int
 755 vn_poll(fp, events, active_cred, td)
 756         struct file *fp;
 757         int events;
 758         struct ucred *active_cred;
 759         struct thread *td;
 760 {
 761         struct vnode *vp;
 762         int error;
 763
 764         mtx_lock(&Giant);
 765
 766         vp = fp->f_vnode;
 767 #ifdef MAC
 768         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 769         error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
 770         VOP_UNLOCK(vp, 0, td);
 771         if (!error)
 772 #endif
 773
 774         error = VOP_POLL(vp, events, fp->f_cred, td);
 775         mtx_unlock(&Giant);
 776         return (error);
 777 }
 778
 779 /*
 780  * Check that the vnode is still valid, and if so
 781  * acquire requested lock.
 782  */
 783 int
 784 #ifndef DEBUG_LOCKS
 785 vn_lock(vp, flags, td)
 786 #else
 787 debug_vn_lock(vp, flags, td, filename, line)
 788 #endif
 789         struct vnode *vp;
 790         int flags;
 791         struct thread *td;
 792 #ifdef  DEBUG_LOCKS
 793         const char *filename;
 794         int line;
 795 #endif
 796 {
 797         int error;
 798
 799         do {
 800                 if ((flags & LK_INTERLOCK) == 0)
 801                         VI_LOCK(vp);
 802                 if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) &&
 803                     vp->v_iflag & VI_DOOMED) {
 804                         VI_UNLOCK(vp);
 805                         return (ENOENT);
 806                 }
 807                 /*
 808                  * Just polling to check validity.
 809                  */
 810                 if ((flags & LK_TYPE_MASK) == 0) {
 811                         VI_UNLOCK(vp);
 812                         return (0);
 813                 }
 814 #ifdef  DEBUG_LOCKS
 815                 vp->filename = filename;
 816                 vp->line = line;
 817 #endif
 818                 /*
 819                  * lockmgr drops interlock before it will return for
 820                  * any reason.  So force the code above to relock it.
 821                  */
 822                 error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
 823                 flags &= ~LK_INTERLOCK;
 824                 /*
 825                  * Callers specify LK_RETRY if they wish to get dead vnodes.
 826                  * If RETRY is not set, we return ENOENT instead.
 827                  */
 828                 if (error == 0 && vp->v_iflag & VI_DOOMED &&
 829                     (flags & LK_RETRY) == 0) {
 830                         VOP_UNLOCK(vp, 0, td);
 831                         error = ENOENT;
 832                         break;
 833                 }
 834         } while (flags & LK_RETRY && error != 0);
 835         return (error);
 836 }
 837
 838 /*
 839  * File table vnode close routine.
 840  */
 841 static int
 842 vn_closefile(fp, td)
 843         struct file *fp;
 844         struct thread *td;
 845 {
 846         struct vnode *vp;
 847         struct flock lf;
 848         int vfslocked;
 849         int error;
 850
 851         vp = fp->f_vnode;
 852
 853         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 854         if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
 855                 lf.l_whence = SEEK_SET;
 856                 lf.l_start = 0;
 857                 lf.l_len = 0;
 858                 lf.l_type = F_UNLCK;
 859                 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 860         }
 861
 862         fp->f_ops = &badfileops;
 863
 864         error = vn_close(vp, fp->f_flag, fp->f_cred, td);
 865         VFS_UNLOCK_GIANT(vfslocked);
 866         return (error);
 867 }
 868
 869 /*
 870  * Preparing to start a filesystem write operation. If the operation is
 871  * permitted, then we bump the count of operations in progress and
 872  * proceed. If a suspend request is in progress, we wait until the
 873  * suspension is over, and then proceed.
 874  */
 875 int
 876 vn_start_write(vp, mpp, flags)
 877         struct vnode *vp;
 878         struct mount **mpp;
 879         int flags;
 880 {
 881         struct mount *mp;
 882         int error;
 883
 884         error = 0;
 885         /*
 886          * If a vnode is provided, get and return the mount point that
 887          * to which it will write.
 888          */
 889         if (vp != NULL) {
 890                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 891                         *mpp = NULL;
 892                         if (error != EOPNOTSUPP)
 893                                 return (error);
 894                         return (0);
 895                 }
 896         }
 897         if ((mp = *mpp) == NULL)
 898                 return (0);
 899         MNT_ILOCK(mp);
 900         /*
 901          * Check on status of suspension.
 902          */
 903         while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 904                 if (flags & V_NOWAIT) {
 905                         error = EWOULDBLOCK;
 906                         goto unlock;
 907                 }
 908                 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 909                     (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
 910                 if (error)
 911                         goto unlock;
 912         }
 913         if (flags & V_XSLEEP)
 914                 goto unlock;
 915         mp->mnt_writeopcount++;
 916 unlock:
 917         MNT_IUNLOCK(mp);
 918         return (error);
 919 }
 920
 921 /*
 922  * Secondary suspension. Used by operations such as vop_inactive
 923  * routines that are needed by the higher level functions. These
 924  * are allowed to proceed until all the higher level functions have
 925  * completed (indicated by mnt_writeopcount dropping to zero). At that
 926  * time, these operations are halted until the suspension is over.
 927  */
 928 int
 929 vn_write_suspend_wait(vp, mp, flags)
 930         struct vnode *vp;
 931         struct mount *mp;
 932         int flags;
 933 {
 934         int error;
 935
 936         if (vp != NULL) {
 937                 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
 938                         if (error != EOPNOTSUPP)
 939                                 return (error);
 940                         return (0);
 941                 }
 942         }
 943         /*
 944          * If we are not suspended or have not yet reached suspended
 945          * mode, then let the operation proceed.
 946          */
 947         if (mp == NULL)
 948                 return (0);
 949         MNT_ILOCK(mp);
 950         if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
 951                 MNT_IUNLOCK(mp);
 952                 return (0);
 953         }
 954         if (flags & V_NOWAIT) {
 955                 MNT_IUNLOCK(mp);
 956                 return (EWOULDBLOCK);
 957         }
 958         /*
 959          * Wait for the suspension to finish.
 960          */
 961         return (msleep(&mp->mnt_flag, MNT_MTX(mp),
 962             (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0));
 963 }
 964
 965 /*
 966  * Filesystem write operation has completed. If we are suspending and this
 967  * operation is the last one, notify the suspender that the suspension is
 968  * now in effect.
 969  */
 970 void
 971 vn_finished_write(mp)
 972         struct mount *mp;
 973 {
 974         if (mp == NULL)
 975                 return;
 976         MNT_ILOCK(mp);
 977         mp->mnt_writeopcount--;
 978         if (mp->mnt_writeopcount < 0)
 979                 panic("vn_finished_write: neg cnt");
 980         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 981             mp->mnt_writeopcount <= 0)
 982                 wakeup(&mp->mnt_writeopcount);
 983         MNT_IUNLOCK(mp);
 984 }
 985
 986 /*
 987  * Request a filesystem to suspend write operations.
 988  */
 989 int
 990 vfs_write_suspend(mp)
 991         struct mount *mp;
 992 {
 993         struct thread *td = curthread;
 994         int error;
 995
 996         error = 0;
 997         MNT_ILOCK(mp);
 998         if (mp->mnt_kern_flag & MNTK_SUSPEND)
 999                 goto unlock;
1000         mp->mnt_kern_flag |= MNTK_SUSPEND;
1001         if (mp->mnt_writeopcount > 0)
1002                 (void) msleep(&mp->mnt_writeopcount,
1003                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1004         else
1005                 MNT_IUNLOCK(mp);
1006         if ((error = VFS_SYNC(mp, MNT_WAIT, td)) != 0) {
1007                 vfs_write_resume(mp);
1008                 return (error);
1009         }
1010         MNT_ILOCK(mp);
1011         mp->mnt_kern_flag |= MNTK_SUSPENDED;
1012 unlock:
1013         MNT_IUNLOCK(mp);
1014         return (error);
1015 }
1016
1017 /*
1018  * Request a filesystem to resume write operations.
1019  */
1020 void
1021 vfs_write_resume(mp)
1022         struct mount *mp;
1023 {
1024
1025         MNT_ILOCK(mp);
1026         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1027                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
1028                 wakeup(&mp->mnt_writeopcount);
1029                 wakeup(&mp->mnt_flag);
1030         }
1031         MNT_IUNLOCK(mp);
1032 }
1033
1034 /*
1035  * Implement kqueues for files by translating it to vnode operation.
1036  */
1037 static int
1038 vn_kqfilter(struct file *fp, struct knote *kn)
1039 {
1040         int error;
1041
1042         mtx_lock(&Giant);
1043         error = VOP_KQFILTER(fp->f_vnode, kn);
1044         mtx_unlock(&Giant);
1045
1046         return error;
1047 }
1048
1049 /*
1050  * Simplified in-kernel wrapper calls for extended attribute access.
1051  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1052  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1053  */
1054 int
1055 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1056     const char *attrname, int *buflen, char *buf, struct thread *td)
1057 {
1058         struct uio      auio;
1059         struct iovec    iov;
1060         int     error;
1061
1062         iov.iov_len = *buflen;
1063         iov.iov_base = buf;
1064
1065         auio.uio_iov = &iov;
1066         auio.uio_iovcnt = 1;
1067         auio.uio_rw = UIO_READ;
1068         auio.uio_segflg = UIO_SYSSPACE;
1069         auio.uio_td = td;
1070         auio.uio_offset = 0;
1071         auio.uio_resid = *buflen;
1072
1073         if ((ioflg & IO_NODELOCKED) == 0)
1074                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1075
1076         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1077
1078         /* authorize attribute retrieval as kernel */
1079         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1080             td);
1081
1082         if ((ioflg & IO_NODELOCKED) == 0)
1083                 VOP_UNLOCK(vp, 0, td);
1084
1085         if (error == 0) {
1086                 *buflen = *buflen - auio.uio_resid;
1087         }
1088
1089         return (error);
1090 }
1091
1092 /*
1093  * XXX failure mode if partially written?
1094  */
1095 int
1096 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1097     const char *attrname, int buflen, char *buf, struct thread *td)
1098 {
1099         struct uio      auio;
1100         struct iovec    iov;
1101         struct mount    *mp;
1102         int     error;
1103
1104         iov.iov_len = buflen;
1105         iov.iov_base = buf;
1106
1107         auio.uio_iov = &iov;
1108         auio.uio_iovcnt = 1;
1109         auio.uio_rw = UIO_WRITE;
1110         auio.uio_segflg = UIO_SYSSPACE;
1111         auio.uio_td = td;
1112         auio.uio_offset = 0;
1113         auio.uio_resid = buflen;
1114
1115         if ((ioflg & IO_NODELOCKED) == 0) {
1116                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1117                         return (error);
1118                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1119         }
1120
1121         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1122
1123         /* authorize attribute setting as kernel */
1124         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1125
1126         if ((ioflg & IO_NODELOCKED) == 0) {
1127                 vn_finished_write(mp);
1128                 VOP_UNLOCK(vp, 0, td);
1129         }
1130
1131         return (error);
1132 }
1133
1134 int
1135 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1136     const char *attrname, struct thread *td)
1137 {
1138         struct mount    *mp;
1139         int     error;
1140
1141         if ((ioflg & IO_NODELOCKED) == 0) {
1142                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1143                         return (error);
1144                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1145         }
1146
1147         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1148
1149         /* authorize attribute removal as kernel */
1150         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1151         if (error == EOPNOTSUPP)
1152                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1153                     NULL, td);
1154
1155         if ((ioflg & IO_NODELOCKED) == 0) {
1156                 vn_finished_write(mp);
1157                 VOP_UNLOCK(vp, 0, td);
1158         }
1159
1160         return (error);
1161 }