sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27 /* Portions Copyright 2010 Robert Milkowski */
  28
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/time.h>
  32 #include <sys/systm.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/resource.h>
  35 #include <sys/resourcevar.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/atomic.h>
  44 #include <sys/namei.h>
  45 #include <sys/mman.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/unistd.h>
  49 #include <sys/zfs_dir.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/dirent.h>
  60 #include <sys/policy.h>
  61 #include <sys/sunddi.h>
  62 #include <sys/filio.h>
  63 #include <sys/sid.h>
  64 #include <sys/zfs_ctldir.h>
  65 #include <sys/zfs_fuid.h>
  66 #include <sys/zfs_sa.h>
  67 #include <sys/dnlc.h>
  68 #include <sys/zfs_rlock.h>
  69 #include <sys/extdirent.h>
  70 #include <sys/kidmap.h>
  71 #include <sys/bio.h>
  72 #include <sys/buf.h>
  73 #include <sys/sf_buf.h>
  74 #include <sys/sched.h>
  75 #include <sys/acl.h>
  76 #include <vm/vm_pageout.h>
  77
  78 /*
  79  * Programming rules.
  80  *
  81  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  82  * properly lock its in-core state, create a DMU transaction, do the work,
  83  * record this work in the intent log (ZIL), commit the DMU transaction,
  84  * and wait for the intent log to commit if it is a synchronous operation.
  85  * Moreover, the vnode ops must work in both normal and log replay context.
  86  * The ordering of events is important to avoid deadlocks and references
  87  * to freed memory.  The example below illustrates the following Big Rules:
  88  *
  89  *  (1) A check must be made in each zfs thread for a mounted file system.
  90  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  91  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  92  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  93  *      can return EIO from the calling function.
  94  *
  95  *  (2) VN_RELE() should always be the last thing except for zil_commit()
  96  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  97  *      First, if it's the last reference, the vnode/znode
  98  *      can be freed, so the zp may point to freed memory.  Second, the last
  99  *      reference will call zfs_zinactive(), which may induce a lot of work --
 100  *      pushing cached pages (which acquires range locks) and syncing out
 101  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 102  *      which could deadlock the system if you were already holding one.
 103  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 104  *
 105  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 106  *      as they can span dmu_tx_assign() calls.
 107  *
 108  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 109  *      This is critical because we don't want to block while holding locks.
 110  *      Note, in particular, that if a lock is sometimes acquired before
 111  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 112  *      use a non-blocking assign can deadlock the system.  The scenario:
 113  *
 114  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 115  *      Thread B is in an already-assigned tx, and blocks for this lock.
 116  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 117  *      forever, because the previous txg can't quiesce until B's tx commits.
 118  *
 119  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 120  *      then drop all locks, call dmu_tx_wait(), and try again.
 121  *
 122  *  (5) If the operation succeeded, generate the intent log entry for it
 123  *      before dropping locks.  This ensures that the ordering of events
 124  *      in the intent log matches the order in which they actually occurred.
 125  *      During ZIL replay the zfs_log_* functions will update the sequence
 126  *      number to indicate the zil transaction has replayed.
 127  *
 128  *  (6) At the end of each vnode op, the DMU tx must always commit,
 129  *      regardless of whether there were any errors.
 130  *
 131  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 132  *      to ensure that synchronous semantics are provided when necessary.
 133  *
 134  * In general, this is how things should be ordered in each vnode op:
 135  *
 136  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 137  * top:
 138  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 139  *      rw_enter(...);                  // grab any other locks you need
 140  *      tx = dmu_tx_create(...);        // get DMU tx
 141  *      dmu_tx_hold_*();                // hold each object you might modify
 142  *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 143  *      if (error) {
 144  *              rw_exit(...);           // drop locks
 145  *              zfs_dirent_unlock(dl);  // unlock directory entry
 146  *              VN_RELE(...);           // release held vnodes
 147  *              if (error == ERESTART) {
 148  *                      dmu_tx_wait(tx);
 149  *                      dmu_tx_abort(tx);
 150  *                      goto top;
 151  *              }
 152  *              dmu_tx_abort(tx);       // abort DMU tx
 153  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 154  *              return (error);         // really out of space
 155  *      }
 156  *      error = do_real_work();         // do whatever this VOP does
 157  *      if (error == 0)
 158  *              zfs_log_*(...);         // on success, make ZIL entry
 159  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 160  *      rw_exit(...);                   // drop locks
 161  *      zfs_dirent_unlock(dl);          // unlock directory entry
 162  *      VN_RELE(...);                   // release held vnodes
 163  *      zil_commit(zilog, foid);        // synchronous when necessary
 164  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 165  *      return (error);                 // done, report error
 166  */
 167
 168 /* ARGSUSED */
 169 static int
 170 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 171 {
 172         znode_t *zp = VTOZ(*vpp);
 173         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 174
 175         ZFS_ENTER(zfsvfs);
 176         ZFS_VERIFY_ZP(zp);
 177
 178         if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 179             ((flag & FAPPEND) == 0)) {
 180                 ZFS_EXIT(zfsvfs);
 181                 return (EPERM);
 182         }
 183
 184         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 185             ZTOV(zp)->v_type == VREG &&
 186             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 187                 if (fs_vscan(*vpp, cr, 0) != 0) {
 188                         ZFS_EXIT(zfsvfs);
 189                         return (EACCES);
 190                 }
 191         }
 192
 193         /* Keep a count of the synchronous opens in the znode */
 194         if (flag & (FSYNC | FDSYNC))
 195                 atomic_inc_32(&zp->z_sync_cnt);
 196
 197         ZFS_EXIT(zfsvfs);
 198         return (0);
 199 }
 200
 201 /* ARGSUSED */
 202 static int
 203 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 204     caller_context_t *ct)
 205 {
 206         znode_t *zp = VTOZ(vp);
 207         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 208
 209         /*
 210          * Clean up any locks held by this process on the vp.
 211          */
 212         cleanlocks(vp, ddi_get_pid(), 0);
 213         cleanshares(vp, ddi_get_pid());
 214
 215         ZFS_ENTER(zfsvfs);
 216         ZFS_VERIFY_ZP(zp);
 217
 218         /* Decrement the synchronous opens in the znode */
 219         if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 220                 atomic_dec_32(&zp->z_sync_cnt);
 221
 222         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 223             ZTOV(zp)->v_type == VREG &&
 224             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 225                 VERIFY(fs_vscan(vp, cr, 1) == 0);
 226
 227         ZFS_EXIT(zfsvfs);
 228         return (0);
 229 }
 230
 231 /*
 232  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 233  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 234  */
 235 static int
 236 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 237 {
 238         znode_t *zp = VTOZ(vp);
 239         uint64_t noff = (uint64_t)*off; /* new offset */
 240         uint64_t file_sz;
 241         int error;
 242         boolean_t hole;
 243
 244         file_sz = zp->z_size;
 245         if (noff >= file_sz)  {
 246                 return (ENXIO);
 247         }
 248
 249         if (cmd == _FIO_SEEK_HOLE)
 250                 hole = B_TRUE;
 251         else
 252                 hole = B_FALSE;
 253
 254         error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 255
 256         /* end of file? */
 257         if ((error == ESRCH) || (noff > file_sz)) {
 258                 /*
 259                  * Handle the virtual hole at the end of file.
 260                  */
 261                 if (hole) {
 262                         *off = file_sz;
 263                         return (0);
 264                 }
 265                 return (ENXIO);
 266         }
 267
 268         if (noff < *off)
 269                 return (error);
 270         *off = noff;
 271         return (error);
 272 }
 273
 274 /* ARGSUSED */
 275 static int
 276 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 277     int *rvalp, caller_context_t *ct)
 278 {
 279         offset_t off;
 280         int error;
 281         zfsvfs_t *zfsvfs;
 282         znode_t *zp;
 283
 284         switch (com) {
 285         case _FIOFFS:
 286                 return (0);
 287
 288                 /*
 289                  * The following two ioctls are used by bfu.  Faking out,
 290                  * necessary to avoid bfu errors.
 291                  */
 292         case _FIOGDIO:
 293         case _FIOSDIO:
 294                 return (0);
 295
 296         case _FIO_SEEK_DATA:
 297         case _FIO_SEEK_HOLE:
 298 #ifdef sun
 299                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 300                         return (EFAULT);
 301 #else
 302                 off = *(offset_t *)data;
 303 #endif
 304                 zp = VTOZ(vp);
 305                 zfsvfs = zp->z_zfsvfs;
 306                 ZFS_ENTER(zfsvfs);
 307                 ZFS_VERIFY_ZP(zp);
 308
 309                 /* offset parameter is in/out */
 310                 error = zfs_holey(vp, com, &off);
 311                 ZFS_EXIT(zfsvfs);
 312                 if (error)
 313                         return (error);
 314 #ifdef sun
 315                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 316                         return (EFAULT);
 317 #else
 318                 *(offset_t *)data = off;
 319 #endif
 320                 return (0);
 321         }
 322         return (ENOTTY);
 323 }
 324
 325 static vm_page_t
 326 page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 327 {
 328         vm_object_t obj;
 329         vm_page_t pp;
 330
 331         obj = vp->v_object;
 332         VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
 333
 334         for (;;) {
 335                 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 336                     vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
 337                         if (vm_page_sleep_if_busy(pp, FALSE, "zfsmwb"))
 338                                 continue;
 339                         vm_page_busy(pp);
 340                         vm_page_lock_queues();
 341                         vm_page_undirty(pp);
 342                         vm_page_unlock_queues();
 343                 } else {
 344                         if (__predict_false(obj->cache != NULL)) {
 345                                 vm_page_cache_free(obj, OFF_TO_IDX(start),
 346                                     OFF_TO_IDX(start) + 1);
 347                         }
 348                         pp = NULL;
 349                 }
 350                 break;
 351         }
 352         return (pp);
 353 }
 354
 355 static void
 356 page_unlock(vm_page_t pp)
 357 {
 358
 359         vm_page_wakeup(pp);
 360 }
 361
 362 static caddr_t
 363 zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
 364 {
 365
 366         *sfp = sf_buf_alloc(pp, 0);
 367         return ((caddr_t)sf_buf_kva(*sfp));
 368 }
 369
 370 static void
 371 zfs_unmap_page(struct sf_buf *sf)
 372 {
 373
 374         sf_buf_free(sf);
 375 }
 376
 377 /*
 378  * When a file is memory mapped, we must keep the IO data synchronized
 379  * between the DMU cache and the memory mapped pages.  What this means:
 380  *
 381  * On Write:    If we find a memory mapped page, we write to *both*
 382  *              the page and the dmu buffer.
 383  */
 384 static void
 385 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
 386     int segflg, dmu_tx_t *tx)
 387 {
 388         vm_object_t obj;
 389         struct sf_buf *sf;
 390         int off;
 391
 392         ASSERT(vp->v_mount != NULL);
 393         obj = vp->v_object;
 394         ASSERT(obj != NULL);
 395
 396         off = start & PAGEOFFSET;
 397         VM_OBJECT_LOCK(obj);
 398         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 399                 vm_page_t pp;
 400                 int nbytes = MIN(PAGESIZE - off, len);
 401
 402                 if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
 403                         caddr_t va;
 404
 405                         VM_OBJECT_UNLOCK(obj);
 406                         va = zfs_map_page(pp, &sf);
 407                         if (segflg == UIO_NOCOPY) {
 408                                 (void) dmu_write(os, oid, start+off, nbytes,
 409                                     va+off, tx);
 410                         } else {
 411                                 (void) dmu_read(os, oid, start+off, nbytes,
 412                                     va+off, DMU_READ_PREFETCH);
 413                         }
 414                         zfs_unmap_page(sf);
 415                         VM_OBJECT_LOCK(obj);
 416                         page_unlock(pp);
 417                 }
 418                 len -= nbytes;
 419                 off = 0;
 420         }
 421         VM_OBJECT_UNLOCK(obj);
 422 }
 423
 424 /*
 425  * Read with UIO_NOCOPY flag means that sendfile(2) requests
 426  * ZFS to populate a range of page cache pages with data.
 427  *
 428  * NOTE: this function could be optimized to pre-allocate
 429  * all pages in advance, drain VPO_BUSY on all of them,
 430  * map them into contiguous KVA region and populate them
 431  * in one single dmu_read() call.
 432  */
 433 static int
 434 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 435 {
 436         znode_t *zp = VTOZ(vp);
 437         objset_t *os = zp->z_zfsvfs->z_os;
 438         struct sf_buf *sf;
 439         vm_object_t obj;
 440         vm_page_t pp;
 441         int64_t start;
 442         caddr_t va;
 443         int len = nbytes;
 444         int off;
 445         int error = 0;
 446
 447         ASSERT(uio->uio_segflg == UIO_NOCOPY);
 448         ASSERT(vp->v_mount != NULL);
 449         obj = vp->v_object;
 450         ASSERT(obj != NULL);
 451         ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 452
 453         VM_OBJECT_LOCK(obj);
 454         for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 455                 int bytes = MIN(PAGESIZE, len);
 456
 457 again:
 458                 pp = vm_page_lookup(obj, OFF_TO_IDX(start));
 459                 if (pp != NULL && vm_page_sleep_if_busy(pp, FALSE,
 460                     "zfsmrb"))
 461                         goto again;
 462                 if (pp == NULL) {
 463                         pp = vm_page_alloc(obj, OFF_TO_IDX(start),
 464                             VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL);
 465                         if (pp == NULL) {
 466                                 VM_OBJECT_UNLOCK(obj);
 467                                 VM_WAIT;
 468                                 VM_OBJECT_LOCK(obj);
 469                                 goto again;
 470                         }
 471                 }
 472                 if (pp->valid == 0) {
 473                         vm_page_io_start(pp);
 474                         VM_OBJECT_UNLOCK(obj);
 475                         va = zfs_map_page(pp, &sf);
 476                         error = dmu_read(os, zp->z_id, start, bytes, va,
 477                             DMU_READ_PREFETCH);
 478                         if (bytes != PAGESIZE && error == 0)
 479                                 bzero(va + bytes, PAGESIZE - bytes);
 480                         zfs_unmap_page(sf);
 481                         VM_OBJECT_LOCK(obj);
 482                         vm_page_io_finish(pp);
 483                         vm_page_lock_queues();
 484                         if (error) {
 485                                 vm_page_free(pp);
 486                         } else {
 487                                 pp->valid = VM_PAGE_BITS_ALL;
 488                                 vm_page_activate(pp);
 489                         }
 490                         vm_page_unlock_queues();
 491                 }
 492                 if (error)
 493                         break;
 494                 uio->uio_resid -= bytes;
 495                 uio->uio_offset += bytes;
 496                 len -= bytes;
 497         }
 498         VM_OBJECT_UNLOCK(obj);
 499         return (error);
 500 }
 501
 502 /*
 503  * When a file is memory mapped, we must keep the IO data synchronized
 504  * between the DMU cache and the memory mapped pages.  What this means:
 505  *
 506  * On Read:     We "read" preferentially from memory mapped pages,
 507  *              else we default from the dmu buffer.
 508  *
 509  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 510  *      the file is memory mapped.
 511  */
 512 static int
 513 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 514 {
 515         znode_t *zp = VTOZ(vp);
 516         objset_t *os = zp->z_zfsvfs->z_os;
 517         vm_object_t obj;
 518         int64_t start;
 519         caddr_t va;
 520         int len = nbytes;
 521         int off;
 522         int error = 0;
 523
 524         ASSERT(vp->v_mount != NULL);
 525         obj = vp->v_object;
 526         ASSERT(obj != NULL);
 527
 528         start = uio->uio_loffset;
 529         off = start & PAGEOFFSET;
 530         VM_OBJECT_LOCK(obj);
 531         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 532                 vm_page_t pp;
 533                 uint64_t bytes = MIN(PAGESIZE - off, len);
 534
 535                 if (pp = page_lookup(vp, start, off, bytes)) {
 536                         struct sf_buf *sf;
 537                         caddr_t va;
 538                         VM_OBJECT_UNLOCK(obj);
 539                         va = zfs_map_page(pp, &sf);
 540                         error = uiomove(va + off, bytes, UIO_READ, uio);
 541                         zfs_unmap_page(sf);
 542                         VM_OBJECT_LOCK(obj);
 543                         page_unlock(pp);
 544                 } else {
 545                         VM_OBJECT_UNLOCK(obj);
 546                         error = dmu_read_uio(os, zp->z_id, uio, bytes);
 547                         VM_OBJECT_LOCK(obj);
 548                 }
 549                 len -= bytes;
 550                 off = 0;
 551                 if (error)
 552                         break;
 553         }
 554         VM_OBJECT_UNLOCK(obj);
 555         return (error);
 556 }
 557
 558 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 559
 560 /*
 561  * Read bytes from specified file into supplied buffer.
 562  *
 563  *      IN:     vp      - vnode of file to be read from.
 564  *              uio     - structure supplying read location, range info,
 565  *                        and return buffer.
 566  *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 567  *              cr      - credentials of caller.
 568  *              ct      - caller context
 569  *
 570  *      OUT:    uio     - updated offset and range, buffer filled.
 571  *
 572  *      RETURN: 0 if success
 573  *              error code if failure
 574  *
 575  * Side Effects:
 576  *      vp - atime updated if byte count > 0
 577  */
 578 /* ARGSUSED */
 579 static int
 580 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 581 {
 582         znode_t         *zp = VTOZ(vp);
 583         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 584         objset_t        *os;
 585         ssize_t         n, nbytes;
 586         int             error;
 587         rl_t            *rl;
 588         xuio_t          *xuio = NULL;
 589
 590         ZFS_ENTER(zfsvfs);
 591         ZFS_VERIFY_ZP(zp);
 592         os = zfsvfs->z_os;
 593
 594         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 595                 ZFS_EXIT(zfsvfs);
 596                 return (EACCES);
 597         }
 598
 599         /*
 600          * Validate file offset
 601          */
 602         if (uio->uio_loffset < (offset_t)0) {
 603                 ZFS_EXIT(zfsvfs);
 604                 return (EINVAL);
 605         }
 606
 607         /*
 608          * Fasttrack empty reads
 609          */
 610         if (uio->uio_resid == 0) {
 611                 ZFS_EXIT(zfsvfs);
 612                 return (0);
 613         }
 614
 615         /*
 616          * Check for mandatory locks
 617          */
 618         if (MANDMODE(zp->z_mode)) {
 619                 if (error = chklock(vp, FREAD,
 620                     uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 621                         ZFS_EXIT(zfsvfs);
 622                         return (error);
 623                 }
 624         }
 625
 626         /*
 627          * If we're in FRSYNC mode, sync out this znode before reading it.
 628          */
 629         if (zfsvfs->z_log &&
 630             (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 631                 zil_commit(zfsvfs->z_log, zp->z_id);
 632
 633         /*
 634          * Lock the range against changes.
 635          */
 636         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 637
 638         /*
 639          * If we are reading past end-of-file we can skip
 640          * to the end; but we might still need to set atime.
 641          */
 642         if (uio->uio_loffset >= zp->z_size) {
 643                 error = 0;
 644                 goto out;
 645         }
 646
 647         ASSERT(uio->uio_loffset < zp->z_size);
 648         n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 649
 650 #ifdef sun
 651         if ((uio->uio_extflg == UIO_XUIO) &&
 652             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 653                 int nblk;
 654                 int blksz = zp->z_blksz;
 655                 uint64_t offset = uio->uio_loffset;
 656
 657                 xuio = (xuio_t *)uio;
 658                 if ((ISP2(blksz))) {
 659                         nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 660                             blksz)) / blksz;
 661                 } else {
 662                         ASSERT(offset + n <= blksz);
 663                         nblk = 1;
 664                 }
 665                 (void) dmu_xuio_init(xuio, nblk);
 666
 667                 if (vn_has_cached_data(vp)) {
 668                         /*
 669                          * For simplicity, we always allocate a full buffer
 670                          * even if we only expect to read a portion of a block.
 671                          */
 672                         while (--nblk >= 0) {
 673                                 (void) dmu_xuio_add(xuio,
 674                                     dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 675                                     blksz), 0, blksz);
 676                         }
 677                 }
 678         }
 679 #endif  /* sun */
 680
 681         while (n > 0) {
 682                 nbytes = MIN(n, zfs_read_chunk_size -
 683                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 684
 685 #ifdef __FreeBSD__
 686                 if (uio->uio_segflg == UIO_NOCOPY)
 687                         error = mappedread_sf(vp, nbytes, uio);
 688                 else
 689 #endif /* __FreeBSD__ */
 690                 if (vn_has_cached_data(vp))
 691                         error = mappedread(vp, nbytes, uio);
 692                 else
 693                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 694                 if (error) {
 695                         /* convert checksum errors into IO errors */
 696                         if (error == ECKSUM)
 697                                 error = EIO;
 698                         break;
 699                 }
 700
 701                 n -= nbytes;
 702         }
 703 out:
 704         zfs_range_unlock(rl);
 705
 706         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 707         ZFS_EXIT(zfsvfs);
 708         return (error);
 709 }
 710
 711 /*
 712  * Write the bytes to a file.
 713  *
 714  *      IN:     vp      - vnode of file to be written to.
 715  *              uio     - structure supplying write location, range info,
 716  *                        and data buffer.
 717  *              ioflag  - FAPPEND flag set if in append mode.
 718  *              cr      - credentials of caller.
 719  *              ct      - caller context (NFS/CIFS fem monitor only)
 720  *
 721  *      OUT:    uio     - updated offset and range.
 722  *
 723  *      RETURN: 0 if success
 724  *              error code if failure
 725  *
 726  * Timestamps:
 727  *      vp - ctime|mtime updated if byte count > 0
 728  */
 729
 730 /* ARGSUSED */
 731 static int
 732 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 733 {
 734         znode_t         *zp = VTOZ(vp);
 735         rlim64_t        limit = MAXOFFSET_T;
 736         ssize_t         start_resid = uio->uio_resid;
 737         ssize_t         tx_bytes;
 738         uint64_t        end_size;
 739         dmu_tx_t        *tx;
 740         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 741         zilog_t         *zilog;
 742         offset_t        woff;
 743         ssize_t         n, nbytes;
 744         rl_t            *rl;
 745         int             max_blksz = zfsvfs->z_max_blksz;
 746         int             error;
 747         arc_buf_t       *abuf;
 748         iovec_t         *aiov;
 749         xuio_t          *xuio = NULL;
 750         int             i_iov = 0;
 751         int             iovcnt = uio->uio_iovcnt;
 752         iovec_t         *iovp = uio->uio_iov;
 753         int             write_eof;
 754         int             count = 0;
 755         sa_bulk_attr_t  bulk[4];
 756         uint64_t        mtime[2], ctime[2];
 757
 758         /*
 759          * Fasttrack empty write
 760          */
 761         n = start_resid;
 762         if (n == 0)
 763                 return (0);
 764
 765         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 766                 limit = MAXOFFSET_T;
 767
 768         ZFS_ENTER(zfsvfs);
 769         ZFS_VERIFY_ZP(zp);
 770
 771         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 772         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 773         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 774             &zp->z_size, 8);
 775         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 776             &zp->z_pflags, 8);
 777
 778         /*
 779          * If immutable or not appending then return EPERM
 780          */
 781         if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 782             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 783             (uio->uio_loffset < zp->z_size))) {
 784                 ZFS_EXIT(zfsvfs);
 785                 return (EPERM);
 786         }
 787
 788         zilog = zfsvfs->z_log;
 789
 790         /*
 791          * Validate file offset
 792          */
 793         woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 794         if (woff < 0) {
 795                 ZFS_EXIT(zfsvfs);
 796                 return (EINVAL);
 797         }
 798
 799         /*
 800          * Check for mandatory locks before calling zfs_range_lock()
 801          * in order to prevent a deadlock with locks set via fcntl().
 802          */
 803         if (MANDMODE((mode_t)zp->z_mode) &&
 804             (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 805                 ZFS_EXIT(zfsvfs);
 806                 return (error);
 807         }
 808
 809 #ifdef sun
 810         /*
 811          * Pre-fault the pages to ensure slow (eg NFS) pages
 812          * don't hold up txg.
 813          * Skip this if uio contains loaned arc_buf.
 814          */
 815         if ((uio->uio_extflg == UIO_XUIO) &&
 816             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 817                 xuio = (xuio_t *)uio;
 818         else
 819                 uio_prefaultpages(MIN(n, max_blksz), uio);
 820 #endif  /* sun */
 821
 822         /*
 823          * If in append mode, set the io offset pointer to eof.
 824          */
 825         if (ioflag & FAPPEND) {
 826                 /*
 827                  * Obtain an appending range lock to guarantee file append
 828                  * semantics.  We reset the write offset once we have the lock.
 829                  */
 830                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 831                 woff = rl->r_off;
 832                 if (rl->r_len == UINT64_MAX) {
 833                         /*
 834                          * We overlocked the file because this write will cause
 835                          * the file block size to increase.
 836                          * Note that zp_size cannot change with this lock held.
 837                          */
 838                         woff = zp->z_size;
 839                 }
 840                 uio->uio_loffset = woff;
 841         } else {
 842                 /*
 843                  * Note that if the file block size will change as a result of
 844                  * this write, then this range lock will lock the entire file
 845                  * so that we can re-write the block safely.
 846                  */
 847                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 848         }
 849
 850         if (woff >= limit) {
 851                 zfs_range_unlock(rl);
 852                 ZFS_EXIT(zfsvfs);
 853                 return (EFBIG);
 854         }
 855
 856         if ((woff + n) > limit || woff > (limit - n))
 857                 n = limit - woff;
 858
 859         /* Will this write extend the file length? */
 860         write_eof = (woff + n > zp->z_size);
 861
 862         end_size = MAX(zp->z_size, woff + n);
 863
 864         /*
 865          * Write the file in reasonable size chunks.  Each chunk is written
 866          * in a separate transaction; this keeps the intent log records small
 867          * and allows us to do more fine-grained space accounting.
 868          */
 869         while (n > 0) {
 870                 abuf = NULL;
 871                 woff = uio->uio_loffset;
 872 again:
 873                 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 874                     zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 875                         if (abuf != NULL)
 876                                 dmu_return_arcbuf(abuf);
 877                         error = EDQUOT;
 878                         break;
 879                 }
 880
 881                 if (xuio && abuf == NULL) {
 882                         ASSERT(i_iov < iovcnt);
 883                         aiov = &iovp[i_iov];
 884                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 885                         dmu_xuio_clear(xuio, i_iov);
 886                         DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 887                             iovec_t *, aiov, arc_buf_t *, abuf);
 888                         ASSERT((aiov->iov_base == abuf->b_data) ||
 889                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 890                             aiov->iov_len == arc_buf_size(abuf)));
 891                         i_iov++;
 892                 } else if (abuf == NULL && n >= max_blksz &&
 893                     woff >= zp->z_size &&
 894                     P2PHASE(woff, max_blksz) == 0 &&
 895                     zp->z_blksz == max_blksz) {
 896                         /*
 897                          * This write covers a full block.  "Borrow" a buffer
 898                          * from the dmu so that we can fill it before we enter
 899                          * a transaction.  This avoids the possibility of
 900                          * holding up the transaction if the data copy hangs
 901                          * up on a pagefault (e.g., from an NFS server mapping).
 902                          */
 903                         size_t cbytes;
 904
 905                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 906                             max_blksz);
 907                         ASSERT(abuf != NULL);
 908                         ASSERT(arc_buf_size(abuf) == max_blksz);
 909                         if (error = uiocopy(abuf->b_data, max_blksz,
 910                             UIO_WRITE, uio, &cbytes)) {
 911                                 dmu_return_arcbuf(abuf);
 912                                 break;
 913                         }
 914                         ASSERT(cbytes == max_blksz);
 915                 }
 916
 917                 /*
 918                  * Start a transaction.
 919                  */
 920                 tx = dmu_tx_create(zfsvfs->z_os);
 921                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 922                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 923                 zfs_sa_upgrade_txholds(tx, zp);
 924                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 925                 if (error) {
 926                         if (error == ERESTART) {
 927                                 dmu_tx_wait(tx);
 928                                 dmu_tx_abort(tx);
 929                                 goto again;
 930                         }
 931                         dmu_tx_abort(tx);
 932                         if (abuf != NULL)
 933                                 dmu_return_arcbuf(abuf);
 934                         break;
 935                 }
 936
 937                 /*
 938                  * If zfs_range_lock() over-locked we grow the blocksize
 939                  * and then reduce the lock range.  This will only happen
 940                  * on the first iteration since zfs_range_reduce() will
 941                  * shrink down r_len to the appropriate size.
 942                  */
 943                 if (rl->r_len == UINT64_MAX) {
 944                         uint64_t new_blksz;
 945
 946                         if (zp->z_blksz > max_blksz) {
 947                                 ASSERT(!ISP2(zp->z_blksz));
 948                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 949                         } else {
 950                                 new_blksz = MIN(end_size, max_blksz);
 951                         }
 952                         zfs_grow_blocksize(zp, new_blksz, tx);
 953                         zfs_range_reduce(rl, woff, n);
 954                 }
 955
 956                 /*
 957                  * XXX - should we really limit each write to z_max_blksz?
 958                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 959                  */
 960                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 961
 962                 if (woff + nbytes > zp->z_size)
 963                         vnode_pager_setsize(vp, woff + nbytes);
 964
 965                 if (abuf == NULL) {
 966                         tx_bytes = uio->uio_resid;
 967                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 968                             uio, nbytes, tx);
 969                         tx_bytes -= uio->uio_resid;
 970                 } else {
 971                         tx_bytes = nbytes;
 972                         ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 973                         /*
 974                          * If this is not a full block write, but we are
 975                          * extending the file past EOF and this data starts
 976                          * block-aligned, use assign_arcbuf().  Otherwise,
 977                          * write via dmu_write().
 978                          */
 979                         if (tx_bytes < max_blksz && (!write_eof ||
 980                             aiov->iov_base != abuf->b_data)) {
 981                                 ASSERT(xuio);
 982                                 dmu_write(zfsvfs->z_os, zp->z_id, woff,
 983                                     aiov->iov_len, aiov->iov_base, tx);
 984                                 dmu_return_arcbuf(abuf);
 985                                 xuio_stat_wbuf_copied();
 986                         } else {
 987                                 ASSERT(xuio || tx_bytes == max_blksz);
 988                                 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 989                                     woff, abuf, tx);
 990                         }
 991                         ASSERT(tx_bytes <= uio->uio_resid);
 992                         uioskip(uio, tx_bytes);
 993                 }
 994                 if (tx_bytes && vn_has_cached_data(vp)) {
 995                         update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 996                             zp->z_id, uio->uio_segflg, tx);
 997                 }
 998
 999                 /*
1000                  * If we made no progress, we're done.  If we made even
1001                  * partial progress, update the znode and ZIL accordingly.
1002                  */
1003                 if (tx_bytes == 0) {
1004                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1005                             (void *)&zp->z_size, sizeof (uint64_t), tx);
1006                         dmu_tx_commit(tx);
1007                         ASSERT(error != 0);
1008                         break;
1009                 }
1010
1011                 /*
1012                  * Clear Set-UID/Set-GID bits on successful write if not
1013                  * privileged and at least one of the excute bits is set.
1014                  *
1015                  * It would be nice to to this after all writes have
1016                  * been done, but that would still expose the ISUID/ISGID
1017                  * to another app after the partial write is committed.
1018                  *
1019                  * Note: we don't call zfs_fuid_map_id() here because
1020                  * user 0 is not an ephemeral uid.
1021                  */
1022                 mutex_enter(&zp->z_acl_lock);
1023                 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1024                     (S_IXUSR >> 6))) != 0 &&
1025                     (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1026                     secpolicy_vnode_setid_retain(vp, cr,
1027                     (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1028                         uint64_t newmode;
1029                         zp->z_mode &= ~(S_ISUID | S_ISGID);
1030                         newmode = zp->z_mode;
1031                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1032                             (void *)&newmode, sizeof (uint64_t), tx);
1033                 }
1034                 mutex_exit(&zp->z_acl_lock);
1035
1036                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1037                     B_TRUE);
1038
1039                 /*
1040                  * Update the file size (zp_size) if it has changed;
1041                  * account for possible concurrent updates.
1042                  */
1043                 while ((end_size = zp->z_size) < uio->uio_loffset) {
1044                         (void) atomic_cas_64(&zp->z_size, end_size,
1045                             uio->uio_loffset);
1046                         ASSERT(error == 0);
1047                 }
1048                 /*
1049                  * If we are replaying and eof is non zero then force
1050                  * the file size to the specified eof. Note, there's no
1051                  * concurrency during replay.
1052                  */
1053                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1054                         zp->z_size = zfsvfs->z_replay_eof;
1055
1056                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1057
1058                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1059                 dmu_tx_commit(tx);
1060
1061                 if (error != 0)
1062                         break;
1063                 ASSERT(tx_bytes == nbytes);
1064                 n -= nbytes;
1065
1066 #ifdef sun
1067                 if (!xuio && n > 0)
1068                         uio_prefaultpages(MIN(n, max_blksz), uio);
1069 #endif  /* sun */
1070         }
1071
1072         zfs_range_unlock(rl);
1073
1074         /*
1075          * If we're in replay mode, or we made no progress, return error.
1076          * Otherwise, it's at least a partial write, so it's successful.
1077          */
1078         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1079                 ZFS_EXIT(zfsvfs);
1080                 return (error);
1081         }
1082
1083         if (ioflag & (FSYNC | FDSYNC) ||
1084             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1085                 zil_commit(zilog, zp->z_id);
1086
1087         ZFS_EXIT(zfsvfs);
1088         return (0);
1089 }
1090
1091 void
1092 zfs_get_done(zgd_t *zgd, int error)
1093 {
1094         znode_t *zp = zgd->zgd_private;
1095         objset_t *os = zp->z_zfsvfs->z_os;
1096         int vfslocked;
1097
1098         if (zgd->zgd_db)
1099                 dmu_buf_rele(zgd->zgd_db, zgd);
1100
1101         zfs_range_unlock(zgd->zgd_rl);
1102
1103         vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
1104         /*
1105          * Release the vnode asynchronously as we currently have the
1106          * txg stopped from syncing.
1107          */
1108         VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1109
1110         if (error == 0 && zgd->zgd_bp)
1111                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1112
1113         kmem_free(zgd, sizeof (zgd_t));
1114         VFS_UNLOCK_GIANT(vfslocked);
1115 }
1116
1117 #ifdef DEBUG
1118 static int zil_fault_io = 0;
1119 #endif
1120
1121 /*
1122  * Get data to generate a TX_WRITE intent log record.
1123  */
1124 int
1125 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1126 {
1127         zfsvfs_t *zfsvfs = arg;
1128         objset_t *os = zfsvfs->z_os;
1129         znode_t *zp;
1130         uint64_t object = lr->lr_foid;
1131         uint64_t offset = lr->lr_offset;
1132         uint64_t size = lr->lr_length;
1133         blkptr_t *bp = &lr->lr_blkptr;
1134         dmu_buf_t *db;
1135         zgd_t *zgd;
1136         int error = 0;
1137
1138         ASSERT(zio != NULL);
1139         ASSERT(size != 0);
1140
1141         /*
1142          * Nothing to do if the file has been removed
1143          */
1144         if (zfs_zget(zfsvfs, object, &zp) != 0)
1145                 return (ENOENT);
1146         if (zp->z_unlinked) {
1147                 /*
1148                  * Release the vnode asynchronously as we currently have the
1149                  * txg stopped from syncing.
1150                  */
1151                 VN_RELE_ASYNC(ZTOV(zp),
1152                     dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1153                 return (ENOENT);
1154         }
1155
1156         zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1157         zgd->zgd_zilog = zfsvfs->z_log;
1158         zgd->zgd_private = zp;
1159
1160         /*
1161          * Write records come in two flavors: immediate and indirect.
1162          * For small writes it's cheaper to store the data with the
1163          * log record (immediate); for large writes it's cheaper to
1164          * sync the data and get a pointer to it (indirect) so that
1165          * we don't have to write the data twice.
1166          */
1167         if (buf != NULL) { /* immediate write */
1168                 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1169                 /* test for truncation needs to be done while range locked */
1170                 if (offset >= zp->z_size) {
1171                         error = ENOENT;
1172                 } else {
1173                         error = dmu_read(os, object, offset, size, buf,
1174                             DMU_READ_NO_PREFETCH);
1175                 }
1176                 ASSERT(error == 0 || error == ENOENT);
1177         } else { /* indirect write */
1178                 /*
1179                  * Have to lock the whole block to ensure when it's
1180                  * written out and it's checksum is being calculated
1181                  * that no one can change the data. We need to re-check
1182                  * blocksize after we get the lock in case it's changed!
1183                  */
1184                 for (;;) {
1185                         uint64_t blkoff;
1186                         size = zp->z_blksz;
1187                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1188                         offset -= blkoff;
1189                         zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1190                             RL_READER);
1191                         if (zp->z_blksz == size)
1192                                 break;
1193                         offset += blkoff;
1194                         zfs_range_unlock(zgd->zgd_rl);
1195                 }
1196                 /* test for truncation needs to be done while range locked */
1197                 if (lr->lr_offset >= zp->z_size)
1198                         error = ENOENT;
1199 #ifdef DEBUG
1200                 if (zil_fault_io) {
1201                         error = EIO;
1202                         zil_fault_io = 0;
1203                 }
1204 #endif
1205                 if (error == 0)
1206                         error = dmu_buf_hold(os, object, offset, zgd, &db,
1207                             DMU_READ_NO_PREFETCH);
1208
1209                 if (error == 0) {
1210                         blkptr_t *obp = dmu_buf_get_blkptr(db);
1211                         if (obp) {
1212                                 ASSERT(BP_IS_HOLE(bp));
1213                                 *bp = *obp;
1214                         }
1215
1216                         zgd->zgd_db = db;
1217                         zgd->zgd_bp = bp;
1218
1219                         ASSERT(db->db_offset == offset);
1220                         ASSERT(db->db_size == size);
1221
1222                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1223                             zfs_get_done, zgd);
1224                         ASSERT(error || lr->lr_length <= zp->z_blksz);
1225
1226                         /*
1227                          * On success, we need to wait for the write I/O
1228                          * initiated by dmu_sync() to complete before we can
1229                          * release this dbuf.  We will finish everything up
1230                          * in the zfs_get_done() callback.
1231                          */
1232                         if (error == 0)
1233                                 return (0);
1234
1235                         if (error == EALREADY) {
1236                                 lr->lr_common.lrc_txtype = TX_WRITE2;
1237                                 error = 0;
1238                         }
1239                 }
1240         }
1241
1242         zfs_get_done(zgd, error);
1243
1244         return (error);
1245 }
1246
1247 /*ARGSUSED*/
1248 static int
1249 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1250     caller_context_t *ct)
1251 {
1252         znode_t *zp = VTOZ(vp);
1253         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1254         int error;
1255
1256         ZFS_ENTER(zfsvfs);
1257         ZFS_VERIFY_ZP(zp);
1258
1259         if (flag & V_ACE_MASK)
1260                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1261         else
1262                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1263
1264         ZFS_EXIT(zfsvfs);
1265         return (error);
1266 }
1267
1268 /*
1269  * If vnode is for a device return a specfs vnode instead.
1270  */
1271 static int
1272 specvp_check(vnode_t **vpp, cred_t *cr)
1273 {
1274         int error = 0;
1275
1276         if (IS_DEVVP(*vpp)) {
1277                 struct vnode *svp;
1278
1279                 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1280                 VN_RELE(*vpp);
1281                 if (svp == NULL)
1282                         error = ENOSYS;
1283                 *vpp = svp;
1284         }
1285         return (error);
1286 }
1287
1288
1289 /*
1290  * Lookup an entry in a directory, or an extended attribute directory.
1291  * If it exists, return a held vnode reference for it.
1292  *
1293  *      IN:     dvp     - vnode of directory to search.
1294  *              nm      - name of entry to lookup.
1295  *              pnp     - full pathname to lookup [UNUSED].
1296  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1297  *              rdir    - root directory vnode [UNUSED].
1298  *              cr      - credentials of caller.
1299  *              ct      - caller context
1300  *              direntflags - directory lookup flags
1301  *              realpnp - returned pathname.
1302  *
1303  *      OUT:    vpp     - vnode of located entry, NULL if not found.
1304  *
1305  *      RETURN: 0 if success
1306  *              error code if failure
1307  *
1308  * Timestamps:
1309  *      NA
1310  */
1311 /* ARGSUSED */
1312 static int
1313 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1314     int nameiop, cred_t *cr, kthread_t *td, int flags)
1315 {
1316         znode_t *zdp = VTOZ(dvp);
1317         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1318         int     error = 0;
1319         int *direntflags = NULL;
1320         void *realpnp = NULL;
1321
1322         /* fast path */
1323         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1324
1325                 if (dvp->v_type != VDIR) {
1326                         return (ENOTDIR);
1327                 } else if (zdp->z_sa_hdl == NULL) {
1328                         return (EIO);
1329                 }
1330
1331                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1332                         error = zfs_fastaccesschk_execute(zdp, cr);
1333                         if (!error) {
1334                                 *vpp = dvp;
1335                                 VN_HOLD(*vpp);
1336                                 return (0);
1337                         }
1338                         return (error);
1339                 } else {
1340                         vnode_t *tvp = dnlc_lookup(dvp, nm);
1341
1342                         if (tvp) {
1343                                 error = zfs_fastaccesschk_execute(zdp, cr);
1344                                 if (error) {
1345                                         VN_RELE(tvp);
1346                                         return (error);
1347                                 }
1348                                 if (tvp == DNLC_NO_VNODE) {
1349                                         VN_RELE(tvp);
1350                                         return (ENOENT);
1351                                 } else {
1352                                         *vpp = tvp;
1353                                         return (specvp_check(vpp, cr));
1354                                 }
1355                         }
1356                 }
1357         }
1358
1359         DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1360
1361         ZFS_ENTER(zfsvfs);
1362         ZFS_VERIFY_ZP(zdp);
1363
1364         *vpp = NULL;
1365
1366         if (flags & LOOKUP_XATTR) {
1367 #ifdef TODO
1368                 /*
1369                  * If the xattr property is off, refuse the lookup request.
1370                  */
1371                 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1372                         ZFS_EXIT(zfsvfs);
1373                         return (EINVAL);
1374                 }
1375 #endif
1376
1377                 /*
1378                  * We don't allow recursive attributes..
1379                  * Maybe someday we will.
1380                  */
1381                 if (zdp->z_pflags & ZFS_XATTR) {
1382                         ZFS_EXIT(zfsvfs);
1383                         return (EINVAL);
1384                 }
1385
1386                 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1387                         ZFS_EXIT(zfsvfs);
1388                         return (error);
1389                 }
1390
1391                 /*
1392                  * Do we have permission to get into attribute directory?
1393                  */
1394
1395                 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1396                     B_FALSE, cr)) {
1397                         VN_RELE(*vpp);
1398                         *vpp = NULL;
1399                 }
1400
1401                 ZFS_EXIT(zfsvfs);
1402                 return (error);
1403         }
1404
1405         if (dvp->v_type != VDIR) {
1406                 ZFS_EXIT(zfsvfs);
1407                 return (ENOTDIR);
1408         }
1409
1410         /*
1411          * Check accessibility of directory.
1412          */
1413
1414         if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1415                 ZFS_EXIT(zfsvfs);
1416                 return (error);
1417         }
1418
1419         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1420             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1421                 ZFS_EXIT(zfsvfs);
1422                 return (EILSEQ);
1423         }
1424
1425         error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1426         if (error == 0)
1427                 error = specvp_check(vpp, cr);
1428
1429         /* Translate errors and add SAVENAME when needed. */
1430         if (cnp->cn_flags & ISLASTCN) {
1431                 switch (nameiop) {
1432                 case CREATE:
1433                 case RENAME:
1434                         if (error == ENOENT) {
1435                                 error = EJUSTRETURN;
1436                                 cnp->cn_flags |= SAVENAME;
1437                                 break;
1438                         }
1439                         /* FALLTHROUGH */
1440                 case DELETE:
1441                         if (error == 0)
1442                                 cnp->cn_flags |= SAVENAME;
1443                         break;
1444                 }
1445         }
1446         if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1447                 int ltype = 0;
1448
1449                 if (cnp->cn_flags & ISDOTDOT) {
1450                         ltype = VOP_ISLOCKED(dvp);
1451                         VOP_UNLOCK(dvp, 0);
1452                 }
1453                 ZFS_EXIT(zfsvfs);
1454                 error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
1455                 if (cnp->cn_flags & ISDOTDOT)
1456                         vn_lock(dvp, ltype | LK_RETRY);
1457                 if (error != 0) {
1458                         VN_RELE(*vpp);
1459                         *vpp = NULL;
1460                         return (error);
1461                 }
1462         } else {
1463                 ZFS_EXIT(zfsvfs);
1464         }
1465
1466 #ifdef FREEBSD_NAMECACHE
1467         /*
1468          * Insert name into cache (as non-existent) if appropriate.
1469          */
1470         if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1471                 cache_enter(dvp, *vpp, cnp);
1472         /*
1473          * Insert name into cache if appropriate.
1474          */
1475         if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1476                 if (!(cnp->cn_flags & ISLASTCN) ||
1477                     (nameiop != DELETE && nameiop != RENAME)) {
1478                         cache_enter(dvp, *vpp, cnp);
1479                 }
1480         }
1481 #endif
1482
1483         return (error);
1484 }
1485
1486 /*
1487  * Attempt to create a new entry in a directory.  If the entry
1488  * already exists, truncate the file if permissible, else return
1489  * an error.  Return the vp of the created or trunc'd file.
1490  *
1491  *      IN:     dvp     - vnode of directory to put new file entry in.
1492  *              name    - name of new file entry.
1493  *              vap     - attributes of new file.
1494  *              excl    - flag indicating exclusive or non-exclusive mode.
1495  *              mode    - mode to open file with.
1496  *              cr      - credentials of caller.
1497  *              flag    - large file flag [UNUSED].
1498  *              ct      - caller context
1499  *              vsecp   - ACL to be set
1500  *
1501  *      OUT:    vpp     - vnode of created or trunc'd entry.
1502  *
1503  *      RETURN: 0 if success
1504  *              error code if failure
1505  *
1506  * Timestamps:
1507  *      dvp - ctime|mtime updated if new entry created
1508  *       vp - ctime|mtime always, atime if new
1509  */
1510
1511 /* ARGSUSED */
1512 static int
1513 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1514     vnode_t **vpp, cred_t *cr, kthread_t *td)
1515 {
1516         znode_t         *zp, *dzp = VTOZ(dvp);
1517         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1518         zilog_t         *zilog;
1519         objset_t        *os;
1520         zfs_dirlock_t   *dl;
1521         dmu_tx_t        *tx;
1522         int             error;
1523         ksid_t          *ksid;
1524         uid_t           uid;
1525         gid_t           gid = crgetgid(cr);
1526         zfs_acl_ids_t   acl_ids;
1527         boolean_t       fuid_dirtied;
1528         boolean_t       have_acl = B_FALSE;
1529         void            *vsecp = NULL;
1530         int             flag = 0;
1531
1532         /*
1533          * If we have an ephemeral id, ACL, or XVATTR then
1534          * make sure file system is at proper version
1535          */
1536
1537         ksid = crgetsid(cr, KSID_OWNER);
1538         if (ksid)
1539                 uid = ksid_getid(ksid);
1540         else
1541                 uid = crgetuid(cr);
1542
1543         if (zfsvfs->z_use_fuids == B_FALSE &&
1544             (vsecp || (vap->va_mask & AT_XVATTR) ||
1545             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1546                 return (EINVAL);
1547
1548         ZFS_ENTER(zfsvfs);
1549         ZFS_VERIFY_ZP(dzp);
1550         os = zfsvfs->z_os;
1551         zilog = zfsvfs->z_log;
1552
1553         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1554             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1555                 ZFS_EXIT(zfsvfs);
1556                 return (EILSEQ);
1557         }
1558
1559         if (vap->va_mask & AT_XVATTR) {
1560                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1561                     crgetuid(cr), cr, vap->va_type)) != 0) {
1562                         ZFS_EXIT(zfsvfs);
1563                         return (error);
1564                 }
1565         }
1566 top:
1567         *vpp = NULL;
1568
1569         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1570                 vap->va_mode &= ~S_ISVTX;
1571
1572         if (*name == '\0') {
1573                 /*
1574                  * Null component name refers to the directory itself.
1575                  */
1576                 VN_HOLD(dvp);
1577                 zp = dzp;
1578                 dl = NULL;
1579                 error = 0;
1580         } else {
1581                 /* possible VN_HOLD(zp) */
1582                 int zflg = 0;
1583
1584                 if (flag & FIGNORECASE)
1585                         zflg |= ZCILOOK;
1586
1587                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1588                     NULL, NULL);
1589                 if (error) {
1590                         if (have_acl)
1591                                 zfs_acl_ids_free(&acl_ids);
1592                         if (strcmp(name, "..") == 0)
1593                                 error = EISDIR;
1594                         ZFS_EXIT(zfsvfs);
1595                         return (error);
1596                 }
1597         }
1598
1599         if (zp == NULL) {
1600                 uint64_t txtype;
1601
1602                 /*
1603                  * Create a new file object and update the directory
1604                  * to reference it.
1605                  */
1606                 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1607                         if (have_acl)
1608                                 zfs_acl_ids_free(&acl_ids);
1609                         goto out;
1610                 }
1611
1612                 /*
1613                  * We only support the creation of regular files in
1614                  * extended attribute directories.
1615                  */
1616
1617                 if ((dzp->z_pflags & ZFS_XATTR) &&
1618                     (vap->va_type != VREG)) {
1619                         if (have_acl)
1620                                 zfs_acl_ids_free(&acl_ids);
1621                         error = EINVAL;
1622                         goto out;
1623                 }
1624
1625                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1626                     cr, vsecp, &acl_ids)) != 0)
1627                         goto out;
1628                 have_acl = B_TRUE;
1629
1630                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1631                         zfs_acl_ids_free(&acl_ids);
1632                         error = EDQUOT;
1633                         goto out;
1634                 }
1635
1636                 tx = dmu_tx_create(os);
1637
1638                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1639                     ZFS_SA_BASE_ATTR_SIZE);
1640
1641                 fuid_dirtied = zfsvfs->z_fuid_dirty;
1642                 if (fuid_dirtied)
1643                         zfs_fuid_txhold(zfsvfs, tx);
1644                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1645                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1646                 if (!zfsvfs->z_use_sa &&
1647                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1648                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1649                             0, acl_ids.z_aclp->z_acl_bytes);
1650                 }
1651                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1652                 if (error) {
1653                         zfs_dirent_unlock(dl);
1654                         if (error == ERESTART) {
1655                                 dmu_tx_wait(tx);
1656                                 dmu_tx_abort(tx);
1657                                 goto top;
1658                         }
1659                         zfs_acl_ids_free(&acl_ids);
1660                         dmu_tx_abort(tx);
1661                         ZFS_EXIT(zfsvfs);
1662                         return (error);
1663                 }
1664                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1665
1666                 if (fuid_dirtied)
1667                         zfs_fuid_sync(zfsvfs, tx);
1668
1669                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1670                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1671                 if (flag & FIGNORECASE)
1672                         txtype |= TX_CI;
1673                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1674                     vsecp, acl_ids.z_fuidp, vap);
1675                 zfs_acl_ids_free(&acl_ids);
1676                 dmu_tx_commit(tx);
1677         } else {
1678                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1679
1680                 if (have_acl)
1681                         zfs_acl_ids_free(&acl_ids);
1682                 have_acl = B_FALSE;
1683
1684                 /*
1685                  * A directory entry already exists for this name.
1686                  */
1687                 /*
1688                  * Can't truncate an existing file if in exclusive mode.
1689                  */
1690                 if (excl == EXCL) {
1691                         error = EEXIST;
1692                         goto out;
1693                 }
1694                 /*
1695                  * Can't open a directory for writing.
1696                  */
1697                 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1698                         error = EISDIR;
1699                         goto out;
1700                 }
1701                 /*
1702                  * Verify requested access to file.
1703                  */
1704                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1705                         goto out;
1706                 }
1707
1708                 mutex_enter(&dzp->z_lock);
1709                 dzp->z_seq++;
1710                 mutex_exit(&dzp->z_lock);
1711
1712                 /*
1713                  * Truncate regular files if requested.
1714                  */
1715                 if ((ZTOV(zp)->v_type == VREG) &&
1716                     (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1717                         /* we can't hold any locks when calling zfs_freesp() */
1718                         zfs_dirent_unlock(dl);
1719                         dl = NULL;
1720                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1721                         if (error == 0) {
1722                                 vnevent_create(ZTOV(zp), ct);
1723                         }
1724                 }
1725         }
1726 out:
1727         if (dl)
1728                 zfs_dirent_unlock(dl);
1729
1730         if (error) {
1731                 if (zp)
1732                         VN_RELE(ZTOV(zp));
1733         } else {
1734                 *vpp = ZTOV(zp);
1735                 error = specvp_check(vpp, cr);
1736         }
1737
1738         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1739                 zil_commit(zilog, 0);
1740
1741         ZFS_EXIT(zfsvfs);
1742         return (error);
1743 }
1744
1745 /*
1746  * Remove an entry from a directory.
1747  *
1748  *      IN:     dvp     - vnode of directory to remove entry from.
1749  *              name    - name of entry to remove.
1750  *              cr      - credentials of caller.
1751  *              ct      - caller context
1752  *              flags   - case flags
1753  *
1754  *      RETURN: 0 if success
1755  *              error code if failure
1756  *
1757  * Timestamps:
1758  *      dvp - ctime|mtime
1759  *       vp - ctime (if nlink > 0)
1760  */
1761
1762 uint64_t null_xattr = 0;
1763
1764 /*ARGSUSED*/
1765 static int
1766 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1767     int flags)
1768 {
1769         znode_t         *zp, *dzp = VTOZ(dvp);
1770         znode_t         *xzp;
1771         vnode_t         *vp;
1772         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1773         zilog_t         *zilog;
1774         uint64_t        acl_obj, xattr_obj;
1775         uint64_t        xattr_obj_unlinked = 0;
1776         uint64_t        obj = 0;
1777         zfs_dirlock_t   *dl;
1778         dmu_tx_t        *tx;
1779         boolean_t       may_delete_now, delete_now = FALSE;
1780         boolean_t       unlinked, toobig = FALSE;
1781         uint64_t        txtype;
1782         pathname_t      *realnmp = NULL;
1783         pathname_t      realnm;
1784         int             error;
1785         int             zflg = ZEXISTS;
1786
1787         ZFS_ENTER(zfsvfs);
1788         ZFS_VERIFY_ZP(dzp);
1789         zilog = zfsvfs->z_log;
1790
1791         if (flags & FIGNORECASE) {
1792                 zflg |= ZCILOOK;
1793                 pn_alloc(&realnm);
1794                 realnmp = &realnm;
1795         }
1796
1797 top:
1798         xattr_obj = 0;
1799         xzp = NULL;
1800         /*
1801          * Attempt to lock directory; fail if entry doesn't exist.
1802          */
1803         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1804             NULL, realnmp)) {
1805                 if (realnmp)
1806                         pn_free(realnmp);
1807                 ZFS_EXIT(zfsvfs);
1808                 return (error);
1809         }
1810
1811         vp = ZTOV(zp);
1812
1813         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1814                 goto out;
1815         }
1816
1817         /*
1818          * Need to use rmdir for removing directories.
1819          */
1820         if (vp->v_type == VDIR) {
1821                 error = EPERM;
1822                 goto out;
1823         }
1824
1825         vnevent_remove(vp, dvp, name, ct);
1826
1827         if (realnmp)
1828                 dnlc_remove(dvp, realnmp->pn_buf);
1829         else
1830                 dnlc_remove(dvp, name);
1831
1832         VI_LOCK(vp);
1833         may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1834         VI_UNLOCK(vp);
1835
1836         /*
1837          * We may delete the znode now, or we may put it in the unlinked set;
1838          * it depends on whether we're the last link, and on whether there are
1839          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1840          * allow for either case.
1841          */
1842         obj = zp->z_id;
1843         tx = dmu_tx_create(zfsvfs->z_os);
1844         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1845         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1846         zfs_sa_upgrade_txholds(tx, zp);
1847         zfs_sa_upgrade_txholds(tx, dzp);
1848         if (may_delete_now) {
1849                 toobig =
1850                     zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1851                 /* if the file is too big, only hold_free a token amount */
1852                 dmu_tx_hold_free(tx, zp->z_id, 0,
1853                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1854         }
1855
1856         /* are there any extended attributes? */
1857         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1858             &xattr_obj, sizeof (xattr_obj));
1859         if (error == 0 && xattr_obj) {
1860                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1861                 ASSERT0(error);
1862                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1863                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1864         }
1865
1866         mutex_enter(&zp->z_lock);
1867         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1868                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1869         mutex_exit(&zp->z_lock);
1870
1871         /* charge as an update -- would be nice not to charge at all */
1872         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1873
1874         error = dmu_tx_assign(tx, TXG_NOWAIT);
1875         if (error) {
1876                 zfs_dirent_unlock(dl);
1877                 VN_RELE(vp);
1878                 if (xzp)
1879                         VN_RELE(ZTOV(xzp));
1880                 if (error == ERESTART) {
1881                         dmu_tx_wait(tx);
1882                         dmu_tx_abort(tx);
1883                         goto top;
1884                 }
1885                 if (realnmp)
1886                         pn_free(realnmp);
1887                 dmu_tx_abort(tx);
1888                 ZFS_EXIT(zfsvfs);
1889                 return (error);
1890         }
1891
1892         /*
1893          * Remove the directory entry.
1894          */
1895         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1896
1897         if (error) {
1898                 dmu_tx_commit(tx);
1899                 goto out;
1900         }
1901
1902         if (unlinked) {
1903
1904                 /*
1905                  * Hold z_lock so that we can make sure that the ACL obj
1906                  * hasn't changed.  Could have been deleted due to
1907                  * zfs_sa_upgrade().
1908                  */
1909                 mutex_enter(&zp->z_lock);
1910                 VI_LOCK(vp);
1911                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1912                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1913                 delete_now = may_delete_now && !toobig &&
1914                     vp->v_count == 1 && !vn_has_cached_data(vp) &&
1915                     xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1916                     acl_obj;
1917                 VI_UNLOCK(vp);
1918         }
1919
1920         if (delete_now) {
1921 #ifdef __FreeBSD__
1922                 panic("zfs_remove: delete_now branch taken");
1923 #endif
1924                 if (xattr_obj_unlinked) {
1925                         ASSERT3U(xzp->z_links, ==, 2);
1926                         mutex_enter(&xzp->z_lock);
1927                         xzp->z_unlinked = 1;
1928                         xzp->z_links = 0;
1929                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1930                             &xzp->z_links, sizeof (xzp->z_links), tx);
1931                         ASSERT3U(error,  ==,  0);
1932                         mutex_exit(&xzp->z_lock);
1933                         zfs_unlinked_add(xzp, tx);
1934
1935                         if (zp->z_is_sa)
1936                                 error = sa_remove(zp->z_sa_hdl,
1937                                     SA_ZPL_XATTR(zfsvfs), tx);
1938                         else
1939                                 error = sa_update(zp->z_sa_hdl,
1940                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1941                                     sizeof (uint64_t), tx);
1942                         ASSERT0(error);
1943                 }
1944                 VI_LOCK(vp);
1945                 vp->v_count--;
1946                 ASSERT0(vp->v_count);
1947                 VI_UNLOCK(vp);
1948                 mutex_exit(&zp->z_lock);
1949                 zfs_znode_delete(zp, tx);
1950         } else if (unlinked) {
1951                 mutex_exit(&zp->z_lock);
1952                 zfs_unlinked_add(zp, tx);
1953 #ifdef __FreeBSD__
1954                 vp->v_vflag |= VV_NOSYNC;
1955 #endif
1956         }
1957
1958         txtype = TX_REMOVE;
1959         if (flags & FIGNORECASE)
1960                 txtype |= TX_CI;
1961         zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1962
1963         dmu_tx_commit(tx);
1964 out:
1965         if (realnmp)
1966                 pn_free(realnmp);
1967
1968         zfs_dirent_unlock(dl);
1969
1970         if (!delete_now)
1971                 VN_RELE(vp);
1972         if (xzp)
1973                 VN_RELE(ZTOV(xzp));
1974
1975         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1976                 zil_commit(zilog, 0);
1977
1978         ZFS_EXIT(zfsvfs);
1979         return (error);
1980 }
1981
1982 /*
1983  * Create a new directory and insert it into dvp using the name
1984  * provided.  Return a pointer to the inserted directory.
1985  *
1986  *      IN:     dvp     - vnode of directory to add subdir to.
1987  *              dirname - name of new directory.
1988  *              vap     - attributes of new directory.
1989  *              cr      - credentials of caller.
1990  *              ct      - caller context
1991  *              vsecp   - ACL to be set
1992  *
1993  *      OUT:    vpp     - vnode of created directory.
1994  *
1995  *      RETURN: 0 if success
1996  *              error code if failure
1997  *
1998  * Timestamps:
1999  *      dvp - ctime|mtime updated
2000  *       vp - ctime|mtime|atime updated
2001  */
2002 /*ARGSUSED*/
2003 static int
2004 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2005     caller_context_t *ct, int flags, vsecattr_t *vsecp)
2006 {
2007         znode_t         *zp, *dzp = VTOZ(dvp);
2008         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2009         zilog_t         *zilog;
2010         zfs_dirlock_t   *dl;
2011         uint64_t        txtype;
2012         dmu_tx_t        *tx;
2013         int             error;
2014         int             zf = ZNEW;
2015         ksid_t          *ksid;
2016         uid_t           uid;
2017         gid_t           gid = crgetgid(cr);
2018         zfs_acl_ids_t   acl_ids;
2019         boolean_t       fuid_dirtied;
2020
2021         ASSERT(vap->va_type == VDIR);
2022
2023         /*
2024          * If we have an ephemeral id, ACL, or XVATTR then
2025          * make sure file system is at proper version
2026          */
2027
2028         ksid = crgetsid(cr, KSID_OWNER);
2029         if (ksid)
2030                 uid = ksid_getid(ksid);
2031         else
2032                 uid = crgetuid(cr);
2033         if (zfsvfs->z_use_fuids == B_FALSE &&
2034             (vsecp || (vap->va_mask & AT_XVATTR) ||
2035             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2036                 return (EINVAL);
2037
2038         ZFS_ENTER(zfsvfs);
2039         ZFS_VERIFY_ZP(dzp);
2040         zilog = zfsvfs->z_log;
2041
2042         if (dzp->z_pflags & ZFS_XATTR) {
2043                 ZFS_EXIT(zfsvfs);
2044                 return (EINVAL);
2045         }
2046
2047         if (zfsvfs->z_utf8 && u8_validate(dirname,
2048             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2049                 ZFS_EXIT(zfsvfs);
2050                 return (EILSEQ);
2051         }
2052         if (flags & FIGNORECASE)
2053                 zf |= ZCILOOK;
2054
2055         if (vap->va_mask & AT_XVATTR) {
2056                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2057                     crgetuid(cr), cr, vap->va_type)) != 0) {
2058                         ZFS_EXIT(zfsvfs);
2059                         return (error);
2060                 }
2061         }
2062
2063         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2064             vsecp, &acl_ids)) != 0) {
2065                 ZFS_EXIT(zfsvfs);
2066                 return (error);
2067         }
2068         /*
2069          * First make sure the new directory doesn't exist.
2070          *
2071          * Existence is checked first to make sure we don't return
2072          * EACCES instead of EEXIST which can cause some applications
2073          * to fail.
2074          */
2075 top:
2076         *vpp = NULL;
2077
2078         if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2079             NULL, NULL)) {
2080                 zfs_acl_ids_free(&acl_ids);
2081                 ZFS_EXIT(zfsvfs);
2082                 return (error);
2083         }
2084
2085         if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2086                 zfs_acl_ids_free(&acl_ids);
2087                 zfs_dirent_unlock(dl);
2088                 ZFS_EXIT(zfsvfs);
2089                 return (error);
2090         }
2091
2092         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2093                 zfs_acl_ids_free(&acl_ids);
2094                 zfs_dirent_unlock(dl);
2095                 ZFS_EXIT(zfsvfs);
2096                 return (EDQUOT);
2097         }
2098
2099         /*
2100          * Add a new entry to the directory.
2101          */
2102         tx = dmu_tx_create(zfsvfs->z_os);
2103         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2104         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2105         fuid_dirtied = zfsvfs->z_fuid_dirty;
2106         if (fuid_dirtied)
2107                 zfs_fuid_txhold(zfsvfs, tx);
2108         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2109                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2110                     acl_ids.z_aclp->z_acl_bytes);
2111         }
2112
2113         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2114             ZFS_SA_BASE_ATTR_SIZE);
2115
2116         error = dmu_tx_assign(tx, TXG_NOWAIT);
2117         if (error) {
2118                 zfs_dirent_unlock(dl);
2119                 if (error == ERESTART) {
2120                         dmu_tx_wait(tx);
2121                         dmu_tx_abort(tx);
2122                         goto top;
2123                 }
2124                 zfs_acl_ids_free(&acl_ids);
2125                 dmu_tx_abort(tx);
2126                 ZFS_EXIT(zfsvfs);
2127                 return (error);
2128         }
2129
2130         /*
2131          * Create new node.
2132          */
2133         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2134
2135         if (fuid_dirtied)
2136                 zfs_fuid_sync(zfsvfs, tx);
2137
2138         /*
2139          * Now put new name in parent dir.
2140          */
2141         (void) zfs_link_create(dl, zp, tx, ZNEW);
2142
2143         *vpp = ZTOV(zp);
2144
2145         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2146         if (flags & FIGNORECASE)
2147                 txtype |= TX_CI;
2148         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2149             acl_ids.z_fuidp, vap);
2150
2151         zfs_acl_ids_free(&acl_ids);
2152
2153         dmu_tx_commit(tx);
2154
2155         zfs_dirent_unlock(dl);
2156
2157         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2158                 zil_commit(zilog, 0);
2159
2160         ZFS_EXIT(zfsvfs);
2161         return (0);
2162 }
2163
2164 /*
2165  * Remove a directory subdir entry.  If the current working
2166  * directory is the same as the subdir to be removed, the
2167  * remove will fail.
2168  *
2169  *      IN:     dvp     - vnode of directory to remove from.
2170  *              name    - name of directory to be removed.
2171  *              cwd     - vnode of current working directory.
2172  *              cr      - credentials of caller.
2173  *              ct      - caller context
2174  *              flags   - case flags
2175  *
2176  *      RETURN: 0 if success
2177  *              error code if failure
2178  *
2179  * Timestamps:
2180  *      dvp - ctime|mtime updated
2181  */
2182 /*ARGSUSED*/
2183 static int
2184 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2185     caller_context_t *ct, int flags)
2186 {
2187         znode_t         *dzp = VTOZ(dvp);
2188         znode_t         *zp;
2189         vnode_t         *vp;
2190         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2191         zilog_t         *zilog;
2192         zfs_dirlock_t   *dl;
2193         dmu_tx_t        *tx;
2194         int             error;
2195         int             zflg = ZEXISTS;
2196
2197         ZFS_ENTER(zfsvfs);
2198         ZFS_VERIFY_ZP(dzp);
2199         zilog = zfsvfs->z_log;
2200
2201         if (flags & FIGNORECASE)
2202                 zflg |= ZCILOOK;
2203 top:
2204         zp = NULL;
2205
2206         /*
2207          * Attempt to lock directory; fail if entry doesn't exist.
2208          */
2209         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2210             NULL, NULL)) {
2211                 ZFS_EXIT(zfsvfs);
2212                 return (error);
2213         }
2214
2215         vp = ZTOV(zp);
2216
2217         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2218                 goto out;
2219         }
2220
2221         if (vp->v_type != VDIR) {
2222                 error = ENOTDIR;
2223                 goto out;
2224         }
2225
2226         if (vp == cwd) {
2227                 error = EINVAL;
2228                 goto out;
2229         }
2230
2231         vnevent_rmdir(vp, dvp, name, ct);
2232
2233         /*
2234          * Grab a lock on the directory to make sure that noone is
2235          * trying to add (or lookup) entries while we are removing it.
2236          */
2237         rw_enter(&zp->z_name_lock, RW_WRITER);
2238
2239         /*
2240          * Grab a lock on the parent pointer to make sure we play well
2241          * with the treewalk and directory rename code.
2242          */
2243         rw_enter(&zp->z_parent_lock, RW_WRITER);
2244
2245         tx = dmu_tx_create(zfsvfs->z_os);
2246         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2247         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2248         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2249         zfs_sa_upgrade_txholds(tx, zp);
2250         zfs_sa_upgrade_txholds(tx, dzp);
2251         error = dmu_tx_assign(tx, TXG_NOWAIT);
2252         if (error) {
2253                 rw_exit(&zp->z_parent_lock);
2254                 rw_exit(&zp->z_name_lock);
2255                 zfs_dirent_unlock(dl);
2256                 VN_RELE(vp);
2257                 if (error == ERESTART) {
2258                         dmu_tx_wait(tx);
2259                         dmu_tx_abort(tx);
2260                         goto top;
2261                 }
2262                 dmu_tx_abort(tx);
2263                 ZFS_EXIT(zfsvfs);
2264                 return (error);
2265         }
2266
2267 #ifdef FREEBSD_NAMECACHE
2268         cache_purge(dvp);
2269 #endif
2270
2271         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2272
2273         if (error == 0) {
2274                 uint64_t txtype = TX_RMDIR;
2275                 if (flags & FIGNORECASE)
2276                         txtype |= TX_CI;
2277                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2278         }
2279
2280         dmu_tx_commit(tx);
2281
2282         rw_exit(&zp->z_parent_lock);
2283         rw_exit(&zp->z_name_lock);
2284 #ifdef FREEBSD_NAMECACHE
2285         cache_purge(vp);
2286 #endif
2287 out:
2288         zfs_dirent_unlock(dl);
2289
2290         VN_RELE(vp);
2291
2292         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2293                 zil_commit(zilog, 0);
2294
2295         ZFS_EXIT(zfsvfs);
2296         return (error);
2297 }
2298
2299 /*
2300  * Read as many directory entries as will fit into the provided
2301  * buffer from the given directory cursor position (specified in
2302  * the uio structure.
2303  *
2304  *      IN:     vp      - vnode of directory to read.
2305  *              uio     - structure supplying read location, range info,
2306  *                        and return buffer.
2307  *              cr      - credentials of caller.
2308  *              ct      - caller context
2309  *              flags   - case flags
2310  *
2311  *      OUT:    uio     - updated offset and range, buffer filled.
2312  *              eofp    - set to true if end-of-file detected.
2313  *
2314  *      RETURN: 0 if success
2315  *              error code if failure
2316  *
2317  * Timestamps:
2318  *      vp - atime updated
2319  *
2320  * Note that the low 4 bits of the cookie returned by zap is always zero.
2321  * This allows us to use the low range for "special" directory entries:
2322  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2323  * we use the offset 2 for the '.zfs' directory.
2324  */
2325 /* ARGSUSED */
2326 static int
2327 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2328 {
2329         znode_t         *zp = VTOZ(vp);
2330         iovec_t         *iovp;
2331         edirent_t       *eodp;
2332         dirent64_t      *odp;
2333         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2334         objset_t        *os;
2335         caddr_t         outbuf;
2336         size_t          bufsize;
2337         zap_cursor_t    zc;
2338         zap_attribute_t zap;
2339         uint_t          bytes_wanted;
2340         uint64_t        offset; /* must be unsigned; checks for < 1 */
2341         uint64_t        parent;
2342         int             local_eof;
2343         int             outcount;
2344         int             error;
2345         uint8_t         prefetch;
2346         boolean_t       check_sysattrs;
2347         uint8_t         type;
2348         int             ncooks;
2349         u_long          *cooks = NULL;
2350         int             flags = 0;
2351
2352         ZFS_ENTER(zfsvfs);
2353         ZFS_VERIFY_ZP(zp);
2354
2355         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2356             &parent, sizeof (parent))) != 0) {
2357                 ZFS_EXIT(zfsvfs);
2358                 return (error);
2359         }
2360
2361         /*
2362          * If we are not given an eof variable,
2363          * use a local one.
2364          */
2365         if (eofp == NULL)
2366                 eofp = &local_eof;
2367
2368         /*
2369          * Check for valid iov_len.
2370          */
2371         if (uio->uio_iov->iov_len <= 0) {
2372                 ZFS_EXIT(zfsvfs);
2373                 return (EINVAL);
2374         }
2375
2376         /*
2377          * Quit if directory has been removed (posix)
2378          */
2379         if ((*eofp = zp->z_unlinked) != 0) {
2380                 ZFS_EXIT(zfsvfs);
2381                 return (0);
2382         }
2383
2384         error = 0;
2385         os = zfsvfs->z_os;
2386         offset = uio->uio_loffset;
2387         prefetch = zp->z_zn_prefetch;
2388
2389         /*
2390          * Initialize the iterator cursor.
2391          */
2392         if (offset <= 3) {
2393                 /*
2394                  * Start iteration from the beginning of the directory.
2395                  */
2396                 zap_cursor_init(&zc, os, zp->z_id);
2397         } else {
2398                 /*
2399                  * The offset is a serialized cursor.
2400                  */
2401                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2402         }
2403
2404         /*
2405          * Get space to change directory entries into fs independent format.
2406          */
2407         iovp = uio->uio_iov;
2408         bytes_wanted = iovp->iov_len;
2409         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2410                 bufsize = bytes_wanted;
2411                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2412                 odp = (struct dirent64 *)outbuf;
2413         } else {
2414                 bufsize = bytes_wanted;
2415                 odp = (struct dirent64 *)iovp->iov_base;
2416         }
2417         eodp = (struct edirent *)odp;
2418
2419         if (ncookies != NULL) {
2420                 /*
2421                  * Minimum entry size is dirent size and 1 byte for a file name.
2422                  */
2423                 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2424                 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2425                 *cookies = cooks;
2426                 *ncookies = ncooks;
2427         }
2428         /*
2429          * If this VFS supports the system attribute view interface; and
2430          * we're looking at an extended attribute directory; and we care
2431          * about normalization conflicts on this vfs; then we must check
2432          * for normalization conflicts with the sysattr name space.
2433          */
2434 #ifdef TODO
2435         check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2436             (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2437             (flags & V_RDDIR_ENTFLAGS);
2438 #else
2439         check_sysattrs = 0;
2440 #endif
2441
2442         /*
2443          * Transform to file-system independent format
2444          */
2445         outcount = 0;
2446         while (outcount < bytes_wanted) {
2447                 ino64_t objnum;
2448                 ushort_t reclen;
2449                 off64_t *next = NULL;
2450
2451                 /*
2452                  * Special case `.', `..', and `.zfs'.
2453                  */
2454                 if (offset == 0) {
2455                         (void) strcpy(zap.za_name, ".");
2456                         zap.za_normalization_conflict = 0;
2457                         objnum = zp->z_id;
2458                         type = DT_DIR;
2459                 } else if (offset == 1) {
2460                         (void) strcpy(zap.za_name, "..");
2461                         zap.za_normalization_conflict = 0;
2462                         objnum = parent;
2463                         type = DT_DIR;
2464                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2465                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2466                         zap.za_normalization_conflict = 0;
2467                         objnum = ZFSCTL_INO_ROOT;
2468                         type = DT_DIR;
2469                 } else {
2470                         /*
2471                          * Grab next entry.
2472                          */
2473                         if (error = zap_cursor_retrieve(&zc, &zap)) {
2474                                 if ((*eofp = (error == ENOENT)) != 0)
2475                                         break;
2476                                 else
2477                                         goto update;
2478                         }
2479
2480                         if (zap.za_integer_length != 8 ||
2481                             zap.za_num_integers != 1) {
2482                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2483                                     "entry, obj = %lld, offset = %lld\n",
2484                                     (u_longlong_t)zp->z_id,
2485                                     (u_longlong_t)offset);
2486                                 error = ENXIO;
2487                                 goto update;
2488                         }
2489
2490                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2491                         /*
2492                          * MacOS X can extract the object type here such as:
2493                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2494                          */
2495                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2496
2497                         if (check_sysattrs && !zap.za_normalization_conflict) {
2498 #ifdef TODO
2499                                 zap.za_normalization_conflict =
2500                                     xattr_sysattr_casechk(zap.za_name);
2501 #else
2502                                 panic("%s:%u: TODO", __func__, __LINE__);
2503 #endif
2504                         }
2505                 }
2506
2507                 if (flags & V_RDDIR_ACCFILTER) {
2508                         /*
2509                          * If we have no access at all, don't include
2510                          * this entry in the returned information
2511                          */
2512                         znode_t *ezp;
2513                         if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2514                                 goto skip_entry;
2515                         if (!zfs_has_access(ezp, cr)) {
2516                                 VN_RELE(ZTOV(ezp));
2517                                 goto skip_entry;
2518                         }
2519                         VN_RELE(ZTOV(ezp));
2520                 }
2521
2522                 if (flags & V_RDDIR_ENTFLAGS)
2523                         reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2524                 else
2525                         reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2526
2527                 /*
2528                  * Will this entry fit in the buffer?
2529                  */
2530                 if (outcount + reclen > bufsize) {
2531                         /*
2532                          * Did we manage to fit anything in the buffer?
2533                          */
2534                         if (!outcount) {
2535                                 error = EINVAL;
2536                                 goto update;
2537                         }
2538                         break;
2539                 }
2540                 if (flags & V_RDDIR_ENTFLAGS) {
2541                         /*
2542                          * Add extended flag entry:
2543                          */
2544                         eodp->ed_ino = objnum;
2545                         eodp->ed_reclen = reclen;
2546                         /* NOTE: ed_off is the offset for the *next* entry */
2547                         next = &(eodp->ed_off);
2548                         eodp->ed_eflags = zap.za_normalization_conflict ?
2549                             ED_CASE_CONFLICT : 0;
2550                         (void) strncpy(eodp->ed_name, zap.za_name,
2551                             EDIRENT_NAMELEN(reclen));
2552                         eodp = (edirent_t *)((intptr_t)eodp + reclen);
2553                 } else {
2554                         /*
2555                          * Add normal entry:
2556                          */
2557                         odp->d_ino = objnum;
2558                         odp->d_reclen = reclen;
2559                         odp->d_namlen = strlen(zap.za_name);
2560                         (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2561                         odp->d_type = type;
2562                         odp = (dirent64_t *)((intptr_t)odp + reclen);
2563                 }
2564                 outcount += reclen;
2565
2566                 ASSERT(outcount <= bufsize);
2567
2568                 /* Prefetch znode */
2569                 if (prefetch)
2570                         dmu_prefetch(os, objnum, 0, 0);
2571
2572         skip_entry:
2573                 /*
2574                  * Move to the next entry, fill in the previous offset.
2575                  */
2576                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2577                         zap_cursor_advance(&zc);
2578                         offset = zap_cursor_serialize(&zc);
2579                 } else {
2580                         offset += 1;
2581                 }
2582
2583                 if (cooks != NULL) {
2584                         *cooks++ = offset;
2585                         ncooks--;
2586                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2587                 }
2588         }
2589         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2590
2591         /* Subtract unused cookies */
2592         if (ncookies != NULL)
2593                 *ncookies -= ncooks;
2594
2595         if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2596                 iovp->iov_base += outcount;
2597                 iovp->iov_len -= outcount;
2598                 uio->uio_resid -= outcount;
2599         } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2600                 /*
2601                  * Reset the pointer.
2602                  */
2603                 offset = uio->uio_loffset;
2604         }
2605
2606 update:
2607         zap_cursor_fini(&zc);
2608         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2609                 kmem_free(outbuf, bufsize);
2610
2611         if (error == ENOENT)
2612                 error = 0;
2613
2614         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2615
2616         uio->uio_loffset = offset;
2617         ZFS_EXIT(zfsvfs);
2618         if (error != 0 && cookies != NULL) {
2619                 free(*cookies, M_TEMP);
2620                 *cookies = NULL;
2621                 *ncookies = 0;
2622         }
2623         return (error);
2624 }
2625
2626 ulong_t zfs_fsync_sync_cnt = 4;
2627
2628 static int
2629 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2630 {
2631         znode_t *zp = VTOZ(vp);
2632         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2633
2634         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2635
2636         if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2637                 ZFS_ENTER(zfsvfs);
2638                 ZFS_VERIFY_ZP(zp);
2639                 zil_commit(zfsvfs->z_log, zp->z_id);
2640                 ZFS_EXIT(zfsvfs);
2641         }
2642         return (0);
2643 }
2644
2645
2646 /*
2647  * Get the requested file attributes and place them in the provided
2648  * vattr structure.
2649  *
2650  *      IN:     vp      - vnode of file.
2651  *              vap     - va_mask identifies requested attributes.
2652  *                        If AT_XVATTR set, then optional attrs are requested
2653  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2654  *              cr      - credentials of caller.
2655  *              ct      - caller context
2656  *
2657  *      OUT:    vap     - attribute values.
2658  *
2659  *      RETURN: 0 (always succeeds)
2660  */
2661 /* ARGSUSED */
2662 static int
2663 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2664     caller_context_t *ct)
2665 {
2666         znode_t *zp = VTOZ(vp);
2667         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2668         int     error = 0;
2669         uint32_t blksize;
2670         u_longlong_t nblocks;
2671         uint64_t links;
2672         uint64_t mtime[2], ctime[2], crtime[2], rdev;
2673         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2674         xoptattr_t *xoap = NULL;
2675         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2676         sa_bulk_attr_t bulk[4];
2677         int count = 0;
2678
2679         ZFS_ENTER(zfsvfs);
2680         ZFS_VERIFY_ZP(zp);
2681
2682         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2683
2684         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2685         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2686         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16);
2687         if (vp->v_type == VBLK || vp->v_type == VCHR)
2688                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2689                     &rdev, 8);
2690
2691         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2692                 ZFS_EXIT(zfsvfs);
2693                 return (error);
2694         }
2695
2696         /*
2697          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2698          * Also, if we are the owner don't bother, since owner should
2699          * always be allowed to read basic attributes of file.
2700          */
2701         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2702             (vap->va_uid != crgetuid(cr))) {
2703                 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2704                     skipaclchk, cr)) {
2705                         ZFS_EXIT(zfsvfs);
2706                         return (error);
2707                 }
2708         }
2709
2710         /*
2711          * Return all attributes.  It's cheaper to provide the answer
2712          * than to determine whether we were asked the question.
2713          */
2714
2715         mutex_enter(&zp->z_lock);
2716         vap->va_type = IFTOVT(zp->z_mode);
2717         vap->va_mode = zp->z_mode & ~S_IFMT;
2718 #ifdef sun
2719         vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2720 #else
2721         vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2722 #endif
2723         vap->va_nodeid = zp->z_id;
2724         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2725                 links = zp->z_links + 1;
2726         else
2727                 links = zp->z_links;
2728         vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2729         vap->va_size = zp->z_size;
2730 #ifdef sun
2731         vap->va_rdev = vp->v_rdev;
2732 #else
2733         if (vp->v_type == VBLK || vp->v_type == VCHR)
2734                 vap->va_rdev = zfs_cmpldev(rdev);
2735 #endif
2736         vap->va_seq = zp->z_seq;
2737         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
2738
2739         /*
2740          * Add in any requested optional attributes and the create time.
2741          * Also set the corresponding bits in the returned attribute bitmap.
2742          */
2743         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2744                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2745                         xoap->xoa_archive =
2746                             ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2747                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
2748                 }
2749
2750                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2751                         xoap->xoa_readonly =
2752                             ((zp->z_pflags & ZFS_READONLY) != 0);
2753                         XVA_SET_RTN(xvap, XAT_READONLY);
2754                 }
2755
2756                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2757                         xoap->xoa_system =
2758                             ((zp->z_pflags & ZFS_SYSTEM) != 0);
2759                         XVA_SET_RTN(xvap, XAT_SYSTEM);
2760                 }
2761
2762                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2763                         xoap->xoa_hidden =
2764                             ((zp->z_pflags & ZFS_HIDDEN) != 0);
2765                         XVA_SET_RTN(xvap, XAT_HIDDEN);
2766                 }
2767
2768                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2769                         xoap->xoa_nounlink =
2770                             ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2771                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
2772                 }
2773
2774                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2775                         xoap->xoa_immutable =
2776                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2777                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2778                 }
2779
2780                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2781                         xoap->xoa_appendonly =
2782                             ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2783                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
2784                 }
2785
2786                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2787                         xoap->xoa_nodump =
2788                             ((zp->z_pflags & ZFS_NODUMP) != 0);
2789                         XVA_SET_RTN(xvap, XAT_NODUMP);
2790                 }
2791
2792                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2793                         xoap->xoa_opaque =
2794                             ((zp->z_pflags & ZFS_OPAQUE) != 0);
2795                         XVA_SET_RTN(xvap, XAT_OPAQUE);
2796                 }
2797
2798                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2799                         xoap->xoa_av_quarantined =
2800                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2801                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2802                 }
2803
2804                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2805                         xoap->xoa_av_modified =
2806                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2807                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2808                 }
2809
2810                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2811                     vp->v_type == VREG) {
2812                         zfs_sa_get_scanstamp(zp, xvap);
2813                 }
2814
2815                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2816                         uint64_t times[2];
2817
2818                         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2819                             times, sizeof (times));
2820                         ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2821                         XVA_SET_RTN(xvap, XAT_CREATETIME);
2822                 }
2823
2824                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2825                         xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2826                         XVA_SET_RTN(xvap, XAT_REPARSE);
2827                 }
2828                 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2829                         xoap->xoa_generation = zp->z_gen;
2830                         XVA_SET_RTN(xvap, XAT_GEN);
2831                 }
2832
2833                 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2834                         xoap->xoa_offline =
2835                             ((zp->z_pflags & ZFS_OFFLINE) != 0);
2836                         XVA_SET_RTN(xvap, XAT_OFFLINE);
2837                 }
2838
2839                 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2840                         xoap->xoa_sparse =
2841                             ((zp->z_pflags & ZFS_SPARSE) != 0);
2842                         XVA_SET_RTN(xvap, XAT_SPARSE);
2843                 }
2844         }
2845
2846         ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2847         ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2848         ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2849         ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2850
2851         mutex_exit(&zp->z_lock);
2852
2853         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2854         vap->va_blksize = blksize;
2855         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2856
2857         if (zp->z_blksz == 0) {
2858                 /*
2859                  * Block size hasn't been set; suggest maximal I/O transfers.
2860                  */
2861                 vap->va_blksize = zfsvfs->z_max_blksz;
2862         }
2863
2864         ZFS_EXIT(zfsvfs);
2865         return (0);
2866 }
2867
2868 /*
2869  * Set the file attributes to the values contained in the
2870  * vattr structure.
2871  *
2872  *      IN:     vp      - vnode of file to be modified.
2873  *              vap     - new attribute values.
2874  *                        If AT_XVATTR set, then optional attrs are being set
2875  *              flags   - ATTR_UTIME set if non-default time values provided.
2876  *                      - ATTR_NOACLCHECK (CIFS context only).
2877  *              cr      - credentials of caller.
2878  *              ct      - caller context
2879  *
2880  *      RETURN: 0 if success
2881  *              error code if failure
2882  *
2883  * Timestamps:
2884  *      vp - ctime updated, mtime updated if size changed.
2885  */
2886 /* ARGSUSED */
2887 static int
2888 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2889         caller_context_t *ct)
2890 {
2891         znode_t         *zp = VTOZ(vp);
2892         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2893         zilog_t         *zilog;
2894         dmu_tx_t        *tx;
2895         vattr_t         oldva;
2896         xvattr_t        tmpxvattr;
2897         uint_t          mask = vap->va_mask;
2898         uint_t          saved_mask;
2899         uint64_t        saved_mode;
2900         int             trim_mask = 0;
2901         uint64_t        new_mode;
2902         uint64_t        new_uid, new_gid;
2903         uint64_t        xattr_obj;
2904         uint64_t        mtime[2], ctime[2];
2905         znode_t         *attrzp;
2906         int             need_policy = FALSE;
2907         int             err, err2;
2908         zfs_fuid_info_t *fuidp = NULL;
2909         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2910         xoptattr_t      *xoap;
2911         zfs_acl_t       *aclp;
2912         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2913         boolean_t       fuid_dirtied = B_FALSE;
2914         sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2915         int             count = 0, xattr_count = 0;
2916
2917         if (mask == 0)
2918                 return (0);
2919
2920         if (mask & AT_NOSET)
2921                 return (EINVAL);
2922
2923         ZFS_ENTER(zfsvfs);
2924         ZFS_VERIFY_ZP(zp);
2925
2926         zilog = zfsvfs->z_log;
2927
2928         /*
2929          * Make sure that if we have ephemeral uid/gid or xvattr specified
2930          * that file system is at proper version level
2931          */
2932
2933         if (zfsvfs->z_use_fuids == B_FALSE &&
2934             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2935             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2936             (mask & AT_XVATTR))) {
2937                 ZFS_EXIT(zfsvfs);
2938                 return (EINVAL);
2939         }
2940
2941         if (mask & AT_SIZE && vp->v_type == VDIR) {
2942                 ZFS_EXIT(zfsvfs);
2943                 return (EISDIR);
2944         }
2945
2946         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2947                 ZFS_EXIT(zfsvfs);
2948                 return (EINVAL);
2949         }
2950
2951         /*
2952          * If this is an xvattr_t, then get a pointer to the structure of
2953          * optional attributes.  If this is NULL, then we have a vattr_t.
2954          */
2955         xoap = xva_getxoptattr(xvap);
2956
2957         xva_init(&tmpxvattr);
2958
2959         /*
2960          * Immutable files can only alter immutable bit and atime
2961          */
2962         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2963             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2964             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2965                 ZFS_EXIT(zfsvfs);
2966                 return (EPERM);
2967         }
2968
2969         if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2970                 ZFS_EXIT(zfsvfs);
2971                 return (EPERM);
2972         }
2973
2974         /*
2975          * Verify timestamps doesn't overflow 32 bits.
2976          * ZFS can handle large timestamps, but 32bit syscalls can't
2977          * handle times greater than 2039.  This check should be removed
2978          * once large timestamps are fully supported.
2979          */
2980         if (mask & (AT_ATIME | AT_MTIME)) {
2981                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2982                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2983                         ZFS_EXIT(zfsvfs);
2984                         return (EOVERFLOW);
2985                 }
2986         }
2987
2988 top:
2989         attrzp = NULL;
2990         aclp = NULL;
2991
2992         /* Can this be moved to before the top label? */
2993         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2994                 ZFS_EXIT(zfsvfs);
2995                 return (EROFS);
2996         }
2997
2998         /*
2999          * First validate permissions
3000          */
3001
3002         if (mask & AT_SIZE) {
3003                 /*
3004                  * XXX - Note, we are not providing any open
3005                  * mode flags here (like FNDELAY), so we may
3006                  * block if there are locks present... this
3007                  * should be addressed in openat().
3008                  */
3009                 /* XXX - would it be OK to generate a log record here? */
3010                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3011                 if (err) {
3012                         ZFS_EXIT(zfsvfs);
3013                         return (err);
3014                 }
3015         }
3016
3017         if (mask & (AT_ATIME|AT_MTIME) ||
3018             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3019             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3020             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3021             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3022             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3023             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3024             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3025                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3026                     skipaclchk, cr);
3027         }
3028
3029         if (mask & (AT_UID|AT_GID)) {
3030                 int     idmask = (mask & (AT_UID|AT_GID));
3031                 int     take_owner;
3032                 int     take_group;
3033
3034                 /*
3035                  * NOTE: even if a new mode is being set,
3036                  * we may clear S_ISUID/S_ISGID bits.
3037                  */
3038
3039                 if (!(mask & AT_MODE))
3040                         vap->va_mode = zp->z_mode;
3041
3042                 /*
3043                  * Take ownership or chgrp to group we are a member of
3044                  */
3045
3046                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3047                 take_group = (mask & AT_GID) &&
3048                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
3049
3050                 /*
3051                  * If both AT_UID and AT_GID are set then take_owner and
3052                  * take_group must both be set in order to allow taking
3053                  * ownership.
3054                  *
3055                  * Otherwise, send the check through secpolicy_vnode_setattr()
3056                  *
3057                  */
3058
3059                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3060                     ((idmask == AT_UID) && take_owner) ||
3061                     ((idmask == AT_GID) && take_group)) {
3062                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3063                             skipaclchk, cr) == 0) {
3064                                 /*
3065                                  * Remove setuid/setgid for non-privileged users
3066                                  */
3067                                 secpolicy_setid_clear(vap, vp, cr);
3068                                 trim_mask = (mask & (AT_UID|AT_GID));
3069                         } else {
3070                                 need_policy =  TRUE;
3071                         }
3072                 } else {
3073                         need_policy =  TRUE;
3074                 }
3075         }
3076
3077         mutex_enter(&zp->z_lock);
3078         oldva.va_mode = zp->z_mode;
3079         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3080         if (mask & AT_XVATTR) {
3081                 /*
3082                  * Update xvattr mask to include only those attributes
3083                  * that are actually changing.
3084                  *
3085                  * the bits will be restored prior to actually setting
3086                  * the attributes so the caller thinks they were set.
3087                  */
3088                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3089                         if (xoap->xoa_appendonly !=
3090                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3091                                 need_policy = TRUE;
3092                         } else {
3093                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3094                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3095                         }
3096                 }
3097
3098                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3099                         if (xoap->xoa_nounlink !=
3100                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3101                                 need_policy = TRUE;
3102                         } else {
3103                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3104                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3105                         }
3106                 }
3107
3108                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3109                         if (xoap->xoa_immutable !=
3110                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3111                                 need_policy = TRUE;
3112                         } else {
3113                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3114                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3115                         }
3116                 }
3117
3118                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3119                         if (xoap->xoa_nodump !=
3120                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3121                                 need_policy = TRUE;
3122                         } else {
3123                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
3124                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3125                         }
3126                 }
3127
3128                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3129                         if (xoap->xoa_av_modified !=
3130                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3131                                 need_policy = TRUE;
3132                         } else {
3133                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3134                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3135                         }
3136                 }
3137
3138                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3139                         if ((vp->v_type != VREG &&
3140                             xoap->xoa_av_quarantined) ||
3141                             xoap->xoa_av_quarantined !=
3142                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3143                                 need_policy = TRUE;
3144                         } else {
3145                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3146                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3147                         }
3148                 }
3149
3150                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3151                         mutex_exit(&zp->z_lock);
3152                         ZFS_EXIT(zfsvfs);
3153                         return (EPERM);
3154                 }
3155
3156                 if (need_policy == FALSE &&
3157                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3158                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3159                         need_policy = TRUE;
3160                 }
3161         }
3162
3163         mutex_exit(&zp->z_lock);
3164
3165         if (mask & AT_MODE) {
3166                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3167                         err = secpolicy_setid_setsticky_clear(vp, vap,
3168                             &oldva, cr);
3169                         if (err) {
3170                                 ZFS_EXIT(zfsvfs);
3171                                 return (err);
3172                         }
3173                         trim_mask |= AT_MODE;
3174                 } else {
3175                         need_policy = TRUE;
3176                 }
3177         }
3178
3179         if (need_policy) {
3180                 /*
3181                  * If trim_mask is set then take ownership
3182                  * has been granted or write_acl is present and user
3183                  * has the ability to modify mode.  In that case remove
3184                  * UID|GID and or MODE from mask so that
3185                  * secpolicy_vnode_setattr() doesn't revoke it.
3186                  */
3187
3188                 if (trim_mask) {
3189                         saved_mask = vap->va_mask;
3190                         vap->va_mask &= ~trim_mask;
3191                         if (trim_mask & AT_MODE) {
3192                                 /*
3193                                  * Save the mode, as secpolicy_vnode_setattr()
3194                                  * will overwrite it with ova.va_mode.
3195                                  */
3196                                 saved_mode = vap->va_mode;
3197                         }
3198                 }
3199                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3200                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3201                 if (err) {
3202                         ZFS_EXIT(zfsvfs);
3203                         return (err);
3204                 }
3205
3206                 if (trim_mask) {
3207                         vap->va_mask |= saved_mask;
3208                         if (trim_mask & AT_MODE) {
3209                                 /*
3210                                  * Recover the mode after
3211                                  * secpolicy_vnode_setattr().
3212                                  */
3213                                 vap->va_mode = saved_mode;
3214                         }
3215                 }
3216         }
3217
3218         /*
3219          * secpolicy_vnode_setattr, or take ownership may have
3220          * changed va_mask
3221          */
3222         mask = vap->va_mask;
3223
3224         if ((mask & (AT_UID | AT_GID))) {
3225                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3226                     &xattr_obj, sizeof (xattr_obj));
3227
3228                 if (err == 0 && xattr_obj) {
3229                         err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3230                         if (err)
3231                                 goto out2;
3232                 }
3233                 if (mask & AT_UID) {
3234                         new_uid = zfs_fuid_create(zfsvfs,
3235                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3236                         if (new_uid != zp->z_uid &&
3237                             zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3238                                 if (attrzp)
3239                                         VN_RELE(ZTOV(attrzp));
3240                                 err = EDQUOT;
3241                                 goto out2;
3242                         }
3243                 }
3244
3245                 if (mask & AT_GID) {
3246                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3247                             cr, ZFS_GROUP, &fuidp);
3248                         if (new_gid != zp->z_gid &&
3249                             zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3250                                 if (attrzp)
3251                                         VN_RELE(ZTOV(attrzp));
3252                                 err = EDQUOT;
3253                                 goto out2;
3254                         }
3255                 }
3256         }
3257         tx = dmu_tx_create(zfsvfs->z_os);
3258
3259         if (mask & AT_MODE) {
3260                 uint64_t pmode = zp->z_mode;
3261                 uint64_t acl_obj;
3262                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3263
3264                 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3265                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3266                         err = EPERM;
3267                         goto out;
3268                 }
3269
3270                 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3271                         goto out;
3272
3273                 mutex_enter(&zp->z_lock);
3274                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3275                         /*
3276                          * Are we upgrading ACL from old V0 format
3277                          * to V1 format?
3278                          */
3279                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3280                             zfs_znode_acl_version(zp) ==
3281                             ZFS_ACL_VERSION_INITIAL) {
3282                                 dmu_tx_hold_free(tx, acl_obj, 0,
3283                                     DMU_OBJECT_END);
3284                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3285                                     0, aclp->z_acl_bytes);
3286                         } else {
3287                                 dmu_tx_hold_write(tx, acl_obj, 0,
3288                                     aclp->z_acl_bytes);
3289                         }
3290                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3291                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3292                             0, aclp->z_acl_bytes);
3293                 }
3294                 mutex_exit(&zp->z_lock);
3295                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3296         } else {
3297                 if ((mask & AT_XVATTR) &&
3298                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3299                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3300                 else
3301                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3302         }
3303
3304         if (attrzp) {
3305                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3306         }
3307
3308         fuid_dirtied = zfsvfs->z_fuid_dirty;
3309         if (fuid_dirtied)
3310                 zfs_fuid_txhold(zfsvfs, tx);
3311
3312         zfs_sa_upgrade_txholds(tx, zp);
3313
3314         err = dmu_tx_assign(tx, TXG_NOWAIT);
3315         if (err) {
3316                 if (err == ERESTART)
3317                         dmu_tx_wait(tx);
3318                 goto out;
3319         }
3320
3321         count = 0;
3322         /*
3323          * Set each attribute requested.
3324          * We group settings according to the locks they need to acquire.
3325          *
3326          * Note: you cannot set ctime directly, although it will be
3327          * updated as a side-effect of calling this function.
3328          */
3329
3330
3331         if (mask & (AT_UID|AT_GID|AT_MODE))
3332                 mutex_enter(&zp->z_acl_lock);
3333         mutex_enter(&zp->z_lock);
3334
3335         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3336             &zp->z_pflags, sizeof (zp->z_pflags));
3337
3338         if (attrzp) {
3339                 if (mask & (AT_UID|AT_GID|AT_MODE))
3340                         mutex_enter(&attrzp->z_acl_lock);
3341                 mutex_enter(&attrzp->z_lock);
3342                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3343                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3344                     sizeof (attrzp->z_pflags));
3345         }
3346
3347         if (mask & (AT_UID|AT_GID)) {
3348
3349                 if (mask & AT_UID) {
3350                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3351                             &new_uid, sizeof (new_uid));
3352                         zp->z_uid = new_uid;
3353                         if (attrzp) {
3354                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3355                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3356                                     sizeof (new_uid));
3357                                 attrzp->z_uid = new_uid;
3358                         }
3359                 }
3360
3361                 if (mask & AT_GID) {
3362                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3363                             NULL, &new_gid, sizeof (new_gid));
3364                         zp->z_gid = new_gid;
3365                         if (attrzp) {
3366                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3367                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3368                                     sizeof (new_gid));
3369                                 attrzp->z_gid = new_gid;
3370                         }
3371                 }
3372                 if (!(mask & AT_MODE)) {
3373                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3374                             NULL, &new_mode, sizeof (new_mode));
3375                         new_mode = zp->z_mode;
3376                 }
3377                 err = zfs_acl_chown_setattr(zp);
3378                 ASSERT(err == 0);
3379                 if (attrzp) {
3380                         err = zfs_acl_chown_setattr(attrzp);
3381                         ASSERT(err == 0);
3382                 }
3383         }
3384
3385         if (mask & AT_MODE) {
3386                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3387                     &new_mode, sizeof (new_mode));
3388                 zp->z_mode = new_mode;
3389                 ASSERT3U((uintptr_t)aclp, !=, 0);
3390                 err = zfs_aclset_common(zp, aclp, cr, tx);
3391                 ASSERT0(err);
3392                 if (zp->z_acl_cached)
3393                         zfs_acl_free(zp->z_acl_cached);
3394                 zp->z_acl_cached = aclp;
3395                 aclp = NULL;
3396         }
3397
3398
3399         if (mask & AT_ATIME) {
3400                 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3401                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3402                     &zp->z_atime, sizeof (zp->z_atime));
3403         }
3404
3405         if (mask & AT_MTIME) {
3406                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3407                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3408                     mtime, sizeof (mtime));
3409         }
3410
3411         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3412         if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3413                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3414                     NULL, mtime, sizeof (mtime));
3415                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3416                     &ctime, sizeof (ctime));
3417                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3418                     B_TRUE);
3419         } else if (mask != 0) {
3420                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3421                     &ctime, sizeof (ctime));
3422                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3423                     B_TRUE);
3424                 if (attrzp) {
3425                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3426                             SA_ZPL_CTIME(zfsvfs), NULL,
3427                             &ctime, sizeof (ctime));
3428                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3429                             mtime, ctime, B_TRUE);
3430                 }
3431         }
3432         /*
3433          * Do this after setting timestamps to prevent timestamp
3434          * update from toggling bit
3435          */
3436
3437         if (xoap && (mask & AT_XVATTR)) {
3438
3439                 /*
3440                  * restore trimmed off masks
3441                  * so that return masks can be set for caller.
3442                  */
3443
3444                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3445                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
3446                 }
3447                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3448                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
3449                 }
3450                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3451                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3452                 }
3453                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3454                         XVA_SET_REQ(xvap, XAT_NODUMP);
3455                 }
3456                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3457                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3458                 }
3459                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3460                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3461                 }
3462
3463                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3464                         ASSERT(vp->v_type == VREG);
3465
3466                 zfs_xvattr_set(zp, xvap, tx);
3467         }
3468
3469         if (fuid_dirtied)
3470                 zfs_fuid_sync(zfsvfs, tx);
3471
3472         if (mask != 0)
3473                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3474
3475         mutex_exit(&zp->z_lock);
3476         if (mask & (AT_UID|AT_GID|AT_MODE))
3477                 mutex_exit(&zp->z_acl_lock);
3478
3479         if (attrzp) {
3480                 if (mask & (AT_UID|AT_GID|AT_MODE))
3481                         mutex_exit(&attrzp->z_acl_lock);
3482                 mutex_exit(&attrzp->z_lock);
3483         }
3484 out:
3485         if (err == 0 && attrzp) {
3486                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3487                     xattr_count, tx);
3488                 ASSERT(err2 == 0);
3489         }
3490
3491         if (attrzp)
3492                 VN_RELE(ZTOV(attrzp));
3493         if (aclp)
3494                 zfs_acl_free(aclp);
3495
3496         if (fuidp) {
3497                 zfs_fuid_info_free(fuidp);
3498                 fuidp = NULL;
3499         }
3500
3501         if (err) {
3502                 dmu_tx_abort(tx);
3503                 if (err == ERESTART)
3504                         goto top;
3505         } else {
3506                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3507                 dmu_tx_commit(tx);
3508         }
3509
3510 out2:
3511         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3512                 zil_commit(zilog, 0);
3513
3514         ZFS_EXIT(zfsvfs);
3515         return (err);
3516 }
3517
3518 typedef struct zfs_zlock {
3519         krwlock_t       *zl_rwlock;     /* lock we acquired */
3520         znode_t         *zl_znode;      /* znode we held */
3521         struct zfs_zlock *zl_next;      /* next in list */
3522 } zfs_zlock_t;
3523
3524 /*
3525  * Drop locks and release vnodes that were held by zfs_rename_lock().
3526  */
3527 static void
3528 zfs_rename_unlock(zfs_zlock_t **zlpp)
3529 {
3530         zfs_zlock_t *zl;
3531
3532         while ((zl = *zlpp) != NULL) {
3533                 if (zl->zl_znode != NULL)
3534                         VN_RELE(ZTOV(zl->zl_znode));
3535                 rw_exit(zl->zl_rwlock);
3536                 *zlpp = zl->zl_next;
3537                 kmem_free(zl, sizeof (*zl));
3538         }
3539 }
3540
3541 /*
3542  * Search back through the directory tree, using the ".." entries.
3543  * Lock each directory in the chain to prevent concurrent renames.
3544  * Fail any attempt to move a directory into one of its own descendants.
3545  * XXX - z_parent_lock can overlap with map or grow locks
3546  */
3547 static int
3548 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3549 {
3550         zfs_zlock_t     *zl;
3551         znode_t         *zp = tdzp;
3552         uint64_t        rootid = zp->z_zfsvfs->z_root;
3553         uint64_t        oidp = zp->z_id;
3554         krwlock_t       *rwlp = &szp->z_parent_lock;
3555         krw_t           rw = RW_WRITER;
3556
3557         /*
3558          * First pass write-locks szp and compares to zp->z_id.
3559          * Later passes read-lock zp and compare to zp->z_parent.
3560          */
3561         do {
3562                 if (!rw_tryenter(rwlp, rw)) {
3563                         /*
3564                          * Another thread is renaming in this path.
3565                          * Note that if we are a WRITER, we don't have any
3566                          * parent_locks held yet.
3567                          */
3568                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3569                                 /*
3570                                  * Drop our locks and restart
3571                                  */
3572                                 zfs_rename_unlock(&zl);
3573                                 *zlpp = NULL;
3574                                 zp = tdzp;
3575                                 oidp = zp->z_id;
3576                                 rwlp = &szp->z_parent_lock;
3577                                 rw = RW_WRITER;
3578                                 continue;
3579                         } else {
3580                                 /*
3581                                  * Wait for other thread to drop its locks
3582                                  */
3583                                 rw_enter(rwlp, rw);
3584                         }
3585                 }
3586
3587                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3588                 zl->zl_rwlock = rwlp;
3589                 zl->zl_znode = NULL;
3590                 zl->zl_next = *zlpp;
3591                 *zlpp = zl;
3592
3593                 if (oidp == szp->z_id)          /* We're a descendant of szp */
3594                         return (EINVAL);
3595
3596                 if (oidp == rootid)             /* We've hit the top */
3597                         return (0);
3598
3599                 if (rw == RW_READER) {          /* i.e. not the first pass */
3600                         int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3601                         if (error)
3602                                 return (error);
3603                         zl->zl_znode = zp;
3604                 }
3605                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3606                     &oidp, sizeof (oidp));
3607                 rwlp = &zp->z_parent_lock;
3608                 rw = RW_READER;
3609
3610         } while (zp->z_id != sdzp->z_id);
3611
3612         return (0);
3613 }
3614
3615 /*
3616  * Move an entry from the provided source directory to the target
3617  * directory.  Change the entry name as indicated.
3618  *
3619  *      IN:     sdvp    - Source directory containing the "old entry".
3620  *              snm     - Old entry name.
3621  *              tdvp    - Target directory to contain the "new entry".
3622  *              tnm     - New entry name.
3623  *              cr      - credentials of caller.
3624  *              ct      - caller context
3625  *              flags   - case flags
3626  *
3627  *      RETURN: 0 if success
3628  *              error code if failure
3629  *
3630  * Timestamps:
3631  *      sdvp,tdvp - ctime|mtime updated
3632  */
3633 /*ARGSUSED*/
3634 static int
3635 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3636     caller_context_t *ct, int flags)
3637 {
3638         znode_t         *tdzp, *szp, *tzp;
3639         znode_t         *sdzp = VTOZ(sdvp);
3640         zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3641         zilog_t         *zilog;
3642         vnode_t         *realvp;
3643         zfs_dirlock_t   *sdl, *tdl;
3644         dmu_tx_t        *tx;
3645         zfs_zlock_t     *zl;
3646         int             cmp, serr, terr;
3647         int             error = 0;
3648         int             zflg = 0;
3649
3650         ZFS_ENTER(zfsvfs);
3651         ZFS_VERIFY_ZP(sdzp);
3652         zilog = zfsvfs->z_log;
3653
3654         /*
3655          * Make sure we have the real vp for the target directory.
3656          */
3657         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3658                 tdvp = realvp;
3659
3660         if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3661                 ZFS_EXIT(zfsvfs);
3662                 return (EXDEV);
3663         }
3664
3665         tdzp = VTOZ(tdvp);
3666         ZFS_VERIFY_ZP(tdzp);
3667         if (zfsvfs->z_utf8 && u8_validate(tnm,
3668             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3669                 ZFS_EXIT(zfsvfs);
3670                 return (EILSEQ);
3671         }
3672
3673         if (flags & FIGNORECASE)
3674                 zflg |= ZCILOOK;
3675
3676 top:
3677         szp = NULL;
3678         tzp = NULL;
3679         zl = NULL;
3680
3681         /*
3682          * This is to prevent the creation of links into attribute space
3683          * by renaming a linked file into/outof an attribute directory.
3684          * See the comment in zfs_link() for why this is considered bad.
3685          */
3686         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3687                 ZFS_EXIT(zfsvfs);
3688                 return (EINVAL);
3689         }
3690
3691         /*
3692          * Lock source and target directory entries.  To prevent deadlock,
3693          * a lock ordering must be defined.  We lock the directory with
3694          * the smallest object id first, or if it's a tie, the one with
3695          * the lexically first name.
3696          */
3697         if (sdzp->z_id < tdzp->z_id) {
3698                 cmp = -1;
3699         } else if (sdzp->z_id > tdzp->z_id) {
3700                 cmp = 1;
3701         } else {
3702                 /*
3703                  * First compare the two name arguments without
3704                  * considering any case folding.
3705                  */
3706                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3707
3708                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3709                 ASSERT(error == 0 || !zfsvfs->z_utf8);
3710                 if (cmp == 0) {
3711                         /*
3712                          * POSIX: "If the old argument and the new argument
3713                          * both refer to links to the same existing file,
3714                          * the rename() function shall return successfully
3715                          * and perform no other action."
3716                          */
3717                         ZFS_EXIT(zfsvfs);
3718                         return (0);
3719                 }
3720                 /*
3721                  * If the file system is case-folding, then we may
3722                  * have some more checking to do.  A case-folding file
3723                  * system is either supporting mixed case sensitivity
3724                  * access or is completely case-insensitive.  Note
3725                  * that the file system is always case preserving.
3726                  *
3727                  * In mixed sensitivity mode case sensitive behavior
3728                  * is the default.  FIGNORECASE must be used to
3729                  * explicitly request case insensitive behavior.
3730                  *
3731                  * If the source and target names provided differ only
3732                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3733                  * we will treat this as a special case in the
3734                  * case-insensitive mode: as long as the source name
3735                  * is an exact match, we will allow this to proceed as
3736                  * a name-change request.
3737                  */
3738                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3739                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
3740                     flags & FIGNORECASE)) &&
3741                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3742                     &error) == 0) {
3743                         /*
3744                          * case preserving rename request, require exact
3745                          * name matches
3746                          */
3747                         zflg |= ZCIEXACT;
3748                         zflg &= ~ZCILOOK;
3749                 }
3750         }
3751
3752         /*
3753          * If the source and destination directories are the same, we should
3754          * grab the z_name_lock of that directory only once.
3755          */
3756         if (sdzp == tdzp) {
3757                 zflg |= ZHAVELOCK;
3758                 rw_enter(&sdzp->z_name_lock, RW_READER);
3759         }
3760
3761         if (cmp < 0) {
3762                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3763                     ZEXISTS | zflg, NULL, NULL);
3764                 terr = zfs_dirent_lock(&tdl,
3765                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3766         } else {
3767                 terr = zfs_dirent_lock(&tdl,
3768                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3769                 serr = zfs_dirent_lock(&sdl,
3770                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3771                     NULL, NULL);
3772         }
3773
3774         if (serr) {
3775                 /*
3776                  * Source entry invalid or not there.
3777                  */
3778                 if (!terr) {
3779                         zfs_dirent_unlock(tdl);
3780                         if (tzp)
3781                                 VN_RELE(ZTOV(tzp));
3782                 }
3783
3784                 if (sdzp == tdzp)
3785                         rw_exit(&sdzp->z_name_lock);
3786
3787                 /*
3788                  * FreeBSD: In OpenSolaris they only check if rename source is
3789                  * ".." here, because "." is handled in their lookup. This is
3790                  * not the case for FreeBSD, so we check for "." explicitly.
3791                  */
3792                 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3793                         serr = EINVAL;
3794                 ZFS_EXIT(zfsvfs);
3795                 return (serr);
3796         }
3797         if (terr) {
3798                 zfs_dirent_unlock(sdl);
3799                 VN_RELE(ZTOV(szp));
3800
3801                 if (sdzp == tdzp)
3802                         rw_exit(&sdzp->z_name_lock);
3803
3804                 if (strcmp(tnm, "..") == 0)
3805                         terr = EINVAL;
3806                 ZFS_EXIT(zfsvfs);
3807                 return (terr);
3808         }
3809
3810         /*
3811          * Must have write access at the source to remove the old entry
3812          * and write access at the target to create the new entry.
3813          * Note that if target and source are the same, this can be
3814          * done in a single check.
3815          */
3816
3817         if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3818                 goto out;
3819
3820         if (ZTOV(szp)->v_type == VDIR) {
3821                 /*
3822                  * Check to make sure rename is valid.
3823                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3824                  */
3825                 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3826                         goto out;
3827         }
3828
3829         /*
3830          * Does target exist?
3831          */
3832         if (tzp) {
3833                 /*
3834                  * Source and target must be the same type.
3835                  */
3836                 if (ZTOV(szp)->v_type == VDIR) {
3837                         if (ZTOV(tzp)->v_type != VDIR) {
3838                                 error = ENOTDIR;
3839                                 goto out;
3840                         }
3841                 } else {
3842                         if (ZTOV(tzp)->v_type == VDIR) {
3843                                 error = EISDIR;
3844                                 goto out;
3845                         }
3846                 }
3847                 /*
3848                  * POSIX dictates that when the source and target
3849                  * entries refer to the same file object, rename
3850                  * must do nothing and exit without error.
3851                  */
3852                 if (szp->z_id == tzp->z_id) {
3853                         error = 0;
3854                         goto out;
3855                 }
3856         }
3857
3858         vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3859         if (tzp)
3860                 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3861
3862         /*
3863          * notify the target directory if it is not the same
3864          * as source directory.
3865          */
3866         if (tdvp != sdvp) {
3867                 vnevent_rename_dest_dir(tdvp, ct);
3868         }
3869
3870         tx = dmu_tx_create(zfsvfs->z_os);
3871         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3872         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3873         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3874         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3875         if (sdzp != tdzp) {
3876                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3877                 zfs_sa_upgrade_txholds(tx, tdzp);
3878         }
3879         if (tzp) {
3880                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3881                 zfs_sa_upgrade_txholds(tx, tzp);
3882         }
3883
3884         zfs_sa_upgrade_txholds(tx, szp);
3885         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3886         error = dmu_tx_assign(tx, TXG_NOWAIT);
3887         if (error) {
3888                 if (zl != NULL)
3889                         zfs_rename_unlock(&zl);
3890                 zfs_dirent_unlock(sdl);
3891                 zfs_dirent_unlock(tdl);
3892
3893                 if (sdzp == tdzp)
3894                         rw_exit(&sdzp->z_name_lock);
3895
3896                 VN_RELE(ZTOV(szp));
3897                 if (tzp)
3898                         VN_RELE(ZTOV(tzp));
3899                 if (error == ERESTART) {
3900                         dmu_tx_wait(tx);
3901                         dmu_tx_abort(tx);
3902                         goto top;
3903                 }
3904                 dmu_tx_abort(tx);
3905                 ZFS_EXIT(zfsvfs);
3906                 return (error);
3907         }
3908
3909         if (tzp)        /* Attempt to remove the existing target */
3910                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3911
3912         if (error == 0) {
3913                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3914                 if (error == 0) {
3915                         szp->z_pflags |= ZFS_AV_MODIFIED;
3916
3917                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3918                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3919                         ASSERT0(error);
3920
3921                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3922                         if (error == 0) {
3923                                 zfs_log_rename(zilog, tx, TX_RENAME |
3924                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3925                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
3926
3927                                 /*
3928                                  * Update path information for the target vnode
3929                                  */
3930                                 vn_renamepath(tdvp, ZTOV(szp), tnm,
3931                                     strlen(tnm));
3932                         } else {
3933                                 /*
3934                                  * At this point, we have successfully created
3935                                  * the target name, but have failed to remove
3936                                  * the source name.  Since the create was done
3937                                  * with the ZRENAMING flag, there are
3938                                  * complications; for one, the link count is
3939                                  * wrong.  The easiest way to deal with this
3940                                  * is to remove the newly created target, and
3941                                  * return the original error.  This must
3942                                  * succeed; fortunately, it is very unlikely to
3943                                  * fail, since we just created it.
3944                                  */
3945                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3946                                     ZRENAMING, NULL), ==, 0);
3947                         }
3948                 }
3949 #ifdef FREEBSD_NAMECACHE
3950                 if (error == 0) {
3951                         cache_purge(sdvp);
3952                         cache_purge(tdvp);
3953                 }
3954 #endif
3955         }
3956
3957         dmu_tx_commit(tx);
3958 out:
3959         if (zl != NULL)
3960                 zfs_rename_unlock(&zl);
3961
3962         zfs_dirent_unlock(sdl);
3963         zfs_dirent_unlock(tdl);
3964
3965         if (sdzp == tdzp)
3966                 rw_exit(&sdzp->z_name_lock);
3967
3968
3969         VN_RELE(ZTOV(szp));
3970         if (tzp)
3971                 VN_RELE(ZTOV(tzp));
3972
3973         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3974                 zil_commit(zilog, 0);
3975
3976         ZFS_EXIT(zfsvfs);
3977
3978         return (error);
3979 }
3980
3981 /*
3982  * Insert the indicated symbolic reference entry into the directory.
3983  *
3984  *      IN:     dvp     - Directory to contain new symbolic link.
3985  *              link    - Name for new symlink entry.
3986  *              vap     - Attributes of new entry.
3987  *              target  - Target path of new symlink.
3988  *              cr      - credentials of caller.
3989  *              ct      - caller context
3990  *              flags   - case flags
3991  *
3992  *      RETURN: 0 if success
3993  *              error code if failure
3994  *
3995  * Timestamps:
3996  *      dvp - ctime|mtime updated
3997  */
3998 /*ARGSUSED*/
3999 static int
4000 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4001     cred_t *cr, kthread_t *td)
4002 {
4003         znode_t         *zp, *dzp = VTOZ(dvp);
4004         zfs_dirlock_t   *dl;
4005         dmu_tx_t        *tx;
4006         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
4007         zilog_t         *zilog;
4008         uint64_t        len = strlen(link);
4009         int             error;
4010         int             zflg = ZNEW;
4011         zfs_acl_ids_t   acl_ids;
4012         boolean_t       fuid_dirtied;
4013         uint64_t        txtype = TX_SYMLINK;
4014         int             flags = 0;
4015
4016         ASSERT(vap->va_type == VLNK);
4017
4018         ZFS_ENTER(zfsvfs);
4019         ZFS_VERIFY_ZP(dzp);
4020         zilog = zfsvfs->z_log;
4021
4022         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4023             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4024                 ZFS_EXIT(zfsvfs);
4025                 return (EILSEQ);
4026         }
4027         if (flags & FIGNORECASE)
4028                 zflg |= ZCILOOK;
4029
4030         if (len > MAXPATHLEN) {
4031                 ZFS_EXIT(zfsvfs);
4032                 return (ENAMETOOLONG);
4033         }
4034
4035         if ((error = zfs_acl_ids_create(dzp, 0,
4036             vap, cr, NULL, &acl_ids)) != 0) {
4037                 ZFS_EXIT(zfsvfs);
4038                 return (error);
4039         }
4040 top:
4041         /*
4042          * Attempt to lock directory; fail if entry already exists.
4043          */
4044         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4045         if (error) {
4046                 zfs_acl_ids_free(&acl_ids);
4047                 ZFS_EXIT(zfsvfs);
4048                 return (error);
4049         }
4050
4051         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4052                 zfs_acl_ids_free(&acl_ids);
4053                 zfs_dirent_unlock(dl);
4054                 ZFS_EXIT(zfsvfs);
4055                 return (error);
4056         }
4057
4058         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4059                 zfs_acl_ids_free(&acl_ids);
4060                 zfs_dirent_unlock(dl);
4061                 ZFS_EXIT(zfsvfs);
4062                 return (EDQUOT);
4063         }
4064         tx = dmu_tx_create(zfsvfs->z_os);
4065         fuid_dirtied = zfsvfs->z_fuid_dirty;
4066         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4067         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4068         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4069             ZFS_SA_BASE_ATTR_SIZE + len);
4070         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4071         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4072                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4073                     acl_ids.z_aclp->z_acl_bytes);
4074         }
4075         if (fuid_dirtied)
4076                 zfs_fuid_txhold(zfsvfs, tx);
4077         error = dmu_tx_assign(tx, TXG_NOWAIT);
4078         if (error) {
4079                 zfs_dirent_unlock(dl);
4080                 if (error == ERESTART) {
4081                         dmu_tx_wait(tx);
4082                         dmu_tx_abort(tx);
4083                         goto top;
4084                 }
4085                 zfs_acl_ids_free(&acl_ids);
4086                 dmu_tx_abort(tx);
4087                 ZFS_EXIT(zfsvfs);
4088                 return (error);
4089         }
4090
4091         /*
4092          * Create a new object for the symlink.
4093          * for version 4 ZPL datsets the symlink will be an SA attribute
4094          */
4095         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4096
4097         if (fuid_dirtied)
4098                 zfs_fuid_sync(zfsvfs, tx);
4099
4100         mutex_enter(&zp->z_lock);
4101         if (zp->z_is_sa)
4102                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4103                     link, len, tx);
4104         else
4105                 zfs_sa_symlink(zp, link, len, tx);
4106         mutex_exit(&zp->z_lock);
4107
4108         zp->z_size = len;
4109         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4110             &zp->z_size, sizeof (zp->z_size), tx);
4111         /*
4112          * Insert the new object into the directory.
4113          */
4114         (void) zfs_link_create(dl, zp, tx, ZNEW);
4115
4116         if (flags & FIGNORECASE)
4117                 txtype |= TX_CI;
4118         zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4119         *vpp = ZTOV(zp);
4120
4121         zfs_acl_ids_free(&acl_ids);
4122
4123         dmu_tx_commit(tx);
4124
4125         zfs_dirent_unlock(dl);
4126
4127         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4128                 zil_commit(zilog, 0);
4129
4130         ZFS_EXIT(zfsvfs);
4131         return (error);
4132 }
4133
4134 /*
4135  * Return, in the buffer contained in the provided uio structure,
4136  * the symbolic path referred to by vp.
4137  *
4138  *      IN:     vp      - vnode of symbolic link.
4139  *              uoip    - structure to contain the link path.
4140  *              cr      - credentials of caller.
4141  *              ct      - caller context
4142  *
4143  *      OUT:    uio     - structure to contain the link path.
4144  *
4145  *      RETURN: 0 if success
4146  *              error code if failure
4147  *
4148  * Timestamps:
4149  *      vp - atime updated
4150  */
4151 /* ARGSUSED */
4152 static int
4153 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4154 {
4155         znode_t         *zp = VTOZ(vp);
4156         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4157         int             error;
4158
4159         ZFS_ENTER(zfsvfs);
4160         ZFS_VERIFY_ZP(zp);
4161
4162         mutex_enter(&zp->z_lock);
4163         if (zp->z_is_sa)
4164                 error = sa_lookup_uio(zp->z_sa_hdl,
4165                     SA_ZPL_SYMLINK(zfsvfs), uio);
4166         else
4167                 error = zfs_sa_readlink(zp, uio);
4168         mutex_exit(&zp->z_lock);
4169
4170         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4171
4172         ZFS_EXIT(zfsvfs);
4173         return (error);
4174 }
4175
4176 /*
4177  * Insert a new entry into directory tdvp referencing svp.
4178  *
4179  *      IN:     tdvp    - Directory to contain new entry.
4180  *              svp     - vnode of new entry.
4181  *              name    - name of new entry.
4182  *              cr      - credentials of caller.
4183  *              ct      - caller context
4184  *
4185  *      RETURN: 0 if success
4186  *              error code if failure
4187  *
4188  * Timestamps:
4189  *      tdvp - ctime|mtime updated
4190  *       svp - ctime updated
4191  */
4192 /* ARGSUSED */
4193 static int
4194 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4195     caller_context_t *ct, int flags)
4196 {
4197         znode_t         *dzp = VTOZ(tdvp);
4198         znode_t         *tzp, *szp;
4199         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
4200         zilog_t         *zilog;
4201         zfs_dirlock_t   *dl;
4202         dmu_tx_t        *tx;
4203         vnode_t         *realvp;
4204         int             error;
4205         int             zf = ZNEW;
4206         uint64_t        parent;
4207         uid_t           owner;
4208
4209         ASSERT(tdvp->v_type == VDIR);
4210
4211         ZFS_ENTER(zfsvfs);
4212         ZFS_VERIFY_ZP(dzp);
4213         zilog = zfsvfs->z_log;
4214
4215         if (VOP_REALVP(svp, &realvp, ct) == 0)
4216                 svp = realvp;
4217
4218         /*
4219          * POSIX dictates that we return EPERM here.
4220          * Better choices include ENOTSUP or EISDIR.
4221          */
4222         if (svp->v_type == VDIR) {
4223                 ZFS_EXIT(zfsvfs);
4224                 return (EPERM);
4225         }
4226
4227         if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
4228                 ZFS_EXIT(zfsvfs);
4229                 return (EXDEV);
4230         }
4231
4232         szp = VTOZ(svp);
4233         ZFS_VERIFY_ZP(szp);
4234
4235         /* Prevent links to .zfs/shares files */
4236
4237         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4238             &parent, sizeof (uint64_t))) != 0) {
4239                 ZFS_EXIT(zfsvfs);
4240                 return (error);
4241         }
4242         if (parent == zfsvfs->z_shares_dir) {
4243                 ZFS_EXIT(zfsvfs);
4244                 return (EPERM);
4245         }
4246
4247         if (zfsvfs->z_utf8 && u8_validate(name,
4248             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4249                 ZFS_EXIT(zfsvfs);
4250                 return (EILSEQ);
4251         }
4252         if (flags & FIGNORECASE)
4253                 zf |= ZCILOOK;
4254
4255         /*
4256          * We do not support links between attributes and non-attributes
4257          * because of the potential security risk of creating links
4258          * into "normal" file space in order to circumvent restrictions
4259          * imposed in attribute space.
4260          */
4261         if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4262                 ZFS_EXIT(zfsvfs);
4263                 return (EINVAL);
4264         }
4265
4266
4267         owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4268         if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4269                 ZFS_EXIT(zfsvfs);
4270                 return (EPERM);
4271         }
4272
4273         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4274                 ZFS_EXIT(zfsvfs);
4275                 return (error);
4276         }
4277
4278 top:
4279         /*
4280          * Attempt to lock directory; fail if entry already exists.
4281          */
4282         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4283         if (error) {
4284                 ZFS_EXIT(zfsvfs);
4285                 return (error);
4286         }
4287
4288         tx = dmu_tx_create(zfsvfs->z_os);
4289         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4290         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4291         zfs_sa_upgrade_txholds(tx, szp);
4292         zfs_sa_upgrade_txholds(tx, dzp);
4293         error = dmu_tx_assign(tx, TXG_NOWAIT);
4294         if (error) {
4295                 zfs_dirent_unlock(dl);
4296                 if (error == ERESTART) {
4297                         dmu_tx_wait(tx);
4298                         dmu_tx_abort(tx);
4299                         goto top;
4300                 }
4301                 dmu_tx_abort(tx);
4302                 ZFS_EXIT(zfsvfs);
4303                 return (error);
4304         }
4305
4306         error = zfs_link_create(dl, szp, tx, 0);
4307
4308         if (error == 0) {
4309                 uint64_t txtype = TX_LINK;
4310                 if (flags & FIGNORECASE)
4311                         txtype |= TX_CI;
4312                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4313         }
4314
4315         dmu_tx_commit(tx);
4316
4317         zfs_dirent_unlock(dl);
4318
4319         if (error == 0) {
4320                 vnevent_link(svp, ct);
4321         }
4322
4323         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4324                 zil_commit(zilog, 0);
4325
4326         ZFS_EXIT(zfsvfs);
4327         return (error);
4328 }
4329
4330 #ifdef sun
4331 /*
4332  * zfs_null_putapage() is used when the file system has been force
4333  * unmounted. It just drops the pages.
4334  */
4335 /* ARGSUSED */
4336 static int
4337 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4338                 size_t *lenp, int flags, cred_t *cr)
4339 {
4340         pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4341         return (0);
4342 }
4343
4344 /*
4345  * Push a page out to disk, klustering if possible.
4346  *
4347  *      IN:     vp      - file to push page to.
4348  *              pp      - page to push.
4349  *              flags   - additional flags.
4350  *              cr      - credentials of caller.
4351  *
4352  *      OUT:    offp    - start of range pushed.
4353  *              lenp    - len of range pushed.
4354  *
4355  *      RETURN: 0 if success
4356  *              error code if failure
4357  *
4358  * NOTE: callers must have locked the page to be pushed.  On
4359  * exit, the page (and all other pages in the kluster) must be
4360  * unlocked.
4361  */
4362 /* ARGSUSED */
4363 static int
4364 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4365                 size_t *lenp, int flags, cred_t *cr)
4366 {
4367         znode_t         *zp = VTOZ(vp);
4368         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4369         dmu_tx_t        *tx;
4370         u_offset_t      off, koff;
4371         size_t          len, klen;
4372         int             err;
4373
4374         off = pp->p_offset;
4375         len = PAGESIZE;
4376         /*
4377          * If our blocksize is bigger than the page size, try to kluster
4378          * multiple pages so that we write a full block (thus avoiding
4379          * a read-modify-write).
4380          */
4381         if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4382                 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4383                 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4384                 ASSERT(koff <= zp->z_size);
4385                 if (koff + klen > zp->z_size)
4386                         klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4387                 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4388         }
4389         ASSERT3U(btop(len), ==, btopr(len));
4390
4391         /*
4392          * Can't push pages past end-of-file.
4393          */
4394         if (off >= zp->z_size) {
4395                 /* ignore all pages */
4396                 err = 0;
4397                 goto out;
4398         } else if (off + len > zp->z_size) {
4399                 int npages = btopr(zp->z_size - off);
4400                 page_t *trunc;
4401
4402                 page_list_break(&pp, &trunc, npages);
4403                 /* ignore pages past end of file */
4404                 if (trunc)
4405                         pvn_write_done(trunc, flags);
4406                 len = zp->z_size - off;
4407         }
4408
4409         if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4410             zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4411                 err = EDQUOT;
4412                 goto out;
4413         }
4414 top:
4415         tx = dmu_tx_create(zfsvfs->z_os);
4416         dmu_tx_hold_write(tx, zp->z_id, off, len);
4417
4418         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4419         zfs_sa_upgrade_txholds(tx, zp);
4420         err = dmu_tx_assign(tx, TXG_NOWAIT);
4421         if (err != 0) {
4422                 if (err == ERESTART) {
4423                         dmu_tx_wait(tx);
4424                         dmu_tx_abort(tx);
4425                         goto top;
4426                 }
4427                 dmu_tx_abort(tx);
4428                 goto out;
4429         }
4430
4431         if (zp->z_blksz <= PAGESIZE) {
4432                 caddr_t va = zfs_map_page(pp, S_READ);
4433                 ASSERT3U(len, <=, PAGESIZE);
4434                 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4435                 zfs_unmap_page(pp, va);
4436         } else {
4437                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4438         }
4439
4440         if (err == 0) {
4441                 uint64_t mtime[2], ctime[2];
4442                 sa_bulk_attr_t bulk[3];
4443                 int count = 0;
4444
4445                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4446                     &mtime, 16);
4447                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4448                     &ctime, 16);
4449                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4450                     &zp->z_pflags, 8);
4451                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4452                     B_TRUE);
4453                 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4454         }
4455         dmu_tx_commit(tx);
4456
4457 out:
4458         pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4459         if (offp)
4460                 *offp = off;
4461         if (lenp)
4462                 *lenp = len;
4463
4464         return (err);
4465 }
4466
4467 /*
4468  * Copy the portion of the file indicated from pages into the file.
4469  * The pages are stored in a page list attached to the files vnode.
4470  *
4471  *      IN:     vp      - vnode of file to push page data to.
4472  *              off     - position in file to put data.
4473  *              len     - amount of data to write.
4474  *              flags   - flags to control the operation.
4475  *              cr      - credentials of caller.
4476  *              ct      - caller context.
4477  *
4478  *      RETURN: 0 if success
4479  *              error code if failure
4480  *
4481  * Timestamps:
4482  *      vp - ctime|mtime updated
4483  */
4484 /*ARGSUSED*/
4485 static int
4486 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4487     caller_context_t *ct)
4488 {
4489         znode_t         *zp = VTOZ(vp);
4490         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4491         page_t          *pp;
4492         size_t          io_len;
4493         u_offset_t      io_off;
4494         uint_t          blksz;
4495         rl_t            *rl;
4496         int             error = 0;
4497
4498         ZFS_ENTER(zfsvfs);
4499         ZFS_VERIFY_ZP(zp);
4500
4501         /*
4502          * Align this request to the file block size in case we kluster.
4503          * XXX - this can result in pretty aggresive locking, which can
4504          * impact simultanious read/write access.  One option might be
4505          * to break up long requests (len == 0) into block-by-block
4506          * operations to get narrower locking.
4507          */
4508         blksz = zp->z_blksz;
4509         if (ISP2(blksz))
4510                 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4511         else
4512                 io_off = 0;
4513         if (len > 0 && ISP2(blksz))
4514                 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4515         else
4516                 io_len = 0;
4517
4518         if (io_len == 0) {
4519                 /*
4520                  * Search the entire vp list for pages >= io_off.
4521                  */
4522                 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4523                 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4524                 goto out;
4525         }
4526         rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4527
4528         if (off > zp->z_size) {
4529                 /* past end of file */
4530                 zfs_range_unlock(rl);
4531                 ZFS_EXIT(zfsvfs);
4532                 return (0);
4533         }
4534
4535         len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4536
4537         for (off = io_off; io_off < off + len; io_off += io_len) {
4538                 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4539                         pp = page_lookup(vp, io_off,
4540                             (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4541                 } else {
4542                         pp = page_lookup_nowait(vp, io_off,
4543                             (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4544                 }
4545
4546                 if (pp != NULL && pvn_getdirty(pp, flags)) {
4547                         int err;
4548
4549                         /*
4550                          * Found a dirty page to push
4551                          */
4552                         err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4553                         if (err)
4554                                 error = err;
4555                 } else {
4556                         io_len = PAGESIZE;
4557                 }
4558         }
4559 out:
4560         zfs_range_unlock(rl);
4561         if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4562                 zil_commit(zfsvfs->z_log, zp->z_id);
4563         ZFS_EXIT(zfsvfs);
4564         return (error);
4565 }
4566 #endif  /* sun */
4567
4568 /*ARGSUSED*/
4569 void
4570 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4571 {
4572         znode_t *zp = VTOZ(vp);
4573         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4574         int error;
4575
4576         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4577         if (zp->z_sa_hdl == NULL) {
4578                 /*
4579                  * The fs has been unmounted, or we did a
4580                  * suspend/resume and this file no longer exists.
4581                  */
4582                 VI_LOCK(vp);
4583                 ASSERT(vp->v_count <= 1);
4584                 vp->v_count = 0;
4585                 VI_UNLOCK(vp);
4586                 vrecycle(vp, curthread);
4587                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4588                 return;
4589         }
4590
4591         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4592                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4593
4594                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4595                 zfs_sa_upgrade_txholds(tx, zp);
4596                 error = dmu_tx_assign(tx, TXG_WAIT);
4597                 if (error) {
4598                         dmu_tx_abort(tx);
4599                 } else {
4600                         mutex_enter(&zp->z_lock);
4601                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4602                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4603                         zp->z_atime_dirty = 0;
4604                         mutex_exit(&zp->z_lock);
4605                         dmu_tx_commit(tx);
4606                 }
4607         }
4608
4609         zfs_zinactive(zp);
4610         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4611 }
4612
4613 #ifdef sun
4614 /*
4615  * Bounds-check the seek operation.
4616  *
4617  *      IN:     vp      - vnode seeking within
4618  *              ooff    - old file offset
4619  *              noffp   - pointer to new file offset
4620  *              ct      - caller context
4621  *
4622  *      RETURN: 0 if success
4623  *              EINVAL if new offset invalid
4624  */
4625 /* ARGSUSED */
4626 static int
4627 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4628     caller_context_t *ct)
4629 {
4630         if (vp->v_type == VDIR)
4631                 return (0);
4632         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4633 }
4634
4635 /*
4636  * Pre-filter the generic locking function to trap attempts to place
4637  * a mandatory lock on a memory mapped file.
4638  */
4639 static int
4640 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4641     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4642 {
4643         znode_t *zp = VTOZ(vp);
4644         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4645
4646         ZFS_ENTER(zfsvfs);
4647         ZFS_VERIFY_ZP(zp);
4648
4649         /*
4650          * We are following the UFS semantics with respect to mapcnt
4651          * here: If we see that the file is mapped already, then we will
4652          * return an error, but we don't worry about races between this
4653          * function and zfs_map().
4654          */
4655         if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4656                 ZFS_EXIT(zfsvfs);
4657                 return (EAGAIN);
4658         }
4659         ZFS_EXIT(zfsvfs);
4660         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4661 }
4662
4663 /*
4664  * If we can't find a page in the cache, we will create a new page
4665  * and fill it with file data.  For efficiency, we may try to fill
4666  * multiple pages at once (klustering) to fill up the supplied page
4667  * list.  Note that the pages to be filled are held with an exclusive
4668  * lock to prevent access by other threads while they are being filled.
4669  */
4670 static int
4671 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4672     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4673 {
4674         znode_t *zp = VTOZ(vp);
4675         page_t *pp, *cur_pp;
4676         objset_t *os = zp->z_zfsvfs->z_os;
4677         u_offset_t io_off, total;
4678         size_t io_len;
4679         int err;
4680
4681         if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4682                 /*
4683                  * We only have a single page, don't bother klustering
4684                  */
4685                 io_off = off;
4686                 io_len = PAGESIZE;
4687                 pp = page_create_va(vp, io_off, io_len,
4688                     PG_EXCL | PG_WAIT, seg, addr);
4689         } else {
4690                 /*
4691                  * Try to find enough pages to fill the page list
4692                  */
4693                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4694                     &io_len, off, plsz, 0);
4695         }
4696         if (pp == NULL) {
4697                 /*
4698                  * The page already exists, nothing to do here.
4699                  */
4700                 *pl = NULL;
4701                 return (0);
4702         }
4703
4704         /*
4705          * Fill the pages in the kluster.
4706          */
4707         cur_pp = pp;
4708         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4709                 caddr_t va;
4710
4711                 ASSERT3U(io_off, ==, cur_pp->p_offset);
4712                 va = zfs_map_page(cur_pp, S_WRITE);
4713                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4714                     DMU_READ_PREFETCH);
4715                 zfs_unmap_page(cur_pp, va);
4716                 if (err) {
4717                         /* On error, toss the entire kluster */
4718                         pvn_read_done(pp, B_ERROR);
4719                         /* convert checksum errors into IO errors */
4720                         if (err == ECKSUM)
4721                                 err = EIO;
4722                         return (err);
4723                 }
4724                 cur_pp = cur_pp->p_next;
4725         }
4726
4727         /*
4728          * Fill in the page list array from the kluster starting
4729          * from the desired offset `off'.
4730          * NOTE: the page list will always be null terminated.
4731          */
4732         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4733         ASSERT(pl == NULL || (*pl)->p_offset == off);
4734
4735         return (0);
4736 }
4737
4738 /*
4739  * Return pointers to the pages for the file region [off, off + len]
4740  * in the pl array.  If plsz is greater than len, this function may
4741  * also return page pointers from after the specified region
4742  * (i.e. the region [off, off + plsz]).  These additional pages are
4743  * only returned if they are already in the cache, or were created as
4744  * part of a klustered read.
4745  *
4746  *      IN:     vp      - vnode of file to get data from.
4747  *              off     - position in file to get data from.
4748  *              len     - amount of data to retrieve.
4749  *              plsz    - length of provided page list.
4750  *              seg     - segment to obtain pages for.
4751  *              addr    - virtual address of fault.
4752  *              rw      - mode of created pages.
4753  *              cr      - credentials of caller.
4754  *              ct      - caller context.
4755  *
4756  *      OUT:    protp   - protection mode of created pages.
4757  *              pl      - list of pages created.
4758  *
4759  *      RETURN: 0 if success
4760  *              error code if failure
4761  *
4762  * Timestamps:
4763  *      vp - atime updated
4764  */
4765 /* ARGSUSED */
4766 static int
4767 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4768         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4769         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4770 {
4771         znode_t         *zp = VTOZ(vp);
4772         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4773         page_t          **pl0 = pl;
4774         int             err = 0;
4775
4776         /* we do our own caching, faultahead is unnecessary */
4777         if (pl == NULL)
4778                 return (0);
4779         else if (len > plsz)
4780                 len = plsz;
4781         else
4782                 len = P2ROUNDUP(len, PAGESIZE);
4783         ASSERT(plsz >= len);
4784
4785         ZFS_ENTER(zfsvfs);
4786         ZFS_VERIFY_ZP(zp);
4787
4788         if (protp)
4789                 *protp = PROT_ALL;
4790
4791         /*
4792          * Loop through the requested range [off, off + len) looking
4793          * for pages.  If we don't find a page, we will need to create
4794          * a new page and fill it with data from the file.
4795          */
4796         while (len > 0) {
4797                 if (*pl = page_lookup(vp, off, SE_SHARED))
4798                         *(pl+1) = NULL;
4799                 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4800                         goto out;
4801                 while (*pl) {
4802                         ASSERT3U((*pl)->p_offset, ==, off);
4803                         off += PAGESIZE;
4804                         addr += PAGESIZE;
4805                         if (len > 0) {
4806                                 ASSERT3U(len, >=, PAGESIZE);
4807                                 len -= PAGESIZE;
4808                         }
4809                         ASSERT3U(plsz, >=, PAGESIZE);
4810                         plsz -= PAGESIZE;
4811                         pl++;
4812                 }
4813         }
4814
4815         /*
4816          * Fill out the page array with any pages already in the cache.
4817          */
4818         while (plsz > 0 &&
4819             (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4820                         off += PAGESIZE;
4821                         plsz -= PAGESIZE;
4822         }
4823 out:
4824         if (err) {
4825                 /*
4826                  * Release any pages we have previously locked.
4827                  */
4828                 while (pl > pl0)
4829                         page_unlock(*--pl);
4830         } else {
4831                 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4832         }
4833
4834         *pl = NULL;
4835
4836         ZFS_EXIT(zfsvfs);
4837         return (err);
4838 }
4839
4840 /*
4841  * Request a memory map for a section of a file.  This code interacts
4842  * with common code and the VM system as follows:
4843  *
4844  *      common code calls mmap(), which ends up in smmap_common()
4845  *
4846  *      this calls VOP_MAP(), which takes you into (say) zfs
4847  *
4848  *      zfs_map() calls as_map(), passing segvn_create() as the callback
4849  *
4850  *      segvn_create() creates the new segment and calls VOP_ADDMAP()
4851  *
4852  *      zfs_addmap() updates z_mapcnt
4853  */
4854 /*ARGSUSED*/
4855 static int
4856 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4857     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4858     caller_context_t *ct)
4859 {
4860         znode_t *zp = VTOZ(vp);
4861         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4862         segvn_crargs_t  vn_a;
4863         int             error;
4864
4865         ZFS_ENTER(zfsvfs);
4866         ZFS_VERIFY_ZP(zp);
4867
4868         if ((prot & PROT_WRITE) && (zp->z_pflags &
4869             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4870                 ZFS_EXIT(zfsvfs);
4871                 return (EPERM);
4872         }
4873
4874         if ((prot & (PROT_READ | PROT_EXEC)) &&
4875             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4876                 ZFS_EXIT(zfsvfs);
4877                 return (EACCES);
4878         }
4879
4880         if (vp->v_flag & VNOMAP) {
4881                 ZFS_EXIT(zfsvfs);
4882                 return (ENOSYS);
4883         }
4884
4885         if (off < 0 || len > MAXOFFSET_T - off) {
4886                 ZFS_EXIT(zfsvfs);
4887                 return (ENXIO);
4888         }
4889
4890         if (vp->v_type != VREG) {
4891                 ZFS_EXIT(zfsvfs);
4892                 return (ENODEV);
4893         }
4894
4895         /*
4896          * If file is locked, disallow mapping.
4897          */
4898         if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4899                 ZFS_EXIT(zfsvfs);
4900                 return (EAGAIN);
4901         }
4902
4903         as_rangelock(as);
4904         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4905         if (error != 0) {
4906                 as_rangeunlock(as);
4907                 ZFS_EXIT(zfsvfs);
4908                 return (error);
4909         }
4910
4911         vn_a.vp = vp;
4912         vn_a.offset = (u_offset_t)off;
4913         vn_a.type = flags & MAP_TYPE;
4914         vn_a.prot = prot;
4915         vn_a.maxprot = maxprot;
4916         vn_a.cred = cr;
4917         vn_a.amp = NULL;
4918         vn_a.flags = flags & ~MAP_TYPE;
4919         vn_a.szc = 0;
4920         vn_a.lgrp_mem_policy_flags = 0;
4921
4922         error = as_map(as, *addrp, len, segvn_create, &vn_a);
4923
4924         as_rangeunlock(as);
4925         ZFS_EXIT(zfsvfs);
4926         return (error);
4927 }
4928
4929 /* ARGSUSED */
4930 static int
4931 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4932     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4933     caller_context_t *ct)
4934 {
4935         uint64_t pages = btopr(len);
4936
4937         atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4938         return (0);
4939 }
4940
4941 /*
4942  * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4943  * more accurate mtime for the associated file.  Since we don't have a way of
4944  * detecting when the data was actually modified, we have to resort to
4945  * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4946  * last page is pushed.  The problem occurs when the msync() call is omitted,
4947  * which by far the most common case:
4948  *
4949  *      open()
4950  *      mmap()
4951  *      <modify memory>
4952  *      munmap()
4953  *      close()
4954  *      <time lapse>
4955  *      putpage() via fsflush
4956  *
4957  * If we wait until fsflush to come along, we can have a modification time that
4958  * is some arbitrary point in the future.  In order to prevent this in the
4959  * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4960  * torn down.
4961  */
4962 /* ARGSUSED */
4963 static int
4964 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4965     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4966     caller_context_t *ct)
4967 {
4968         uint64_t pages = btopr(len);
4969
4970         ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4971         atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4972
4973         if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4974             vn_has_cached_data(vp))
4975                 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4976
4977         return (0);
4978 }
4979
4980 /*
4981  * Free or allocate space in a file.  Currently, this function only
4982  * supports the `F_FREESP' command.  However, this command is somewhat
4983  * misnamed, as its functionality includes the ability to allocate as
4984  * well as free space.
4985  *
4986  *      IN:     vp      - vnode of file to free data in.
4987  *              cmd     - action to take (only F_FREESP supported).
4988  *              bfp     - section of file to free/alloc.
4989  *              flag    - current file open mode flags.
4990  *              offset  - current file offset.
4991  *              cr      - credentials of caller [UNUSED].
4992  *              ct      - caller context.
4993  *
4994  *      RETURN: 0 if success
4995  *              error code if failure
4996  *
4997  * Timestamps:
4998  *      vp - ctime|mtime updated
4999  */
5000 /* ARGSUSED */
5001 static int
5002 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5003     offset_t offset, cred_t *cr, caller_context_t *ct)
5004 {
5005         znode_t         *zp = VTOZ(vp);
5006         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
5007         uint64_t        off, len;
5008         int             error;
5009
5010         ZFS_ENTER(zfsvfs);
5011         ZFS_VERIFY_ZP(zp);
5012
5013         if (cmd != F_FREESP) {
5014                 ZFS_EXIT(zfsvfs);
5015                 return (EINVAL);
5016         }
5017
5018         if (error = convoff(vp, bfp, 0, offset)) {
5019                 ZFS_EXIT(zfsvfs);
5020                 return (error);
5021         }
5022
5023         if (bfp->l_len < 0) {
5024                 ZFS_EXIT(zfsvfs);
5025                 return (EINVAL);
5026         }
5027
5028         off = bfp->l_start;
5029         len = bfp->l_len; /* 0 means from off to end of file */
5030
5031         error = zfs_freesp(zp, off, len, flag, TRUE);
5032
5033         ZFS_EXIT(zfsvfs);
5034         return (error);
5035 }
5036 #endif  /* sun */
5037
5038 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5039 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5040
5041 /*ARGSUSED*/
5042 static int
5043 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5044 {
5045         znode_t         *zp = VTOZ(vp);
5046         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
5047         uint32_t        gen;
5048         uint64_t        gen64;
5049         uint64_t        object = zp->z_id;
5050         zfid_short_t    *zfid;
5051         int             size, i, error;
5052
5053         ZFS_ENTER(zfsvfs);
5054         ZFS_VERIFY_ZP(zp);
5055
5056         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5057             &gen64, sizeof (uint64_t))) != 0) {
5058                 ZFS_EXIT(zfsvfs);
5059                 return (error);
5060         }
5061
5062         gen = (uint32_t)gen64;
5063
5064         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5065         fidp->fid_len = size;
5066
5067         zfid = (zfid_short_t *)fidp;
5068
5069         zfid->zf_len = size;
5070
5071         for (i = 0; i < sizeof (zfid->zf_object); i++)
5072                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5073
5074         /* Must have a non-zero generation number to distinguish from .zfs */
5075         if (gen == 0)
5076                 gen = 1;
5077         for (i = 0; i < sizeof (zfid->zf_gen); i++)
5078                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5079
5080         if (size == LONG_FID_LEN) {
5081                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
5082                 zfid_long_t     *zlfid;
5083
5084                 zlfid = (zfid_long_t *)fidp;
5085
5086                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5087                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5088
5089                 /* XXX - this should be the generation number for the objset */
5090                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5091                         zlfid->zf_setgen[i] = 0;
5092         }
5093
5094         ZFS_EXIT(zfsvfs);
5095         return (0);
5096 }
5097
5098 static int
5099 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5100     caller_context_t *ct)
5101 {
5102         znode_t         *zp, *xzp;
5103         zfsvfs_t        *zfsvfs;
5104         zfs_dirlock_t   *dl;
5105         int             error;
5106
5107         switch (cmd) {
5108         case _PC_LINK_MAX:
5109                 *valp = INT_MAX;
5110                 return (0);
5111
5112         case _PC_FILESIZEBITS:
5113                 *valp = 64;
5114                 return (0);
5115 #ifdef sun
5116         case _PC_XATTR_EXISTS:
5117                 zp = VTOZ(vp);
5118                 zfsvfs = zp->z_zfsvfs;
5119                 ZFS_ENTER(zfsvfs);
5120                 ZFS_VERIFY_ZP(zp);
5121                 *valp = 0;
5122                 error = zfs_dirent_lock(&dl, zp, "", &xzp,
5123                     ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5124                 if (error == 0) {
5125                         zfs_dirent_unlock(dl);
5126                         if (!zfs_dirempty(xzp))
5127                                 *valp = 1;
5128                         VN_RELE(ZTOV(xzp));
5129                 } else if (error == ENOENT) {
5130                         /*
5131                          * If there aren't extended attributes, it's the
5132                          * same as having zero of them.
5133                          */
5134                         error = 0;
5135                 }
5136                 ZFS_EXIT(zfsvfs);
5137                 return (error);
5138
5139         case _PC_SATTR_ENABLED:
5140         case _PC_SATTR_EXISTS:
5141                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5142                     (vp->v_type == VREG || vp->v_type == VDIR);
5143                 return (0);
5144
5145         case _PC_ACCESS_FILTERING:
5146                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5147                     vp->v_type == VDIR;
5148                 return (0);
5149
5150         case _PC_ACL_ENABLED:
5151                 *valp = _ACL_ACE_ENABLED;
5152                 return (0);
5153 #endif  /* sun */
5154         case _PC_MIN_HOLE_SIZE:
5155                 *valp = (int)SPA_MINBLOCKSIZE;
5156                 return (0);
5157 #ifdef sun
5158         case _PC_TIMESTAMP_RESOLUTION:
5159                 /* nanosecond timestamp resolution */
5160                 *valp = 1L;
5161                 return (0);
5162 #endif  /* sun */
5163         case _PC_ACL_EXTENDED:
5164                 *valp = 0;
5165                 return (0);
5166
5167         case _PC_ACL_NFS4:
5168                 *valp = 1;
5169                 return (0);
5170
5171         case _PC_ACL_PATH_MAX:
5172                 *valp = ACL_MAX_ENTRIES;
5173                 return (0);
5174
5175         default:
5176                 return (EOPNOTSUPP);
5177         }
5178 }
5179
5180 /*ARGSUSED*/
5181 static int
5182 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5183     caller_context_t *ct)
5184 {
5185         znode_t *zp = VTOZ(vp);
5186         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5187         int error;
5188         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5189
5190         ZFS_ENTER(zfsvfs);
5191         ZFS_VERIFY_ZP(zp);
5192         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5193         ZFS_EXIT(zfsvfs);
5194
5195         return (error);
5196 }
5197
5198 /*ARGSUSED*/
5199 static int
5200 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5201     caller_context_t *ct)
5202 {
5203         znode_t *zp = VTOZ(vp);
5204         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5205         int error;
5206         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5207         zilog_t *zilog = zfsvfs->z_log;
5208
5209         ZFS_ENTER(zfsvfs);
5210         ZFS_VERIFY_ZP(zp);
5211
5212         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5213
5214         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5215                 zil_commit(zilog, 0);
5216
5217         ZFS_EXIT(zfsvfs);
5218         return (error);
5219 }
5220
5221 #ifdef sun
5222 /*
5223  * Tunable, both must be a power of 2.
5224  *
5225  * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
5226  * zcr_blksz_max: if set to less than the file block size, allow loaning out of
5227  *                an arcbuf for a partial block read
5228  */
5229 int zcr_blksz_min = (1 << 10);  /* 1K */
5230 int zcr_blksz_max = (1 << 17);  /* 128K */
5231
5232 /*ARGSUSED*/
5233 static int
5234 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5235     caller_context_t *ct)
5236 {
5237         znode_t *zp = VTOZ(vp);
5238         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5239         int max_blksz = zfsvfs->z_max_blksz;
5240         uio_t *uio = &xuio->xu_uio;
5241         ssize_t size = uio->uio_resid;
5242         offset_t offset = uio->uio_loffset;
5243         int blksz;
5244         int fullblk, i;
5245         arc_buf_t *abuf;
5246         ssize_t maxsize;
5247         int preamble, postamble;
5248
5249         if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5250                 return (EINVAL);
5251
5252         ZFS_ENTER(zfsvfs);
5253         ZFS_VERIFY_ZP(zp);
5254         switch (ioflag) {
5255         case UIO_WRITE:
5256                 /*
5257                  * Loan out an arc_buf for write if write size is bigger than
5258                  * max_blksz, and the file's block size is also max_blksz.
5259                  */
5260                 blksz = max_blksz;
5261                 if (size < blksz || zp->z_blksz != blksz) {
5262                         ZFS_EXIT(zfsvfs);
5263                         return (EINVAL);
5264                 }
5265                 /*
5266                  * Caller requests buffers for write before knowing where the
5267                  * write offset might be (e.g. NFS TCP write).
5268                  */
5269                 if (offset == -1) {
5270                         preamble = 0;
5271                 } else {
5272                         preamble = P2PHASE(offset, blksz);
5273                         if (preamble) {
5274                                 preamble = blksz - preamble;
5275                                 size -= preamble;
5276                         }
5277                 }
5278
5279                 postamble = P2PHASE(size, blksz);
5280                 size -= postamble;
5281
5282                 fullblk = size / blksz;
5283                 (void) dmu_xuio_init(xuio,
5284                     (preamble != 0) + fullblk + (postamble != 0));
5285                 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5286                     int, postamble, int,
5287                     (preamble != 0) + fullblk + (postamble != 0));
5288
5289                 /*
5290                  * Have to fix iov base/len for partial buffers.  They
5291                  * currently represent full arc_buf's.
5292                  */
5293                 if (preamble) {
5294                         /* data begins in the middle of the arc_buf */
5295                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5296                             blksz);
5297                         ASSERT(abuf);
5298                         (void) dmu_xuio_add(xuio, abuf,
5299                             blksz - preamble, preamble);
5300                 }
5301
5302                 for (i = 0; i < fullblk; i++) {
5303                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5304                             blksz);
5305                         ASSERT(abuf);
5306                         (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5307                 }
5308
5309                 if (postamble) {
5310                         /* data ends in the middle of the arc_buf */
5311                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5312                             blksz);
5313                         ASSERT(abuf);
5314                         (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5315                 }
5316                 break;
5317         case UIO_READ:
5318                 /*
5319                  * Loan out an arc_buf for read if the read size is larger than
5320                  * the current file block size.  Block alignment is not
5321                  * considered.  Partial arc_buf will be loaned out for read.
5322                  */
5323                 blksz = zp->z_blksz;
5324                 if (blksz < zcr_blksz_min)
5325                         blksz = zcr_blksz_min;
5326                 if (blksz > zcr_blksz_max)
5327                         blksz = zcr_blksz_max;
5328                 /* avoid potential complexity of dealing with it */
5329                 if (blksz > max_blksz) {
5330                         ZFS_EXIT(zfsvfs);
5331                         return (EINVAL);
5332                 }
5333
5334                 maxsize = zp->z_size - uio->uio_loffset;
5335                 if (size > maxsize)
5336                         size = maxsize;
5337
5338                 if (size < blksz || vn_has_cached_data(vp)) {
5339                         ZFS_EXIT(zfsvfs);
5340                         return (EINVAL);
5341                 }
5342                 break;
5343         default:
5344                 ZFS_EXIT(zfsvfs);
5345                 return (EINVAL);
5346         }
5347
5348         uio->uio_extflg = UIO_XUIO;
5349         XUIO_XUZC_RW(xuio) = ioflag;
5350         ZFS_EXIT(zfsvfs);
5351         return (0);
5352 }
5353
5354 /*ARGSUSED*/
5355 static int
5356 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5357 {
5358         int i;
5359         arc_buf_t *abuf;
5360         int ioflag = XUIO_XUZC_RW(xuio);
5361
5362         ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5363
5364         i = dmu_xuio_cnt(xuio);
5365         while (i-- > 0) {
5366                 abuf = dmu_xuio_arcbuf(xuio, i);
5367                 /*
5368                  * if abuf == NULL, it must be a write buffer
5369                  * that has been returned in zfs_write().
5370                  */
5371                 if (abuf)
5372                         dmu_return_arcbuf(abuf);
5373                 ASSERT(abuf || ioflag == UIO_WRITE);
5374         }
5375
5376         dmu_xuio_fini(xuio);
5377         return (0);
5378 }
5379
5380 /*
5381  * Predeclare these here so that the compiler assumes that
5382  * this is an "old style" function declaration that does
5383  * not include arguments => we won't get type mismatch errors
5384  * in the initializations that follow.
5385  */
5386 static int zfs_inval();
5387 static int zfs_isdir();
5388
5389 static int
5390 zfs_inval()
5391 {
5392         return (EINVAL);
5393 }
5394
5395 static int
5396 zfs_isdir()
5397 {
5398         return (EISDIR);
5399 }
5400 /*
5401  * Directory vnode operations template
5402  */
5403 vnodeops_t *zfs_dvnodeops;
5404 const fs_operation_def_t zfs_dvnodeops_template[] = {
5405         VOPNAME_OPEN,           { .vop_open = zfs_open },
5406         VOPNAME_CLOSE,          { .vop_close = zfs_close },
5407         VOPNAME_READ,           { .error = zfs_isdir },
5408         VOPNAME_WRITE,          { .error = zfs_isdir },
5409         VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5410         VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5411         VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5412         VOPNAME_ACCESS,         { .vop_access = zfs_access },
5413         VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5414         VOPNAME_CREATE,         { .vop_create = zfs_create },
5415         VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5416         VOPNAME_LINK,           { .vop_link = zfs_link },
5417         VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5418         VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5419         VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5420         VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5421         VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5422         VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5423         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5424         VOPNAME_FID,            { .vop_fid = zfs_fid },
5425         VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5426         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5427         VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5428         VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5429         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5430         NULL,                   NULL
5431 };
5432
5433 /*
5434  * Regular file vnode operations template
5435  */
5436 vnodeops_t *zfs_fvnodeops;
5437 const fs_operation_def_t zfs_fvnodeops_template[] = {
5438         VOPNAME_OPEN,           { .vop_open = zfs_open },
5439         VOPNAME_CLOSE,          { .vop_close = zfs_close },
5440         VOPNAME_READ,           { .vop_read = zfs_read },
5441         VOPNAME_WRITE,          { .vop_write = zfs_write },
5442         VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5443         VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5444         VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5445         VOPNAME_ACCESS,         { .vop_access = zfs_access },
5446         VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5447         VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5448         VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5449         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5450         VOPNAME_FID,            { .vop_fid = zfs_fid },
5451         VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5452         VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5453         VOPNAME_SPACE,          { .vop_space = zfs_space },
5454         VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5455         VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5456         VOPNAME_MAP,            { .vop_map = zfs_map },
5457         VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5458         VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5459         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5460         VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5461         VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5462         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5463         VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5464         VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5465         NULL,                   NULL
5466 };
5467
5468 /*
5469  * Symbolic link vnode operations template
5470  */
5471 vnodeops_t *zfs_symvnodeops;
5472 const fs_operation_def_t zfs_symvnodeops_template[] = {
5473         VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5474         VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5475         VOPNAME_ACCESS,         { .vop_access = zfs_access },
5476         VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5477         VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5478         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5479         VOPNAME_FID,            { .vop_fid = zfs_fid },
5480         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5481         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5482         NULL,                   NULL
5483 };
5484
5485 /*
5486  * special share hidden files vnode operations template
5487  */
5488 vnodeops_t *zfs_sharevnodeops;
5489 const fs_operation_def_t zfs_sharevnodeops_template[] = {
5490         VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5491         VOPNAME_ACCESS,         { .vop_access = zfs_access },
5492         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5493         VOPNAME_FID,            { .vop_fid = zfs_fid },
5494         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5495         VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5496         VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5497         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5498         NULL,                   NULL
5499 };
5500
5501 /*
5502  * Extended attribute directory vnode operations template
5503  *      This template is identical to the directory vnodes
5504  *      operation template except for restricted operations:
5505  *              VOP_MKDIR()
5506  *              VOP_SYMLINK()
5507  * Note that there are other restrictions embedded in:
5508  *      zfs_create()    - restrict type to VREG
5509  *      zfs_link()      - no links into/out of attribute space
5510  *      zfs_rename()    - no moves into/out of attribute space
5511  */
5512 vnodeops_t *zfs_xdvnodeops;
5513 const fs_operation_def_t zfs_xdvnodeops_template[] = {
5514         VOPNAME_OPEN,           { .vop_open = zfs_open },
5515         VOPNAME_CLOSE,          { .vop_close = zfs_close },
5516         VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5517         VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5518         VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5519         VOPNAME_ACCESS,         { .vop_access = zfs_access },
5520         VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5521         VOPNAME_CREATE,         { .vop_create = zfs_create },
5522         VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5523         VOPNAME_LINK,           { .vop_link = zfs_link },
5524         VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5525         VOPNAME_MKDIR,          { .error = zfs_inval },
5526         VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5527         VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5528         VOPNAME_SYMLINK,        { .error = zfs_inval },
5529         VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5530         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5531         VOPNAME_FID,            { .vop_fid = zfs_fid },
5532         VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5533         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5534         VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5535         VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5536         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5537         NULL,                   NULL
5538 };
5539
5540 /*
5541  * Error vnode operations template
5542  */
5543 vnodeops_t *zfs_evnodeops;
5544 const fs_operation_def_t zfs_evnodeops_template[] = {
5545         VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5546         VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5547         NULL,                   NULL
5548 };
5549 #endif  /* sun */
5550
5551 static int
5552 ioflags(int ioflags)
5553 {
5554         int flags = 0;
5555
5556         if (ioflags & IO_APPEND)
5557                 flags |= FAPPEND;
5558         if (ioflags & IO_NDELAY)
5559                 flags |= FNONBLOCK;
5560         if (ioflags & IO_SYNC)
5561                 flags |= (FSYNC | FDSYNC | FRSYNC);
5562
5563         return (flags);
5564 }
5565
5566 static int
5567 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5568 {
5569         znode_t *zp = VTOZ(vp);
5570         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5571         objset_t *os = zp->z_zfsvfs->z_os;
5572         vm_page_t mreq;
5573         vm_object_t object;
5574         caddr_t va;
5575         struct sf_buf *sf;
5576         int i, error;
5577         int pcount, size;
5578
5579         ZFS_ENTER(zfsvfs);
5580         ZFS_VERIFY_ZP(zp);
5581
5582         pcount = round_page(count) / PAGE_SIZE;
5583         mreq = m[reqpage];
5584         object = mreq->object;
5585         error = 0;
5586
5587         KASSERT(vp->v_object == object, ("mismatching object"));
5588
5589         VM_OBJECT_LOCK(object);
5590         vm_page_lock_queues();
5591         for (i = 0; i < pcount; i++) {
5592                 if (i != reqpage) {
5593                         vm_page_free(m[i]);
5594                 }
5595         }
5596         vm_page_unlock_queues();
5597
5598         if (mreq->valid) {
5599                 if (mreq->valid != VM_PAGE_BITS_ALL)
5600                         vm_page_zero_invalid(mreq, TRUE);
5601                 VM_OBJECT_UNLOCK(object);
5602                 ZFS_EXIT(zfsvfs);
5603                 return (VM_PAGER_OK);
5604         }
5605
5606         PCPU_INC(cnt.v_vnodein);
5607         PCPU_INC(cnt.v_vnodepgsin);
5608
5609         if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5610                 VM_OBJECT_UNLOCK(object);
5611                 ZFS_EXIT(zfsvfs);
5612                 return (VM_PAGER_BAD);
5613         }
5614
5615         size = PAGE_SIZE;
5616         if (IDX_TO_OFF(mreq->pindex) + size > object->un_pager.vnp.vnp_size)
5617                 size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex);
5618
5619         VM_OBJECT_UNLOCK(object);
5620         va = zfs_map_page(mreq, &sf);
5621         error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex),
5622             size, va, DMU_READ_PREFETCH);
5623         if (size != PAGE_SIZE)
5624                 bzero(va + size, PAGE_SIZE - size);
5625         zfs_unmap_page(sf);
5626         VM_OBJECT_LOCK(object);
5627
5628         if (!error)
5629                 mreq->valid = VM_PAGE_BITS_ALL;
5630         KASSERT(mreq->dirty == 0, ("zfs_getpages: page %p is dirty", mreq));
5631
5632         VM_OBJECT_UNLOCK(object);
5633
5634         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5635         ZFS_EXIT(zfsvfs);
5636         return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
5637 }
5638
5639 static int
5640 zfs_freebsd_getpages(ap)
5641         struct vop_getpages_args /* {
5642                 struct vnode *a_vp;
5643                 vm_page_t *a_m;
5644                 int a_count;
5645                 int a_reqpage;
5646                 vm_ooffset_t a_offset;
5647         } */ *ap;
5648 {
5649
5650         return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5651 }
5652
5653 static int
5654 zfs_freebsd_open(ap)
5655         struct vop_open_args /* {
5656                 struct vnode *a_vp;
5657                 int a_mode;
5658                 struct ucred *a_cred;
5659                 struct thread *a_td;
5660         } */ *ap;
5661 {
5662         vnode_t *vp = ap->a_vp;
5663         znode_t *zp = VTOZ(vp);
5664         int error;
5665
5666         error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
5667         if (error == 0)
5668                 vnode_create_vobject(vp, zp->z_size, ap->a_td);
5669         return (error);
5670 }
5671
5672 static int
5673 zfs_freebsd_close(ap)
5674         struct vop_close_args /* {
5675                 struct vnode *a_vp;
5676                 int  a_fflag;
5677                 struct ucred *a_cred;
5678                 struct thread *a_td;
5679         } */ *ap;
5680 {
5681
5682         return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
5683 }
5684
5685 static int
5686 zfs_freebsd_ioctl(ap)
5687         struct vop_ioctl_args /* {
5688                 struct vnode *a_vp;
5689                 u_long a_command;
5690                 caddr_t a_data;
5691                 int a_fflag;
5692                 struct ucred *cred;
5693                 struct thread *td;
5694         } */ *ap;
5695 {
5696
5697         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5698             ap->a_fflag, ap->a_cred, NULL, NULL));
5699 }
5700
5701 static int
5702 zfs_freebsd_read(ap)
5703         struct vop_read_args /* {
5704                 struct vnode *a_vp;
5705                 struct uio *a_uio;
5706                 int a_ioflag;
5707                 struct ucred *a_cred;
5708         } */ *ap;
5709 {
5710
5711         return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5712             ap->a_cred, NULL));
5713 }
5714
5715 static int
5716 zfs_freebsd_write(ap)
5717         struct vop_write_args /* {
5718                 struct vnode *a_vp;
5719                 struct uio *a_uio;
5720                 int a_ioflag;
5721                 struct ucred *a_cred;
5722         } */ *ap;
5723 {
5724
5725         if (vn_rlimit_fsize(ap->a_vp, ap->a_uio, ap->a_uio->uio_td))
5726                 return (EFBIG);
5727
5728         return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5729             ap->a_cred, NULL));
5730 }
5731
5732 static int
5733 zfs_freebsd_access(ap)
5734         struct vop_access_args /* {
5735                 struct vnode *a_vp;
5736                 accmode_t a_accmode;
5737                 struct ucred *a_cred;
5738                 struct thread *a_td;
5739         } */ *ap;
5740 {
5741         accmode_t accmode;
5742         int error = 0;
5743
5744         /*
5745          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5746          */
5747         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5748         if (accmode != 0)
5749                 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
5750
5751         /*
5752          * VADMIN has to be handled by vaccess().
5753          */
5754         if (error == 0) {
5755                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5756                 if (accmode != 0) {
5757                         vnode_t *vp = ap->a_vp;
5758                         znode_t *zp = VTOZ(vp);
5759
5760                         error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
5761                             zp->z_gid, accmode, ap->a_cred, NULL);
5762                 }
5763         }
5764
5765         return (error);
5766 }
5767
5768 static int
5769 zfs_freebsd_lookup(ap)
5770         struct vop_lookup_args /* {
5771                 struct vnode *a_dvp;
5772                 struct vnode **a_vpp;
5773                 struct componentname *a_cnp;
5774         } */ *ap;
5775 {
5776         struct componentname *cnp = ap->a_cnp;
5777         char nm[NAME_MAX + 1];
5778
5779         ASSERT(cnp->cn_namelen < sizeof(nm));
5780         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
5781
5782         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5783             cnp->cn_cred, cnp->cn_thread, 0));
5784 }
5785
5786 static int
5787 zfs_freebsd_create(ap)
5788         struct vop_create_args /* {
5789                 struct vnode *a_dvp;
5790                 struct vnode **a_vpp;
5791                 struct componentname *a_cnp;
5792                 struct vattr *a_vap;
5793         } */ *ap;
5794 {
5795         struct componentname *cnp = ap->a_cnp;
5796         vattr_t *vap = ap->a_vap;
5797         int mode;
5798
5799         ASSERT(cnp->cn_flags & SAVENAME);
5800
5801         vattr_init_mask(vap);
5802         mode = vap->va_mode & ALLPERMS;
5803
5804         return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5805             ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
5806 }
5807
5808 static int
5809 zfs_freebsd_remove(ap)
5810         struct vop_remove_args /* {
5811                 struct vnode *a_dvp;
5812                 struct vnode *a_vp;
5813                 struct componentname *a_cnp;
5814         } */ *ap;
5815 {
5816
5817         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5818
5819         return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
5820             ap->a_cnp->cn_cred, NULL, 0));
5821 }
5822
5823 static int
5824 zfs_freebsd_mkdir(ap)
5825         struct vop_mkdir_args /* {
5826                 struct vnode *a_dvp;
5827                 struct vnode **a_vpp;
5828                 struct componentname *a_cnp;
5829                 struct vattr *a_vap;
5830         } */ *ap;
5831 {
5832         vattr_t *vap = ap->a_vap;
5833
5834         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5835
5836         vattr_init_mask(vap);
5837
5838         return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5839             ap->a_cnp->cn_cred, NULL, 0, NULL));
5840 }
5841
5842 static int
5843 zfs_freebsd_rmdir(ap)
5844         struct vop_rmdir_args /* {
5845                 struct vnode *a_dvp;
5846                 struct vnode *a_vp;
5847                 struct componentname *a_cnp;
5848         } */ *ap;
5849 {
5850         struct componentname *cnp = ap->a_cnp;
5851
5852         ASSERT(cnp->cn_flags & SAVENAME);
5853
5854         return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
5855 }
5856
5857 static int
5858 zfs_freebsd_readdir(ap)
5859         struct vop_readdir_args /* {
5860                 struct vnode *a_vp;
5861                 struct uio *a_uio;
5862                 struct ucred *a_cred;
5863                 int *a_eofflag;
5864                 int *a_ncookies;
5865                 u_long **a_cookies;
5866         } */ *ap;
5867 {
5868
5869         return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5870             ap->a_ncookies, ap->a_cookies));
5871 }
5872
5873 static int
5874 zfs_freebsd_fsync(ap)
5875         struct vop_fsync_args /* {
5876                 struct vnode *a_vp;
5877                 int a_waitfor;
5878                 struct thread *a_td;
5879         } */ *ap;
5880 {
5881
5882         vop_stdfsync(ap);
5883         return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5884 }
5885
5886 static int
5887 zfs_freebsd_getattr(ap)
5888         struct vop_getattr_args /* {
5889                 struct vnode *a_vp;
5890                 struct vattr *a_vap;
5891                 struct ucred *a_cred;
5892         } */ *ap;
5893 {
5894         vattr_t *vap = ap->a_vap;
5895         xvattr_t xvap;
5896         u_long fflags = 0;
5897         int error;
5898
5899         xva_init(&xvap);
5900         xvap.xva_vattr = *vap;
5901         xvap.xva_vattr.va_mask |= AT_XVATTR;
5902
5903         /* Convert chflags into ZFS-type flags. */
5904         /* XXX: what about SF_SETTABLE?. */
5905         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5906         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5907         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5908         XVA_SET_REQ(&xvap, XAT_NODUMP);
5909         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5910         if (error != 0)
5911                 return (error);
5912
5913         /* Convert ZFS xattr into chflags. */
5914 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
5915         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
5916                 fflags |= (fflag);                                      \
5917 } while (0)
5918         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5919             xvap.xva_xoptattrs.xoa_immutable);
5920         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5921             xvap.xva_xoptattrs.xoa_appendonly);
5922         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5923             xvap.xva_xoptattrs.xoa_nounlink);
5924         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5925             xvap.xva_xoptattrs.xoa_nodump);
5926 #undef  FLAG_CHECK
5927         *vap = xvap.xva_vattr;
5928         vap->va_flags = fflags;
5929         return (0);
5930 }
5931
5932 static int
5933 zfs_freebsd_setattr(ap)
5934         struct vop_setattr_args /* {
5935                 struct vnode *a_vp;
5936                 struct vattr *a_vap;
5937                 struct ucred *a_cred;
5938         } */ *ap;
5939 {
5940         vnode_t *vp = ap->a_vp;
5941         vattr_t *vap = ap->a_vap;
5942         cred_t *cred = ap->a_cred;
5943         xvattr_t xvap;
5944         u_long fflags;
5945         uint64_t zflags;
5946
5947         vattr_init_mask(vap);
5948         vap->va_mask &= ~AT_NOSET;
5949
5950         xva_init(&xvap);
5951         xvap.xva_vattr = *vap;
5952
5953         zflags = VTOZ(vp)->z_pflags;
5954
5955         if (vap->va_flags != VNOVAL) {
5956                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5957                 int error;
5958
5959                 if (zfsvfs->z_use_fuids == B_FALSE)
5960                         return (EOPNOTSUPP);
5961
5962                 fflags = vap->va_flags;
5963                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5964                         return (EOPNOTSUPP);
5965                 /*
5966                  * Unprivileged processes are not permitted to unset system
5967                  * flags, or modify flags if any system flags are set.
5968                  * Privileged non-jail processes may not modify system flags
5969                  * if securelevel > 0 and any existing system flags are set.
5970                  * Privileged jail processes behave like privileged non-jail
5971                  * processes if the security.jail.chflags_allowed sysctl is
5972                  * is non-zero; otherwise, they behave like unprivileged
5973                  * processes.
5974                  */
5975                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5976                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5977                         if (zflags &
5978                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5979                                 error = securelevel_gt(cred, 0);
5980                                 if (error != 0)
5981                                         return (error);
5982                         }
5983                 } else {
5984                         /*
5985                          * Callers may only modify the file flags on objects they
5986                          * have VADMIN rights for.
5987                          */
5988                         if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5989                                 return (error);
5990                         if (zflags &
5991                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5992                                 return (EPERM);
5993                         }
5994                         if (fflags &
5995                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5996                                 return (EPERM);
5997                         }
5998                 }
5999
6000 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
6001         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
6002             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
6003                 XVA_SET_REQ(&xvap, (xflag));                            \
6004                 (xfield) = ((fflags & (fflag)) != 0);                   \
6005         }                                                               \
6006 } while (0)
6007                 /* Convert chflags into ZFS-type flags. */
6008                 /* XXX: what about SF_SETTABLE?. */
6009                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6010                     xvap.xva_xoptattrs.xoa_immutable);
6011                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6012                     xvap.xva_xoptattrs.xoa_appendonly);
6013                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6014                     xvap.xva_xoptattrs.xoa_nounlink);
6015                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6016                     xvap.xva_xoptattrs.xoa_nodump);
6017 #undef  FLAG_CHANGE
6018         }
6019         return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6020 }
6021
6022 static int
6023 zfs_freebsd_rename(ap)
6024         struct vop_rename_args  /* {
6025                 struct vnode *a_fdvp;
6026                 struct vnode *a_fvp;
6027                 struct componentname *a_fcnp;
6028                 struct vnode *a_tdvp;
6029                 struct vnode *a_tvp;
6030                 struct componentname *a_tcnp;
6031         } */ *ap;
6032 {
6033         vnode_t *fdvp = ap->a_fdvp;
6034         vnode_t *fvp = ap->a_fvp;
6035         vnode_t *tdvp = ap->a_tdvp;
6036         vnode_t *tvp = ap->a_tvp;
6037         int error;
6038
6039         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6040         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6041
6042         error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6043             ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6044
6045         if (tdvp == tvp)
6046                 VN_RELE(tdvp);
6047         else
6048                 VN_URELE(tdvp);
6049         if (tvp)
6050                 VN_URELE(tvp);
6051         VN_RELE(fdvp);
6052         VN_RELE(fvp);
6053
6054         return (error);
6055 }
6056
6057 static int
6058 zfs_freebsd_symlink(ap)
6059         struct vop_symlink_args /* {
6060                 struct vnode *a_dvp;
6061                 struct vnode **a_vpp;
6062                 struct componentname *a_cnp;
6063                 struct vattr *a_vap;
6064                 char *a_target;
6065         } */ *ap;
6066 {
6067         struct componentname *cnp = ap->a_cnp;
6068         vattr_t *vap = ap->a_vap;
6069
6070         ASSERT(cnp->cn_flags & SAVENAME);
6071
6072         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
6073         vattr_init_mask(vap);
6074
6075         return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6076             ap->a_target, cnp->cn_cred, cnp->cn_thread));
6077 }
6078
6079 static int
6080 zfs_freebsd_readlink(ap)
6081         struct vop_readlink_args /* {
6082                 struct vnode *a_vp;
6083                 struct uio *a_uio;
6084                 struct ucred *a_cred;
6085         } */ *ap;
6086 {
6087
6088         return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6089 }
6090
6091 static int
6092 zfs_freebsd_link(ap)
6093         struct vop_link_args /* {
6094                 struct vnode *a_tdvp;
6095                 struct vnode *a_vp;
6096                 struct componentname *a_cnp;
6097         } */ *ap;
6098 {
6099         struct componentname *cnp = ap->a_cnp;
6100
6101         ASSERT(cnp->cn_flags & SAVENAME);
6102
6103         return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6104 }
6105
6106 static int
6107 zfs_freebsd_inactive(ap)
6108         struct vop_inactive_args /* {
6109                 struct vnode *a_vp;
6110                 struct thread *a_td;
6111         } */ *ap;
6112 {
6113         vnode_t *vp = ap->a_vp;
6114
6115         zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6116         return (0);
6117 }
6118
6119 static void
6120 zfs_reclaim_complete(void *arg, int pending)
6121 {
6122         znode_t *zp = arg;
6123         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6124
6125         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6126         if (zp->z_sa_hdl != NULL) {
6127                 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
6128                 zfs_znode_dmu_fini(zp);
6129                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6130         }
6131         zfs_znode_free(zp);
6132         rw_exit(&zfsvfs->z_teardown_inactive_lock);
6133         /*
6134          * If the file system is being unmounted, there is a process waiting
6135          * for us, wake it up.
6136          */
6137         if (zfsvfs->z_unmounted)
6138                 wakeup_one(zfsvfs);
6139 }
6140
6141 static int
6142 zfs_freebsd_reclaim(ap)
6143         struct vop_reclaim_args /* {
6144                 struct vnode *a_vp;
6145                 struct thread *a_td;
6146         } */ *ap;
6147 {
6148         vnode_t *vp = ap->a_vp;
6149         znode_t *zp = VTOZ(vp);
6150         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6151         boolean_t rlocked;
6152
6153         rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6154
6155         ASSERT(zp != NULL);
6156
6157         /*
6158          * Destroy the vm object and flush associated pages.
6159          */
6160         vnode_destroy_vobject(vp);
6161
6162         mutex_enter(&zp->z_lock);
6163         zp->z_vnode = NULL;
6164         mutex_exit(&zp->z_lock);
6165
6166         if (zp->z_unlinked) {
6167                 ;       /* Do nothing. */
6168         } else if (!rlocked) {
6169                 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6170                 taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6171         } else if (zp->z_sa_hdl == NULL) {
6172                 zfs_znode_free(zp);
6173         } else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
6174                 int locked;
6175
6176                 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
6177                     ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
6178                 if (locked == 0) {
6179                         /*
6180                          * Lock can't be obtained due to deadlock possibility,
6181                          * so defer znode destruction.
6182                          */
6183                         TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6184                         taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6185                 } else {
6186                         zfs_znode_dmu_fini(zp);
6187                         if (locked == 1)
6188                                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6189                         zfs_znode_free(zp);
6190                 }
6191         }
6192         VI_LOCK(vp);
6193         vp->v_data = NULL;
6194         ASSERT(vp->v_holdcnt >= 1);
6195         VI_UNLOCK(vp);
6196         if (rlocked)
6197                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
6198         return (0);
6199 }
6200
6201 static int
6202 zfs_freebsd_fid(ap)
6203         struct vop_fid_args /* {
6204                 struct vnode *a_vp;
6205                 struct fid *a_fid;
6206         } */ *ap;
6207 {
6208
6209         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6210 }
6211
6212 static int
6213 zfs_freebsd_pathconf(ap)
6214         struct vop_pathconf_args /* {
6215                 struct vnode *a_vp;
6216                 int a_name;
6217                 register_t *a_retval;
6218         } */ *ap;
6219 {
6220         ulong_t val;
6221         int error;
6222
6223         error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6224         if (error == 0)
6225                 *ap->a_retval = val;
6226         else if (error == EOPNOTSUPP)
6227                 error = vop_stdpathconf(ap);
6228         return (error);
6229 }
6230
6231 static int
6232 zfs_freebsd_fifo_pathconf(ap)
6233         struct vop_pathconf_args /* {
6234                 struct vnode *a_vp;
6235                 int a_name;
6236                 register_t *a_retval;
6237         } */ *ap;
6238 {
6239
6240         switch (ap->a_name) {
6241         case _PC_ACL_EXTENDED:
6242         case _PC_ACL_NFS4:
6243         case _PC_ACL_PATH_MAX:
6244         case _PC_MAC_PRESENT:
6245                 return (zfs_freebsd_pathconf(ap));
6246         default:
6247                 return (fifo_specops.vop_pathconf(ap));
6248         }
6249 }
6250
6251 /*
6252  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6253  * extended attribute name:
6254  *
6255  *      NAMESPACE       PREFIX
6256  *      system          freebsd:system:
6257  *      user            (none, can be used to access ZFS fsattr(5) attributes
6258  *                      created on Solaris)
6259  */
6260 static int
6261 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6262     size_t size)
6263 {
6264         const char *namespace, *prefix, *suffix;
6265
6266         /* We don't allow '/' character in attribute name. */
6267         if (strchr(name, '/') != NULL)
6268                 return (EINVAL);
6269         /* We don't allow attribute names that start with "freebsd:" string. */
6270         if (strncmp(name, "freebsd:", 8) == 0)
6271                 return (EINVAL);
6272
6273         bzero(attrname, size);
6274
6275         switch (attrnamespace) {
6276         case EXTATTR_NAMESPACE_USER:
6277 #if 0
6278                 prefix = "freebsd:";
6279                 namespace = EXTATTR_NAMESPACE_USER_STRING;
6280                 suffix = ":";
6281 #else
6282                 /*
6283                  * This is the default namespace by which we can access all
6284                  * attributes created on Solaris.
6285                  */
6286                 prefix = namespace = suffix = "";
6287 #endif
6288                 break;
6289         case EXTATTR_NAMESPACE_SYSTEM:
6290                 prefix = "freebsd:";
6291                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6292                 suffix = ":";
6293                 break;
6294         case EXTATTR_NAMESPACE_EMPTY:
6295         default:
6296                 return (EINVAL);
6297         }
6298         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6299             name) >= size) {
6300                 return (ENAMETOOLONG);
6301         }
6302         return (0);
6303 }
6304
6305 /*
6306  * Vnode operating to retrieve a named extended attribute.
6307  */
6308 static int
6309 zfs_getextattr(struct vop_getextattr_args *ap)
6310 /*
6311 vop_getextattr {
6312         IN struct vnode *a_vp;
6313         IN int a_attrnamespace;
6314         IN const char *a_name;
6315         INOUT struct uio *a_uio;
6316         OUT size_t *a_size;
6317         IN struct ucred *a_cred;
6318         IN struct thread *a_td;
6319 };
6320 */
6321 {
6322         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6323         struct thread *td = ap->a_td;
6324         struct nameidata nd;
6325         char attrname[255];
6326         struct vattr va;
6327         vnode_t *xvp = NULL, *vp;
6328         int error, flags;
6329
6330         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6331             ap->a_cred, ap->a_td, VREAD);
6332         if (error != 0)
6333                 return (error);
6334
6335         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6336             sizeof(attrname));
6337         if (error != 0)
6338                 return (error);
6339
6340         ZFS_ENTER(zfsvfs);
6341
6342         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6343             LOOKUP_XATTR);
6344         if (error != 0) {
6345                 ZFS_EXIT(zfsvfs);
6346                 return (error);
6347         }
6348
6349         flags = FREAD;
6350         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6351             xvp, td);
6352         error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6353         vp = nd.ni_vp;
6354         NDFREE(&nd, NDF_ONLY_PNBUF);
6355         if (error != 0) {
6356                 ZFS_EXIT(zfsvfs);
6357                 if (error == ENOENT)
6358                         error = ENOATTR;
6359                 return (error);
6360         }
6361
6362         if (ap->a_size != NULL) {
6363                 error = VOP_GETATTR(vp, &va, ap->a_cred);
6364                 if (error == 0)
6365                         *ap->a_size = (size_t)va.va_size;
6366         } else if (ap->a_uio != NULL)
6367                 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6368
6369         VOP_UNLOCK(vp, 0);
6370         vn_close(vp, flags, ap->a_cred, td);
6371         ZFS_EXIT(zfsvfs);
6372
6373         return (error);
6374 }
6375
6376 /*
6377  * Vnode operation to remove a named attribute.
6378  */
6379 int
6380 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
6381 /*
6382 vop_deleteextattr {
6383         IN struct vnode *a_vp;
6384         IN int a_attrnamespace;
6385         IN const char *a_name;
6386         IN struct ucred *a_cred;
6387         IN struct thread *a_td;
6388 };
6389 */
6390 {
6391         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6392         struct thread *td = ap->a_td;
6393         struct nameidata nd;
6394         char attrname[255];
6395         struct vattr va;
6396         vnode_t *xvp = NULL, *vp;
6397         int error, flags;
6398
6399         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6400             ap->a_cred, ap->a_td, VWRITE);
6401         if (error != 0)
6402                 return (error);
6403
6404         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6405             sizeof(attrname));
6406         if (error != 0)
6407                 return (error);
6408
6409         ZFS_ENTER(zfsvfs);
6410
6411         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6412             LOOKUP_XATTR);
6413         if (error != 0) {
6414                 ZFS_EXIT(zfsvfs);
6415                 return (error);
6416         }
6417
6418         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
6419             UIO_SYSSPACE, attrname, xvp, td);
6420         error = namei(&nd);
6421         vp = nd.ni_vp;
6422         NDFREE(&nd, NDF_ONLY_PNBUF);
6423         if (error != 0) {
6424                 ZFS_EXIT(zfsvfs);
6425                 if (error == ENOENT)
6426                         error = ENOATTR;
6427                 return (error);
6428         }
6429         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6430
6431         vput(nd.ni_dvp);
6432         if (vp == nd.ni_dvp)
6433                 vrele(vp);
6434         else
6435                 vput(vp);
6436         ZFS_EXIT(zfsvfs);
6437
6438         return (error);
6439 }
6440
6441 /*
6442  * Vnode operation to set a named attribute.
6443  */
6444 static int
6445 zfs_setextattr(struct vop_setextattr_args *ap)
6446 /*
6447 vop_setextattr {
6448         IN struct vnode *a_vp;
6449         IN int a_attrnamespace;
6450         IN const char *a_name;
6451         INOUT struct uio *a_uio;
6452         IN struct ucred *a_cred;
6453         IN struct thread *a_td;
6454 };
6455 */
6456 {
6457         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6458         struct thread *td = ap->a_td;
6459         struct nameidata nd;
6460         char attrname[255];
6461         struct vattr va;
6462         vnode_t *xvp = NULL, *vp;
6463         int error, flags;
6464
6465         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6466             ap->a_cred, ap->a_td, VWRITE);
6467         if (error != 0)
6468                 return (error);
6469
6470         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6471             sizeof(attrname));
6472         if (error != 0)
6473                 return (error);
6474
6475         ZFS_ENTER(zfsvfs);
6476
6477         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6478             LOOKUP_XATTR | CREATE_XATTR_DIR);
6479         if (error != 0) {
6480                 ZFS_EXIT(zfsvfs);
6481                 return (error);
6482         }
6483
6484         flags = FFLAGS(O_WRONLY | O_CREAT);
6485         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6486             xvp, td);
6487         error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6488         vp = nd.ni_vp;
6489         NDFREE(&nd, NDF_ONLY_PNBUF);
6490         if (error != 0) {
6491                 ZFS_EXIT(zfsvfs);
6492                 return (error);
6493         }
6494
6495         VATTR_NULL(&va);
6496         va.va_size = 0;
6497         error = VOP_SETATTR(vp, &va, ap->a_cred);
6498         if (error == 0)
6499                 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6500
6501         VOP_UNLOCK(vp, 0);
6502         vn_close(vp, flags, ap->a_cred, td);
6503         ZFS_EXIT(zfsvfs);
6504
6505         return (error);
6506 }
6507
6508 /*
6509  * Vnode operation to retrieve extended attributes on a vnode.
6510  */
6511 static int
6512 zfs_listextattr(struct vop_listextattr_args *ap)
6513 /*
6514 vop_listextattr {
6515         IN struct vnode *a_vp;
6516         IN int a_attrnamespace;
6517         INOUT struct uio *a_uio;
6518         OUT size_t *a_size;
6519         IN struct ucred *a_cred;
6520         IN struct thread *a_td;
6521 };
6522 */
6523 {
6524         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6525         struct thread *td = ap->a_td;
6526         struct nameidata nd;
6527         char attrprefix[16];
6528         u_char dirbuf[sizeof(struct dirent)];
6529         struct dirent *dp;
6530         struct iovec aiov;
6531         struct uio auio, *uio = ap->a_uio;
6532         size_t *sizep = ap->a_size;
6533         size_t plen;
6534         vnode_t *xvp = NULL, *vp;
6535         int done, error, eof, pos;
6536
6537         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6538             ap->a_cred, ap->a_td, VREAD);
6539         if (error != 0)
6540                 return (error);
6541
6542         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6543             sizeof(attrprefix));
6544         if (error != 0)
6545                 return (error);
6546         plen = strlen(attrprefix);
6547
6548         ZFS_ENTER(zfsvfs);
6549
6550         if (sizep != NULL)
6551                 *sizep = 0;
6552
6553         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6554             LOOKUP_XATTR);
6555         if (error != 0) {
6556                 ZFS_EXIT(zfsvfs);
6557                 /*
6558                  * ENOATTR means that the EA directory does not yet exist,
6559                  * i.e. there are no extended attributes there.
6560                  */
6561                 if (error == ENOATTR)
6562                         error = 0;
6563                 return (error);
6564         }
6565
6566         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
6567             UIO_SYSSPACE, ".", xvp, td);
6568         error = namei(&nd);
6569         vp = nd.ni_vp;
6570         NDFREE(&nd, NDF_ONLY_PNBUF);
6571         if (error != 0) {
6572                 ZFS_EXIT(zfsvfs);
6573                 return (error);
6574         }
6575
6576         auio.uio_iov = &aiov;
6577         auio.uio_iovcnt = 1;
6578         auio.uio_segflg = UIO_SYSSPACE;
6579         auio.uio_td = td;
6580         auio.uio_rw = UIO_READ;
6581         auio.uio_offset = 0;
6582
6583         do {
6584                 u_char nlen;
6585
6586                 aiov.iov_base = (void *)dirbuf;
6587                 aiov.iov_len = sizeof(dirbuf);
6588                 auio.uio_resid = sizeof(dirbuf);
6589                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6590                 done = sizeof(dirbuf) - auio.uio_resid;
6591                 if (error != 0)
6592                         break;
6593                 for (pos = 0; pos < done;) {
6594                         dp = (struct dirent *)(dirbuf + pos);
6595                         pos += dp->d_reclen;
6596                         /*
6597                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
6598                          * is what we get when attribute was created on Solaris.
6599                          */
6600                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6601                                 continue;
6602                         if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6603                                 continue;
6604                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6605                                 continue;
6606                         nlen = dp->d_namlen - plen;
6607                         if (sizep != NULL)
6608                                 *sizep += 1 + nlen;
6609                         else if (uio != NULL) {
6610                                 /*
6611                                  * Format of extattr name entry is one byte for
6612                                  * length and the rest for name.
6613                                  */
6614                                 error = uiomove(&nlen, 1, uio->uio_rw, uio);
6615                                 if (error == 0) {
6616                                         error = uiomove(dp->d_name + plen, nlen,
6617                                             uio->uio_rw, uio);
6618                                 }
6619                                 if (error != 0)
6620                                         break;
6621                         }
6622                 }
6623         } while (!eof && error == 0);
6624
6625         vput(vp);
6626         ZFS_EXIT(zfsvfs);
6627
6628         return (error);
6629 }
6630
6631 int
6632 zfs_freebsd_getacl(ap)
6633         struct vop_getacl_args /* {
6634                 struct vnode *vp;
6635                 acl_type_t type;
6636                 struct acl *aclp;
6637                 struct ucred *cred;
6638                 struct thread *td;
6639         } */ *ap;
6640 {
6641         int             error;
6642         vsecattr_t      vsecattr;
6643
6644         if (ap->a_type != ACL_TYPE_NFS4)
6645                 return (EINVAL);
6646
6647         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6648         if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
6649                 return (error);
6650
6651         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
6652         if (vsecattr.vsa_aclentp != NULL)
6653                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6654
6655         return (error);
6656 }
6657
6658 int
6659 zfs_freebsd_setacl(ap)
6660         struct vop_setacl_args /* {
6661                 struct vnode *vp;
6662                 acl_type_t type;
6663                 struct acl *aclp;
6664                 struct ucred *cred;
6665                 struct thread *td;
6666         } */ *ap;
6667 {
6668         int             error;
6669         vsecattr_t      vsecattr;
6670         int             aclbsize;       /* size of acl list in bytes */
6671         aclent_t        *aaclp;
6672
6673         if (ap->a_type != ACL_TYPE_NFS4)
6674                 return (EINVAL);
6675
6676         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6677                 return (EINVAL);
6678
6679         /*
6680          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6681          * splitting every entry into two and appending "canonical six"
6682          * entries at the end.  Don't allow for setting an ACL that would
6683          * cause chmod(2) to run out of ACL entries.
6684          */
6685         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6686                 return (ENOSPC);
6687
6688         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6689         if (error != 0)
6690                 return (error);
6691
6692         vsecattr.vsa_mask = VSA_ACE;
6693         aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
6694         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6695         aaclp = vsecattr.vsa_aclentp;
6696         vsecattr.vsa_aclentsz = aclbsize;
6697
6698         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6699         error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
6700         kmem_free(aaclp, aclbsize);
6701
6702         return (error);
6703 }
6704
6705 int
6706 zfs_freebsd_aclcheck(ap)
6707         struct vop_aclcheck_args /* {
6708                 struct vnode *vp;
6709                 acl_type_t type;
6710                 struct acl *aclp;
6711                 struct ucred *cred;
6712                 struct thread *td;
6713         } */ *ap;
6714 {
6715
6716         return (EOPNOTSUPP);
6717 }
6718
6719 struct vop_vector zfs_vnodeops;
6720 struct vop_vector zfs_fifoops;
6721 struct vop_vector zfs_shareops;
6722
6723 struct vop_vector zfs_vnodeops = {
6724         .vop_default =          &default_vnodeops,
6725         .vop_inactive =         zfs_freebsd_inactive,
6726         .vop_reclaim =          zfs_freebsd_reclaim,
6727         .vop_access =           zfs_freebsd_access,
6728 #ifdef FREEBSD_NAMECACHE
6729         .vop_lookup =           vfs_cache_lookup,
6730         .vop_cachedlookup =     zfs_freebsd_lookup,
6731 #else
6732         .vop_lookup =           zfs_freebsd_lookup,
6733 #endif
6734         .vop_getattr =          zfs_freebsd_getattr,
6735         .vop_setattr =          zfs_freebsd_setattr,
6736         .vop_create =           zfs_freebsd_create,
6737         .vop_mknod =            zfs_freebsd_create,
6738         .vop_mkdir =            zfs_freebsd_mkdir,
6739         .vop_readdir =          zfs_freebsd_readdir,
6740         .vop_fsync =            zfs_freebsd_fsync,
6741         .vop_open =             zfs_freebsd_open,
6742         .vop_close =            zfs_freebsd_close,
6743         .vop_rmdir =            zfs_freebsd_rmdir,
6744         .vop_ioctl =            zfs_freebsd_ioctl,
6745         .vop_link =             zfs_freebsd_link,
6746         .vop_symlink =          zfs_freebsd_symlink,
6747         .vop_readlink =         zfs_freebsd_readlink,
6748         .vop_read =             zfs_freebsd_read,
6749         .vop_write =            zfs_freebsd_write,
6750         .vop_remove =           zfs_freebsd_remove,
6751         .vop_rename =           zfs_freebsd_rename,
6752         .vop_pathconf =         zfs_freebsd_pathconf,
6753         .vop_bmap =             VOP_EOPNOTSUPP,
6754         .vop_fid =              zfs_freebsd_fid,
6755         .vop_getextattr =       zfs_getextattr,
6756         .vop_deleteextattr =    zfs_deleteextattr,
6757         .vop_setextattr =       zfs_setextattr,
6758         .vop_listextattr =      zfs_listextattr,
6759         .vop_getacl =           zfs_freebsd_getacl,
6760         .vop_setacl =           zfs_freebsd_setacl,
6761         .vop_aclcheck =         zfs_freebsd_aclcheck,
6762         .vop_getpages =         zfs_freebsd_getpages,
6763 };
6764
6765 struct vop_vector zfs_fifoops = {
6766         .vop_default =          &fifo_specops,
6767         .vop_fsync =            zfs_freebsd_fsync,
6768         .vop_access =           zfs_freebsd_access,
6769         .vop_getattr =          zfs_freebsd_getattr,
6770         .vop_inactive =         zfs_freebsd_inactive,
6771         .vop_read =             VOP_PANIC,
6772         .vop_reclaim =          zfs_freebsd_reclaim,
6773         .vop_setattr =          zfs_freebsd_setattr,
6774         .vop_write =            VOP_PANIC,
6775         .vop_pathconf =         zfs_freebsd_fifo_pathconf,
6776         .vop_fid =              zfs_freebsd_fid,
6777         .vop_getacl =           zfs_freebsd_getacl,
6778         .vop_setacl =           zfs_freebsd_setacl,
6779         .vop_aclcheck =         zfs_freebsd_aclcheck,
6780 };
6781
6782 /*
6783  * special share hidden files vnode operations template
6784  */
6785 struct vop_vector zfs_shareops = {
6786         .vop_default =          &default_vnodeops,
6787         .vop_access =           zfs_freebsd_access,
6788         .vop_inactive =         zfs_freebsd_inactive,
6789         .vop_reclaim =          zfs_freebsd_reclaim,
6790         .vop_fid =              zfs_freebsd_fid,
6791         .vop_pathconf =         zfs_freebsd_pathconf,
6792 };