sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/time.h>
  31 #include <sys/systm.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/resource.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vnode.h>
  36 #include <sys/file.h>
  37 #include <sys/stat.h>
  38 #include <sys/kmem.h>
  39 #include <sys/taskq.h>
  40 #include <sys/uio.h>
  41 #include <sys/atomic.h>
  42 #include <sys/namei.h>
  43 #include <sys/mman.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/errno.h>
  46 #include <sys/unistd.h>
  47 #include <sys/zfs_dir.h>
  48 #include <sys/zfs_ioctl.h>
  49 #include <sys/fs/zfs.h>
  50 #include <sys/dmu.h>
  51 #include <sys/spa.h>
  52 #include <sys/txg.h>
  53 #include <sys/dbuf.h>
  54 #include <sys/zap.h>
  55 #include <sys/dirent.h>
  56 #include <sys/policy.h>
  57 #include <sys/sunddi.h>
  58 #include <sys/filio.h>
  59 #include <sys/sid.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/zfs_rlock.h>
  64 #include <sys/extdirent.h>
  65 #include <sys/kidmap.h>
  66 #include <sys/bio.h>
  67 #include <sys/buf.h>
  68 #include <sys/sf_buf.h>
  69 #include <sys/sched.h>
  70 #include <sys/acl.h>
  71
  72 /*
  73  * Programming rules.
  74  *
  75  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  76  * properly lock its in-core state, create a DMU transaction, do the work,
  77  * record this work in the intent log (ZIL), commit the DMU transaction,
  78  * and wait for the intent log to commit if it is a synchronous operation.
  79  * Moreover, the vnode ops must work in both normal and log replay context.
  80  * The ordering of events is important to avoid deadlocks and references
  81  * to freed memory.  The example below illustrates the following Big Rules:
  82  *
  83  *  (1) A check must be made in each zfs thread for a mounted file system.
  84  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  85  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  86  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  87  *      can return EIO from the calling function.
  88  *
  89  *  (2) VN_RELE() should always be the last thing except for zil_commit()
  90  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  91  *      First, if it's the last reference, the vnode/znode
  92  *      can be freed, so the zp may point to freed memory.  Second, the last
  93  *      reference will call zfs_zinactive(), which may induce a lot of work --
  94  *      pushing cached pages (which acquires range locks) and syncing out
  95  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
  96  *      which could deadlock the system if you were already holding one.
  97  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  98  *
  99  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 100  *      as they can span dmu_tx_assign() calls.
 101  *
 102  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 103  *      This is critical because we don't want to block while holding locks.
 104  *      Note, in particular, that if a lock is sometimes acquired before
 105  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 106  *      use a non-blocking assign can deadlock the system.  The scenario:
 107  *
 108  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 109  *      Thread B is in an already-assigned tx, and blocks for this lock.
 110  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 111  *      forever, because the previous txg can't quiesce until B's tx commits.
 112  *
 113  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 114  *      then drop all locks, call dmu_tx_wait(), and try again.
 115  *
 116  *  (5) If the operation succeeded, generate the intent log entry for it
 117  *      before dropping locks.  This ensures that the ordering of events
 118  *      in the intent log matches the order in which they actually occurred.
 119  *      During ZIL replay the zfs_log_* functions will update the sequence
 120  *      number to indicate the zil transaction has replayed.
 121  *
 122  *  (6) At the end of each vnode op, the DMU tx must always commit,
 123  *      regardless of whether there were any errors.
 124  *
 125  *  (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
 126  *      to ensure that synchronous semantics are provided when necessary.
 127  *
 128  * In general, this is how things should be ordered in each vnode op:
 129  *
 130  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 131  * top:
 132  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 133  *      rw_enter(...);                  // grab any other locks you need
 134  *      tx = dmu_tx_create(...);        // get DMU tx
 135  *      dmu_tx_hold_*();                // hold each object you might modify
 136  *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 137  *      if (error) {
 138  *              rw_exit(...);           // drop locks
 139  *              zfs_dirent_unlock(dl);  // unlock directory entry
 140  *              VN_RELE(...);           // release held vnodes
 141  *              if (error == ERESTART) {
 142  *                      dmu_tx_wait(tx);
 143  *                      dmu_tx_abort(tx);
 144  *                      goto top;
 145  *              }
 146  *              dmu_tx_abort(tx);       // abort DMU tx
 147  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 148  *              return (error);         // really out of space
 149  *      }
 150  *      error = do_real_work();         // do whatever this VOP does
 151  *      if (error == 0)
 152  *              zfs_log_*(...);         // on success, make ZIL entry
 153  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 154  *      rw_exit(...);                   // drop locks
 155  *      zfs_dirent_unlock(dl);          // unlock directory entry
 156  *      VN_RELE(...);                   // release held vnodes
 157  *      zil_commit(zilog, seq, foid);   // synchronous when necessary
 158  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 159  *      return (error);                 // done, report error
 160  */
 161
 162 /* ARGSUSED */
 163 static int
 164 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 165 {
 166         znode_t *zp = VTOZ(*vpp);
 167         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 168
 169         ZFS_ENTER(zfsvfs);
 170         ZFS_VERIFY_ZP(zp);
 171
 172         if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
 173             ((flag & FAPPEND) == 0)) {
 174                 ZFS_EXIT(zfsvfs);
 175                 return (EPERM);
 176         }
 177
 178         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 179             ZTOV(zp)->v_type == VREG &&
 180             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 181             zp->z_phys->zp_size > 0) {
 182                 if (fs_vscan(*vpp, cr, 0) != 0) {
 183                         ZFS_EXIT(zfsvfs);
 184                         return (EACCES);
 185                 }
 186         }
 187
 188         /* Keep a count of the synchronous opens in the znode */
 189         if (flag & (FSYNC | FDSYNC))
 190                 atomic_inc_32(&zp->z_sync_cnt);
 191
 192         ZFS_EXIT(zfsvfs);
 193         return (0);
 194 }
 195
 196 /* ARGSUSED */
 197 static int
 198 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 199     caller_context_t *ct)
 200 {
 201         znode_t *zp = VTOZ(vp);
 202         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 203
 204         ZFS_ENTER(zfsvfs);
 205         ZFS_VERIFY_ZP(zp);
 206
 207         /* Decrement the synchronous opens in the znode */
 208         if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 209                 atomic_dec_32(&zp->z_sync_cnt);
 210
 211         /*
 212          * Clean up any locks held by this process on the vp.
 213          */
 214         cleanlocks(vp, ddi_get_pid(), 0);
 215         cleanshares(vp, ddi_get_pid());
 216
 217         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 218             ZTOV(zp)->v_type == VREG &&
 219             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 220             zp->z_phys->zp_size > 0)
 221                 VERIFY(fs_vscan(vp, cr, 1) == 0);
 222
 223         ZFS_EXIT(zfsvfs);
 224         return (0);
 225 }
 226
 227 /*
 228  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 229  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 230  */
 231 static int
 232 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 233 {
 234         znode_t *zp = VTOZ(vp);
 235         uint64_t noff = (uint64_t)*off; /* new offset */
 236         uint64_t file_sz;
 237         int error;
 238         boolean_t hole;
 239
 240         file_sz = zp->z_phys->zp_size;
 241         if (noff >= file_sz)  {
 242                 return (ENXIO);
 243         }
 244
 245         if (cmd == _FIO_SEEK_HOLE)
 246                 hole = B_TRUE;
 247         else
 248                 hole = B_FALSE;
 249
 250         error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 251
 252         /* end of file? */
 253         if ((error == ESRCH) || (noff > file_sz)) {
 254                 /*
 255                  * Handle the virtual hole at the end of file.
 256                  */
 257                 if (hole) {
 258                         *off = file_sz;
 259                         return (0);
 260                 }
 261                 return (ENXIO);
 262         }
 263
 264         if (noff < *off)
 265                 return (error);
 266         *off = noff;
 267         return (error);
 268 }
 269
 270 /* ARGSUSED */
 271 static int
 272 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 273     int *rvalp, caller_context_t *ct)
 274 {
 275         offset_t off;
 276         int error;
 277         zfsvfs_t *zfsvfs;
 278         znode_t *zp;
 279
 280         switch (com) {
 281         case _FIOFFS:
 282                 return (0);
 283
 284                 /*
 285                  * The following two ioctls are used by bfu.  Faking out,
 286                  * necessary to avoid bfu errors.
 287                  */
 288         case _FIOGDIO:
 289         case _FIOSDIO:
 290                 return (0);
 291
 292         case _FIO_SEEK_DATA:
 293         case _FIO_SEEK_HOLE:
 294                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 295                         return (EFAULT);
 296
 297                 zp = VTOZ(vp);
 298                 zfsvfs = zp->z_zfsvfs;
 299                 ZFS_ENTER(zfsvfs);
 300                 ZFS_VERIFY_ZP(zp);
 301
 302                 /* offset parameter is in/out */
 303                 error = zfs_holey(vp, com, &off);
 304                 ZFS_EXIT(zfsvfs);
 305                 if (error)
 306                         return (error);
 307                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 308                         return (EFAULT);
 309                 return (0);
 310         }
 311         return (ENOTTY);
 312 }
 313
 314 static vm_page_t
 315 page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 316 {
 317         vm_object_t obj;
 318         vm_page_t pp;
 319
 320         obj = vp->v_object;
 321         VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
 322
 323         for (;;) {
 324                 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 325                     vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
 326                         if (vm_page_sleep_if_busy(pp, FALSE, "zfsmwb"))
 327                                 continue;
 328                         vm_page_busy(pp);
 329                         vm_page_lock_queues();
 330                         vm_page_undirty(pp);
 331                         vm_page_unlock_queues();
 332                 } else {
 333                         if (__predict_false(obj->cache != NULL)) {
 334                                 vm_page_cache_free(obj, OFF_TO_IDX(start),
 335                                     OFF_TO_IDX(start) + 1);
 336                         }
 337                         pp = NULL;
 338                 }
 339                 break;
 340         }
 341         return (pp);
 342 }
 343
 344 static void
 345 page_unlock(vm_page_t pp)
 346 {
 347
 348         vm_page_wakeup(pp);
 349 }
 350
 351 static caddr_t
 352 zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
 353 {
 354
 355         sched_pin();
 356         *sfp = sf_buf_alloc(pp, SFB_CPUPRIVATE);
 357         return ((caddr_t)sf_buf_kva(*sfp));
 358 }
 359
 360 static void
 361 zfs_unmap_page(struct sf_buf *sf)
 362 {
 363
 364         sf_buf_free(sf);
 365         sched_unpin();
 366 }
 367
 368
 369 /*
 370  * When a file is memory mapped, we must keep the IO data synchronized
 371  * between the DMU cache and the memory mapped pages.  What this means:
 372  *
 373  * On Write:    If we find a memory mapped page, we write to *both*
 374  *              the page and the dmu buffer.
 375  */
 376
 377 static void
 378 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
 379     int segflg, dmu_tx_t *tx)
 380 {
 381         vm_object_t obj;
 382         struct sf_buf *sf;
 383         int64_t off;
 384
 385         ASSERT(vp->v_mount != NULL);
 386         obj = vp->v_object;
 387         ASSERT(obj != NULL);
 388
 389         off = start & PAGEOFFSET;
 390         VM_OBJECT_LOCK(obj);
 391         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 392                 vm_page_t pp;
 393                 uint64_t nbytes = MIN(PAGESIZE - off, len);
 394
 395                 if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
 396                         caddr_t va;
 397
 398                         VM_OBJECT_UNLOCK(obj);
 399                         va = zfs_map_page(pp, &sf);
 400                         if (segflg == UIO_NOCOPY) {
 401                                 (void) dmu_write(os, oid, start+off, nbytes,
 402                                     va+off, tx);
 403                         } else {
 404                                 (void) dmu_read(os, oid, start+off, nbytes,
 405                                     va+off, DMU_READ_PREFETCH);;
 406                         }
 407                         zfs_unmap_page(sf);
 408                         VM_OBJECT_LOCK(obj);
 409                         page_unlock(pp);
 410
 411                 }
 412                 len -= nbytes;
 413                 off = 0;
 414         }
 415         VM_OBJECT_UNLOCK(obj);
 416 }
 417
 418 /*
 419  * When a file is memory mapped, we must keep the IO data synchronized
 420  * between the DMU cache and the memory mapped pages.  What this means:
 421  *
 422  * On Read:     We "read" preferentially from memory mapped pages,
 423  *              else we default from the dmu buffer.
 424  *
 425  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 426  *      the file is memory mapped.
 427  */
 428 static int
 429 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 430 {
 431         znode_t *zp = VTOZ(vp);
 432         objset_t *os = zp->z_zfsvfs->z_os;
 433         vm_object_t obj;
 434         vm_page_t m;
 435         struct sf_buf *sf;
 436         int64_t start, off;
 437         caddr_t va;
 438         int len = nbytes;
 439         int error = 0;
 440         uint64_t dirbytes;
 441
 442         ASSERT(vp->v_mount != NULL);
 443         obj = vp->v_object;
 444         ASSERT(obj != NULL);
 445
 446         start = uio->uio_loffset;
 447         off = start & PAGEOFFSET;
 448         dirbytes = 0;
 449         VM_OBJECT_LOCK(obj);
 450         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 451                 uint64_t bytes = MIN(PAGESIZE - off, len);
 452
 453 again:
 454                 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 455                     vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 456                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 457                                 goto again;
 458                         vm_page_busy(m);
 459                         VM_OBJECT_UNLOCK(obj);
 460                         if (dirbytes > 0) {
 461                                 error = dmu_read_uio(os, zp->z_id, uio,
 462                                     dirbytes);
 463                                 dirbytes = 0;
 464                         }
 465                         if (error == 0) {
 466                                 sched_pin();
 467                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 468                                 va = (caddr_t)sf_buf_kva(sf);
 469                                 error = uiomove(va + off, bytes, UIO_READ, uio);
 470                                 sf_buf_free(sf);
 471                                 sched_unpin();
 472                         }
 473                         VM_OBJECT_LOCK(obj);
 474                         vm_page_wakeup(m);
 475                 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
 476                         /*
 477                          * The code below is here to make sendfile(2) work
 478                          * correctly with ZFS. As pointed out by ups@
 479                          * sendfile(2) should be changed to use VOP_GETPAGES(),
 480                          * but it pessimize performance of sendfile/UFS, that's
 481                          * why I handle this special case in ZFS code.
 482                          */
 483                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 484                                 goto again;
 485                         vm_page_busy(m);
 486                         VM_OBJECT_UNLOCK(obj);
 487                         if (dirbytes > 0) {
 488                                 error = dmu_read_uio(os, zp->z_id, uio,
 489                                     dirbytes);
 490                                 dirbytes = 0;
 491                         }
 492                         if (error == 0) {
 493                                 sched_pin();
 494                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 495                                 va = (caddr_t)sf_buf_kva(sf);
 496                                 error = dmu_read(os, zp->z_id, start + off,
 497                                     bytes, (void *)(va + off),
 498                                     DMU_READ_PREFETCH);
 499                                 sf_buf_free(sf);
 500                                 sched_unpin();
 501                         }
 502                         VM_OBJECT_LOCK(obj);
 503                         vm_page_wakeup(m);
 504                         if (error == 0)
 505                                 uio->uio_resid -= bytes;
 506                 } else {
 507                         dirbytes += bytes;
 508                 }
 509                 len -= bytes;
 510                 off = 0;
 511                 if (error)
 512                         break;
 513         }
 514         VM_OBJECT_UNLOCK(obj);
 515         if (error == 0 && dirbytes > 0)
 516                 error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 517         return (error);
 518 }
 519
 520 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 521
 522 /*
 523  * Read bytes from specified file into supplied buffer.
 524  *
 525  *      IN:     vp      - vnode of file to be read from.
 526  *              uio     - structure supplying read location, range info,
 527  *                        and return buffer.
 528  *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 529  *              cr      - credentials of caller.
 530  *              ct      - caller context
 531  *
 532  *      OUT:    uio     - updated offset and range, buffer filled.
 533  *
 534  *      RETURN: 0 if success
 535  *              error code if failure
 536  *
 537  * Side Effects:
 538  *      vp - atime updated if byte count > 0
 539  */
 540 /* ARGSUSED */
 541 static int
 542 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 543 {
 544         znode_t         *zp = VTOZ(vp);
 545         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 546         objset_t        *os;
 547         ssize_t         n, nbytes;
 548         int             error;
 549         rl_t            *rl;
 550
 551         ZFS_ENTER(zfsvfs);
 552         ZFS_VERIFY_ZP(zp);
 553         os = zfsvfs->z_os;
 554
 555         if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
 556                 ZFS_EXIT(zfsvfs);
 557                 return (EACCES);
 558         }
 559
 560         /*
 561          * Validate file offset
 562          */
 563         if (uio->uio_loffset < (offset_t)0) {
 564                 ZFS_EXIT(zfsvfs);
 565                 return (EINVAL);
 566         }
 567
 568         /*
 569          * Fasttrack empty reads
 570          */
 571         if (uio->uio_resid == 0) {
 572                 ZFS_EXIT(zfsvfs);
 573                 return (0);
 574         }
 575
 576         /*
 577          * Check for mandatory locks
 578          */
 579         if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 580                 if (error = chklock(vp, FREAD,
 581                     uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 582                         ZFS_EXIT(zfsvfs);
 583                         return (error);
 584                 }
 585         }
 586
 587         /*
 588          * If we're in FRSYNC mode, sync out this znode before reading it.
 589          */
 590         if (ioflag & FRSYNC)
 591                 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 592
 593         /*
 594          * Lock the range against changes.
 595          */
 596         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 597
 598         /*
 599          * If we are reading past end-of-file we can skip
 600          * to the end; but we might still need to set atime.
 601          */
 602         if (uio->uio_loffset >= zp->z_phys->zp_size) {
 603                 error = 0;
 604                 goto out;
 605         }
 606
 607         ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 608         n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 609
 610         while (n > 0) {
 611                 nbytes = MIN(n, zfs_read_chunk_size -
 612                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 613
 614                 if (vn_has_cached_data(vp))
 615                         error = mappedread(vp, nbytes, uio);
 616                 else
 617                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 618                 if (error) {
 619                         /* convert checksum errors into IO errors */
 620                         if (error == ECKSUM)
 621                                 error = EIO;
 622                         break;
 623                 }
 624
 625                 n -= nbytes;
 626         }
 627
 628 out:
 629         zfs_range_unlock(rl);
 630
 631         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 632         ZFS_EXIT(zfsvfs);
 633         return (error);
 634 }
 635
 636 /*
 637  * Fault in the pages of the first n bytes specified by the uio structure.
 638  * 1 byte in each page is touched and the uio struct is unmodified.
 639  * Any error will exit this routine as this is only a best
 640  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
 641  */
 642 static void
 643 zfs_prefault_write(ssize_t n, struct uio *uio)
 644 {
 645         struct iovec *iov;
 646         ulong_t cnt, incr;
 647         caddr_t p;
 648
 649         if (uio->uio_segflg != UIO_USERSPACE)
 650                 return;
 651
 652         iov = uio->uio_iov;
 653
 654         while (n) {
 655                 cnt = MIN(iov->iov_len, n);
 656                 if (cnt == 0) {
 657                         /* empty iov entry */
 658                         iov++;
 659                         continue;
 660                 }
 661                 n -= cnt;
 662                 /*
 663                  * touch each page in this segment.
 664                  */
 665                 p = iov->iov_base;
 666                 while (cnt) {
 667                         if (fubyte(p) == -1)
 668                                 return;
 669                         incr = MIN(cnt, PAGESIZE);
 670                         p += incr;
 671                         cnt -= incr;
 672                 }
 673                 /*
 674                  * touch the last byte in case it straddles a page.
 675                  */
 676                 p--;
 677                 if (fubyte(p) == -1)
 678                         return;
 679                 iov++;
 680         }
 681 }
 682
 683 /*
 684  * Write the bytes to a file.
 685  *
 686  *      IN:     vp      - vnode of file to be written to.
 687  *              uio     - structure supplying write location, range info,
 688  *                        and data buffer.
 689  *              ioflag  - IO_APPEND flag set if in append mode.
 690  *              cr      - credentials of caller.
 691  *              ct      - caller context (NFS/CIFS fem monitor only)
 692  *
 693  *      OUT:    uio     - updated offset and range.
 694  *
 695  *      RETURN: 0 if success
 696  *              error code if failure
 697  *
 698  * Timestamps:
 699  *      vp - ctime|mtime updated if byte count > 0
 700  */
 701 /* ARGSUSED */
 702 static int
 703 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 704 {
 705         znode_t         *zp = VTOZ(vp);
 706         rlim64_t        limit = MAXOFFSET_T;
 707         ssize_t         start_resid = uio->uio_resid;
 708         ssize_t         tx_bytes;
 709         uint64_t        end_size;
 710         dmu_tx_t        *tx;
 711         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 712         zilog_t         *zilog;
 713         offset_t        woff;
 714         ssize_t         n, nbytes;
 715         rl_t            *rl;
 716         int             max_blksz = zfsvfs->z_max_blksz;
 717         uint64_t        pflags;
 718         int             error;
 719         arc_buf_t       *abuf;
 720
 721         /*
 722          * Fasttrack empty write
 723          */
 724         n = start_resid;
 725         if (n == 0)
 726                 return (0);
 727
 728         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 729                 limit = MAXOFFSET_T;
 730
 731         ZFS_ENTER(zfsvfs);
 732         ZFS_VERIFY_ZP(zp);
 733
 734         /*
 735          * If immutable or not appending then return EPERM
 736          */
 737         pflags = zp->z_phys->zp_flags;
 738         if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 739             ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 740             (uio->uio_loffset < zp->z_phys->zp_size))) {
 741                 ZFS_EXIT(zfsvfs);
 742                 return (EPERM);
 743         }
 744
 745         zilog = zfsvfs->z_log;
 746
 747         /*
 748          * Pre-fault the pages to ensure slow (eg NFS) pages
 749          * don't hold up txg.
 750          */
 751         zfs_prefault_write(n, uio);
 752
 753         /*
 754          * If in append mode, set the io offset pointer to eof.
 755          */
 756         if (ioflag & IO_APPEND) {
 757                 /*
 758                  * Range lock for a file append:
 759                  * The value for the start of range will be determined by
 760                  * zfs_range_lock() (to guarantee append semantics).
 761                  * If this write will cause the block size to increase,
 762                  * zfs_range_lock() will lock the entire file, so we must
 763                  * later reduce the range after we grow the block size.
 764                  */
 765                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 766                 if (rl->r_len == UINT64_MAX) {
 767                         /* overlocked, zp_size can't change */
 768                         woff = uio->uio_loffset = zp->z_phys->zp_size;
 769                 } else {
 770                         woff = uio->uio_loffset = rl->r_off;
 771                 }
 772         } else {
 773                 woff = uio->uio_loffset;
 774                 /*
 775                  * Validate file offset
 776                  */
 777                 if (woff < 0) {
 778                         ZFS_EXIT(zfsvfs);
 779                         return (EINVAL);
 780                 }
 781
 782                 /*
 783                  * If we need to grow the block size then zfs_range_lock()
 784                  * will lock a wider range than we request here.
 785                  * Later after growing the block size we reduce the range.
 786                  */
 787                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 788         }
 789
 790         if (woff >= limit) {
 791                 zfs_range_unlock(rl);
 792                 ZFS_EXIT(zfsvfs);
 793                 return (EFBIG);
 794         }
 795
 796         if ((woff + n) > limit || woff > (limit - n))
 797                 n = limit - woff;
 798
 799         /*
 800          * Check for mandatory locks
 801          */
 802         if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
 803             (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 804                 zfs_range_unlock(rl);
 805                 ZFS_EXIT(zfsvfs);
 806                 return (error);
 807         }
 808         end_size = MAX(zp->z_phys->zp_size, woff + n);
 809
 810         /*
 811          * Write the file in reasonable size chunks.  Each chunk is written
 812          * in a separate transaction; this keeps the intent log records small
 813          * and allows us to do more fine-grained space accounting.
 814          */
 815         while (n > 0) {
 816                 abuf = NULL;
 817                 woff = uio->uio_loffset;
 818
 819 again:
 820                 if (zfs_usergroup_overquota(zfsvfs,
 821                     B_FALSE, zp->z_phys->zp_uid) ||
 822                     zfs_usergroup_overquota(zfsvfs,
 823                     B_TRUE, zp->z_phys->zp_gid)) {
 824                         if (abuf != NULL)
 825                                 dmu_return_arcbuf(abuf);
 826                         error = EDQUOT;
 827                         break;
 828                 }
 829
 830                 /*
 831                  * If dmu_assign_arcbuf() is expected to execute with minimum
 832                  * overhead loan an arc buffer and copy user data to it before
 833                  * we enter a txg.  This avoids holding a txg forever while we
 834                  * pagefault on a hanging NFS server mapping.
 835                  */
 836                 if (abuf == NULL && n >= max_blksz &&
 837                     woff >= zp->z_phys->zp_size &&
 838                     P2PHASE(woff, max_blksz) == 0 &&
 839                     zp->z_blksz == max_blksz) {
 840                         size_t cbytes;
 841
 842                         abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
 843                         ASSERT(abuf != NULL);
 844                         ASSERT(arc_buf_size(abuf) == max_blksz);
 845                         if (error = uiocopy(abuf->b_data, max_blksz,
 846                             UIO_WRITE, uio, &cbytes)) {
 847                                 dmu_return_arcbuf(abuf);
 848                                 break;
 849                         }
 850                         ASSERT(cbytes == max_blksz);
 851                 }
 852
 853                 /*
 854                  * Start a transaction.
 855                  */
 856                 tx = dmu_tx_create(zfsvfs->z_os);
 857                 dmu_tx_hold_bonus(tx, zp->z_id);
 858                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 859                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 860                 if (error) {
 861                         if (error == ERESTART) {
 862                                 dmu_tx_wait(tx);
 863                                 dmu_tx_abort(tx);
 864                                 goto again;
 865                         }
 866                         dmu_tx_abort(tx);
 867                         if (abuf != NULL)
 868                                 dmu_return_arcbuf(abuf);
 869                         break;
 870                 }
 871
 872                 /*
 873                  * If zfs_range_lock() over-locked we grow the blocksize
 874                  * and then reduce the lock range.  This will only happen
 875                  * on the first iteration since zfs_range_reduce() will
 876                  * shrink down r_len to the appropriate size.
 877                  */
 878                 if (rl->r_len == UINT64_MAX) {
 879                         uint64_t new_blksz;
 880
 881                         if (zp->z_blksz > max_blksz) {
 882                                 ASSERT(!ISP2(zp->z_blksz));
 883                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 884                         } else {
 885                                 new_blksz = MIN(end_size, max_blksz);
 886                         }
 887                         zfs_grow_blocksize(zp, new_blksz, tx);
 888                         zfs_range_reduce(rl, woff, n);
 889                 }
 890
 891                 /*
 892                  * XXX - should we really limit each write to z_max_blksz?
 893                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 894                  */
 895                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 896
 897                 if (woff + nbytes > zp->z_phys->zp_size)
 898                         vnode_pager_setsize(vp, woff + nbytes);
 899
 900                 if (abuf == NULL) {
 901                         tx_bytes = uio->uio_resid;
 902                         error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
 903                             nbytes, tx);
 904                         tx_bytes -= uio->uio_resid;
 905                 } else {
 906                         tx_bytes = nbytes;
 907                         ASSERT(tx_bytes == max_blksz);
 908                         dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
 909                         ASSERT(tx_bytes <= uio->uio_resid);
 910                         uioskip(uio, tx_bytes);
 911                 }
 912
 913                 /*
 914                  * XXXPJD: There are some cases (triggered by fsx) where
 915                  *         vn_has_cached_data(vp) returns false when it should
 916                  *         return true. This should be investigated.
 917                  */
 918 #if 0
 919                 if (tx_bytes && vn_has_cached_data(vp))
 920 #else
 921                 if (tx_bytes && vp->v_object != NULL)
 922 #endif
 923                 {
 924                         update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 925                             zp->z_id, uio->uio_segflg, tx);
 926                 }
 927
 928                 /*
 929                  * If we made no progress, we're done.  If we made even
 930                  * partial progress, update the znode and ZIL accordingly.
 931                  */
 932                 if (tx_bytes == 0) {
 933                         dmu_tx_commit(tx);
 934                         ASSERT(error != 0);
 935                         break;
 936                 }
 937
 938                 /*
 939                  * Clear Set-UID/Set-GID bits on successful write if not
 940                  * privileged and at least one of the excute bits is set.
 941                  *
 942                  * It would be nice to to this after all writes have
 943                  * been done, but that would still expose the ISUID/ISGID
 944                  * to another app after the partial write is committed.
 945                  *
 946                  * Note: we don't call zfs_fuid_map_id() here because
 947                  * user 0 is not an ephemeral uid.
 948                  */
 949                 mutex_enter(&zp->z_acl_lock);
 950                 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 951                     (S_IXUSR >> 6))) != 0 &&
 952                     (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
 953                     secpolicy_vnode_setid_retain(vp, cr,
 954                     (zp->z_phys->zp_mode & S_ISUID) != 0 &&
 955                     zp->z_phys->zp_uid == 0) != 0) {
 956                         zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 957                 }
 958                 mutex_exit(&zp->z_acl_lock);
 959
 960                 /*
 961                  * Update time stamp.  NOTE: This marks the bonus buffer as
 962                  * dirty, so we don't have to do it again for zp_size.
 963                  */
 964                 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 965
 966                 /*
 967                  * Update the file size (zp_size) if it has changed;
 968                  * account for possible concurrent updates.
 969                  */
 970                 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 971                         (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 972                             uio->uio_loffset);
 973                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 974                 dmu_tx_commit(tx);
 975
 976                 if (error != 0)
 977                         break;
 978                 ASSERT(tx_bytes == nbytes);
 979                 n -= nbytes;
 980         }
 981
 982         zfs_range_unlock(rl);
 983
 984         /*
 985          * If we're in replay mode, or we made no progress, return error.
 986          * Otherwise, it's at least a partial write, so it's successful.
 987          */
 988         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 989                 ZFS_EXIT(zfsvfs);
 990                 return (error);
 991         }
 992
 993         if (ioflag & (FSYNC | FDSYNC))
 994                 zil_commit(zilog, zp->z_last_itx, zp->z_id);
 995
 996         ZFS_EXIT(zfsvfs);
 997         return (0);
 998 }
 999
1000 void
1001 zfs_get_done(dmu_buf_t *db, void *vzgd)
1002 {
1003         zgd_t *zgd = (zgd_t *)vzgd;
1004         rl_t *rl = zgd->zgd_rl;
1005         vnode_t *vp = ZTOV(rl->r_zp);
1006         objset_t *os = rl->r_zp->z_zfsvfs->z_os;
1007         int vfslocked;
1008
1009         vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
1010         dmu_buf_rele(db, vzgd);
1011         zfs_range_unlock(rl);
1012         /*
1013          * Release the vnode asynchronously as we currently have the
1014          * txg stopped from syncing.
1015          */
1016         VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1017         zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1018         kmem_free(zgd, sizeof (zgd_t));
1019         VFS_UNLOCK_GIANT(vfslocked);
1020 }
1021
1022 /*
1023  * Get data to generate a TX_WRITE intent log record.
1024  */
1025 int
1026 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1027 {
1028         zfsvfs_t *zfsvfs = arg;
1029         objset_t *os = zfsvfs->z_os;
1030         znode_t *zp;
1031         uint64_t off = lr->lr_offset;
1032         dmu_buf_t *db;
1033         rl_t *rl;
1034         zgd_t *zgd;
1035         int dlen = lr->lr_length;               /* length of user data */
1036         int error = 0;
1037
1038         ASSERT(zio);
1039         ASSERT(dlen != 0);
1040
1041         /*
1042          * Nothing to do if the file has been removed
1043          */
1044         if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
1045                 return (ENOENT);
1046         if (zp->z_unlinked) {
1047                 /*
1048                  * Release the vnode asynchronously as we currently have the
1049                  * txg stopped from syncing.
1050                  */
1051                 VN_RELE_ASYNC(ZTOV(zp),
1052                     dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1053                 return (ENOENT);
1054         }
1055
1056         /*
1057          * Write records come in two flavors: immediate and indirect.
1058          * For small writes it's cheaper to store the data with the
1059          * log record (immediate); for large writes it's cheaper to
1060          * sync the data and get a pointer to it (indirect) so that
1061          * we don't have to write the data twice.
1062          */
1063         if (buf != NULL) { /* immediate write */
1064                 rl = zfs_range_lock(zp, off, dlen, RL_READER);
1065                 /* test for truncation needs to be done while range locked */
1066                 if (off >= zp->z_phys->zp_size) {
1067                         error = ENOENT;
1068                         goto out;
1069                 }
1070                 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
1071                     DMU_READ_NO_PREFETCH));
1072         } else { /* indirect write */
1073                 uint64_t boff; /* block starting offset */
1074
1075                 /*
1076                  * Have to lock the whole block to ensure when it's
1077                  * written out and it's checksum is being calculated
1078                  * that no one can change the data. We need to re-check
1079                  * blocksize after we get the lock in case it's changed!
1080                  */
1081                 for (;;) {
1082                         if (ISP2(zp->z_blksz)) {
1083                                 boff = P2ALIGN_TYPED(off, zp->z_blksz,
1084                                     uint64_t);
1085                         } else {
1086                                 boff = 0;
1087                         }
1088                         dlen = zp->z_blksz;
1089                         rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1090                         if (zp->z_blksz == dlen)
1091                                 break;
1092                         zfs_range_unlock(rl);
1093                 }
1094                 /* test for truncation needs to be done while range locked */
1095                 if (off >= zp->z_phys->zp_size) {
1096                         error = ENOENT;
1097                         goto out;
1098                 }
1099                 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1100                 zgd->zgd_rl = rl;
1101                 zgd->zgd_zilog = zfsvfs->z_log;
1102                 zgd->zgd_bp = &lr->lr_blkptr;
1103                 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1104                 ASSERT(boff == db->db_offset);
1105                 lr->lr_blkoff = off - boff;
1106                 error = dmu_sync(zio, db, &lr->lr_blkptr,
1107                     lr->lr_common.lrc_txg, zfs_get_done, zgd);
1108                 ASSERT((error && error != EINPROGRESS) ||
1109                     lr->lr_length <= zp->z_blksz);
1110                 if (error == 0) {
1111                         /*
1112                          * dmu_sync() can compress a block of zeros to a null
1113                          * blkptr but the block size still needs to be passed
1114                          * through to replay.
1115                          */
1116                         BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
1117                         zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1118                 }
1119
1120                 /*
1121                  * If we get EINPROGRESS, then we need to wait for a
1122                  * write IO initiated by dmu_sync() to complete before
1123                  * we can release this dbuf.  We will finish everything
1124                  * up in the zfs_get_done() callback.
1125                  */
1126                 if (error == EINPROGRESS) {
1127                         return (0);
1128                 } else if (error == EALREADY) {
1129                         lr->lr_common.lrc_txtype = TX_WRITE2;
1130                         error = 0;
1131                 }
1132                 dmu_buf_rele(db, zgd);
1133                 kmem_free(zgd, sizeof (zgd_t));
1134         }
1135 out:
1136         zfs_range_unlock(rl);
1137         /*
1138          * Release the vnode asynchronously as we currently have the
1139          * txg stopped from syncing.
1140          */
1141         VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1142         return (error);
1143 }
1144
1145 /*ARGSUSED*/
1146 static int
1147 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1148     caller_context_t *ct)
1149 {
1150         znode_t *zp = VTOZ(vp);
1151         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1152         int error;
1153
1154         ZFS_ENTER(zfsvfs);
1155         ZFS_VERIFY_ZP(zp);
1156
1157         if (flag & V_ACE_MASK)
1158                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1159         else
1160                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1161
1162         ZFS_EXIT(zfsvfs);
1163         return (error);
1164 }
1165
1166 /*
1167  * If vnode is for a device return a specfs vnode instead.
1168  */
1169 static int
1170 specvp_check(vnode_t **vpp, cred_t *cr)
1171 {
1172         int error = 0;
1173
1174         if (IS_DEVVP(*vpp)) {
1175                 struct vnode *svp;
1176
1177                 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1178                 VN_RELE(*vpp);
1179                 if (svp == NULL)
1180                         error = ENOSYS;
1181                 *vpp = svp;
1182         }
1183         return (error);
1184 }
1185
1186
1187 /*
1188  * Lookup an entry in a directory, or an extended attribute directory.
1189  * If it exists, return a held vnode reference for it.
1190  *
1191  *      IN:     dvp     - vnode of directory to search.
1192  *              nm      - name of entry to lookup.
1193  *              pnp     - full pathname to lookup [UNUSED].
1194  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1195  *              rdir    - root directory vnode [UNUSED].
1196  *              cr      - credentials of caller.
1197  *              ct      - caller context
1198  *              direntflags - directory lookup flags
1199  *              realpnp - returned pathname.
1200  *
1201  *      OUT:    vpp     - vnode of located entry, NULL if not found.
1202  *
1203  *      RETURN: 0 if success
1204  *              error code if failure
1205  *
1206  * Timestamps:
1207  *      NA
1208  */
1209 /* ARGSUSED */
1210 static int
1211 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1212     int nameiop, cred_t *cr, kthread_t *td, int flags)
1213 {
1214         znode_t *zdp = VTOZ(dvp);
1215         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1216         int     error = 0;
1217         int *direntflags = NULL;
1218         void *realpnp = NULL;
1219
1220         /* fast path */
1221         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1222
1223                 if (dvp->v_type != VDIR) {
1224                         return (ENOTDIR);
1225                 } else if (zdp->z_dbuf == NULL) {
1226                         return (EIO);
1227                 }
1228
1229                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1230                         error = zfs_fastaccesschk_execute(zdp, cr);
1231                         if (!error) {
1232                                 *vpp = dvp;
1233                                 VN_HOLD(*vpp);
1234                                 return (0);
1235                         }
1236                         return (error);
1237                 } else {
1238                         vnode_t *tvp = dnlc_lookup(dvp, nm);
1239
1240                         if (tvp) {
1241                                 error = zfs_fastaccesschk_execute(zdp, cr);
1242                                 if (error) {
1243                                         VN_RELE(tvp);
1244                                         return (error);
1245                                 }
1246                                 if (tvp == DNLC_NO_VNODE) {
1247                                         VN_RELE(tvp);
1248                                         return (ENOENT);
1249                                 } else {
1250                                         *vpp = tvp;
1251                                         return (specvp_check(vpp, cr));
1252                                 }
1253                         }
1254                 }
1255         }
1256
1257         DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1258
1259         ZFS_ENTER(zfsvfs);
1260         ZFS_VERIFY_ZP(zdp);
1261
1262         *vpp = NULL;
1263
1264         if (flags & LOOKUP_XATTR) {
1265 #ifdef TODO
1266                 /*
1267                  * If the xattr property is off, refuse the lookup request.
1268                  */
1269                 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1270                         ZFS_EXIT(zfsvfs);
1271                         return (EINVAL);
1272                 }
1273 #endif
1274
1275                 /*
1276                  * We don't allow recursive attributes..
1277                  * Maybe someday we will.
1278                  */
1279                 if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1280                         ZFS_EXIT(zfsvfs);
1281                         return (EINVAL);
1282                 }
1283
1284                 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1285                         ZFS_EXIT(zfsvfs);
1286                         return (error);
1287                 }
1288
1289                 /*
1290                  * Do we have permission to get into attribute directory?
1291                  */
1292
1293                 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1294                     B_FALSE, cr)) {
1295                         VN_RELE(*vpp);
1296                         *vpp = NULL;
1297                 }
1298
1299                 ZFS_EXIT(zfsvfs);
1300                 return (error);
1301         }
1302
1303         if (dvp->v_type != VDIR) {
1304                 ZFS_EXIT(zfsvfs);
1305                 return (ENOTDIR);
1306         }
1307
1308         /*
1309          * Check accessibility of directory.
1310          */
1311
1312         if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1313                 ZFS_EXIT(zfsvfs);
1314                 return (error);
1315         }
1316
1317         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1318             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1319                 ZFS_EXIT(zfsvfs);
1320                 return (EILSEQ);
1321         }
1322
1323         error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1324         if (error == 0)
1325                 error = specvp_check(vpp, cr);
1326
1327         /* Translate errors and add SAVENAME when needed. */
1328         if (cnp->cn_flags & ISLASTCN) {
1329                 switch (nameiop) {
1330                 case CREATE:
1331                 case RENAME:
1332                         if (error == ENOENT) {
1333                                 error = EJUSTRETURN;
1334                                 cnp->cn_flags |= SAVENAME;
1335                                 break;
1336                         }
1337                         /* FALLTHROUGH */
1338                 case DELETE:
1339                         if (error == 0)
1340                                 cnp->cn_flags |= SAVENAME;
1341                         break;
1342                 }
1343         }
1344         if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1345                 int ltype = 0;
1346
1347                 if (cnp->cn_flags & ISDOTDOT) {
1348                         ltype = VOP_ISLOCKED(dvp);
1349                         VOP_UNLOCK(dvp, 0);
1350                 }
1351                 ZFS_EXIT(zfsvfs);
1352                 error = vn_lock(*vpp, cnp->cn_lkflags);
1353                 if (cnp->cn_flags & ISDOTDOT)
1354                         vn_lock(dvp, ltype | LK_RETRY);
1355                 if (error != 0) {
1356                         VN_RELE(*vpp);
1357                         *vpp = NULL;
1358                         return (error);
1359                 }
1360         } else {
1361                 ZFS_EXIT(zfsvfs);
1362         }
1363
1364 #ifdef FREEBSD_NAMECACHE
1365         /*
1366          * Insert name into cache (as non-existent) if appropriate.
1367          */
1368         if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1369                 cache_enter(dvp, *vpp, cnp);
1370         /*
1371          * Insert name into cache if appropriate.
1372          */
1373         if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1374                 if (!(cnp->cn_flags & ISLASTCN) ||
1375                     (nameiop != DELETE && nameiop != RENAME)) {
1376                         cache_enter(dvp, *vpp, cnp);
1377                 }
1378         }
1379 #endif
1380
1381         return (error);
1382 }
1383
1384 /*
1385  * Attempt to create a new entry in a directory.  If the entry
1386  * already exists, truncate the file if permissible, else return
1387  * an error.  Return the vp of the created or trunc'd file.
1388  *
1389  *      IN:     dvp     - vnode of directory to put new file entry in.
1390  *              name    - name of new file entry.
1391  *              vap     - attributes of new file.
1392  *              excl    - flag indicating exclusive or non-exclusive mode.
1393  *              mode    - mode to open file with.
1394  *              cr      - credentials of caller.
1395  *              flag    - large file flag [UNUSED].
1396  *              ct      - caller context
1397  *              vsecp   - ACL to be set
1398  *
1399  *      OUT:    vpp     - vnode of created or trunc'd entry.
1400  *
1401  *      RETURN: 0 if success
1402  *              error code if failure
1403  *
1404  * Timestamps:
1405  *      dvp - ctime|mtime updated if new entry created
1406  *       vp - ctime|mtime always, atime if new
1407  */
1408
1409 /* ARGSUSED */
1410 static int
1411 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1412     vnode_t **vpp, cred_t *cr, kthread_t *td)
1413 {
1414         znode_t         *zp, *dzp = VTOZ(dvp);
1415         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1416         zilog_t         *zilog;
1417         objset_t        *os;
1418         zfs_dirlock_t   *dl;
1419         dmu_tx_t        *tx;
1420         int             error;
1421         ksid_t          *ksid;
1422         uid_t           uid;
1423         gid_t           gid = crgetgid(cr);
1424         zfs_acl_ids_t   acl_ids;
1425         boolean_t       fuid_dirtied;
1426         void            *vsecp = NULL;
1427         int             flag = 0;
1428
1429         /*
1430          * If we have an ephemeral id, ACL, or XVATTR then
1431          * make sure file system is at proper version
1432          */
1433
1434         ksid = crgetsid(cr, KSID_OWNER);
1435         if (ksid)
1436                 uid = ksid_getid(ksid);
1437         else
1438                 uid = crgetuid(cr);
1439         if (zfsvfs->z_use_fuids == B_FALSE &&
1440             (vsecp || (vap->va_mask & AT_XVATTR) ||
1441             IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1442                 return (EINVAL);
1443
1444         ZFS_ENTER(zfsvfs);
1445         ZFS_VERIFY_ZP(dzp);
1446         os = zfsvfs->z_os;
1447         zilog = zfsvfs->z_log;
1448
1449         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1450             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1451                 ZFS_EXIT(zfsvfs);
1452                 return (EILSEQ);
1453         }
1454
1455         if (vap->va_mask & AT_XVATTR) {
1456                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1457                     crgetuid(cr), cr, vap->va_type)) != 0) {
1458                         ZFS_EXIT(zfsvfs);
1459                         return (error);
1460                 }
1461         }
1462 top:
1463         *vpp = NULL;
1464
1465         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1466                 vap->va_mode &= ~S_ISVTX;
1467
1468         if (*name == '\0') {
1469                 /*
1470                  * Null component name refers to the directory itself.
1471                  */
1472                 VN_HOLD(dvp);
1473                 zp = dzp;
1474                 dl = NULL;
1475                 error = 0;
1476         } else {
1477                 /* possible VN_HOLD(zp) */
1478                 int zflg = 0;
1479
1480                 if (flag & FIGNORECASE)
1481                         zflg |= ZCILOOK;
1482
1483                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1484                     NULL, NULL);
1485                 if (error) {
1486                         if (strcmp(name, "..") == 0)
1487                                 error = EISDIR;
1488                         ZFS_EXIT(zfsvfs);
1489                         return (error);
1490                 }
1491         }
1492         if (zp == NULL) {
1493                 uint64_t txtype;
1494
1495                 /*
1496                  * Create a new file object and update the directory
1497                  * to reference it.
1498                  */
1499                 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1500                         goto out;
1501                 }
1502
1503                 /*
1504                  * We only support the creation of regular files in
1505                  * extended attribute directories.
1506                  */
1507                 if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1508                     (vap->va_type != VREG)) {
1509                         error = EINVAL;
1510                         goto out;
1511                 }
1512
1513
1514                 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1515                     &acl_ids)) != 0)
1516                         goto out;
1517                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1518                         zfs_acl_ids_free(&acl_ids);
1519                         error = EDQUOT;
1520                         goto out;
1521                 }
1522
1523                 tx = dmu_tx_create(os);
1524                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1525                 fuid_dirtied = zfsvfs->z_fuid_dirty;
1526                 if (fuid_dirtied)
1527                         zfs_fuid_txhold(zfsvfs, tx);
1528                 dmu_tx_hold_bonus(tx, dzp->z_id);
1529                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1530                 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1531                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1532                             0, SPA_MAXBLOCKSIZE);
1533                 }
1534                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1535                 if (error) {
1536                         zfs_acl_ids_free(&acl_ids);
1537                         zfs_dirent_unlock(dl);
1538                         if (error == ERESTART) {
1539                                 dmu_tx_wait(tx);
1540                                 dmu_tx_abort(tx);
1541                                 goto top;
1542                         }
1543                         dmu_tx_abort(tx);
1544                         ZFS_EXIT(zfsvfs);
1545                         return (error);
1546                 }
1547                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1548
1549                 if (fuid_dirtied)
1550                         zfs_fuid_sync(zfsvfs, tx);
1551
1552                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1553
1554                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1555                 if (flag & FIGNORECASE)
1556                         txtype |= TX_CI;
1557                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1558                     vsecp, acl_ids.z_fuidp, vap);
1559                 zfs_acl_ids_free(&acl_ids);
1560                 dmu_tx_commit(tx);
1561         } else {
1562                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1563
1564                 /*
1565                  * A directory entry already exists for this name.
1566                  */
1567                 /*
1568                  * Can't truncate an existing file if in exclusive mode.
1569                  */
1570                 if (excl == EXCL) {
1571                         error = EEXIST;
1572                         goto out;
1573                 }
1574                 /*
1575                  * Can't open a directory for writing.
1576                  */
1577                 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1578                         error = EISDIR;
1579                         goto out;
1580                 }
1581                 /*
1582                  * Verify requested access to file.
1583                  */
1584                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1585                         goto out;
1586                 }
1587
1588                 mutex_enter(&dzp->z_lock);
1589                 dzp->z_seq++;
1590                 mutex_exit(&dzp->z_lock);
1591
1592                 /*
1593                  * Truncate regular files if requested.
1594                  */
1595                 if ((ZTOV(zp)->v_type == VREG) &&
1596                     (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1597                         /* we can't hold any locks when calling zfs_freesp() */
1598                         zfs_dirent_unlock(dl);
1599                         dl = NULL;
1600                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1601                         if (error == 0) {
1602                                 vnevent_create(ZTOV(zp), ct);
1603                         }
1604                 }
1605         }
1606 out:
1607         if (dl)
1608                 zfs_dirent_unlock(dl);
1609
1610         if (error) {
1611                 if (zp)
1612                         VN_RELE(ZTOV(zp));
1613         } else {
1614                 *vpp = ZTOV(zp);
1615                 error = specvp_check(vpp, cr);
1616         }
1617
1618         ZFS_EXIT(zfsvfs);
1619         return (error);
1620 }
1621
1622 /*
1623  * Remove an entry from a directory.
1624  *
1625  *      IN:     dvp     - vnode of directory to remove entry from.
1626  *              name    - name of entry to remove.
1627  *              cr      - credentials of caller.
1628  *              ct      - caller context
1629  *              flags   - case flags
1630  *
1631  *      RETURN: 0 if success
1632  *              error code if failure
1633  *
1634  * Timestamps:
1635  *      dvp - ctime|mtime
1636  *       vp - ctime (if nlink > 0)
1637  */
1638 /*ARGSUSED*/
1639 static int
1640 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1641     int flags)
1642 {
1643         znode_t         *zp, *dzp = VTOZ(dvp);
1644         znode_t         *xzp = NULL;
1645         vnode_t         *vp;
1646         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1647         zilog_t         *zilog;
1648         uint64_t        acl_obj, xattr_obj;
1649         zfs_dirlock_t   *dl;
1650         dmu_tx_t        *tx;
1651         boolean_t       may_delete_now, delete_now = FALSE;
1652         boolean_t       unlinked, toobig = FALSE;
1653         uint64_t        txtype;
1654         pathname_t      *realnmp = NULL;
1655         pathname_t      realnm;
1656         int             error;
1657         int             zflg = ZEXISTS;
1658
1659         ZFS_ENTER(zfsvfs);
1660         ZFS_VERIFY_ZP(dzp);
1661         zilog = zfsvfs->z_log;
1662
1663         if (flags & FIGNORECASE) {
1664                 zflg |= ZCILOOK;
1665                 pn_alloc(&realnm);
1666                 realnmp = &realnm;
1667         }
1668
1669 top:
1670         /*
1671          * Attempt to lock directory; fail if entry doesn't exist.
1672          */
1673         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1674             NULL, realnmp)) {
1675                 if (realnmp)
1676                         pn_free(realnmp);
1677                 ZFS_EXIT(zfsvfs);
1678                 return (error);
1679         }
1680
1681         vp = ZTOV(zp);
1682
1683         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1684                 goto out;
1685         }
1686
1687         /*
1688          * Need to use rmdir for removing directories.
1689          */
1690         if (vp->v_type == VDIR) {
1691                 error = EPERM;
1692                 goto out;
1693         }
1694
1695         vnevent_remove(vp, dvp, name, ct);
1696
1697         if (realnmp)
1698                 dnlc_remove(dvp, realnmp->pn_buf);
1699         else
1700                 dnlc_remove(dvp, name);
1701
1702         may_delete_now = FALSE;
1703
1704         /*
1705          * We may delete the znode now, or we may put it in the unlinked set;
1706          * it depends on whether we're the last link, and on whether there are
1707          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1708          * allow for either case.
1709          */
1710         tx = dmu_tx_create(zfsvfs->z_os);
1711         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1712         dmu_tx_hold_bonus(tx, zp->z_id);
1713         if (may_delete_now) {
1714                 toobig =
1715                     zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1716                 /* if the file is too big, only hold_free a token amount */
1717                 dmu_tx_hold_free(tx, zp->z_id, 0,
1718                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1719         }
1720
1721         /* are there any extended attributes? */
1722         if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1723                 /* XXX - do we need this if we are deleting? */
1724                 dmu_tx_hold_bonus(tx, xattr_obj);
1725         }
1726
1727         /* are there any additional acls */
1728         if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1729             may_delete_now)
1730                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1731
1732         /* charge as an update -- would be nice not to charge at all */
1733         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1734
1735         error = dmu_tx_assign(tx, TXG_NOWAIT);
1736         if (error) {
1737                 zfs_dirent_unlock(dl);
1738                 VN_RELE(vp);
1739                 if (error == ERESTART) {
1740                         dmu_tx_wait(tx);
1741                         dmu_tx_abort(tx);
1742                         goto top;
1743                 }
1744                 if (realnmp)
1745                         pn_free(realnmp);
1746                 dmu_tx_abort(tx);
1747                 ZFS_EXIT(zfsvfs);
1748                 return (error);
1749         }
1750
1751         /*
1752          * Remove the directory entry.
1753          */
1754         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1755
1756         if (error) {
1757                 dmu_tx_commit(tx);
1758                 goto out;
1759         }
1760
1761         if (0 && unlinked) {
1762                 VI_LOCK(vp);
1763                 delete_now = may_delete_now && !toobig &&
1764                     vp->v_count == 1 && !vn_has_cached_data(vp) &&
1765                     zp->z_phys->zp_xattr == xattr_obj &&
1766                     zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1767                 VI_UNLOCK(vp);
1768         }
1769
1770         if (delete_now) {
1771                 if (zp->z_phys->zp_xattr) {
1772                         error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1773                         ASSERT3U(error, ==, 0);
1774                         ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1775                         dmu_buf_will_dirty(xzp->z_dbuf, tx);
1776                         mutex_enter(&xzp->z_lock);
1777                         xzp->z_unlinked = 1;
1778                         xzp->z_phys->zp_links = 0;
1779                         mutex_exit(&xzp->z_lock);
1780                         zfs_unlinked_add(xzp, tx);
1781                         zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1782                 }
1783                 mutex_enter(&zp->z_lock);
1784                 VI_LOCK(vp);
1785                 vp->v_count--;
1786                 ASSERT3U(vp->v_count, ==, 0);
1787                 VI_UNLOCK(vp);
1788                 mutex_exit(&zp->z_lock);
1789                 zfs_znode_delete(zp, tx);
1790         } else if (unlinked) {
1791                 zfs_unlinked_add(zp, tx);
1792         }
1793
1794         txtype = TX_REMOVE;
1795         if (flags & FIGNORECASE)
1796                 txtype |= TX_CI;
1797         zfs_log_remove(zilog, tx, txtype, dzp, name);
1798
1799         dmu_tx_commit(tx);
1800 out:
1801         if (realnmp)
1802                 pn_free(realnmp);
1803
1804         zfs_dirent_unlock(dl);
1805
1806         if (!delete_now) {
1807                 VN_RELE(vp);
1808         } else if (xzp) {
1809                 /* this rele is delayed to prevent nesting transactions */
1810                 VN_RELE(ZTOV(xzp));
1811         }
1812
1813         ZFS_EXIT(zfsvfs);
1814         return (error);
1815 }
1816
1817 /*
1818  * Create a new directory and insert it into dvp using the name
1819  * provided.  Return a pointer to the inserted directory.
1820  *
1821  *      IN:     dvp     - vnode of directory to add subdir to.
1822  *              dirname - name of new directory.
1823  *              vap     - attributes of new directory.
1824  *              cr      - credentials of caller.
1825  *              ct      - caller context
1826  *              vsecp   - ACL to be set
1827  *
1828  *      OUT:    vpp     - vnode of created directory.
1829  *
1830  *      RETURN: 0 if success
1831  *              error code if failure
1832  *
1833  * Timestamps:
1834  *      dvp - ctime|mtime updated
1835  *       vp - ctime|mtime|atime updated
1836  */
1837 /*ARGSUSED*/
1838 static int
1839 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1840     caller_context_t *ct, int flags, vsecattr_t *vsecp)
1841 {
1842         znode_t         *zp, *dzp = VTOZ(dvp);
1843         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1844         zilog_t         *zilog;
1845         zfs_dirlock_t   *dl;
1846         uint64_t        txtype;
1847         dmu_tx_t        *tx;
1848         int             error;
1849         int             zf = ZNEW;
1850         ksid_t          *ksid;
1851         uid_t           uid;
1852         gid_t           gid = crgetgid(cr);
1853         zfs_acl_ids_t   acl_ids;
1854         boolean_t       fuid_dirtied;
1855
1856         ASSERT(vap->va_type == VDIR);
1857
1858         /*
1859          * If we have an ephemeral id, ACL, or XVATTR then
1860          * make sure file system is at proper version
1861          */
1862
1863         ksid = crgetsid(cr, KSID_OWNER);
1864         if (ksid)
1865                 uid = ksid_getid(ksid);
1866         else
1867                 uid = crgetuid(cr);
1868         if (zfsvfs->z_use_fuids == B_FALSE &&
1869             (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1870             IS_EPHEMERAL(crgetgid(cr))))
1871                 return (EINVAL);
1872
1873         ZFS_ENTER(zfsvfs);
1874         ZFS_VERIFY_ZP(dzp);
1875         zilog = zfsvfs->z_log;
1876
1877         if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1878                 ZFS_EXIT(zfsvfs);
1879                 return (EINVAL);
1880         }
1881
1882         if (zfsvfs->z_utf8 && u8_validate(dirname,
1883             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1884                 ZFS_EXIT(zfsvfs);
1885                 return (EILSEQ);
1886         }
1887         if (flags & FIGNORECASE)
1888                 zf |= ZCILOOK;
1889
1890         if (vap->va_mask & AT_XVATTR)
1891                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1892                     crgetuid(cr), cr, vap->va_type)) != 0) {
1893                         ZFS_EXIT(zfsvfs);
1894                         return (error);
1895                 }
1896
1897         /*
1898          * First make sure the new directory doesn't exist.
1899          */
1900 top:
1901         *vpp = NULL;
1902
1903         if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1904             NULL, NULL)) {
1905                 ZFS_EXIT(zfsvfs);
1906                 return (error);
1907         }
1908
1909         if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1910                 zfs_dirent_unlock(dl);
1911                 ZFS_EXIT(zfsvfs);
1912                 return (error);
1913         }
1914
1915         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1916             &acl_ids)) != 0) {
1917                 zfs_dirent_unlock(dl);
1918                 ZFS_EXIT(zfsvfs);
1919                 return (error);
1920         }
1921         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1922                 zfs_acl_ids_free(&acl_ids);
1923                 zfs_dirent_unlock(dl);
1924                 ZFS_EXIT(zfsvfs);
1925                 return (EDQUOT);
1926         }
1927
1928         /*
1929          * Add a new entry to the directory.
1930          */
1931         tx = dmu_tx_create(zfsvfs->z_os);
1932         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1933         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1934         fuid_dirtied = zfsvfs->z_fuid_dirty;
1935         if (fuid_dirtied)
1936                 zfs_fuid_txhold(zfsvfs, tx);
1937         if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
1938                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1939                     0, SPA_MAXBLOCKSIZE);
1940         error = dmu_tx_assign(tx, TXG_NOWAIT);
1941         if (error) {
1942                 zfs_acl_ids_free(&acl_ids);
1943                 zfs_dirent_unlock(dl);
1944                 if (error == ERESTART) {
1945                         dmu_tx_wait(tx);
1946                         dmu_tx_abort(tx);
1947                         goto top;
1948                 }
1949                 dmu_tx_abort(tx);
1950                 ZFS_EXIT(zfsvfs);
1951                 return (error);
1952         }
1953
1954         /*
1955          * Create new node.
1956          */
1957         zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1958
1959         if (fuid_dirtied)
1960                 zfs_fuid_sync(zfsvfs, tx);
1961         /*
1962          * Now put new name in parent dir.
1963          */
1964         (void) zfs_link_create(dl, zp, tx, ZNEW);
1965
1966         *vpp = ZTOV(zp);
1967
1968         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1969         if (flags & FIGNORECASE)
1970                 txtype |= TX_CI;
1971         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1972             acl_ids.z_fuidp, vap);
1973
1974         zfs_acl_ids_free(&acl_ids);
1975         dmu_tx_commit(tx);
1976
1977         zfs_dirent_unlock(dl);
1978
1979         ZFS_EXIT(zfsvfs);
1980         return (0);
1981 }
1982
1983 /*
1984  * Remove a directory subdir entry.  If the current working
1985  * directory is the same as the subdir to be removed, the
1986  * remove will fail.
1987  *
1988  *      IN:     dvp     - vnode of directory to remove from.
1989  *              name    - name of directory to be removed.
1990  *              cwd     - vnode of current working directory.
1991  *              cr      - credentials of caller.
1992  *              ct      - caller context
1993  *              flags   - case flags
1994  *
1995  *      RETURN: 0 if success
1996  *              error code if failure
1997  *
1998  * Timestamps:
1999  *      dvp - ctime|mtime updated
2000  */
2001 /*ARGSUSED*/
2002 static int
2003 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2004     caller_context_t *ct, int flags)
2005 {
2006         znode_t         *dzp = VTOZ(dvp);
2007         znode_t         *zp;
2008         vnode_t         *vp;
2009         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2010         zilog_t         *zilog;
2011         zfs_dirlock_t   *dl;
2012         dmu_tx_t        *tx;
2013         int             error;
2014         int             zflg = ZEXISTS;
2015
2016         ZFS_ENTER(zfsvfs);
2017         ZFS_VERIFY_ZP(dzp);
2018         zilog = zfsvfs->z_log;
2019
2020         if (flags & FIGNORECASE)
2021                 zflg |= ZCILOOK;
2022 top:
2023         zp = NULL;
2024
2025         /*
2026          * Attempt to lock directory; fail if entry doesn't exist.
2027          */
2028         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2029             NULL, NULL)) {
2030                 ZFS_EXIT(zfsvfs);
2031                 return (error);
2032         }
2033
2034         vp = ZTOV(zp);
2035
2036         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2037                 goto out;
2038         }
2039
2040         if (vp->v_type != VDIR) {
2041                 error = ENOTDIR;
2042                 goto out;
2043         }
2044
2045         if (vp == cwd) {
2046                 error = EINVAL;
2047                 goto out;
2048         }
2049
2050         vnevent_rmdir(vp, dvp, name, ct);
2051
2052         /*
2053          * Grab a lock on the directory to make sure that noone is
2054          * trying to add (or lookup) entries while we are removing it.
2055          */
2056         rw_enter(&zp->z_name_lock, RW_WRITER);
2057
2058         /*
2059          * Grab a lock on the parent pointer to make sure we play well
2060          * with the treewalk and directory rename code.
2061          */
2062         rw_enter(&zp->z_parent_lock, RW_WRITER);
2063
2064         tx = dmu_tx_create(zfsvfs->z_os);
2065         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2066         dmu_tx_hold_bonus(tx, zp->z_id);
2067         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2068         error = dmu_tx_assign(tx, TXG_NOWAIT);
2069         if (error) {
2070                 rw_exit(&zp->z_parent_lock);
2071                 rw_exit(&zp->z_name_lock);
2072                 zfs_dirent_unlock(dl);
2073                 VN_RELE(vp);
2074                 if (error == ERESTART) {
2075                         dmu_tx_wait(tx);
2076                         dmu_tx_abort(tx);
2077                         goto top;
2078                 }
2079                 dmu_tx_abort(tx);
2080                 ZFS_EXIT(zfsvfs);
2081                 return (error);
2082         }
2083
2084 #ifdef FREEBSD_NAMECACHE
2085         cache_purge(dvp);
2086 #endif
2087
2088         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2089
2090         if (error == 0) {
2091                 uint64_t txtype = TX_RMDIR;
2092                 if (flags & FIGNORECASE)
2093                         txtype |= TX_CI;
2094                 zfs_log_remove(zilog, tx, txtype, dzp, name);
2095         }
2096
2097         dmu_tx_commit(tx);
2098
2099         rw_exit(&zp->z_parent_lock);
2100         rw_exit(&zp->z_name_lock);
2101 #ifdef FREEBSD_NAMECACHE
2102         cache_purge(vp);
2103 #endif
2104 out:
2105         zfs_dirent_unlock(dl);
2106
2107         VN_RELE(vp);
2108
2109         ZFS_EXIT(zfsvfs);
2110         return (error);
2111 }
2112
2113 /*
2114  * Read as many directory entries as will fit into the provided
2115  * buffer from the given directory cursor position (specified in
2116  * the uio structure.
2117  *
2118  *      IN:     vp      - vnode of directory to read.
2119  *              uio     - structure supplying read location, range info,
2120  *                        and return buffer.
2121  *              cr      - credentials of caller.
2122  *              ct      - caller context
2123  *              flags   - case flags
2124  *
2125  *      OUT:    uio     - updated offset and range, buffer filled.
2126  *              eofp    - set to true if end-of-file detected.
2127  *
2128  *      RETURN: 0 if success
2129  *              error code if failure
2130  *
2131  * Timestamps:
2132  *      vp - atime updated
2133  *
2134  * Note that the low 4 bits of the cookie returned by zap is always zero.
2135  * This allows us to use the low range for "special" directory entries:
2136  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2137  * we use the offset 2 for the '.zfs' directory.
2138  */
2139 /* ARGSUSED */
2140 static int
2141 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2142 {
2143         znode_t         *zp = VTOZ(vp);
2144         iovec_t         *iovp;
2145         edirent_t       *eodp;
2146         dirent64_t      *odp;
2147         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2148         objset_t        *os;
2149         caddr_t         outbuf;
2150         size_t          bufsize;
2151         zap_cursor_t    zc;
2152         zap_attribute_t zap;
2153         uint_t          bytes_wanted;
2154         uint64_t        offset; /* must be unsigned; checks for < 1 */
2155         int             local_eof;
2156         int             outcount;
2157         int             error;
2158         uint8_t         prefetch;
2159         boolean_t       check_sysattrs;
2160         uint8_t         type;
2161         int             ncooks;
2162         u_long          *cooks = NULL;
2163         int             flags = 0;
2164
2165         ZFS_ENTER(zfsvfs);
2166         ZFS_VERIFY_ZP(zp);
2167
2168         /*
2169          * If we are not given an eof variable,
2170          * use a local one.
2171          */
2172         if (eofp == NULL)
2173                 eofp = &local_eof;
2174
2175         /*
2176          * Check for valid iov_len.
2177          */
2178         if (uio->uio_iov->iov_len <= 0) {
2179                 ZFS_EXIT(zfsvfs);
2180                 return (EINVAL);
2181         }
2182
2183         /*
2184          * Quit if directory has been removed (posix)
2185          */
2186         if ((*eofp = zp->z_unlinked) != 0) {
2187                 ZFS_EXIT(zfsvfs);
2188                 return (0);
2189         }
2190
2191         error = 0;
2192         os = zfsvfs->z_os;
2193         offset = uio->uio_loffset;
2194         prefetch = zp->z_zn_prefetch;
2195
2196         /*
2197          * Initialize the iterator cursor.
2198          */
2199         if (offset <= 3) {
2200                 /*
2201                  * Start iteration from the beginning of the directory.
2202                  */
2203                 zap_cursor_init(&zc, os, zp->z_id);
2204         } else {
2205                 /*
2206                  * The offset is a serialized cursor.
2207                  */
2208                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2209         }
2210
2211         /*
2212          * Get space to change directory entries into fs independent format.
2213          */
2214         iovp = uio->uio_iov;
2215         bytes_wanted = iovp->iov_len;
2216         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2217                 bufsize = bytes_wanted;
2218                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2219                 odp = (struct dirent64 *)outbuf;
2220         } else {
2221                 bufsize = bytes_wanted;
2222                 odp = (struct dirent64 *)iovp->iov_base;
2223         }
2224         eodp = (struct edirent *)odp;
2225
2226         if (ncookies != NULL) {
2227                 /*
2228                  * Minimum entry size is dirent size and 1 byte for a file name.
2229                  */
2230                 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2231                 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2232                 *cookies = cooks;
2233                 *ncookies = ncooks;
2234         }
2235         /*
2236          * If this VFS supports the system attribute view interface; and
2237          * we're looking at an extended attribute directory; and we care
2238          * about normalization conflicts on this vfs; then we must check
2239          * for normalization conflicts with the sysattr name space.
2240          */
2241 #ifdef TODO
2242         check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2243             (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2244             (flags & V_RDDIR_ENTFLAGS);
2245 #else
2246         check_sysattrs = 0;
2247 #endif
2248
2249         /*
2250          * Transform to file-system independent format
2251          */
2252         outcount = 0;
2253         while (outcount < bytes_wanted) {
2254                 ino64_t objnum;
2255                 ushort_t reclen;
2256                 off64_t *next;
2257
2258                 /*
2259                  * Special case `.', `..', and `.zfs'.
2260                  */
2261                 if (offset == 0) {
2262                         (void) strcpy(zap.za_name, ".");
2263                         zap.za_normalization_conflict = 0;
2264                         objnum = zp->z_id;
2265                         type = DT_DIR;
2266                 } else if (offset == 1) {
2267                         (void) strcpy(zap.za_name, "..");
2268                         zap.za_normalization_conflict = 0;
2269                         objnum = zp->z_phys->zp_parent;
2270                         type = DT_DIR;
2271                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2272                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2273                         zap.za_normalization_conflict = 0;
2274                         objnum = ZFSCTL_INO_ROOT;
2275                         type = DT_DIR;
2276                 } else {
2277                         /*
2278                          * Grab next entry.
2279                          */
2280                         if (error = zap_cursor_retrieve(&zc, &zap)) {
2281                                 if ((*eofp = (error == ENOENT)) != 0)
2282                                         break;
2283                                 else
2284                                         goto update;
2285                         }
2286
2287                         if (zap.za_integer_length != 8 ||
2288                             zap.za_num_integers != 1) {
2289                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2290                                     "entry, obj = %lld, offset = %lld\n",
2291                                     (u_longlong_t)zp->z_id,
2292                                     (u_longlong_t)offset);
2293                                 error = ENXIO;
2294                                 goto update;
2295                         }
2296
2297                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2298                         /*
2299                          * MacOS X can extract the object type here such as:
2300                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2301                          */
2302                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2303
2304                         if (check_sysattrs && !zap.za_normalization_conflict) {
2305 #ifdef TODO
2306                                 zap.za_normalization_conflict =
2307                                     xattr_sysattr_casechk(zap.za_name);
2308 #else
2309                                 panic("%s:%u: TODO", __func__, __LINE__);
2310 #endif
2311                         }
2312                 }
2313
2314                 if (flags & V_RDDIR_ACCFILTER) {
2315                         /*
2316                          * If we have no access at all, don't include
2317                          * this entry in the returned information
2318                          */
2319                         znode_t *ezp;
2320                         if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2321                                 goto skip_entry;
2322                         if (!zfs_has_access(ezp, cr)) {
2323                                 VN_RELE(ZTOV(ezp));
2324                                 goto skip_entry;
2325                         }
2326                         VN_RELE(ZTOV(ezp));
2327                 }
2328
2329                 if (flags & V_RDDIR_ENTFLAGS)
2330                         reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2331                 else
2332                         reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2333
2334                 /*
2335                  * Will this entry fit in the buffer?
2336                  */
2337                 if (outcount + reclen > bufsize) {
2338                         /*
2339                          * Did we manage to fit anything in the buffer?
2340                          */
2341                         if (!outcount) {
2342                                 error = EINVAL;
2343                                 goto update;
2344                         }
2345                         break;
2346                 }
2347                 if (flags & V_RDDIR_ENTFLAGS) {
2348                         /*
2349                          * Add extended flag entry:
2350                          */
2351                         eodp->ed_ino = objnum;
2352                         eodp->ed_reclen = reclen;
2353                         /* NOTE: ed_off is the offset for the *next* entry */
2354                         next = &(eodp->ed_off);
2355                         eodp->ed_eflags = zap.za_normalization_conflict ?
2356                             ED_CASE_CONFLICT : 0;
2357                         (void) strncpy(eodp->ed_name, zap.za_name,
2358                             EDIRENT_NAMELEN(reclen));
2359                         eodp = (edirent_t *)((intptr_t)eodp + reclen);
2360                 } else {
2361                         /*
2362                          * Add normal entry:
2363                          */
2364                         odp->d_ino = objnum;
2365                         odp->d_reclen = reclen;
2366                         odp->d_namlen = strlen(zap.za_name);
2367                         (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2368                         odp->d_type = type;
2369                         odp = (dirent64_t *)((intptr_t)odp + reclen);
2370                 }
2371                 outcount += reclen;
2372
2373                 ASSERT(outcount <= bufsize);
2374
2375                 /* Prefetch znode */
2376                 if (prefetch)
2377                         dmu_prefetch(os, objnum, 0, 0);
2378
2379         skip_entry:
2380                 /*
2381                  * Move to the next entry, fill in the previous offset.
2382                  */
2383                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2384                         zap_cursor_advance(&zc);
2385                         offset = zap_cursor_serialize(&zc);
2386                 } else {
2387                         offset += 1;
2388                 }
2389
2390                 if (cooks != NULL) {
2391                         *cooks++ = offset;
2392                         ncooks--;
2393                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2394                 }
2395         }
2396         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2397
2398         /* Subtract unused cookies */
2399         if (ncookies != NULL)
2400                 *ncookies -= ncooks;
2401
2402         if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2403                 iovp->iov_base += outcount;
2404                 iovp->iov_len -= outcount;
2405                 uio->uio_resid -= outcount;
2406         } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2407                 /*
2408                  * Reset the pointer.
2409                  */
2410                 offset = uio->uio_loffset;
2411         }
2412
2413 update:
2414         zap_cursor_fini(&zc);
2415         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2416                 kmem_free(outbuf, bufsize);
2417
2418         if (error == ENOENT)
2419                 error = 0;
2420
2421         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2422
2423         uio->uio_loffset = offset;
2424         ZFS_EXIT(zfsvfs);
2425         if (error != 0 && cookies != NULL) {
2426                 free(*cookies, M_TEMP);
2427                 *cookies = NULL;
2428                 *ncookies = 0;
2429         }
2430         return (error);
2431 }
2432
2433 ulong_t zfs_fsync_sync_cnt = 4;
2434
2435 static int
2436 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2437 {
2438         znode_t *zp = VTOZ(vp);
2439         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2440
2441         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2442
2443         ZFS_ENTER(zfsvfs);
2444         ZFS_VERIFY_ZP(zp);
2445         zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2446         ZFS_EXIT(zfsvfs);
2447         return (0);
2448 }
2449
2450
2451 /*
2452  * Get the requested file attributes and place them in the provided
2453  * vattr structure.
2454  *
2455  *      IN:     vp      - vnode of file.
2456  *              vap     - va_mask identifies requested attributes.
2457  *                        If AT_XVATTR set, then optional attrs are requested
2458  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2459  *              cr      - credentials of caller.
2460  *              ct      - caller context
2461  *
2462  *      OUT:    vap     - attribute values.
2463  *
2464  *      RETURN: 0 (always succeeds)
2465  */
2466 /* ARGSUSED */
2467 static int
2468 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2469     caller_context_t *ct)
2470 {
2471         znode_t *zp = VTOZ(vp);
2472         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2473         znode_phys_t *pzp;
2474         int     error = 0;
2475         uint32_t blksize;
2476         u_longlong_t nblocks;
2477         uint64_t links;
2478         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2479         xoptattr_t *xoap = NULL;
2480         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2481
2482         ZFS_ENTER(zfsvfs);
2483         ZFS_VERIFY_ZP(zp);
2484         pzp = zp->z_phys;
2485
2486         /*
2487          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2488          * Also, if we are the owner don't bother, since owner should
2489          * always be allowed to read basic attributes of file.
2490          */
2491         if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2492             (pzp->zp_uid != crgetuid(cr))) {
2493                 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2494                     skipaclchk, cr)) {
2495                         ZFS_EXIT(zfsvfs);
2496                         return (error);
2497                 }
2498         }
2499
2500         /*
2501          * Return all attributes.  It's cheaper to provide the answer
2502          * than to determine whether we were asked the question.
2503          */
2504
2505         mutex_enter(&zp->z_lock);
2506         vap->va_type = IFTOVT(pzp->zp_mode);
2507         vap->va_mode = pzp->zp_mode & ~S_IFMT;
2508         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2509 //      vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2510         vap->va_nodeid = zp->z_id;
2511         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2512                 links = pzp->zp_links + 1;
2513         else
2514                 links = pzp->zp_links;
2515         vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2516         vap->va_size = pzp->zp_size;
2517         vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2518         vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2519         vap->va_seq = zp->z_seq;
2520         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
2521
2522         /*
2523          * Add in any requested optional attributes and the create time.
2524          * Also set the corresponding bits in the returned attribute bitmap.
2525          */
2526         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2527                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2528                         xoap->xoa_archive =
2529                             ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2530                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
2531                 }
2532
2533                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2534                         xoap->xoa_readonly =
2535                             ((pzp->zp_flags & ZFS_READONLY) != 0);
2536                         XVA_SET_RTN(xvap, XAT_READONLY);
2537                 }
2538
2539                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2540                         xoap->xoa_system =
2541                             ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2542                         XVA_SET_RTN(xvap, XAT_SYSTEM);
2543                 }
2544
2545                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2546                         xoap->xoa_hidden =
2547                             ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2548                         XVA_SET_RTN(xvap, XAT_HIDDEN);
2549                 }
2550
2551                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2552                         xoap->xoa_nounlink =
2553                             ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2554                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
2555                 }
2556
2557                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2558                         xoap->xoa_immutable =
2559                             ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2560                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2561                 }
2562
2563                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2564                         xoap->xoa_appendonly =
2565                             ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2566                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
2567                 }
2568
2569                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2570                         xoap->xoa_nodump =
2571                             ((pzp->zp_flags & ZFS_NODUMP) != 0);
2572                         XVA_SET_RTN(xvap, XAT_NODUMP);
2573                 }
2574
2575                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2576                         xoap->xoa_opaque =
2577                             ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2578                         XVA_SET_RTN(xvap, XAT_OPAQUE);
2579                 }
2580
2581                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2582                         xoap->xoa_av_quarantined =
2583                             ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2584                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2585                 }
2586
2587                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2588                         xoap->xoa_av_modified =
2589                             ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2590                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2591                 }
2592
2593                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2594                     vp->v_type == VREG &&
2595                     (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2596                         size_t len;
2597                         dmu_object_info_t doi;
2598
2599                         /*
2600                          * Only VREG files have anti-virus scanstamps, so we
2601                          * won't conflict with symlinks in the bonus buffer.
2602                          */
2603                         dmu_object_info_from_db(zp->z_dbuf, &doi);
2604                         len = sizeof (xoap->xoa_av_scanstamp) +
2605                             sizeof (znode_phys_t);
2606                         if (len <= doi.doi_bonus_size) {
2607                                 /*
2608                                  * pzp points to the start of the
2609                                  * znode_phys_t. pzp + 1 points to the
2610                                  * first byte after the znode_phys_t.
2611                                  */
2612                                 (void) memcpy(xoap->xoa_av_scanstamp,
2613                                     pzp + 1,
2614                                     sizeof (xoap->xoa_av_scanstamp));
2615                                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2616                         }
2617                 }
2618
2619                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2620                         ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2621                         XVA_SET_RTN(xvap, XAT_CREATETIME);
2622                 }
2623         }
2624
2625         ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2626         ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2627         ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2628         ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2629
2630         mutex_exit(&zp->z_lock);
2631
2632         dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2633         vap->va_blksize = blksize;
2634         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2635
2636         if (zp->z_blksz == 0) {
2637                 /*
2638                  * Block size hasn't been set; suggest maximal I/O transfers.
2639                  */
2640                 vap->va_blksize = zfsvfs->z_max_blksz;
2641         }
2642
2643         ZFS_EXIT(zfsvfs);
2644         return (0);
2645 }
2646
2647 /*
2648  * Set the file attributes to the values contained in the
2649  * vattr structure.
2650  *
2651  *      IN:     vp      - vnode of file to be modified.
2652  *              vap     - new attribute values.
2653  *                        If AT_XVATTR set, then optional attrs are being set
2654  *              flags   - ATTR_UTIME set if non-default time values provided.
2655  *                      - ATTR_NOACLCHECK (CIFS context only).
2656  *              cr      - credentials of caller.
2657  *              ct      - caller context
2658  *
2659  *      RETURN: 0 if success
2660  *              error code if failure
2661  *
2662  * Timestamps:
2663  *      vp - ctime updated, mtime updated if size changed.
2664  */
2665 /* ARGSUSED */
2666 static int
2667 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2668         caller_context_t *ct)
2669 {
2670         znode_t         *zp = VTOZ(vp);
2671         znode_phys_t    *pzp;
2672         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2673         zilog_t         *zilog;
2674         dmu_tx_t        *tx;
2675         vattr_t         oldva;
2676         xvattr_t        tmpxvattr;
2677         uint_t          mask = vap->va_mask;
2678         uint_t          saved_mask;
2679         uint64_t        saved_mode;
2680         int             trim_mask = 0;
2681         uint64_t        new_mode;
2682         uint64_t        new_uid, new_gid;
2683         znode_t         *attrzp;
2684         int             need_policy = FALSE;
2685         int             err;
2686         zfs_fuid_info_t *fuidp = NULL;
2687         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2688         xoptattr_t      *xoap;
2689         zfs_acl_t       *aclp = NULL;
2690         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2691         boolean_t fuid_dirtied = B_FALSE;
2692
2693         if (mask == 0)
2694                 return (0);
2695
2696         if (mask & AT_NOSET)
2697                 return (EINVAL);
2698
2699         ZFS_ENTER(zfsvfs);
2700         ZFS_VERIFY_ZP(zp);
2701
2702         pzp = zp->z_phys;
2703         zilog = zfsvfs->z_log;
2704
2705         /*
2706          * Make sure that if we have ephemeral uid/gid or xvattr specified
2707          * that file system is at proper version level
2708          */
2709
2710         if (zfsvfs->z_use_fuids == B_FALSE &&
2711             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2712             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2713             (mask & AT_XVATTR))) {
2714                 ZFS_EXIT(zfsvfs);
2715                 return (EINVAL);
2716         }
2717
2718         if (mask & AT_SIZE && vp->v_type == VDIR) {
2719                 ZFS_EXIT(zfsvfs);
2720                 return (EISDIR);
2721         }
2722
2723         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2724                 ZFS_EXIT(zfsvfs);
2725                 return (EINVAL);
2726         }
2727
2728         /*
2729          * If this is an xvattr_t, then get a pointer to the structure of
2730          * optional attributes.  If this is NULL, then we have a vattr_t.
2731          */
2732         xoap = xva_getxoptattr(xvap);
2733
2734         xva_init(&tmpxvattr);
2735
2736         /*
2737          * Immutable files can only alter immutable bit and atime
2738          */
2739         if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2740             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2741             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2742                 ZFS_EXIT(zfsvfs);
2743                 return (EPERM);
2744         }
2745
2746         if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2747                 ZFS_EXIT(zfsvfs);
2748                 return (EPERM);
2749         }
2750
2751         /*
2752          * Verify timestamps doesn't overflow 32 bits.
2753          * ZFS can handle large timestamps, but 32bit syscalls can't
2754          * handle times greater than 2039.  This check should be removed
2755          * once large timestamps are fully supported.
2756          */
2757         if (mask & (AT_ATIME | AT_MTIME)) {
2758                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2759                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2760                         ZFS_EXIT(zfsvfs);
2761                         return (EOVERFLOW);
2762                 }
2763         }
2764
2765 top:
2766         attrzp = NULL;
2767
2768         /* Can this be moved to before the top label? */
2769         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2770                 ZFS_EXIT(zfsvfs);
2771                 return (EROFS);
2772         }
2773
2774         /*
2775          * First validate permissions
2776          */
2777
2778         if (mask & AT_SIZE) {
2779                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2780                 if (err) {
2781                         ZFS_EXIT(zfsvfs);
2782                         return (err);
2783                 }
2784                 /*
2785                  * XXX - Note, we are not providing any open
2786                  * mode flags here (like FNDELAY), so we may
2787                  * block if there are locks present... this
2788                  * should be addressed in openat().
2789                  */
2790                 /* XXX - would it be OK to generate a log record here? */
2791                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2792                 if (err) {
2793                         ZFS_EXIT(zfsvfs);
2794                         return (err);
2795                 }
2796         }
2797
2798         if (mask & (AT_ATIME|AT_MTIME) ||
2799             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2800             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2801             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2802             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2803             XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2804                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2805                     skipaclchk, cr);
2806
2807         if (mask & (AT_UID|AT_GID)) {
2808                 int     idmask = (mask & (AT_UID|AT_GID));
2809                 int     take_owner;
2810                 int     take_group;
2811
2812                 /*
2813                  * NOTE: even if a new mode is being set,
2814                  * we may clear S_ISUID/S_ISGID bits.
2815                  */
2816
2817                 if (!(mask & AT_MODE))
2818                         vap->va_mode = pzp->zp_mode;
2819
2820                 /*
2821                  * Take ownership or chgrp to group we are a member of
2822                  */
2823
2824                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2825                 take_group = (mask & AT_GID) &&
2826                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2827
2828                 /*
2829                  * If both AT_UID and AT_GID are set then take_owner and
2830                  * take_group must both be set in order to allow taking
2831                  * ownership.
2832                  *
2833                  * Otherwise, send the check through secpolicy_vnode_setattr()
2834                  *
2835                  */
2836
2837                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2838                     ((idmask == AT_UID) && take_owner) ||
2839                     ((idmask == AT_GID) && take_group)) {
2840                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2841                             skipaclchk, cr) == 0) {
2842                                 /*
2843                                  * Remove setuid/setgid for non-privileged users
2844                                  */
2845                                 secpolicy_setid_clear(vap, vp, cr);
2846                                 trim_mask = (mask & (AT_UID|AT_GID));
2847                         } else {
2848                                 need_policy =  TRUE;
2849                         }
2850                 } else {
2851                         need_policy =  TRUE;
2852                 }
2853         }
2854
2855         mutex_enter(&zp->z_lock);
2856         oldva.va_mode = pzp->zp_mode;
2857         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2858         if (mask & AT_XVATTR) {
2859                 /*
2860                  * Update xvattr mask to include only those attributes
2861                  * that are actually changing.
2862                  *
2863                  * the bits will be restored prior to actually setting
2864                  * the attributes so the caller thinks they were set.
2865                  */
2866                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2867                         if (xoap->xoa_appendonly !=
2868                             ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
2869                                 need_policy = TRUE;
2870                         } else {
2871                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2872                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2873                         }
2874                 }
2875
2876                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2877                         if (xoap->xoa_nounlink !=
2878                             ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
2879                                 need_policy = TRUE;
2880                         } else {
2881                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2882                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2883                         }
2884                 }
2885
2886                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2887                         if (xoap->xoa_immutable !=
2888                             ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
2889                                 need_policy = TRUE;
2890                         } else {
2891                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2892                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2893                         }
2894                 }
2895
2896                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2897                         if (xoap->xoa_nodump !=
2898                             ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
2899                                 need_policy = TRUE;
2900                         } else {
2901                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2902                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2903                         }
2904                 }
2905
2906                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2907                         if (xoap->xoa_av_modified !=
2908                             ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
2909                                 need_policy = TRUE;
2910                         } else {
2911                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2912                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2913                         }
2914                 }
2915
2916                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2917                         if ((vp->v_type != VREG &&
2918                             xoap->xoa_av_quarantined) ||
2919                             xoap->xoa_av_quarantined !=
2920                             ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
2921                                 need_policy = TRUE;
2922                         } else {
2923                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2924                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2925                         }
2926                 }
2927
2928                 if (need_policy == FALSE &&
2929                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2930                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2931                         need_policy = TRUE;
2932                 }
2933         }
2934
2935         mutex_exit(&zp->z_lock);
2936
2937         if (mask & AT_MODE) {
2938                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2939                         err = secpolicy_setid_setsticky_clear(vp, vap,
2940                             &oldva, cr);
2941                         if (err) {
2942                                 ZFS_EXIT(zfsvfs);
2943                                 return (err);
2944                         }
2945                         trim_mask |= AT_MODE;
2946                 } else {
2947                         need_policy = TRUE;
2948                 }
2949         }
2950
2951         if (need_policy) {
2952                 /*
2953                  * If trim_mask is set then take ownership
2954                  * has been granted or write_acl is present and user
2955                  * has the ability to modify mode.  In that case remove
2956                  * UID|GID and or MODE from mask so that
2957                  * secpolicy_vnode_setattr() doesn't revoke it.
2958                  */
2959
2960                 if (trim_mask) {
2961                         saved_mask = vap->va_mask;
2962                         vap->va_mask &= ~trim_mask;
2963                         if (trim_mask & AT_MODE) {
2964                                 /*
2965                                  * Save the mode, as secpolicy_vnode_setattr()
2966                                  * will overwrite it with ova.va_mode.
2967                                  */
2968                                 saved_mode = vap->va_mode;
2969                         }
2970                 }
2971                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2972                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2973                 if (err) {
2974                         ZFS_EXIT(zfsvfs);
2975                         return (err);
2976                 }
2977
2978                 if (trim_mask) {
2979                         vap->va_mask |= saved_mask;
2980                         if (trim_mask & AT_MODE) {
2981                                 /*
2982                                  * Recover the mode after
2983                                  * secpolicy_vnode_setattr().
2984                                  */
2985                                 vap->va_mode = saved_mode;
2986                         }
2987                 }
2988         }
2989
2990         /*
2991          * secpolicy_vnode_setattr, or take ownership may have
2992          * changed va_mask
2993          */
2994         mask = vap->va_mask;
2995
2996         tx = dmu_tx_create(zfsvfs->z_os);
2997         dmu_tx_hold_bonus(tx, zp->z_id);
2998
2999         if (mask & AT_MODE) {
3000                 uint64_t pmode = pzp->zp_mode;
3001
3002                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3003
3004                 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3005                         goto out;
3006                 if (pzp->zp_acl.z_acl_extern_obj) {
3007                         /* Are we upgrading ACL from old V0 format to new V1 */
3008                         if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
3009                             pzp->zp_acl.z_acl_version ==
3010                             ZFS_ACL_VERSION_INITIAL) {
3011                                 dmu_tx_hold_free(tx,
3012                                     pzp->zp_acl.z_acl_extern_obj, 0,
3013                                     DMU_OBJECT_END);
3014                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3015                                     0, aclp->z_acl_bytes);
3016                         } else {
3017                                 dmu_tx_hold_write(tx,
3018                                     pzp->zp_acl.z_acl_extern_obj, 0,
3019                                     aclp->z_acl_bytes);
3020                         }
3021                 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3022                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3023                             0, aclp->z_acl_bytes);
3024                 }
3025         }
3026
3027         if (mask & (AT_UID | AT_GID)) {
3028                 if (pzp->zp_xattr) {
3029                         err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
3030                         if (err)
3031                                 goto out;
3032                         dmu_tx_hold_bonus(tx, attrzp->z_id);
3033                 }
3034                 if (mask & AT_UID) {
3035                         new_uid = zfs_fuid_create(zfsvfs,
3036                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3037                         if (new_uid != pzp->zp_uid &&
3038                             zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
3039                                 err = EDQUOT;
3040                                 goto out;
3041                         }
3042                 }
3043
3044                 if (mask & AT_GID) {
3045                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3046                             cr, ZFS_GROUP, &fuidp);
3047                         if (new_gid != pzp->zp_gid &&
3048                             zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
3049                                 err = EDQUOT;
3050                                 goto out;
3051                         }
3052                 }
3053                 fuid_dirtied = zfsvfs->z_fuid_dirty;
3054                 if (fuid_dirtied) {
3055                         if (zfsvfs->z_fuid_obj == 0) {
3056                                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3057                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3058                                     FUID_SIZE_ESTIMATE(zfsvfs));
3059                                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
3060                                     FALSE, NULL);
3061                         } else {
3062                                 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3063                                 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3064                                     FUID_SIZE_ESTIMATE(zfsvfs));
3065                         }
3066                 }
3067         }
3068
3069         err = dmu_tx_assign(tx, TXG_NOWAIT);
3070         if (err) {
3071                 if (err == ERESTART)
3072                         dmu_tx_wait(tx);
3073                 goto out;
3074         }
3075
3076         dmu_buf_will_dirty(zp->z_dbuf, tx);
3077
3078         /*
3079          * Set each attribute requested.
3080          * We group settings according to the locks they need to acquire.
3081          *
3082          * Note: you cannot set ctime directly, although it will be
3083          * updated as a side-effect of calling this function.
3084          */
3085
3086         mutex_enter(&zp->z_lock);
3087
3088         if (mask & AT_MODE) {
3089                 mutex_enter(&zp->z_acl_lock);
3090                 zp->z_phys->zp_mode = new_mode;
3091                 err = zfs_aclset_common(zp, aclp, cr, tx);
3092                 ASSERT3U(err, ==, 0);
3093                 zp->z_acl_cached = aclp;
3094                 aclp = NULL;
3095                 mutex_exit(&zp->z_acl_lock);
3096         }
3097
3098         if (attrzp)
3099                 mutex_enter(&attrzp->z_lock);
3100
3101         if (mask & AT_UID) {
3102                 pzp->zp_uid = new_uid;
3103                 if (attrzp)
3104                         attrzp->z_phys->zp_uid = new_uid;
3105         }
3106
3107         if (mask & AT_GID) {
3108                 pzp->zp_gid = new_gid;
3109                 if (attrzp)
3110                         attrzp->z_phys->zp_gid = new_gid;
3111         }
3112
3113         if (attrzp)
3114                 mutex_exit(&attrzp->z_lock);
3115
3116         if (mask & AT_ATIME)
3117                 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
3118
3119         if (mask & AT_MTIME)
3120                 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
3121
3122         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3123         if (mask & AT_SIZE)
3124                 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
3125         else if (mask != 0)
3126                 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
3127         /*
3128          * Do this after setting timestamps to prevent timestamp
3129          * update from toggling bit
3130          */
3131
3132         if (xoap && (mask & AT_XVATTR)) {
3133
3134                 /*
3135                  * restore trimmed off masks
3136                  * so that return masks can be set for caller.
3137                  */
3138
3139                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3140                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
3141                 }
3142                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3143                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
3144                 }
3145                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3146                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3147                 }
3148                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3149                         XVA_SET_REQ(xvap, XAT_NODUMP);
3150                 }
3151                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3152                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3153                 }
3154                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3155                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3156                 }
3157
3158                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
3159                         size_t len;
3160                         dmu_object_info_t doi;
3161
3162                         ASSERT(vp->v_type == VREG);
3163
3164                         /* Grow the bonus buffer if necessary. */
3165                         dmu_object_info_from_db(zp->z_dbuf, &doi);
3166                         len = sizeof (xoap->xoa_av_scanstamp) +
3167                             sizeof (znode_phys_t);
3168                         if (len > doi.doi_bonus_size)
3169                                 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
3170                 }
3171                 zfs_xvattr_set(zp, xvap);
3172         }
3173
3174         if (fuid_dirtied)
3175                 zfs_fuid_sync(zfsvfs, tx);
3176
3177         if (mask != 0)
3178                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3179
3180         mutex_exit(&zp->z_lock);
3181
3182 out:
3183         if (attrzp)
3184                 VN_RELE(ZTOV(attrzp));
3185
3186         if (aclp)
3187                 zfs_acl_free(aclp);
3188
3189         if (fuidp) {
3190                 zfs_fuid_info_free(fuidp);
3191                 fuidp = NULL;
3192         }
3193
3194         if (err)
3195                 dmu_tx_abort(tx);
3196         else
3197                 dmu_tx_commit(tx);
3198
3199         if (err == ERESTART)
3200                 goto top;
3201
3202         ZFS_EXIT(zfsvfs);
3203         return (err);
3204 }
3205
3206 typedef struct zfs_zlock {
3207         krwlock_t       *zl_rwlock;     /* lock we acquired */
3208         znode_t         *zl_znode;      /* znode we held */
3209         struct zfs_zlock *zl_next;      /* next in list */
3210 } zfs_zlock_t;
3211
3212 /*
3213  * Drop locks and release vnodes that were held by zfs_rename_lock().
3214  */
3215 static void
3216 zfs_rename_unlock(zfs_zlock_t **zlpp)
3217 {
3218         zfs_zlock_t *zl;
3219
3220         while ((zl = *zlpp) != NULL) {
3221                 if (zl->zl_znode != NULL)
3222                         VN_RELE(ZTOV(zl->zl_znode));
3223                 rw_exit(zl->zl_rwlock);
3224                 *zlpp = zl->zl_next;
3225                 kmem_free(zl, sizeof (*zl));
3226         }
3227 }
3228
3229 /*
3230  * Search back through the directory tree, using the ".." entries.
3231  * Lock each directory in the chain to prevent concurrent renames.
3232  * Fail any attempt to move a directory into one of its own descendants.
3233  * XXX - z_parent_lock can overlap with map or grow locks
3234  */
3235 static int
3236 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3237 {
3238         zfs_zlock_t     *zl;
3239         znode_t         *zp = tdzp;
3240         uint64_t        rootid = zp->z_zfsvfs->z_root;
3241         uint64_t        *oidp = &zp->z_id;
3242         krwlock_t       *rwlp = &szp->z_parent_lock;
3243         krw_t           rw = RW_WRITER;
3244
3245         /*
3246          * First pass write-locks szp and compares to zp->z_id.
3247          * Later passes read-lock zp and compare to zp->z_parent.
3248          */
3249         do {
3250                 if (!rw_tryenter(rwlp, rw)) {
3251                         /*
3252                          * Another thread is renaming in this path.
3253                          * Note that if we are a WRITER, we don't have any
3254                          * parent_locks held yet.
3255                          */
3256                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3257                                 /*
3258                                  * Drop our locks and restart
3259                                  */
3260                                 zfs_rename_unlock(&zl);
3261                                 *zlpp = NULL;
3262                                 zp = tdzp;
3263                                 oidp = &zp->z_id;
3264                                 rwlp = &szp->z_parent_lock;
3265                                 rw = RW_WRITER;
3266                                 continue;
3267                         } else {
3268                                 /*
3269                                  * Wait for other thread to drop its locks
3270                                  */
3271                                 rw_enter(rwlp, rw);
3272                         }
3273                 }
3274
3275                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3276                 zl->zl_rwlock = rwlp;
3277                 zl->zl_znode = NULL;
3278                 zl->zl_next = *zlpp;
3279                 *zlpp = zl;
3280
3281                 if (*oidp == szp->z_id)         /* We're a descendant of szp */
3282                         return (EINVAL);
3283
3284                 if (*oidp == rootid)            /* We've hit the top */
3285                         return (0);
3286
3287                 if (rw == RW_READER) {          /* i.e. not the first pass */
3288                         int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3289                         if (error)
3290                                 return (error);
3291                         zl->zl_znode = zp;
3292                 }
3293                 oidp = &zp->z_phys->zp_parent;
3294                 rwlp = &zp->z_parent_lock;
3295                 rw = RW_READER;
3296
3297         } while (zp->z_id != sdzp->z_id);
3298
3299         return (0);
3300 }
3301
3302 /*
3303  * Move an entry from the provided source directory to the target
3304  * directory.  Change the entry name as indicated.
3305  *
3306  *      IN:     sdvp    - Source directory containing the "old entry".
3307  *              snm     - Old entry name.
3308  *              tdvp    - Target directory to contain the "new entry".
3309  *              tnm     - New entry name.
3310  *              cr      - credentials of caller.
3311  *              ct      - caller context
3312  *              flags   - case flags
3313  *
3314  *      RETURN: 0 if success
3315  *              error code if failure
3316  *
3317  * Timestamps:
3318  *      sdvp,tdvp - ctime|mtime updated
3319  */
3320 /*ARGSUSED*/
3321 static int
3322 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3323     caller_context_t *ct, int flags)
3324 {
3325         znode_t         *tdzp, *szp, *tzp;
3326         znode_t         *sdzp = VTOZ(sdvp);
3327         zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3328         zilog_t         *zilog;
3329         vnode_t         *realvp;
3330         zfs_dirlock_t   *sdl, *tdl;
3331         dmu_tx_t        *tx;
3332         zfs_zlock_t     *zl;
3333         int             cmp, serr, terr;
3334         int             error = 0;
3335         int             zflg = 0;
3336
3337         ZFS_ENTER(zfsvfs);
3338         ZFS_VERIFY_ZP(sdzp);
3339         zilog = zfsvfs->z_log;
3340
3341         /*
3342          * Make sure we have the real vp for the target directory.
3343          */
3344         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3345                 tdvp = realvp;
3346
3347         if (tdvp->v_vfsp != sdvp->v_vfsp) {
3348                 ZFS_EXIT(zfsvfs);
3349                 return (EXDEV);
3350         }
3351
3352         tdzp = VTOZ(tdvp);
3353         ZFS_VERIFY_ZP(tdzp);
3354         if (zfsvfs->z_utf8 && u8_validate(tnm,
3355             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3356                 ZFS_EXIT(zfsvfs);
3357                 return (EILSEQ);
3358         }
3359
3360         if (flags & FIGNORECASE)
3361                 zflg |= ZCILOOK;
3362
3363 top:
3364         szp = NULL;
3365         tzp = NULL;
3366         zl = NULL;
3367
3368         /*
3369          * This is to prevent the creation of links into attribute space
3370          * by renaming a linked file into/outof an attribute directory.
3371          * See the comment in zfs_link() for why this is considered bad.
3372          */
3373         if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3374             (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3375                 ZFS_EXIT(zfsvfs);
3376                 return (EINVAL);
3377         }
3378
3379         /*
3380          * Lock source and target directory entries.  To prevent deadlock,
3381          * a lock ordering must be defined.  We lock the directory with
3382          * the smallest object id first, or if it's a tie, the one with
3383          * the lexically first name.
3384          */
3385         if (sdzp->z_id < tdzp->z_id) {
3386                 cmp = -1;
3387         } else if (sdzp->z_id > tdzp->z_id) {
3388                 cmp = 1;
3389         } else {
3390                 /*
3391                  * First compare the two name arguments without
3392                  * considering any case folding.
3393                  */
3394                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3395
3396                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3397                 ASSERT(error == 0 || !zfsvfs->z_utf8);
3398                 if (cmp == 0) {
3399                         /*
3400                          * POSIX: "If the old argument and the new argument
3401                          * both refer to links to the same existing file,
3402                          * the rename() function shall return successfully
3403                          * and perform no other action."
3404                          */
3405                         ZFS_EXIT(zfsvfs);
3406                         return (0);
3407                 }
3408                 /*
3409                  * If the file system is case-folding, then we may
3410                  * have some more checking to do.  A case-folding file
3411                  * system is either supporting mixed case sensitivity
3412                  * access or is completely case-insensitive.  Note
3413                  * that the file system is always case preserving.
3414                  *
3415                  * In mixed sensitivity mode case sensitive behavior
3416                  * is the default.  FIGNORECASE must be used to
3417                  * explicitly request case insensitive behavior.
3418                  *
3419                  * If the source and target names provided differ only
3420                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3421                  * we will treat this as a special case in the
3422                  * case-insensitive mode: as long as the source name
3423                  * is an exact match, we will allow this to proceed as
3424                  * a name-change request.
3425                  */
3426                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3427                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
3428                     flags & FIGNORECASE)) &&
3429                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3430                     &error) == 0) {
3431                         /*
3432                          * case preserving rename request, require exact
3433                          * name matches
3434                          */
3435                         zflg |= ZCIEXACT;
3436                         zflg &= ~ZCILOOK;
3437                 }
3438         }
3439
3440         /*
3441          * If the source and destination directories are the same, we should
3442          * grab the z_name_lock of that directory only once.
3443          */
3444         if (sdzp == tdzp) {
3445                 zflg |= ZHAVELOCK;
3446                 rw_enter(&sdzp->z_name_lock, RW_READER);
3447         }
3448
3449         if (cmp < 0) {
3450                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3451                     ZEXISTS | zflg, NULL, NULL);
3452                 terr = zfs_dirent_lock(&tdl,
3453                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3454         } else {
3455                 terr = zfs_dirent_lock(&tdl,
3456                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3457                 serr = zfs_dirent_lock(&sdl,
3458                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3459                     NULL, NULL);
3460         }
3461
3462         if (serr) {
3463                 /*
3464                  * Source entry invalid or not there.
3465                  */
3466                 if (!terr) {
3467                         zfs_dirent_unlock(tdl);
3468                         if (tzp)
3469                                 VN_RELE(ZTOV(tzp));
3470                 }
3471
3472                 if (sdzp == tdzp)
3473                         rw_exit(&sdzp->z_name_lock);
3474
3475                 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3476                         serr = EINVAL;
3477                 ZFS_EXIT(zfsvfs);
3478                 return (serr);
3479         }
3480         if (terr) {
3481                 zfs_dirent_unlock(sdl);
3482                 VN_RELE(ZTOV(szp));
3483
3484                 if (sdzp == tdzp)
3485                         rw_exit(&sdzp->z_name_lock);
3486
3487                 if (strcmp(tnm, "..") == 0)
3488                         terr = EINVAL;
3489                 ZFS_EXIT(zfsvfs);
3490                 return (terr);
3491         }
3492
3493         /*
3494          * Must have write access at the source to remove the old entry
3495          * and write access at the target to create the new entry.
3496          * Note that if target and source are the same, this can be
3497          * done in a single check.
3498          */
3499
3500         if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3501                 goto out;
3502
3503         if (ZTOV(szp)->v_type == VDIR) {
3504                 /*
3505                  * Check to make sure rename is valid.
3506                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3507                  */
3508                 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3509                         goto out;
3510         }
3511
3512         /*
3513          * Does target exist?
3514          */
3515         if (tzp) {
3516                 /*
3517                  * Source and target must be the same type.
3518                  */
3519                 if (ZTOV(szp)->v_type == VDIR) {
3520                         if (ZTOV(tzp)->v_type != VDIR) {
3521                                 error = ENOTDIR;
3522                                 goto out;
3523                         }
3524                 } else {
3525                         if (ZTOV(tzp)->v_type == VDIR) {
3526                                 error = EISDIR;
3527                                 goto out;
3528                         }
3529                 }
3530                 /*
3531                  * POSIX dictates that when the source and target
3532                  * entries refer to the same file object, rename
3533                  * must do nothing and exit without error.
3534                  */
3535                 if (szp->z_id == tzp->z_id) {
3536                         error = 0;
3537                         goto out;
3538                 }
3539         }
3540
3541         vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3542         if (tzp)
3543                 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3544
3545         /*
3546          * notify the target directory if it is not the same
3547          * as source directory.
3548          */
3549         if (tdvp != sdvp) {
3550                 vnevent_rename_dest_dir(tdvp, ct);
3551         }
3552
3553         tx = dmu_tx_create(zfsvfs->z_os);
3554         dmu_tx_hold_bonus(tx, szp->z_id);       /* nlink changes */
3555         dmu_tx_hold_bonus(tx, sdzp->z_id);      /* nlink changes */
3556         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3557         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3558         if (sdzp != tdzp)
3559                 dmu_tx_hold_bonus(tx, tdzp->z_id);      /* nlink changes */
3560         if (tzp)
3561                 dmu_tx_hold_bonus(tx, tzp->z_id);       /* parent changes */
3562         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3563         error = dmu_tx_assign(tx, TXG_NOWAIT);
3564         if (error) {
3565                 if (zl != NULL)
3566                         zfs_rename_unlock(&zl);
3567                 zfs_dirent_unlock(sdl);
3568                 zfs_dirent_unlock(tdl);
3569
3570                 if (sdzp == tdzp)
3571                         rw_exit(&sdzp->z_name_lock);
3572
3573                 VN_RELE(ZTOV(szp));
3574                 if (tzp)
3575                         VN_RELE(ZTOV(tzp));
3576                 if (error == ERESTART) {
3577                         dmu_tx_wait(tx);
3578                         dmu_tx_abort(tx);
3579                         goto top;
3580                 }
3581                 dmu_tx_abort(tx);
3582                 ZFS_EXIT(zfsvfs);
3583                 return (error);
3584         }
3585
3586         if (tzp)        /* Attempt to remove the existing target */
3587                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3588
3589         if (error == 0) {
3590                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3591                 if (error == 0) {
3592                         szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3593
3594                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3595                         ASSERT(error == 0);
3596
3597                         zfs_log_rename(zilog, tx,
3598                             TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3599                             sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3600
3601                         /* Update path information for the target vnode */
3602                         vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3603                 }
3604 #ifdef FREEBSD_NAMECACHE
3605                 if (error == 0) {
3606                         cache_purge(sdvp);
3607                         cache_purge(tdvp);
3608                 }
3609 #endif
3610         }
3611
3612         dmu_tx_commit(tx);
3613 out:
3614         if (zl != NULL)
3615                 zfs_rename_unlock(&zl);
3616
3617         zfs_dirent_unlock(sdl);
3618         zfs_dirent_unlock(tdl);
3619
3620         if (sdzp == tdzp)
3621                 rw_exit(&sdzp->z_name_lock);
3622
3623         VN_RELE(ZTOV(szp));
3624         if (tzp)
3625                 VN_RELE(ZTOV(tzp));
3626
3627         ZFS_EXIT(zfsvfs);
3628
3629         return (error);
3630 }
3631
3632 /*
3633  * Insert the indicated symbolic reference entry into the directory.
3634  *
3635  *      IN:     dvp     - Directory to contain new symbolic link.
3636  *              link    - Name for new symlink entry.
3637  *              vap     - Attributes of new entry.
3638  *              target  - Target path of new symlink.
3639  *              cr      - credentials of caller.
3640  *              ct      - caller context
3641  *              flags   - case flags
3642  *
3643  *      RETURN: 0 if success
3644  *              error code if failure
3645  *
3646  * Timestamps:
3647  *      dvp - ctime|mtime updated
3648  */
3649 /*ARGSUSED*/
3650 static int
3651 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3652     cred_t *cr, kthread_t *td)
3653 {
3654         znode_t         *zp, *dzp = VTOZ(dvp);
3655         zfs_dirlock_t   *dl;
3656         dmu_tx_t        *tx;
3657         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3658         zilog_t         *zilog;
3659         int             len = strlen(link);
3660         int             error;
3661         int             zflg = ZNEW;
3662         zfs_acl_ids_t   acl_ids;
3663         boolean_t       fuid_dirtied;
3664         int             flags = 0;
3665
3666         ASSERT(vap->va_type == VLNK);
3667
3668         ZFS_ENTER(zfsvfs);
3669         ZFS_VERIFY_ZP(dzp);
3670         zilog = zfsvfs->z_log;
3671
3672         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3673             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3674                 ZFS_EXIT(zfsvfs);
3675                 return (EILSEQ);
3676         }
3677         if (flags & FIGNORECASE)
3678                 zflg |= ZCILOOK;
3679 top:
3680         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3681                 ZFS_EXIT(zfsvfs);
3682                 return (error);
3683         }
3684
3685         if (len > MAXPATHLEN) {
3686                 ZFS_EXIT(zfsvfs);
3687                 return (ENAMETOOLONG);
3688         }
3689
3690         /*
3691          * Attempt to lock directory; fail if entry already exists.
3692          */
3693         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3694         if (error) {
3695                 ZFS_EXIT(zfsvfs);
3696                 return (error);
3697         }
3698
3699         VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
3700         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3701                 zfs_acl_ids_free(&acl_ids);
3702                 zfs_dirent_unlock(dl);
3703                 ZFS_EXIT(zfsvfs);
3704                 return (EDQUOT);
3705         }
3706         tx = dmu_tx_create(zfsvfs->z_os);
3707         fuid_dirtied = zfsvfs->z_fuid_dirty;
3708         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3709         dmu_tx_hold_bonus(tx, dzp->z_id);
3710         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3711         if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
3712                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3713         if (fuid_dirtied)
3714                 zfs_fuid_txhold(zfsvfs, tx);
3715         error = dmu_tx_assign(tx, TXG_NOWAIT);
3716         if (error) {
3717                 zfs_acl_ids_free(&acl_ids);
3718                 zfs_dirent_unlock(dl);
3719                 if (error == ERESTART) {
3720                         dmu_tx_wait(tx);
3721                         dmu_tx_abort(tx);
3722                         goto top;
3723                 }
3724                 dmu_tx_abort(tx);
3725                 ZFS_EXIT(zfsvfs);
3726                 return (error);
3727         }
3728
3729         dmu_buf_will_dirty(dzp->z_dbuf, tx);
3730
3731         /*
3732          * Create a new object for the symlink.
3733          * Put the link content into bonus buffer if it will fit;
3734          * otherwise, store it just like any other file data.
3735          */
3736         if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3737                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
3738                 if (len != 0)
3739                         bcopy(link, zp->z_phys + 1, len);
3740         } else {
3741                 dmu_buf_t *dbp;
3742
3743                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
3744
3745                 if (fuid_dirtied)
3746                         zfs_fuid_sync(zfsvfs, tx);
3747                 /*
3748                  * Nothing can access the znode yet so no locking needed
3749                  * for growing the znode's blocksize.
3750                  */
3751                 zfs_grow_blocksize(zp, len, tx);
3752
3753                 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3754                     zp->z_id, 0, FTAG, &dbp));
3755                 dmu_buf_will_dirty(dbp, tx);
3756
3757                 ASSERT3U(len, <=, dbp->db_size);
3758                 bcopy(link, dbp->db_data, len);
3759                 dmu_buf_rele(dbp, FTAG);
3760         }
3761         zp->z_phys->zp_size = len;
3762
3763         /*
3764          * Insert the new object into the directory.
3765          */
3766         (void) zfs_link_create(dl, zp, tx, ZNEW);
3767         if (error == 0) {
3768                 uint64_t txtype = TX_SYMLINK;
3769                 if (flags & FIGNORECASE)
3770                         txtype |= TX_CI;
3771                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3772                 *vpp = ZTOV(zp);
3773         }
3774
3775         zfs_acl_ids_free(&acl_ids);
3776
3777         dmu_tx_commit(tx);
3778
3779         zfs_dirent_unlock(dl);
3780
3781         ZFS_EXIT(zfsvfs);
3782         return (error);
3783 }
3784
3785 /*
3786  * Return, in the buffer contained in the provided uio structure,
3787  * the symbolic path referred to by vp.
3788  *
3789  *      IN:     vp      - vnode of symbolic link.
3790  *              uoip    - structure to contain the link path.
3791  *              cr      - credentials of caller.
3792  *              ct      - caller context
3793  *
3794  *      OUT:    uio     - structure to contain the link path.
3795  *
3796  *      RETURN: 0 if success
3797  *              error code if failure
3798  *
3799  * Timestamps:
3800  *      vp - atime updated
3801  */
3802 /* ARGSUSED */
3803 static int
3804 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3805 {
3806         znode_t         *zp = VTOZ(vp);
3807         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3808         size_t          bufsz;
3809         int             error;
3810
3811         ZFS_ENTER(zfsvfs);
3812         ZFS_VERIFY_ZP(zp);
3813
3814         bufsz = (size_t)zp->z_phys->zp_size;
3815         if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3816                 error = uiomove(zp->z_phys + 1,
3817                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3818         } else {
3819                 dmu_buf_t *dbp;
3820                 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3821                 if (error) {
3822                         ZFS_EXIT(zfsvfs);
3823                         return (error);
3824                 }
3825                 error = uiomove(dbp->db_data,
3826                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3827                 dmu_buf_rele(dbp, FTAG);
3828         }
3829
3830         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3831         ZFS_EXIT(zfsvfs);
3832         return (error);
3833 }
3834
3835 /*
3836  * Insert a new entry into directory tdvp referencing svp.
3837  *
3838  *      IN:     tdvp    - Directory to contain new entry.
3839  *              svp     - vnode of new entry.
3840  *              name    - name of new entry.
3841  *              cr      - credentials of caller.
3842  *              ct      - caller context
3843  *
3844  *      RETURN: 0 if success
3845  *              error code if failure
3846  *
3847  * Timestamps:
3848  *      tdvp - ctime|mtime updated
3849  *       svp - ctime updated
3850  */
3851 /* ARGSUSED */
3852 static int
3853 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3854     caller_context_t *ct, int flags)
3855 {
3856         znode_t         *dzp = VTOZ(tdvp);
3857         znode_t         *tzp, *szp;
3858         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3859         zilog_t         *zilog;
3860         zfs_dirlock_t   *dl;
3861         dmu_tx_t        *tx;
3862         vnode_t         *realvp;
3863         int             error;
3864         int             zf = ZNEW;
3865         uid_t           owner;
3866
3867         ASSERT(tdvp->v_type == VDIR);
3868
3869         ZFS_ENTER(zfsvfs);
3870         ZFS_VERIFY_ZP(dzp);
3871         zilog = zfsvfs->z_log;
3872
3873         if (VOP_REALVP(svp, &realvp, ct) == 0)
3874                 svp = realvp;
3875
3876         if (svp->v_vfsp != tdvp->v_vfsp) {
3877                 ZFS_EXIT(zfsvfs);
3878                 return (EXDEV);
3879         }
3880         szp = VTOZ(svp);
3881         ZFS_VERIFY_ZP(szp);
3882
3883         if (zfsvfs->z_utf8 && u8_validate(name,
3884             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3885                 ZFS_EXIT(zfsvfs);
3886                 return (EILSEQ);
3887         }
3888         if (flags & FIGNORECASE)
3889                 zf |= ZCILOOK;
3890
3891 top:
3892         /*
3893          * We do not support links between attributes and non-attributes
3894          * because of the potential security risk of creating links
3895          * into "normal" file space in order to circumvent restrictions
3896          * imposed in attribute space.
3897          */
3898         if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3899             (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3900                 ZFS_EXIT(zfsvfs);
3901                 return (EINVAL);
3902         }
3903
3904         /*
3905          * POSIX dictates that we return EPERM here.
3906          * Better choices include ENOTSUP or EISDIR.
3907          */
3908         if (svp->v_type == VDIR) {
3909                 ZFS_EXIT(zfsvfs);
3910                 return (EPERM);
3911         }
3912
3913         owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3914         if (owner != crgetuid(cr) &&
3915             secpolicy_basic_link(svp, cr) != 0) {
3916                 ZFS_EXIT(zfsvfs);
3917                 return (EPERM);
3918         }
3919
3920         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3921                 ZFS_EXIT(zfsvfs);
3922                 return (error);
3923         }
3924
3925         /*
3926          * Attempt to lock directory; fail if entry already exists.
3927          */
3928         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3929         if (error) {
3930                 ZFS_EXIT(zfsvfs);
3931                 return (error);
3932         }
3933
3934         tx = dmu_tx_create(zfsvfs->z_os);
3935         dmu_tx_hold_bonus(tx, szp->z_id);
3936         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3937         error = dmu_tx_assign(tx, TXG_NOWAIT);
3938         if (error) {
3939                 zfs_dirent_unlock(dl);
3940                 if (error == ERESTART) {
3941                         dmu_tx_wait(tx);
3942                         dmu_tx_abort(tx);
3943                         goto top;
3944                 }
3945                 dmu_tx_abort(tx);
3946                 ZFS_EXIT(zfsvfs);
3947                 return (error);
3948         }
3949
3950         error = zfs_link_create(dl, szp, tx, 0);
3951
3952         if (error == 0) {
3953                 uint64_t txtype = TX_LINK;
3954                 if (flags & FIGNORECASE)
3955                         txtype |= TX_CI;
3956                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3957         }
3958
3959         dmu_tx_commit(tx);
3960
3961         zfs_dirent_unlock(dl);
3962
3963         if (error == 0) {
3964                 vnevent_link(svp, ct);
3965         }
3966
3967         ZFS_EXIT(zfsvfs);
3968         return (error);
3969 }
3970
3971 /*ARGSUSED*/
3972 void
3973 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3974 {
3975         znode_t *zp = VTOZ(vp);
3976         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3977         int error;
3978
3979         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3980         if (zp->z_dbuf == NULL) {
3981                 /*
3982                  * The fs has been unmounted, or we did a
3983                  * suspend/resume and this file no longer exists.
3984                  */
3985                 VI_LOCK(vp);
3986                 vp->v_count = 0; /* count arrives as 1 */
3987                 VI_UNLOCK(vp);
3988                 vrecycle(vp, curthread);
3989                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3990                 return;
3991         }
3992
3993         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3994                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3995
3996                 dmu_tx_hold_bonus(tx, zp->z_id);
3997                 error = dmu_tx_assign(tx, TXG_WAIT);
3998                 if (error) {
3999                         dmu_tx_abort(tx);
4000                 } else {
4001                         dmu_buf_will_dirty(zp->z_dbuf, tx);
4002                         mutex_enter(&zp->z_lock);
4003                         zp->z_atime_dirty = 0;
4004                         mutex_exit(&zp->z_lock);
4005                         dmu_tx_commit(tx);
4006                 }
4007         }
4008
4009         zfs_zinactive(zp);
4010         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4011 }
4012
4013 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4014 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4015
4016 /*ARGSUSED*/
4017 static int
4018 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4019 {
4020         znode_t         *zp = VTOZ(vp);
4021         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4022         uint32_t        gen;
4023         uint64_t        object = zp->z_id;
4024         zfid_short_t    *zfid;
4025         int             size, i;
4026
4027         ZFS_ENTER(zfsvfs);
4028         ZFS_VERIFY_ZP(zp);
4029         gen = (uint32_t)zp->z_gen;
4030
4031         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4032         fidp->fid_len = size;
4033
4034         zfid = (zfid_short_t *)fidp;
4035
4036         zfid->zf_len = size;
4037
4038         for (i = 0; i < sizeof (zfid->zf_object); i++)
4039                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4040
4041         /* Must have a non-zero generation number to distinguish from .zfs */
4042         if (gen == 0)
4043                 gen = 1;
4044         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4045                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4046
4047         if (size == LONG_FID_LEN) {
4048                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4049                 zfid_long_t     *zlfid;
4050
4051                 zlfid = (zfid_long_t *)fidp;
4052
4053                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4054                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4055
4056                 /* XXX - this should be the generation number for the objset */
4057                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4058                         zlfid->zf_setgen[i] = 0;
4059         }
4060
4061         ZFS_EXIT(zfsvfs);
4062         return (0);
4063 }
4064
4065 static int
4066 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4067     caller_context_t *ct)
4068 {
4069         znode_t         *zp, *xzp;
4070         zfsvfs_t        *zfsvfs;
4071         zfs_dirlock_t   *dl;
4072         int             error;
4073
4074         switch (cmd) {
4075         case _PC_LINK_MAX:
4076                 *valp = INT_MAX;
4077                 return (0);
4078
4079         case _PC_FILESIZEBITS:
4080                 *valp = 64;
4081                 return (0);
4082
4083 #if 0
4084         case _PC_XATTR_EXISTS:
4085                 zp = VTOZ(vp);
4086                 zfsvfs = zp->z_zfsvfs;
4087                 ZFS_ENTER(zfsvfs);
4088                 ZFS_VERIFY_ZP(zp);
4089                 *valp = 0;
4090                 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4091                     ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4092                 if (error == 0) {
4093                         zfs_dirent_unlock(dl);
4094                         if (!zfs_dirempty(xzp))
4095                                 *valp = 1;
4096                         VN_RELE(ZTOV(xzp));
4097                 } else if (error == ENOENT) {
4098                         /*
4099                          * If there aren't extended attributes, it's the
4100                          * same as having zero of them.
4101                          */
4102                         error = 0;
4103                 }
4104                 ZFS_EXIT(zfsvfs);
4105                 return (error);
4106 #endif
4107
4108         case _PC_ACL_EXTENDED:
4109                 *valp = 0;
4110                 return (0);
4111
4112         case _PC_ACL_NFS4:
4113                 *valp = 1;
4114                 return (0);
4115
4116         case _PC_ACL_PATH_MAX:
4117                 *valp = ACL_MAX_ENTRIES;
4118                 return (0);
4119
4120         case _PC_MIN_HOLE_SIZE:
4121                 *valp = (int)SPA_MINBLOCKSIZE;
4122                 return (0);
4123
4124         default:
4125                 return (EOPNOTSUPP);
4126         }
4127 }
4128
4129 /*ARGSUSED*/
4130 static int
4131 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4132     caller_context_t *ct)
4133 {
4134         znode_t *zp = VTOZ(vp);
4135         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4136         int error;
4137         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4138
4139         ZFS_ENTER(zfsvfs);
4140         ZFS_VERIFY_ZP(zp);
4141         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4142         ZFS_EXIT(zfsvfs);
4143
4144         return (error);
4145 }
4146
4147 /*ARGSUSED*/
4148 static int
4149 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4150     caller_context_t *ct)
4151 {
4152         znode_t *zp = VTOZ(vp);
4153         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4154         int error;
4155         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4156
4157         ZFS_ENTER(zfsvfs);
4158         ZFS_VERIFY_ZP(zp);
4159         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4160         ZFS_EXIT(zfsvfs);
4161         return (error);
4162 }
4163
4164 static int
4165 zfs_freebsd_open(ap)
4166         struct vop_open_args /* {
4167                 struct vnode *a_vp;
4168                 int a_mode;
4169                 struct ucred *a_cred;
4170                 struct thread *a_td;
4171         } */ *ap;
4172 {
4173         vnode_t *vp = ap->a_vp;
4174         znode_t *zp = VTOZ(vp);
4175         int error;
4176
4177         error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4178         if (error == 0)
4179                 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
4180         return (error);
4181 }
4182
4183 static int
4184 zfs_freebsd_close(ap)
4185         struct vop_close_args /* {
4186                 struct vnode *a_vp;
4187                 int  a_fflag;
4188                 struct ucred *a_cred;
4189                 struct thread *a_td;
4190         } */ *ap;
4191 {
4192
4193         return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
4194 }
4195
4196 static int
4197 zfs_freebsd_ioctl(ap)
4198         struct vop_ioctl_args /* {
4199                 struct vnode *a_vp;
4200                 u_long a_command;
4201                 caddr_t a_data;
4202                 int a_fflag;
4203                 struct ucred *cred;
4204                 struct thread *td;
4205         } */ *ap;
4206 {
4207
4208         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4209             ap->a_fflag, ap->a_cred, NULL, NULL));
4210 }
4211
4212 static int
4213 zfs_freebsd_read(ap)
4214         struct vop_read_args /* {
4215                 struct vnode *a_vp;
4216                 struct uio *a_uio;
4217                 int a_ioflag;
4218                 struct ucred *a_cred;
4219         } */ *ap;
4220 {
4221
4222         return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4223 }
4224
4225 static int
4226 zfs_freebsd_write(ap)
4227         struct vop_write_args /* {
4228                 struct vnode *a_vp;
4229                 struct uio *a_uio;
4230                 int a_ioflag;
4231                 struct ucred *a_cred;
4232         } */ *ap;
4233 {
4234
4235         return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4236 }
4237
4238 static int
4239 zfs_freebsd_access(ap)
4240         struct vop_access_args /* {
4241                 struct vnode *a_vp;
4242                 accmode_t a_accmode;
4243                 struct ucred *a_cred;
4244                 struct thread *a_td;
4245         } */ *ap;
4246 {
4247         accmode_t accmode;
4248         int error = 0;
4249
4250         /*
4251          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4252          */
4253         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4254         if (accmode != 0)
4255                 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4256
4257         /*
4258          * VADMIN has to be handled by vaccess().
4259          */
4260         if (error == 0) {
4261                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4262                 if (accmode != 0) {
4263                         vnode_t *vp = ap->a_vp;
4264                         znode_t *zp = VTOZ(vp);
4265                         znode_phys_t *zphys = zp->z_phys;
4266
4267                         error = vaccess(vp->v_type, zphys->zp_mode,
4268                             zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred,
4269                             NULL);
4270                 }
4271         }
4272
4273         return (error);
4274 }
4275
4276 static int
4277 zfs_freebsd_lookup(ap)
4278         struct vop_lookup_args /* {
4279                 struct vnode *a_dvp;
4280                 struct vnode **a_vpp;
4281                 struct componentname *a_cnp;
4282         } */ *ap;
4283 {
4284         struct componentname *cnp = ap->a_cnp;
4285         char nm[NAME_MAX + 1];
4286
4287         ASSERT(cnp->cn_namelen < sizeof(nm));
4288         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4289
4290         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4291             cnp->cn_cred, cnp->cn_thread, 0));
4292 }
4293
4294 static int
4295 zfs_freebsd_create(ap)
4296         struct vop_create_args /* {
4297                 struct vnode *a_dvp;
4298                 struct vnode **a_vpp;
4299                 struct componentname *a_cnp;
4300                 struct vattr *a_vap;
4301         } */ *ap;
4302 {
4303         struct componentname *cnp = ap->a_cnp;
4304         vattr_t *vap = ap->a_vap;
4305         int mode;
4306
4307         ASSERT(cnp->cn_flags & SAVENAME);
4308
4309         vattr_init_mask(vap);
4310         mode = vap->va_mode & ALLPERMS;
4311
4312         return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4313             ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
4314 }
4315
4316 static int
4317 zfs_freebsd_remove(ap)
4318         struct vop_remove_args /* {
4319                 struct vnode *a_dvp;
4320                 struct vnode *a_vp;
4321                 struct componentname *a_cnp;
4322         } */ *ap;
4323 {
4324
4325         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4326
4327         return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
4328             ap->a_cnp->cn_cred, NULL, 0));
4329 }
4330
4331 static int
4332 zfs_freebsd_mkdir(ap)
4333         struct vop_mkdir_args /* {
4334                 struct vnode *a_dvp;
4335                 struct vnode **a_vpp;
4336                 struct componentname *a_cnp;
4337                 struct vattr *a_vap;
4338         } */ *ap;
4339 {
4340         vattr_t *vap = ap->a_vap;
4341
4342         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4343
4344         vattr_init_mask(vap);
4345
4346         return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4347             ap->a_cnp->cn_cred, NULL, 0, NULL));
4348 }
4349
4350 static int
4351 zfs_freebsd_rmdir(ap)
4352         struct vop_rmdir_args /* {
4353                 struct vnode *a_dvp;
4354                 struct vnode *a_vp;
4355                 struct componentname *a_cnp;
4356         } */ *ap;
4357 {
4358         struct componentname *cnp = ap->a_cnp;
4359
4360         ASSERT(cnp->cn_flags & SAVENAME);
4361
4362         return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4363 }
4364
4365 static int
4366 zfs_freebsd_readdir(ap)
4367         struct vop_readdir_args /* {
4368                 struct vnode *a_vp;
4369                 struct uio *a_uio;
4370                 struct ucred *a_cred;
4371                 int *a_eofflag;
4372                 int *a_ncookies;
4373                 u_long **a_cookies;
4374         } */ *ap;
4375 {
4376
4377         return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4378             ap->a_ncookies, ap->a_cookies));
4379 }
4380
4381 static int
4382 zfs_freebsd_fsync(ap)
4383         struct vop_fsync_args /* {
4384                 struct vnode *a_vp;
4385                 int a_waitfor;
4386                 struct thread *a_td;
4387         } */ *ap;
4388 {
4389
4390         vop_stdfsync(ap);
4391         return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
4392 }
4393
4394 static int
4395 zfs_freebsd_getattr(ap)
4396         struct vop_getattr_args /* {
4397                 struct vnode *a_vp;
4398                 struct vattr *a_vap;
4399                 struct ucred *a_cred;
4400                 struct thread *a_td;
4401         } */ *ap;
4402 {
4403         vattr_t *vap = ap->a_vap;
4404         xvattr_t xvap;
4405         u_long fflags = 0;
4406         int error;
4407
4408         xva_init(&xvap);
4409         xvap.xva_vattr = *vap;
4410         xvap.xva_vattr.va_mask |= AT_XVATTR;
4411
4412         /* Convert chflags into ZFS-type flags. */
4413         /* XXX: what about SF_SETTABLE?. */
4414         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4415         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4416         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4417         XVA_SET_REQ(&xvap, XAT_NODUMP);
4418         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4419         if (error != 0)
4420                 return (error);
4421
4422         /* Convert ZFS xattr into chflags. */
4423 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4424         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4425                 fflags |= (fflag);                                      \
4426 } while (0)
4427         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4428             xvap.xva_xoptattrs.xoa_immutable);
4429         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4430             xvap.xva_xoptattrs.xoa_appendonly);
4431         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4432             xvap.xva_xoptattrs.xoa_nounlink);
4433         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4434             xvap.xva_xoptattrs.xoa_nodump);
4435 #undef  FLAG_CHECK
4436         *vap = xvap.xva_vattr;
4437         vap->va_flags = fflags;
4438         return (0);
4439 }
4440
4441 static int
4442 zfs_freebsd_setattr(ap)
4443         struct vop_setattr_args /* {
4444                 struct vnode *a_vp;
4445                 struct vattr *a_vap;
4446                 struct ucred *a_cred;
4447                 struct thread *a_td;
4448         } */ *ap;
4449 {
4450         vnode_t *vp = ap->a_vp;
4451         vattr_t *vap = ap->a_vap;
4452         cred_t *cred = ap->a_cred;
4453         xvattr_t xvap;
4454         u_long fflags;
4455         uint64_t zflags;
4456
4457         vattr_init_mask(vap);
4458         vap->va_mask &= ~AT_NOSET;
4459
4460         xva_init(&xvap);
4461         xvap.xva_vattr = *vap;
4462
4463         zflags = VTOZ(vp)->z_phys->zp_flags;
4464
4465         if (vap->va_flags != VNOVAL) {
4466                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4467                 int error;
4468
4469                 if (zfsvfs->z_use_fuids == B_FALSE)
4470                         return (EOPNOTSUPP);
4471
4472                 fflags = vap->va_flags;
4473                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4474                         return (EOPNOTSUPP);
4475                 /*
4476                  * Unprivileged processes are not permitted to unset system
4477                  * flags, or modify flags if any system flags are set.
4478                  * Privileged non-jail processes may not modify system flags
4479                  * if securelevel > 0 and any existing system flags are set.
4480                  * Privileged jail processes behave like privileged non-jail
4481                  * processes if the security.jail.chflags_allowed sysctl is
4482                  * is non-zero; otherwise, they behave like unprivileged
4483                  * processes.
4484                  */
4485                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4486                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
4487                         if (zflags &
4488                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4489                                 error = securelevel_gt(cred, 0);
4490                                 if (error != 0)
4491                                         return (error);
4492                         }
4493                 } else {
4494                         /*
4495                          * Callers may only modify the file flags on objects they
4496                          * have VADMIN rights for.
4497                          */
4498                         if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
4499                                 return (error);
4500                         if (zflags &
4501                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4502                                 return (EPERM);
4503                         }
4504                         if (fflags &
4505                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4506                                 return (EPERM);
4507                         }
4508                 }
4509
4510 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4511         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4512             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4513                 XVA_SET_REQ(&xvap, (xflag));                            \
4514                 (xfield) = ((fflags & (fflag)) != 0);                   \
4515         }                                                               \
4516 } while (0)
4517                 /* Convert chflags into ZFS-type flags. */
4518                 /* XXX: what about SF_SETTABLE?. */
4519                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4520                     xvap.xva_xoptattrs.xoa_immutable);
4521                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4522                     xvap.xva_xoptattrs.xoa_appendonly);
4523                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4524                     xvap.xva_xoptattrs.xoa_nounlink);
4525                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4526                     xvap.xva_xoptattrs.xoa_nodump);
4527 #undef  FLAG_CHANGE
4528         }
4529         return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4530 }
4531
4532 static int
4533 zfs_freebsd_rename(ap)
4534         struct vop_rename_args  /* {
4535                 struct vnode *a_fdvp;
4536                 struct vnode *a_fvp;
4537                 struct componentname *a_fcnp;
4538                 struct vnode *a_tdvp;
4539                 struct vnode *a_tvp;
4540                 struct componentname *a_tcnp;
4541         } */ *ap;
4542 {
4543         vnode_t *fdvp = ap->a_fdvp;
4544         vnode_t *fvp = ap->a_fvp;
4545         vnode_t *tdvp = ap->a_tdvp;
4546         vnode_t *tvp = ap->a_tvp;
4547         int error;
4548
4549         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4550         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4551
4552         error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
4553             ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4554
4555         if (tdvp == tvp)
4556                 VN_RELE(tdvp);
4557         else
4558                 VN_URELE(tdvp);
4559         if (tvp)
4560                 VN_URELE(tvp);
4561         VN_RELE(fdvp);
4562         VN_RELE(fvp);
4563
4564         return (error);
4565 }
4566
4567 static int
4568 zfs_freebsd_symlink(ap)
4569         struct vop_symlink_args /* {
4570                 struct vnode *a_dvp;
4571                 struct vnode **a_vpp;
4572                 struct componentname *a_cnp;
4573                 struct vattr *a_vap;
4574                 char *a_target;
4575         } */ *ap;
4576 {
4577         struct componentname *cnp = ap->a_cnp;
4578         vattr_t *vap = ap->a_vap;
4579
4580         ASSERT(cnp->cn_flags & SAVENAME);
4581
4582         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
4583         vattr_init_mask(vap);
4584
4585         return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
4586             ap->a_target, cnp->cn_cred, cnp->cn_thread));
4587 }
4588
4589 static int
4590 zfs_freebsd_readlink(ap)
4591         struct vop_readlink_args /* {
4592                 struct vnode *a_vp;
4593                 struct uio *a_uio;
4594                 struct ucred *a_cred;
4595         } */ *ap;
4596 {
4597
4598         return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4599 }
4600
4601 static int
4602 zfs_freebsd_link(ap)
4603         struct vop_link_args /* {
4604                 struct vnode *a_tdvp;
4605                 struct vnode *a_vp;
4606                 struct componentname *a_cnp;
4607         } */ *ap;
4608 {
4609         struct componentname *cnp = ap->a_cnp;
4610
4611         ASSERT(cnp->cn_flags & SAVENAME);
4612
4613         return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4614 }
4615
4616 static int
4617 zfs_freebsd_inactive(ap)
4618         struct vop_inactive_args /* {
4619                 struct vnode *a_vp;
4620                 struct thread *a_td;
4621         } */ *ap;
4622 {
4623         vnode_t *vp = ap->a_vp;
4624
4625         zfs_inactive(vp, ap->a_td->td_ucred, NULL);
4626         return (0);
4627 }
4628
4629 static void
4630 zfs_reclaim_complete(void *arg, int pending)
4631 {
4632         znode_t *zp = arg;
4633         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4634
4635         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4636         if (zp->z_dbuf != NULL) {
4637                 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
4638                 zfs_znode_dmu_fini(zp);
4639                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4640         }
4641         zfs_znode_free(zp);
4642         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4643         /*
4644          * If the file system is being unmounted, there is a process waiting
4645          * for us, wake it up.
4646          */
4647         if (zfsvfs->z_unmounted)
4648                 wakeup_one(zfsvfs);
4649 }
4650
4651 static int
4652 zfs_freebsd_reclaim(ap)
4653         struct vop_reclaim_args /* {
4654                 struct vnode *a_vp;
4655                 struct thread *a_td;
4656         } */ *ap;
4657 {
4658         vnode_t *vp = ap->a_vp;
4659         znode_t *zp = VTOZ(vp);
4660         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4661
4662         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4663
4664         ASSERT(zp != NULL);
4665
4666         /*
4667          * Destroy the vm object and flush associated pages.
4668          */
4669         vnode_destroy_vobject(vp);
4670
4671         mutex_enter(&zp->z_lock);
4672         ASSERT(zp->z_phys != NULL);
4673         zp->z_vnode = NULL;
4674         mutex_exit(&zp->z_lock);
4675
4676         if (zp->z_unlinked)
4677                 ;       /* Do nothing. */
4678         else if (zp->z_dbuf == NULL)
4679                 zfs_znode_free(zp);
4680         else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
4681                 int locked;
4682
4683                 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4684                     ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4685                 if (locked == 0) {
4686                         /*
4687                          * Lock can't be obtained due to deadlock possibility,
4688                          * so defer znode destruction.
4689                          */
4690                         TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
4691                         taskqueue_enqueue(taskqueue_thread, &zp->z_task);
4692                 } else {
4693                         zfs_znode_dmu_fini(zp);
4694                         if (locked == 1)
4695                                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4696                         zfs_znode_free(zp);
4697                 }
4698         }
4699         VI_LOCK(vp);
4700         vp->v_data = NULL;
4701         ASSERT(vp->v_holdcnt >= 1);
4702         VI_UNLOCK(vp);
4703         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4704         return (0);
4705 }
4706
4707 static int
4708 zfs_freebsd_fid(ap)
4709         struct vop_fid_args /* {
4710                 struct vnode *a_vp;
4711                 struct fid *a_fid;
4712         } */ *ap;
4713 {
4714
4715         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4716 }
4717
4718 static int
4719 zfs_freebsd_pathconf(ap)
4720         struct vop_pathconf_args /* {
4721                 struct vnode *a_vp;
4722                 int a_name;
4723                 register_t *a_retval;
4724         } */ *ap;
4725 {
4726         ulong_t val;
4727         int error;
4728
4729         error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
4730         if (error == 0)
4731                 *ap->a_retval = val;
4732         else if (error == EOPNOTSUPP)
4733                 error = vop_stdpathconf(ap);
4734         return (error);
4735 }
4736
4737 static int
4738 zfs_freebsd_fifo_pathconf(ap)
4739         struct vop_pathconf_args /* {
4740                 struct vnode *a_vp;
4741                 int a_name;
4742                 register_t *a_retval;
4743         } */ *ap;
4744 {
4745
4746         switch (ap->a_name) {
4747         case _PC_ACL_EXTENDED:
4748         case _PC_ACL_NFS4:
4749         case _PC_ACL_PATH_MAX:
4750         case _PC_MAC_PRESENT:
4751                 return (zfs_freebsd_pathconf(ap));
4752         default:
4753                 return (fifo_specops.vop_pathconf(ap));
4754         }
4755 }
4756
4757 /*
4758  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
4759  * extended attribute name:
4760  *
4761  *      NAMESPACE       PREFIX
4762  *      system          freebsd:system:
4763  *      user            (none, can be used to access ZFS fsattr(5) attributes
4764  *                      created on Solaris)
4765  */
4766 static int
4767 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
4768     size_t size)
4769 {
4770         const char *namespace, *prefix, *suffix;
4771
4772         /* We don't allow '/' character in attribute name. */
4773         if (strchr(name, '/') != NULL)
4774                 return (EINVAL);
4775         /* We don't allow attribute names that start with "freebsd:" string. */
4776         if (strncmp(name, "freebsd:", 8) == 0)
4777                 return (EINVAL);
4778
4779         bzero(attrname, size);
4780
4781         switch (attrnamespace) {
4782         case EXTATTR_NAMESPACE_USER:
4783 #if 0
4784                 prefix = "freebsd:";
4785                 namespace = EXTATTR_NAMESPACE_USER_STRING;
4786                 suffix = ":";
4787 #else
4788                 /*
4789                  * This is the default namespace by which we can access all
4790                  * attributes created on Solaris.
4791                  */
4792                 prefix = namespace = suffix = "";
4793 #endif
4794                 break;
4795         case EXTATTR_NAMESPACE_SYSTEM:
4796                 prefix = "freebsd:";
4797                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
4798                 suffix = ":";
4799                 break;
4800         case EXTATTR_NAMESPACE_EMPTY:
4801         default:
4802                 return (EINVAL);
4803         }
4804         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
4805             name) >= size) {
4806                 return (ENAMETOOLONG);
4807         }
4808         return (0);
4809 }
4810
4811 /*
4812  * Vnode operating to retrieve a named extended attribute.
4813  */
4814 static int
4815 zfs_getextattr(struct vop_getextattr_args *ap)
4816 /*
4817 vop_getextattr {
4818         IN struct vnode *a_vp;
4819         IN int a_attrnamespace;
4820         IN const char *a_name;
4821         INOUT struct uio *a_uio;
4822         OUT size_t *a_size;
4823         IN struct ucred *a_cred;
4824         IN struct thread *a_td;
4825 };
4826 */
4827 {
4828         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4829         struct thread *td = ap->a_td;
4830         struct nameidata nd;
4831         char attrname[255];
4832         struct vattr va;
4833         vnode_t *xvp = NULL, *vp;
4834         int error, flags;
4835
4836         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4837             ap->a_cred, ap->a_td, VREAD);
4838         if (error != 0)
4839                 return (error);
4840
4841         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4842             sizeof(attrname));
4843         if (error != 0)
4844                 return (error);
4845
4846         ZFS_ENTER(zfsvfs);
4847
4848         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4849             LOOKUP_XATTR);
4850         if (error != 0) {
4851                 ZFS_EXIT(zfsvfs);
4852                 return (error);
4853         }
4854
4855         flags = FREAD;
4856         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4857             xvp, td);
4858         error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
4859         vp = nd.ni_vp;
4860         NDFREE(&nd, NDF_ONLY_PNBUF);
4861         if (error != 0) {
4862                 ZFS_EXIT(zfsvfs);
4863                 if (error == ENOENT)
4864                         error = ENOATTR;
4865                 return (error);
4866         }
4867
4868         if (ap->a_size != NULL) {
4869                 error = VOP_GETATTR(vp, &va, ap->a_cred);
4870                 if (error == 0)
4871                         *ap->a_size = (size_t)va.va_size;
4872         } else if (ap->a_uio != NULL)
4873                 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4874
4875         VOP_UNLOCK(vp, 0);
4876         vn_close(vp, flags, ap->a_cred, td);
4877         ZFS_EXIT(zfsvfs);
4878
4879         return (error);
4880 }
4881
4882 /*
4883  * Vnode operation to remove a named attribute.
4884  */
4885 int
4886 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
4887 /*
4888 vop_deleteextattr {
4889         IN struct vnode *a_vp;
4890         IN int a_attrnamespace;
4891         IN const char *a_name;
4892         IN struct ucred *a_cred;
4893         IN struct thread *a_td;
4894 };
4895 */
4896 {
4897         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4898         struct thread *td = ap->a_td;
4899         struct nameidata nd;
4900         char attrname[255];
4901         struct vattr va;
4902         vnode_t *xvp = NULL, *vp;
4903         int error, flags;
4904
4905         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4906             ap->a_cred, ap->a_td, VWRITE);
4907         if (error != 0)
4908                 return (error);
4909
4910         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4911             sizeof(attrname));
4912         if (error != 0)
4913                 return (error);
4914
4915         ZFS_ENTER(zfsvfs);
4916
4917         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4918             LOOKUP_XATTR);
4919         if (error != 0) {
4920                 ZFS_EXIT(zfsvfs);
4921                 return (error);
4922         }
4923
4924         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
4925             UIO_SYSSPACE, attrname, xvp, td);
4926         error = namei(&nd);
4927         vp = nd.ni_vp;
4928         NDFREE(&nd, NDF_ONLY_PNBUF);
4929         if (error != 0) {
4930                 ZFS_EXIT(zfsvfs);
4931                 if (error == ENOENT)
4932                         error = ENOATTR;
4933                 return (error);
4934         }
4935         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
4936
4937         vput(nd.ni_dvp);
4938         if (vp == nd.ni_dvp)
4939                 vrele(vp);
4940         else
4941                 vput(vp);
4942         ZFS_EXIT(zfsvfs);
4943
4944         return (error);
4945 }
4946
4947 /*
4948  * Vnode operation to set a named attribute.
4949  */
4950 static int
4951 zfs_setextattr(struct vop_setextattr_args *ap)
4952 /*
4953 vop_setextattr {
4954         IN struct vnode *a_vp;
4955         IN int a_attrnamespace;
4956         IN const char *a_name;
4957         INOUT struct uio *a_uio;
4958         IN struct ucred *a_cred;
4959         IN struct thread *a_td;
4960 };
4961 */
4962 {
4963         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4964         struct thread *td = ap->a_td;
4965         struct nameidata nd;
4966         char attrname[255];
4967         struct vattr va;
4968         vnode_t *xvp = NULL, *vp;
4969         int error, flags;
4970
4971         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4972             ap->a_cred, ap->a_td, VWRITE);
4973         if (error != 0)
4974                 return (error);
4975
4976         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4977             sizeof(attrname));
4978         if (error != 0)
4979                 return (error);
4980
4981         ZFS_ENTER(zfsvfs);
4982
4983         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4984             LOOKUP_XATTR | CREATE_XATTR_DIR);
4985         if (error != 0) {
4986                 ZFS_EXIT(zfsvfs);
4987                 return (error);
4988         }
4989
4990         flags = FFLAGS(O_WRONLY | O_CREAT);
4991         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4992             xvp, td);
4993         error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
4994         vp = nd.ni_vp;
4995         NDFREE(&nd, NDF_ONLY_PNBUF);
4996         if (error != 0) {
4997                 ZFS_EXIT(zfsvfs);
4998                 return (error);
4999         }
5000
5001         VATTR_NULL(&va);
5002         va.va_size = 0;
5003         error = VOP_SETATTR(vp, &va, ap->a_cred);
5004         if (error == 0)
5005                 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
5006
5007         VOP_UNLOCK(vp, 0);
5008         vn_close(vp, flags, ap->a_cred, td);
5009         ZFS_EXIT(zfsvfs);
5010
5011         return (error);
5012 }
5013
5014 /*
5015  * Vnode operation to retrieve extended attributes on a vnode.
5016  */
5017 static int
5018 zfs_listextattr(struct vop_listextattr_args *ap)
5019 /*
5020 vop_listextattr {
5021         IN struct vnode *a_vp;
5022         IN int a_attrnamespace;
5023         INOUT struct uio *a_uio;
5024         OUT size_t *a_size;
5025         IN struct ucred *a_cred;
5026         IN struct thread *a_td;
5027 };
5028 */
5029 {
5030         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5031         struct thread *td = ap->a_td;
5032         struct nameidata nd;
5033         char attrprefix[16];
5034         u_char dirbuf[sizeof(struct dirent)];
5035         struct dirent *dp;
5036         struct iovec aiov;
5037         struct uio auio, *uio = ap->a_uio;
5038         size_t *sizep = ap->a_size;
5039         size_t plen;
5040         vnode_t *xvp = NULL, *vp;
5041         int done, error, eof, pos;
5042
5043         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5044             ap->a_cred, ap->a_td, VREAD);
5045         if (error != 0)
5046                 return (error);
5047
5048         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5049             sizeof(attrprefix));
5050         if (error != 0)
5051                 return (error);
5052         plen = strlen(attrprefix);
5053
5054         ZFS_ENTER(zfsvfs);
5055
5056         if (sizep != NULL)
5057                 *sizep = 0;
5058
5059         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5060             LOOKUP_XATTR);
5061         if (error != 0) {
5062                 ZFS_EXIT(zfsvfs);
5063                 /*
5064                  * ENOATTR means that the EA directory does not yet exist,
5065                  * i.e. there are no extended attributes there.
5066                  */
5067                 if (error == ENOATTR)
5068                         error = 0;
5069                 return (error);
5070         }
5071
5072         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
5073             UIO_SYSSPACE, ".", xvp, td);
5074         error = namei(&nd);
5075         vp = nd.ni_vp;
5076         NDFREE(&nd, NDF_ONLY_PNBUF);
5077         if (error != 0) {
5078                 ZFS_EXIT(zfsvfs);
5079                 return (error);
5080         }
5081
5082         auio.uio_iov = &aiov;
5083         auio.uio_iovcnt = 1;
5084         auio.uio_segflg = UIO_SYSSPACE;
5085         auio.uio_td = td;
5086         auio.uio_rw = UIO_READ;
5087         auio.uio_offset = 0;
5088
5089         do {
5090                 u_char nlen;
5091
5092                 aiov.iov_base = (void *)dirbuf;
5093                 aiov.iov_len = sizeof(dirbuf);
5094                 auio.uio_resid = sizeof(dirbuf);
5095                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5096                 done = sizeof(dirbuf) - auio.uio_resid;
5097                 if (error != 0)
5098                         break;
5099                 for (pos = 0; pos < done;) {
5100                         dp = (struct dirent *)(dirbuf + pos);
5101                         pos += dp->d_reclen;
5102                         /*
5103                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
5104                          * is what we get when attribute was created on Solaris.
5105                          */
5106                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5107                                 continue;
5108                         if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5109                                 continue;
5110                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5111                                 continue;
5112                         nlen = dp->d_namlen - plen;
5113                         if (sizep != NULL)
5114                                 *sizep += 1 + nlen;
5115                         else if (uio != NULL) {
5116                                 /*
5117                                  * Format of extattr name entry is one byte for
5118                                  * length and the rest for name.
5119                                  */
5120                                 error = uiomove(&nlen, 1, uio->uio_rw, uio);
5121                                 if (error == 0) {
5122                                         error = uiomove(dp->d_name + plen, nlen,
5123                                             uio->uio_rw, uio);
5124                                 }
5125                                 if (error != 0)
5126                                         break;
5127                         }
5128                 }
5129         } while (!eof && error == 0);
5130
5131         vput(vp);
5132         ZFS_EXIT(zfsvfs);
5133
5134         return (error);
5135 }
5136
5137 int
5138 zfs_freebsd_getacl(ap)
5139         struct vop_getacl_args /* {
5140                 struct vnode *vp;
5141                 acl_type_t type;
5142                 struct acl *aclp;
5143                 struct ucred *cred;
5144                 struct thread *td;
5145         } */ *ap;
5146 {
5147         int             error;
5148         vsecattr_t      vsecattr;
5149
5150         if (ap->a_type != ACL_TYPE_NFS4)
5151                 return (EINVAL);
5152
5153         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5154         if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5155                 return (error);
5156
5157         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5158         if (vsecattr.vsa_aclentp != NULL)
5159                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5160
5161         return (error);
5162 }
5163
5164 int
5165 zfs_freebsd_setacl(ap)
5166         struct vop_setacl_args /* {
5167                 struct vnode *vp;
5168                 acl_type_t type;
5169                 struct acl *aclp;
5170                 struct ucred *cred;
5171                 struct thread *td;
5172         } */ *ap;
5173 {
5174         int             error;
5175         vsecattr_t      vsecattr;
5176         int             aclbsize;       /* size of acl list in bytes */
5177         aclent_t        *aaclp;
5178
5179         if (ap->a_type != ACL_TYPE_NFS4)
5180                 return (EINVAL);
5181
5182         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5183                 return (EINVAL);
5184
5185         /*
5186          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5187          * splitting every entry into two and appending "canonical six"
5188          * entries at the end.  Don't allow for setting an ACL that would
5189          * cause chmod(2) to run out of ACL entries.
5190          */
5191         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5192                 return (ENOSPC);
5193
5194         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5195         if (error != 0)
5196                 return (error);
5197
5198         vsecattr.vsa_mask = VSA_ACE;
5199         aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5200         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5201         aaclp = vsecattr.vsa_aclentp;
5202         vsecattr.vsa_aclentsz = aclbsize;
5203
5204         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5205         error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5206         kmem_free(aaclp, aclbsize);
5207
5208         return (error);
5209 }
5210
5211 int
5212 zfs_freebsd_aclcheck(ap)
5213         struct vop_aclcheck_args /* {
5214                 struct vnode *vp;
5215                 acl_type_t type;
5216                 struct acl *aclp;
5217                 struct ucred *cred;
5218                 struct thread *td;
5219         } */ *ap;
5220 {
5221
5222         return (EOPNOTSUPP);
5223 }
5224
5225 struct vop_vector zfs_vnodeops;
5226 struct vop_vector zfs_fifoops;
5227 struct vop_vector zfs_shareops;
5228
5229 struct vop_vector zfs_vnodeops = {
5230         .vop_default =          &default_vnodeops,
5231         .vop_inactive =         zfs_freebsd_inactive,
5232         .vop_reclaim =          zfs_freebsd_reclaim,
5233         .vop_access =           zfs_freebsd_access,
5234 #ifdef FREEBSD_NAMECACHE
5235         .vop_lookup =           vfs_cache_lookup,
5236         .vop_cachedlookup =     zfs_freebsd_lookup,
5237 #else
5238         .vop_lookup =           zfs_freebsd_lookup,
5239 #endif
5240         .vop_getattr =          zfs_freebsd_getattr,
5241         .vop_setattr =          zfs_freebsd_setattr,
5242         .vop_create =           zfs_freebsd_create,
5243         .vop_mknod =            zfs_freebsd_create,
5244         .vop_mkdir =            zfs_freebsd_mkdir,
5245         .vop_readdir =          zfs_freebsd_readdir,
5246         .vop_fsync =            zfs_freebsd_fsync,
5247         .vop_open =             zfs_freebsd_open,
5248         .vop_close =            zfs_freebsd_close,
5249         .vop_rmdir =            zfs_freebsd_rmdir,
5250         .vop_ioctl =            zfs_freebsd_ioctl,
5251         .vop_link =             zfs_freebsd_link,
5252         .vop_symlink =          zfs_freebsd_symlink,
5253         .vop_readlink =         zfs_freebsd_readlink,
5254         .vop_read =             zfs_freebsd_read,
5255         .vop_write =            zfs_freebsd_write,
5256         .vop_remove =           zfs_freebsd_remove,
5257         .vop_rename =           zfs_freebsd_rename,
5258         .vop_pathconf =         zfs_freebsd_pathconf,
5259         .vop_bmap =             VOP_EOPNOTSUPP,
5260         .vop_fid =              zfs_freebsd_fid,
5261         .vop_getextattr =       zfs_getextattr,
5262         .vop_deleteextattr =    zfs_deleteextattr,
5263         .vop_setextattr =       zfs_setextattr,
5264         .vop_listextattr =      zfs_listextattr,
5265         .vop_getacl =           zfs_freebsd_getacl,
5266         .vop_setacl =           zfs_freebsd_setacl,
5267         .vop_aclcheck =         zfs_freebsd_aclcheck,
5268 };
5269
5270 struct vop_vector zfs_fifoops = {
5271         .vop_default =          &fifo_specops,
5272         .vop_fsync =            zfs_freebsd_fsync,
5273         .vop_access =           zfs_freebsd_access,
5274         .vop_getattr =          zfs_freebsd_getattr,
5275         .vop_inactive =         zfs_freebsd_inactive,
5276         .vop_read =             VOP_PANIC,
5277         .vop_reclaim =          zfs_freebsd_reclaim,
5278         .vop_setattr =          zfs_freebsd_setattr,
5279         .vop_write =            VOP_PANIC,
5280         .vop_pathconf =         zfs_freebsd_fifo_pathconf,
5281         .vop_fid =              zfs_freebsd_fid,
5282         .vop_getacl =           zfs_freebsd_getacl,
5283         .vop_setacl =           zfs_freebsd_setacl,
5284         .vop_aclcheck =         zfs_freebsd_aclcheck,
5285 };
5286
5287 /*
5288  * special share hidden files vnode operations template
5289  */
5290 struct vop_vector zfs_shareops = {
5291         .vop_default =          &default_vnodeops,
5292         .vop_access =           zfs_freebsd_access,
5293         .vop_inactive =         zfs_freebsd_inactive,
5294         .vop_reclaim =          zfs_freebsd_reclaim,
5295         .vop_fid =              zfs_freebsd_fid,
5296         .vop_pathconf =         zfs_freebsd_pathconf,
5297 };