sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/time.h>
  31 #include <sys/systm.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/resource.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vnode.h>
  36 #include <sys/file.h>
  37 #include <sys/stat.h>
  38 #include <sys/kmem.h>
  39 #include <sys/taskq.h>
  40 #include <sys/uio.h>
  41 #include <sys/atomic.h>
  42 #include <sys/namei.h>
  43 #include <sys/mman.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/errno.h>
  46 #include <sys/unistd.h>
  47 #include <sys/zfs_dir.h>
  48 #include <sys/zfs_ioctl.h>
  49 #include <sys/fs/zfs.h>
  50 #include <sys/dmu.h>
  51 #include <sys/spa.h>
  52 #include <sys/txg.h>
  53 #include <sys/dbuf.h>
  54 #include <sys/zap.h>
  55 #include <sys/dirent.h>
  56 #include <sys/policy.h>
  57 #include <sys/sunddi.h>
  58 #include <sys/filio.h>
  59 #include <sys/sid.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/zfs_rlock.h>
  64 #include <sys/extdirent.h>
  65 #include <sys/kidmap.h>
  66 #include <sys/bio.h>
  67 #include <sys/buf.h>
  68 #include <sys/sf_buf.h>
  69 #include <sys/sched.h>
  70 #include <sys/acl.h>
  71
  72 /*
  73  * Programming rules.
  74  *
  75  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  76  * properly lock its in-core state, create a DMU transaction, do the work,
  77  * record this work in the intent log (ZIL), commit the DMU transaction,
  78  * and wait for the intent log to commit if it is a synchronous operation.
  79  * Moreover, the vnode ops must work in both normal and log replay context.
  80  * The ordering of events is important to avoid deadlocks and references
  81  * to freed memory.  The example below illustrates the following Big Rules:
  82  *
  83  *  (1) A check must be made in each zfs thread for a mounted file system.
  84  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  85  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  86  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  87  *      can return EIO from the calling function.
  88  *
  89  *  (2) VN_RELE() should always be the last thing except for zil_commit()
  90  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  91  *      First, if it's the last reference, the vnode/znode
  92  *      can be freed, so the zp may point to freed memory.  Second, the last
  93  *      reference will call zfs_zinactive(), which may induce a lot of work --
  94  *      pushing cached pages (which acquires range locks) and syncing out
  95  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
  96  *      which could deadlock the system if you were already holding one.
  97  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  98  *
  99  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 100  *      as they can span dmu_tx_assign() calls.
 101  *
 102  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 103  *      This is critical because we don't want to block while holding locks.
 104  *      Note, in particular, that if a lock is sometimes acquired before
 105  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 106  *      use a non-blocking assign can deadlock the system.  The scenario:
 107  *
 108  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 109  *      Thread B is in an already-assigned tx, and blocks for this lock.
 110  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 111  *      forever, because the previous txg can't quiesce until B's tx commits.
 112  *
 113  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 114  *      then drop all locks, call dmu_tx_wait(), and try again.
 115  *
 116  *  (5) If the operation succeeded, generate the intent log entry for it
 117  *      before dropping locks.  This ensures that the ordering of events
 118  *      in the intent log matches the order in which they actually occurred.
 119  *      During ZIL replay the zfs_log_* functions will update the sequence
 120  *      number to indicate the zil transaction has replayed.
 121  *
 122  *  (6) At the end of each vnode op, the DMU tx must always commit,
 123  *      regardless of whether there were any errors.
 124  *
 125  *  (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
 126  *      to ensure that synchronous semantics are provided when necessary.
 127  *
 128  * In general, this is how things should be ordered in each vnode op:
 129  *
 130  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 131  * top:
 132  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 133  *      rw_enter(...);                  // grab any other locks you need
 134  *      tx = dmu_tx_create(...);        // get DMU tx
 135  *      dmu_tx_hold_*();                // hold each object you might modify
 136  *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 137  *      if (error) {
 138  *              rw_exit(...);           // drop locks
 139  *              zfs_dirent_unlock(dl);  // unlock directory entry
 140  *              VN_RELE(...);           // release held vnodes
 141  *              if (error == ERESTART) {
 142  *                      dmu_tx_wait(tx);
 143  *                      dmu_tx_abort(tx);
 144  *                      goto top;
 145  *              }
 146  *              dmu_tx_abort(tx);       // abort DMU tx
 147  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 148  *              return (error);         // really out of space
 149  *      }
 150  *      error = do_real_work();         // do whatever this VOP does
 151  *      if (error == 0)
 152  *              zfs_log_*(...);         // on success, make ZIL entry
 153  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 154  *      rw_exit(...);                   // drop locks
 155  *      zfs_dirent_unlock(dl);          // unlock directory entry
 156  *      VN_RELE(...);                   // release held vnodes
 157  *      zil_commit(zilog, seq, foid);   // synchronous when necessary
 158  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 159  *      return (error);                 // done, report error
 160  */
 161
 162 /* ARGSUSED */
 163 static int
 164 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 165 {
 166         znode_t *zp = VTOZ(*vpp);
 167         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 168
 169         ZFS_ENTER(zfsvfs);
 170         ZFS_VERIFY_ZP(zp);
 171
 172         if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
 173             ((flag & FAPPEND) == 0)) {
 174                 ZFS_EXIT(zfsvfs);
 175                 return (EPERM);
 176         }
 177
 178         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 179             ZTOV(zp)->v_type == VREG &&
 180             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 181             zp->z_phys->zp_size > 0) {
 182                 if (fs_vscan(*vpp, cr, 0) != 0) {
 183                         ZFS_EXIT(zfsvfs);
 184                         return (EACCES);
 185                 }
 186         }
 187
 188         /* Keep a count of the synchronous opens in the znode */
 189         if (flag & (FSYNC | FDSYNC))
 190                 atomic_inc_32(&zp->z_sync_cnt);
 191
 192         ZFS_EXIT(zfsvfs);
 193         return (0);
 194 }
 195
 196 /* ARGSUSED */
 197 static int
 198 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 199     caller_context_t *ct)
 200 {
 201         znode_t *zp = VTOZ(vp);
 202         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 203
 204         ZFS_ENTER(zfsvfs);
 205         ZFS_VERIFY_ZP(zp);
 206
 207         /* Decrement the synchronous opens in the znode */
 208         if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 209                 atomic_dec_32(&zp->z_sync_cnt);
 210
 211         /*
 212          * Clean up any locks held by this process on the vp.
 213          */
 214         cleanlocks(vp, ddi_get_pid(), 0);
 215         cleanshares(vp, ddi_get_pid());
 216
 217         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 218             ZTOV(zp)->v_type == VREG &&
 219             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 220             zp->z_phys->zp_size > 0)
 221                 VERIFY(fs_vscan(vp, cr, 1) == 0);
 222
 223         ZFS_EXIT(zfsvfs);
 224         return (0);
 225 }
 226
 227 /*
 228  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 229  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 230  */
 231 static int
 232 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 233 {
 234         znode_t *zp = VTOZ(vp);
 235         uint64_t noff = (uint64_t)*off; /* new offset */
 236         uint64_t file_sz;
 237         int error;
 238         boolean_t hole;
 239
 240         file_sz = zp->z_phys->zp_size;
 241         if (noff >= file_sz)  {
 242                 return (ENXIO);
 243         }
 244
 245         if (cmd == _FIO_SEEK_HOLE)
 246                 hole = B_TRUE;
 247         else
 248                 hole = B_FALSE;
 249
 250         error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 251
 252         /* end of file? */
 253         if ((error == ESRCH) || (noff > file_sz)) {
 254                 /*
 255                  * Handle the virtual hole at the end of file.
 256                  */
 257                 if (hole) {
 258                         *off = file_sz;
 259                         return (0);
 260                 }
 261                 return (ENXIO);
 262         }
 263
 264         if (noff < *off)
 265                 return (error);
 266         *off = noff;
 267         return (error);
 268 }
 269
 270 /* ARGSUSED */
 271 static int
 272 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 273     int *rvalp, caller_context_t *ct)
 274 {
 275         offset_t off;
 276         int error;
 277         zfsvfs_t *zfsvfs;
 278         znode_t *zp;
 279
 280         switch (com) {
 281         case _FIOFFS:
 282                 return (0);
 283
 284                 /*
 285                  * The following two ioctls are used by bfu.  Faking out,
 286                  * necessary to avoid bfu errors.
 287                  */
 288         case _FIOGDIO:
 289         case _FIOSDIO:
 290                 return (0);
 291
 292         case _FIO_SEEK_DATA:
 293         case _FIO_SEEK_HOLE:
 294                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 295                         return (EFAULT);
 296
 297                 zp = VTOZ(vp);
 298                 zfsvfs = zp->z_zfsvfs;
 299                 ZFS_ENTER(zfsvfs);
 300                 ZFS_VERIFY_ZP(zp);
 301
 302                 /* offset parameter is in/out */
 303                 error = zfs_holey(vp, com, &off);
 304                 ZFS_EXIT(zfsvfs);
 305                 if (error)
 306                         return (error);
 307                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 308                         return (EFAULT);
 309                 return (0);
 310         }
 311         return (ENOTTY);
 312 }
 313
 314 static vm_page_t
 315 page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 316 {
 317         vm_object_t obj;
 318         vm_page_t pp;
 319
 320         obj = vp->v_object;
 321         VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
 322
 323         for (;;) {
 324                 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 325                     vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
 326                         if (vm_page_sleep_if_busy(pp, FALSE, "zfsmwb"))
 327                                 continue;
 328                         vm_page_busy(pp);
 329                         vm_page_lock_queues();
 330                         vm_page_undirty(pp);
 331                         vm_page_unlock_queues();
 332                 } else {
 333                         if (__predict_false(obj->cache != NULL)) {
 334                                 vm_page_cache_free(obj, OFF_TO_IDX(start),
 335                                     OFF_TO_IDX(start) + 1);
 336                         }
 337                         pp = NULL;
 338                 }
 339                 break;
 340         }
 341         return (pp);
 342 }
 343
 344 static void
 345 page_unlock(vm_page_t pp)
 346 {
 347
 348         vm_page_wakeup(pp);
 349 }
 350
 351 static caddr_t
 352 zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
 353 {
 354
 355         sched_pin();
 356         *sfp = sf_buf_alloc(pp, SFB_CPUPRIVATE);
 357         return ((caddr_t)sf_buf_kva(*sfp));
 358 }
 359
 360 static void
 361 zfs_unmap_page(struct sf_buf *sf)
 362 {
 363
 364         sf_buf_free(sf);
 365         sched_unpin();
 366 }
 367
 368
 369 /*
 370  * When a file is memory mapped, we must keep the IO data synchronized
 371  * between the DMU cache and the memory mapped pages.  What this means:
 372  *
 373  * On Write:    If we find a memory mapped page, we write to *both*
 374  *              the page and the dmu buffer.
 375  */
 376
 377 static void
 378 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
 379     int segflg, dmu_tx_t *tx)
 380 {
 381         vm_object_t obj;
 382         struct sf_buf *sf;
 383         int64_t off;
 384
 385         ASSERT(vp->v_mount != NULL);
 386         obj = vp->v_object;
 387         ASSERT(obj != NULL);
 388
 389         off = start & PAGEOFFSET;
 390         VM_OBJECT_LOCK(obj);
 391         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 392                 vm_page_t pp;
 393                 uint64_t nbytes = MIN(PAGESIZE - off, len);
 394
 395                 if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
 396                         caddr_t va;
 397
 398                         VM_OBJECT_UNLOCK(obj);
 399                         va = zfs_map_page(pp, &sf);
 400                         if (segflg == UIO_NOCOPY) {
 401                                 (void) dmu_write(os, oid, start+off, nbytes,
 402                                     va+off, tx);
 403                         } else {
 404                                 (void) dmu_read(os, oid, start+off, nbytes,
 405                                     va+off, DMU_READ_PREFETCH);;
 406                         }
 407                         zfs_unmap_page(sf);
 408                         VM_OBJECT_LOCK(obj);
 409                         page_unlock(pp);
 410
 411                 }
 412                 len -= nbytes;
 413                 off = 0;
 414         }
 415         VM_OBJECT_UNLOCK(obj);
 416 }
 417
 418 /*
 419  * When a file is memory mapped, we must keep the IO data synchronized
 420  * between the DMU cache and the memory mapped pages.  What this means:
 421  *
 422  * On Read:     We "read" preferentially from memory mapped pages,
 423  *              else we default from the dmu buffer.
 424  *
 425  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 426  *      the file is memory mapped.
 427  */
 428 static int
 429 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 430 {
 431         znode_t *zp = VTOZ(vp);
 432         objset_t *os = zp->z_zfsvfs->z_os;
 433         vm_object_t obj;
 434         vm_page_t m;
 435         struct sf_buf *sf;
 436         int64_t start, off;
 437         caddr_t va;
 438         int len = nbytes;
 439         int error = 0;
 440         uint64_t dirbytes;
 441
 442         ASSERT(vp->v_mount != NULL);
 443         obj = vp->v_object;
 444         ASSERT(obj != NULL);
 445
 446         start = uio->uio_loffset;
 447         off = start & PAGEOFFSET;
 448         dirbytes = 0;
 449         VM_OBJECT_LOCK(obj);
 450         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 451                 uint64_t bytes = MIN(PAGESIZE - off, len);
 452
 453 again:
 454                 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 455                     vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 456                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 457                                 goto again;
 458                         vm_page_busy(m);
 459                         VM_OBJECT_UNLOCK(obj);
 460                         if (dirbytes > 0) {
 461                                 error = dmu_read_uio(os, zp->z_id, uio,
 462                                     dirbytes);
 463                                 dirbytes = 0;
 464                         }
 465                         if (error == 0) {
 466                                 sched_pin();
 467                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 468                                 va = (caddr_t)sf_buf_kva(sf);
 469                                 error = uiomove(va + off, bytes, UIO_READ, uio);
 470                                 sf_buf_free(sf);
 471                                 sched_unpin();
 472                         }
 473                         VM_OBJECT_LOCK(obj);
 474                         vm_page_wakeup(m);
 475                 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
 476                         /*
 477                          * The code below is here to make sendfile(2) work
 478                          * correctly with ZFS. As pointed out by ups@
 479                          * sendfile(2) should be changed to use VOP_GETPAGES(),
 480                          * but it pessimize performance of sendfile/UFS, that's
 481                          * why I handle this special case in ZFS code.
 482                          */
 483                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 484                                 goto again;
 485                         vm_page_busy(m);
 486                         VM_OBJECT_UNLOCK(obj);
 487                         if (dirbytes > 0) {
 488                                 error = dmu_read_uio(os, zp->z_id, uio,
 489                                     dirbytes);
 490                                 dirbytes = 0;
 491                         }
 492                         if (error == 0) {
 493                                 sched_pin();
 494                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 495                                 va = (caddr_t)sf_buf_kva(sf);
 496                                 error = dmu_read(os, zp->z_id, start + off,
 497                                     bytes, (void *)(va + off),
 498                                     DMU_READ_PREFETCH);
 499                                 sf_buf_free(sf);
 500                                 sched_unpin();
 501                         }
 502                         VM_OBJECT_LOCK(obj);
 503                         vm_page_wakeup(m);
 504                         if (error == 0)
 505                                 uio->uio_resid -= bytes;
 506                 } else {
 507                         dirbytes += bytes;
 508                 }
 509                 len -= bytes;
 510                 off = 0;
 511                 if (error)
 512                         break;
 513         }
 514         VM_OBJECT_UNLOCK(obj);
 515         if (error == 0 && dirbytes > 0)
 516                 error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 517         return (error);
 518 }
 519
 520 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 521
 522 /*
 523  * Read bytes from specified file into supplied buffer.
 524  *
 525  *      IN:     vp      - vnode of file to be read from.
 526  *              uio     - structure supplying read location, range info,
 527  *                        and return buffer.
 528  *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 529  *              cr      - credentials of caller.
 530  *              ct      - caller context
 531  *
 532  *      OUT:    uio     - updated offset and range, buffer filled.
 533  *
 534  *      RETURN: 0 if success
 535  *              error code if failure
 536  *
 537  * Side Effects:
 538  *      vp - atime updated if byte count > 0
 539  */
 540 /* ARGSUSED */
 541 static int
 542 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 543 {
 544         znode_t         *zp = VTOZ(vp);
 545         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 546         objset_t        *os;
 547         ssize_t         n, nbytes;
 548         int             error;
 549         rl_t            *rl;
 550
 551         ZFS_ENTER(zfsvfs);
 552         ZFS_VERIFY_ZP(zp);
 553         os = zfsvfs->z_os;
 554
 555         if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
 556                 ZFS_EXIT(zfsvfs);
 557                 return (EACCES);
 558         }
 559
 560         /*
 561          * Validate file offset
 562          */
 563         if (uio->uio_loffset < (offset_t)0) {
 564                 ZFS_EXIT(zfsvfs);
 565                 return (EINVAL);
 566         }
 567
 568         /*
 569          * Fasttrack empty reads
 570          */
 571         if (uio->uio_resid == 0) {
 572                 ZFS_EXIT(zfsvfs);
 573                 return (0);
 574         }
 575
 576         /*
 577          * Check for mandatory locks
 578          */
 579         if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 580                 if (error = chklock(vp, FREAD,
 581                     uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 582                         ZFS_EXIT(zfsvfs);
 583                         return (error);
 584                 }
 585         }
 586
 587         /*
 588          * If we're in FRSYNC mode, sync out this znode before reading it.
 589          */
 590         if (ioflag & FRSYNC)
 591                 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 592
 593         /*
 594          * Lock the range against changes.
 595          */
 596         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 597
 598         /*
 599          * If we are reading past end-of-file we can skip
 600          * to the end; but we might still need to set atime.
 601          */
 602         if (uio->uio_loffset >= zp->z_phys->zp_size) {
 603                 error = 0;
 604                 goto out;
 605         }
 606
 607         ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 608         n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 609
 610         while (n > 0) {
 611                 nbytes = MIN(n, zfs_read_chunk_size -
 612                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 613
 614                 if (vn_has_cached_data(vp))
 615                         error = mappedread(vp, nbytes, uio);
 616                 else
 617                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 618                 if (error) {
 619                         /* convert checksum errors into IO errors */
 620                         if (error == ECKSUM)
 621                                 error = EIO;
 622                         break;
 623                 }
 624
 625                 n -= nbytes;
 626         }
 627
 628 out:
 629         zfs_range_unlock(rl);
 630
 631         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 632         ZFS_EXIT(zfsvfs);
 633         return (error);
 634 }
 635
 636 /*
 637  * Fault in the pages of the first n bytes specified by the uio structure.
 638  * 1 byte in each page is touched and the uio struct is unmodified.
 639  * Any error will exit this routine as this is only a best
 640  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
 641  */
 642 static void
 643 zfs_prefault_write(ssize_t n, struct uio *uio)
 644 {
 645         struct iovec *iov;
 646         ulong_t cnt, incr;
 647         caddr_t p;
 648
 649         if (uio->uio_segflg != UIO_USERSPACE)
 650                 return;
 651
 652         iov = uio->uio_iov;
 653
 654         while (n) {
 655                 cnt = MIN(iov->iov_len, n);
 656                 if (cnt == 0) {
 657                         /* empty iov entry */
 658                         iov++;
 659                         continue;
 660                 }
 661                 n -= cnt;
 662                 /*
 663                  * touch each page in this segment.
 664                  */
 665                 p = iov->iov_base;
 666                 while (cnt) {
 667                         if (fubyte(p) == -1)
 668                                 return;
 669                         incr = MIN(cnt, PAGESIZE);
 670                         p += incr;
 671                         cnt -= incr;
 672                 }
 673                 /*
 674                  * touch the last byte in case it straddles a page.
 675                  */
 676                 p--;
 677                 if (fubyte(p) == -1)
 678                         return;
 679                 iov++;
 680         }
 681 }
 682
 683 /*
 684  * Write the bytes to a file.
 685  *
 686  *      IN:     vp      - vnode of file to be written to.
 687  *              uio     - structure supplying write location, range info,
 688  *                        and data buffer.
 689  *              ioflag  - IO_APPEND flag set if in append mode.
 690  *              cr      - credentials of caller.
 691  *              ct      - caller context (NFS/CIFS fem monitor only)
 692  *
 693  *      OUT:    uio     - updated offset and range.
 694  *
 695  *      RETURN: 0 if success
 696  *              error code if failure
 697  *
 698  * Timestamps:
 699  *      vp - ctime|mtime updated if byte count > 0
 700  */
 701 /* ARGSUSED */
 702 static int
 703 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 704 {
 705         znode_t         *zp = VTOZ(vp);
 706         rlim64_t        limit = MAXOFFSET_T;
 707         ssize_t         start_resid = uio->uio_resid;
 708         ssize_t         tx_bytes;
 709         uint64_t        end_size;
 710         dmu_tx_t        *tx;
 711         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 712         zilog_t         *zilog;
 713         offset_t        woff;
 714         ssize_t         n, nbytes;
 715         rl_t            *rl;
 716         int             max_blksz = zfsvfs->z_max_blksz;
 717         uint64_t        pflags;
 718         int             error;
 719         arc_buf_t       *abuf;
 720
 721         /*
 722          * Fasttrack empty write
 723          */
 724         n = start_resid;
 725         if (n == 0)
 726                 return (0);
 727
 728         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 729                 limit = MAXOFFSET_T;
 730
 731         ZFS_ENTER(zfsvfs);
 732         ZFS_VERIFY_ZP(zp);
 733
 734         /*
 735          * If immutable or not appending then return EPERM
 736          */
 737         pflags = zp->z_phys->zp_flags;
 738         if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 739             ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 740             (uio->uio_loffset < zp->z_phys->zp_size))) {
 741                 ZFS_EXIT(zfsvfs);
 742                 return (EPERM);
 743         }
 744
 745         zilog = zfsvfs->z_log;
 746
 747         /*
 748          * Pre-fault the pages to ensure slow (eg NFS) pages
 749          * don't hold up txg.
 750          */
 751         zfs_prefault_write(n, uio);
 752
 753         /*
 754          * If in append mode, set the io offset pointer to eof.
 755          */
 756         if (ioflag & IO_APPEND) {
 757                 /*
 758                  * Range lock for a file append:
 759                  * The value for the start of range will be determined by
 760                  * zfs_range_lock() (to guarantee append semantics).
 761                  * If this write will cause the block size to increase,
 762                  * zfs_range_lock() will lock the entire file, so we must
 763                  * later reduce the range after we grow the block size.
 764                  */
 765                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 766                 if (rl->r_len == UINT64_MAX) {
 767                         /* overlocked, zp_size can't change */
 768                         woff = uio->uio_loffset = zp->z_phys->zp_size;
 769                 } else {
 770                         woff = uio->uio_loffset = rl->r_off;
 771                 }
 772         } else {
 773                 woff = uio->uio_loffset;
 774                 /*
 775                  * Validate file offset
 776                  */
 777                 if (woff < 0) {
 778                         ZFS_EXIT(zfsvfs);
 779                         return (EINVAL);
 780                 }
 781
 782                 /*
 783                  * If we need to grow the block size then zfs_range_lock()
 784                  * will lock a wider range than we request here.
 785                  * Later after growing the block size we reduce the range.
 786                  */
 787                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 788         }
 789
 790         if (woff >= limit) {
 791                 zfs_range_unlock(rl);
 792                 ZFS_EXIT(zfsvfs);
 793                 return (EFBIG);
 794         }
 795
 796         if ((woff + n) > limit || woff > (limit - n))
 797                 n = limit - woff;
 798
 799         /*
 800          * Check for mandatory locks
 801          */
 802         if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
 803             (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 804                 zfs_range_unlock(rl);
 805                 ZFS_EXIT(zfsvfs);
 806                 return (error);
 807         }
 808         end_size = MAX(zp->z_phys->zp_size, woff + n);
 809
 810         /*
 811          * Write the file in reasonable size chunks.  Each chunk is written
 812          * in a separate transaction; this keeps the intent log records small
 813          * and allows us to do more fine-grained space accounting.
 814          */
 815         while (n > 0) {
 816                 abuf = NULL;
 817                 woff = uio->uio_loffset;
 818
 819 again:
 820                 if (zfs_usergroup_overquota(zfsvfs,
 821                     B_FALSE, zp->z_phys->zp_uid) ||
 822                     zfs_usergroup_overquota(zfsvfs,
 823                     B_TRUE, zp->z_phys->zp_gid)) {
 824                         if (abuf != NULL)
 825                                 dmu_return_arcbuf(abuf);
 826                         error = EDQUOT;
 827                         break;
 828                 }
 829
 830                 /*
 831                  * If dmu_assign_arcbuf() is expected to execute with minimum
 832                  * overhead loan an arc buffer and copy user data to it before
 833                  * we enter a txg.  This avoids holding a txg forever while we
 834                  * pagefault on a hanging NFS server mapping.
 835                  */
 836                 if (abuf == NULL && n >= max_blksz &&
 837                     woff >= zp->z_phys->zp_size &&
 838                     P2PHASE(woff, max_blksz) == 0 &&
 839                     zp->z_blksz == max_blksz) {
 840                         size_t cbytes;
 841
 842                         abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
 843                         ASSERT(abuf != NULL);
 844                         ASSERT(arc_buf_size(abuf) == max_blksz);
 845                         if (error = uiocopy(abuf->b_data, max_blksz,
 846                             UIO_WRITE, uio, &cbytes)) {
 847                                 dmu_return_arcbuf(abuf);
 848                                 break;
 849                         }
 850                         ASSERT(cbytes == max_blksz);
 851                 }
 852
 853                 /*
 854                  * Start a transaction.
 855                  */
 856                 tx = dmu_tx_create(zfsvfs->z_os);
 857                 dmu_tx_hold_bonus(tx, zp->z_id);
 858                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 859                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 860                 if (error) {
 861                         if (error == ERESTART) {
 862                                 dmu_tx_wait(tx);
 863                                 dmu_tx_abort(tx);
 864                                 goto again;
 865                         }
 866                         dmu_tx_abort(tx);
 867                         if (abuf != NULL)
 868                                 dmu_return_arcbuf(abuf);
 869                         break;
 870                 }
 871
 872                 /*
 873                  * If zfs_range_lock() over-locked we grow the blocksize
 874                  * and then reduce the lock range.  This will only happen
 875                  * on the first iteration since zfs_range_reduce() will
 876                  * shrink down r_len to the appropriate size.
 877                  */
 878                 if (rl->r_len == UINT64_MAX) {
 879                         uint64_t new_blksz;
 880
 881                         if (zp->z_blksz > max_blksz) {
 882                                 ASSERT(!ISP2(zp->z_blksz));
 883                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 884                         } else {
 885                                 new_blksz = MIN(end_size, max_blksz);
 886                         }
 887                         zfs_grow_blocksize(zp, new_blksz, tx);
 888                         zfs_range_reduce(rl, woff, n);
 889                 }
 890
 891                 /*
 892                  * XXX - should we really limit each write to z_max_blksz?
 893                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 894                  */
 895                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 896
 897                 if (woff + nbytes > zp->z_phys->zp_size)
 898                         vnode_pager_setsize(vp, woff + nbytes);
 899
 900                 if (abuf == NULL) {
 901                         tx_bytes = uio->uio_resid;
 902                         error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
 903                             nbytes, tx);
 904                         tx_bytes -= uio->uio_resid;
 905                 } else {
 906                         tx_bytes = nbytes;
 907                         ASSERT(tx_bytes == max_blksz);
 908                         dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
 909                         ASSERT(tx_bytes <= uio->uio_resid);
 910                         uioskip(uio, tx_bytes);
 911                 }
 912
 913                 /*
 914                  * XXXPJD: There are some cases (triggered by fsx) where
 915                  *         vn_has_cached_data(vp) returns false when it should
 916                  *         return true. This should be investigated.
 917                  */
 918 #if 0
 919                 if (tx_bytes && vn_has_cached_data(vp))
 920 #else
 921                 if (tx_bytes && vp->v_object != NULL)
 922 #endif
 923                 {
 924                         update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 925                             zp->z_id, uio->uio_segflg, tx);
 926                 }
 927
 928                 /*
 929                  * If we made no progress, we're done.  If we made even
 930                  * partial progress, update the znode and ZIL accordingly.
 931                  */
 932                 if (tx_bytes == 0) {
 933                         dmu_tx_commit(tx);
 934                         ASSERT(error != 0);
 935                         break;
 936                 }
 937
 938                 /*
 939                  * Clear Set-UID/Set-GID bits on successful write if not
 940                  * privileged and at least one of the excute bits is set.
 941                  *
 942                  * It would be nice to to this after all writes have
 943                  * been done, but that would still expose the ISUID/ISGID
 944                  * to another app after the partial write is committed.
 945                  *
 946                  * Note: we don't call zfs_fuid_map_id() here because
 947                  * user 0 is not an ephemeral uid.
 948                  */
 949                 mutex_enter(&zp->z_acl_lock);
 950                 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 951                     (S_IXUSR >> 6))) != 0 &&
 952                     (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
 953                     secpolicy_vnode_setid_retain(vp, cr,
 954                     (zp->z_phys->zp_mode & S_ISUID) != 0 &&
 955                     zp->z_phys->zp_uid == 0) != 0) {
 956                         zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 957                 }
 958                 mutex_exit(&zp->z_acl_lock);
 959
 960                 /*
 961                  * Update time stamp.  NOTE: This marks the bonus buffer as
 962                  * dirty, so we don't have to do it again for zp_size.
 963                  */
 964                 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 965
 966                 /*
 967                  * Update the file size (zp_size) if it has changed;
 968                  * account for possible concurrent updates.
 969                  */
 970                 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 971                         (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 972                             uio->uio_loffset);
 973                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 974                 dmu_tx_commit(tx);
 975
 976                 if (error != 0)
 977                         break;
 978                 ASSERT(tx_bytes == nbytes);
 979                 n -= nbytes;
 980         }
 981
 982         zfs_range_unlock(rl);
 983
 984         /*
 985          * If we're in replay mode, or we made no progress, return error.
 986          * Otherwise, it's at least a partial write, so it's successful.
 987          */
 988         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 989                 ZFS_EXIT(zfsvfs);
 990                 return (error);
 991         }
 992
 993         if (ioflag & (FSYNC | FDSYNC))
 994                 zil_commit(zilog, zp->z_last_itx, zp->z_id);
 995
 996         ZFS_EXIT(zfsvfs);
 997         return (0);
 998 }
 999
1000 void
1001 zfs_get_done(dmu_buf_t *db, void *vzgd)
1002 {
1003         zgd_t *zgd = (zgd_t *)vzgd;
1004         rl_t *rl = zgd->zgd_rl;
1005         vnode_t *vp = ZTOV(rl->r_zp);
1006         objset_t *os = rl->r_zp->z_zfsvfs->z_os;
1007         int vfslocked;
1008
1009         vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
1010         dmu_buf_rele(db, vzgd);
1011         zfs_range_unlock(rl);
1012         /*
1013          * Release the vnode asynchronously as we currently have the
1014          * txg stopped from syncing.
1015          */
1016         VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1017         zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1018         kmem_free(zgd, sizeof (zgd_t));
1019         VFS_UNLOCK_GIANT(vfslocked);
1020 }
1021
1022 /*
1023  * Get data to generate a TX_WRITE intent log record.
1024  */
1025 int
1026 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1027 {
1028         zfsvfs_t *zfsvfs = arg;
1029         objset_t *os = zfsvfs->z_os;
1030         znode_t *zp;
1031         uint64_t off = lr->lr_offset;
1032         dmu_buf_t *db;
1033         rl_t *rl;
1034         zgd_t *zgd;
1035         int dlen = lr->lr_length;               /* length of user data */
1036         int error = 0;
1037
1038         ASSERT(zio);
1039         ASSERT(dlen != 0);
1040
1041         /*
1042          * Nothing to do if the file has been removed
1043          */
1044         if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
1045                 return (ENOENT);
1046         if (zp->z_unlinked) {
1047                 /*
1048                  * Release the vnode asynchronously as we currently have the
1049                  * txg stopped from syncing.
1050                  */
1051                 VN_RELE_ASYNC(ZTOV(zp),
1052                     dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1053                 return (ENOENT);
1054         }
1055
1056         /*
1057          * Write records come in two flavors: immediate and indirect.
1058          * For small writes it's cheaper to store the data with the
1059          * log record (immediate); for large writes it's cheaper to
1060          * sync the data and get a pointer to it (indirect) so that
1061          * we don't have to write the data twice.
1062          */
1063         if (buf != NULL) { /* immediate write */
1064                 rl = zfs_range_lock(zp, off, dlen, RL_READER);
1065                 /* test for truncation needs to be done while range locked */
1066                 if (off >= zp->z_phys->zp_size) {
1067                         error = ENOENT;
1068                         goto out;
1069                 }
1070                 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
1071                     DMU_READ_NO_PREFETCH));
1072         } else { /* indirect write */
1073                 uint64_t boff; /* block starting offset */
1074
1075                 /*
1076                  * Have to lock the whole block to ensure when it's
1077                  * written out and it's checksum is being calculated
1078                  * that no one can change the data. We need to re-check
1079                  * blocksize after we get the lock in case it's changed!
1080                  */
1081                 for (;;) {
1082                         if (ISP2(zp->z_blksz)) {
1083                                 boff = P2ALIGN_TYPED(off, zp->z_blksz,
1084                                     uint64_t);
1085                         } else {
1086                                 boff = 0;
1087                         }
1088                         dlen = zp->z_blksz;
1089                         rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1090                         if (zp->z_blksz == dlen)
1091                                 break;
1092                         zfs_range_unlock(rl);
1093                 }
1094                 /* test for truncation needs to be done while range locked */
1095                 if (off >= zp->z_phys->zp_size) {
1096                         error = ENOENT;
1097                         goto out;
1098                 }
1099                 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1100                 zgd->zgd_rl = rl;
1101                 zgd->zgd_zilog = zfsvfs->z_log;
1102                 zgd->zgd_bp = &lr->lr_blkptr;
1103                 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1104                 ASSERT(boff == db->db_offset);
1105                 lr->lr_blkoff = off - boff;
1106                 error = dmu_sync(zio, db, &lr->lr_blkptr,
1107                     lr->lr_common.lrc_txg, zfs_get_done, zgd);
1108                 ASSERT((error && error != EINPROGRESS) ||
1109                     lr->lr_length <= zp->z_blksz);
1110                 if (error == 0) {
1111                         /*
1112                          * dmu_sync() can compress a block of zeros to a null
1113                          * blkptr but the block size still needs to be passed
1114                          * through to replay.
1115                          */
1116                         BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
1117                         zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1118                 }
1119
1120                 /*
1121                  * If we get EINPROGRESS, then we need to wait for a
1122                  * write IO initiated by dmu_sync() to complete before
1123                  * we can release this dbuf.  We will finish everything
1124                  * up in the zfs_get_done() callback.
1125                  */
1126                 if (error == EINPROGRESS) {
1127                         return (0);
1128                 } else if (error == EALREADY) {
1129                         lr->lr_common.lrc_txtype = TX_WRITE2;
1130                         error = 0;
1131                 }
1132                 dmu_buf_rele(db, zgd);
1133                 kmem_free(zgd, sizeof (zgd_t));
1134         }
1135 out:
1136         zfs_range_unlock(rl);
1137         /*
1138          * Release the vnode asynchronously as we currently have the
1139          * txg stopped from syncing.
1140          */
1141         VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1142         return (error);
1143 }
1144
1145 /*ARGSUSED*/
1146 static int
1147 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1148     caller_context_t *ct)
1149 {
1150         znode_t *zp = VTOZ(vp);
1151         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1152         int error;
1153
1154         ZFS_ENTER(zfsvfs);
1155         ZFS_VERIFY_ZP(zp);
1156
1157         if (flag & V_ACE_MASK)
1158                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1159         else
1160                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1161
1162         ZFS_EXIT(zfsvfs);
1163         return (error);
1164 }
1165
1166 /*
1167  * Lookup an entry in a directory, or an extended attribute directory.
1168  * If it exists, return a held vnode reference for it.
1169  *
1170  *      IN:     dvp     - vnode of directory to search.
1171  *              nm      - name of entry to lookup.
1172  *              pnp     - full pathname to lookup [UNUSED].
1173  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1174  *              rdir    - root directory vnode [UNUSED].
1175  *              cr      - credentials of caller.
1176  *              ct      - caller context
1177  *              direntflags - directory lookup flags
1178  *              realpnp - returned pathname.
1179  *
1180  *      OUT:    vpp     - vnode of located entry, NULL if not found.
1181  *
1182  *      RETURN: 0 if success
1183  *              error code if failure
1184  *
1185  * Timestamps:
1186  *      NA
1187  */
1188 /* ARGSUSED */
1189 static int
1190 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1191     int nameiop, cred_t *cr, kthread_t *td, int flags)
1192 {
1193         znode_t *zdp = VTOZ(dvp);
1194         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1195         int     error;
1196         int *direntflags = NULL;
1197         void *realpnp = NULL;
1198
1199         ZFS_ENTER(zfsvfs);
1200         ZFS_VERIFY_ZP(zdp);
1201
1202         *vpp = NULL;
1203
1204         if (flags & LOOKUP_XATTR) {
1205 #ifdef TODO
1206                 /*
1207                  * If the xattr property is off, refuse the lookup request.
1208                  */
1209                 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1210                         ZFS_EXIT(zfsvfs);
1211                         return (EINVAL);
1212                 }
1213 #endif
1214
1215                 /*
1216                  * We don't allow recursive attributes..
1217                  * Maybe someday we will.
1218                  */
1219                 if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1220                         ZFS_EXIT(zfsvfs);
1221                         return (EINVAL);
1222                 }
1223
1224                 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1225                         ZFS_EXIT(zfsvfs);
1226                         return (error);
1227                 }
1228
1229                 /*
1230                  * Do we have permission to get into attribute directory?
1231                  */
1232
1233                 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1234                     B_FALSE, cr)) {
1235                         VN_RELE(*vpp);
1236                         *vpp = NULL;
1237                 }
1238
1239                 ZFS_EXIT(zfsvfs);
1240                 return (error);
1241         }
1242
1243         if (dvp->v_type != VDIR) {
1244                 ZFS_EXIT(zfsvfs);
1245                 return (ENOTDIR);
1246         }
1247
1248         /*
1249          * Check accessibility of directory.
1250          */
1251
1252         if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1253                 ZFS_EXIT(zfsvfs);
1254                 return (error);
1255         }
1256
1257         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1258             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1259                 ZFS_EXIT(zfsvfs);
1260                 return (EILSEQ);
1261         }
1262
1263         error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1264         if (error == 0) {
1265                 /*
1266                  * Convert device special files
1267                  */
1268                 if (IS_DEVVP(*vpp)) {
1269                         vnode_t *svp;
1270
1271                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1272                         VN_RELE(*vpp);
1273                         if (svp == NULL)
1274                                 error = ENOSYS;
1275                         else
1276                                 *vpp = svp;
1277                 }
1278         }
1279
1280         /* Translate errors and add SAVENAME when needed. */
1281         if (cnp->cn_flags & ISLASTCN) {
1282                 switch (nameiop) {
1283                 case CREATE:
1284                 case RENAME:
1285                         if (error == ENOENT) {
1286                                 error = EJUSTRETURN;
1287                                 cnp->cn_flags |= SAVENAME;
1288                                 break;
1289                         }
1290                         /* FALLTHROUGH */
1291                 case DELETE:
1292                         if (error == 0)
1293                                 cnp->cn_flags |= SAVENAME;
1294                         break;
1295                 }
1296         }
1297         if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1298                 int ltype = 0;
1299
1300                 if (cnp->cn_flags & ISDOTDOT) {
1301                         ltype = VOP_ISLOCKED(dvp);
1302                         VOP_UNLOCK(dvp, 0);
1303                 }
1304                 ZFS_EXIT(zfsvfs);
1305                 error = vn_lock(*vpp, cnp->cn_lkflags);
1306                 if (cnp->cn_flags & ISDOTDOT)
1307                         vn_lock(dvp, ltype | LK_RETRY);
1308                 if (error != 0) {
1309                         VN_RELE(*vpp);
1310                         *vpp = NULL;
1311                         return (error);
1312                 }
1313         } else {
1314                 ZFS_EXIT(zfsvfs);
1315         }
1316
1317 #ifdef FREEBSD_NAMECACHE
1318         /*
1319          * Insert name into cache (as non-existent) if appropriate.
1320          */
1321         if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1322                 cache_enter(dvp, *vpp, cnp);
1323         /*
1324          * Insert name into cache if appropriate.
1325          */
1326         if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1327                 if (!(cnp->cn_flags & ISLASTCN) ||
1328                     (nameiop != DELETE && nameiop != RENAME)) {
1329                         cache_enter(dvp, *vpp, cnp);
1330                 }
1331         }
1332 #endif
1333
1334         return (error);
1335 }
1336
1337 /*
1338  * Attempt to create a new entry in a directory.  If the entry
1339  * already exists, truncate the file if permissible, else return
1340  * an error.  Return the vp of the created or trunc'd file.
1341  *
1342  *      IN:     dvp     - vnode of directory to put new file entry in.
1343  *              name    - name of new file entry.
1344  *              vap     - attributes of new file.
1345  *              excl    - flag indicating exclusive or non-exclusive mode.
1346  *              mode    - mode to open file with.
1347  *              cr      - credentials of caller.
1348  *              flag    - large file flag [UNUSED].
1349  *              ct      - caller context
1350  *              vsecp   - ACL to be set
1351  *
1352  *      OUT:    vpp     - vnode of created or trunc'd entry.
1353  *
1354  *      RETURN: 0 if success
1355  *              error code if failure
1356  *
1357  * Timestamps:
1358  *      dvp - ctime|mtime updated if new entry created
1359  *       vp - ctime|mtime always, atime if new
1360  */
1361
1362 /* ARGSUSED */
1363 static int
1364 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1365     vnode_t **vpp, cred_t *cr, kthread_t *td)
1366 {
1367         znode_t         *zp, *dzp = VTOZ(dvp);
1368         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1369         zilog_t         *zilog;
1370         objset_t        *os;
1371         zfs_dirlock_t   *dl;
1372         dmu_tx_t        *tx;
1373         int             error;
1374         ksid_t          *ksid;
1375         uid_t           uid;
1376         gid_t           gid = crgetgid(cr);
1377         zfs_acl_ids_t   acl_ids;
1378         boolean_t       fuid_dirtied;
1379         void            *vsecp = NULL;
1380         int             flag = 0;
1381
1382         /*
1383          * If we have an ephemeral id, ACL, or XVATTR then
1384          * make sure file system is at proper version
1385          */
1386
1387         ksid = crgetsid(cr, KSID_OWNER);
1388         if (ksid)
1389                 uid = ksid_getid(ksid);
1390         else
1391                 uid = crgetuid(cr);
1392         if (zfsvfs->z_use_fuids == B_FALSE &&
1393             (vsecp || (vap->va_mask & AT_XVATTR) ||
1394             IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1395                 return (EINVAL);
1396
1397         ZFS_ENTER(zfsvfs);
1398         ZFS_VERIFY_ZP(dzp);
1399         os = zfsvfs->z_os;
1400         zilog = zfsvfs->z_log;
1401
1402         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1403             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1404                 ZFS_EXIT(zfsvfs);
1405                 return (EILSEQ);
1406         }
1407
1408         if (vap->va_mask & AT_XVATTR) {
1409                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1410                     crgetuid(cr), cr, vap->va_type)) != 0) {
1411                         ZFS_EXIT(zfsvfs);
1412                         return (error);
1413                 }
1414         }
1415 top:
1416         *vpp = NULL;
1417
1418         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1419                 vap->va_mode &= ~S_ISVTX;
1420
1421         if (*name == '\0') {
1422                 /*
1423                  * Null component name refers to the directory itself.
1424                  */
1425                 VN_HOLD(dvp);
1426                 zp = dzp;
1427                 dl = NULL;
1428                 error = 0;
1429         } else {
1430                 /* possible VN_HOLD(zp) */
1431                 int zflg = 0;
1432
1433                 if (flag & FIGNORECASE)
1434                         zflg |= ZCILOOK;
1435
1436                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1437                     NULL, NULL);
1438                 if (error) {
1439                         if (strcmp(name, "..") == 0)
1440                                 error = EISDIR;
1441                         ZFS_EXIT(zfsvfs);
1442                         return (error);
1443                 }
1444         }
1445         if (zp == NULL) {
1446                 uint64_t txtype;
1447
1448                 /*
1449                  * Create a new file object and update the directory
1450                  * to reference it.
1451                  */
1452                 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1453                         goto out;
1454                 }
1455
1456                 /*
1457                  * We only support the creation of regular files in
1458                  * extended attribute directories.
1459                  */
1460                 if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1461                     (vap->va_type != VREG)) {
1462                         error = EINVAL;
1463                         goto out;
1464                 }
1465
1466
1467                 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1468                     &acl_ids)) != 0)
1469                         goto out;
1470                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1471                         error = EDQUOT;
1472                         goto out;
1473                 }
1474
1475                 tx = dmu_tx_create(os);
1476                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1477                 fuid_dirtied = zfsvfs->z_fuid_dirty;
1478                 if (fuid_dirtied)
1479                         zfs_fuid_txhold(zfsvfs, tx);
1480                 dmu_tx_hold_bonus(tx, dzp->z_id);
1481                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1482                 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1483                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1484                             0, SPA_MAXBLOCKSIZE);
1485                 }
1486                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1487                 if (error) {
1488                         zfs_acl_ids_free(&acl_ids);
1489                         zfs_dirent_unlock(dl);
1490                         if (error == ERESTART) {
1491                                 dmu_tx_wait(tx);
1492                                 dmu_tx_abort(tx);
1493                                 goto top;
1494                         }
1495                         dmu_tx_abort(tx);
1496                         ZFS_EXIT(zfsvfs);
1497                         return (error);
1498                 }
1499                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1500
1501                 if (fuid_dirtied)
1502                         zfs_fuid_sync(zfsvfs, tx);
1503
1504                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1505
1506                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1507                 if (flag & FIGNORECASE)
1508                         txtype |= TX_CI;
1509                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1510                     vsecp, acl_ids.z_fuidp, vap);
1511                 zfs_acl_ids_free(&acl_ids);
1512                 dmu_tx_commit(tx);
1513         } else {
1514                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1515
1516                 /*
1517                  * A directory entry already exists for this name.
1518                  */
1519                 /*
1520                  * Can't truncate an existing file if in exclusive mode.
1521                  */
1522                 if (excl == EXCL) {
1523                         error = EEXIST;
1524                         goto out;
1525                 }
1526                 /*
1527                  * Can't open a directory for writing.
1528                  */
1529                 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1530                         error = EISDIR;
1531                         goto out;
1532                 }
1533                 /*
1534                  * Verify requested access to file.
1535                  */
1536                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1537                         goto out;
1538                 }
1539
1540                 mutex_enter(&dzp->z_lock);
1541                 dzp->z_seq++;
1542                 mutex_exit(&dzp->z_lock);
1543
1544                 /*
1545                  * Truncate regular files if requested.
1546                  */
1547                 if ((ZTOV(zp)->v_type == VREG) &&
1548                     (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1549                         /* we can't hold any locks when calling zfs_freesp() */
1550                         zfs_dirent_unlock(dl);
1551                         dl = NULL;
1552                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1553                         if (error == 0) {
1554                                 vnevent_create(ZTOV(zp), ct);
1555                         }
1556                 }
1557         }
1558 out:
1559         if (dl)
1560                 zfs_dirent_unlock(dl);
1561
1562         if (error) {
1563                 if (zp)
1564                         VN_RELE(ZTOV(zp));
1565         } else {
1566                 *vpp = ZTOV(zp);
1567                 /*
1568                  * If vnode is for a device return a specfs vnode instead.
1569                  */
1570                 if (IS_DEVVP(*vpp)) {
1571                         struct vnode *svp;
1572
1573                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1574                         VN_RELE(*vpp);
1575                         if (svp == NULL) {
1576                                 error = ENOSYS;
1577                         }
1578                         *vpp = svp;
1579                 }
1580         }
1581
1582         ZFS_EXIT(zfsvfs);
1583         return (error);
1584 }
1585
1586 /*
1587  * Remove an entry from a directory.
1588  *
1589  *      IN:     dvp     - vnode of directory to remove entry from.
1590  *              name    - name of entry to remove.
1591  *              cr      - credentials of caller.
1592  *              ct      - caller context
1593  *              flags   - case flags
1594  *
1595  *      RETURN: 0 if success
1596  *              error code if failure
1597  *
1598  * Timestamps:
1599  *      dvp - ctime|mtime
1600  *       vp - ctime (if nlink > 0)
1601  */
1602 /*ARGSUSED*/
1603 static int
1604 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1605     int flags)
1606 {
1607         znode_t         *zp, *dzp = VTOZ(dvp);
1608         znode_t         *xzp = NULL;
1609         vnode_t         *vp;
1610         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1611         zilog_t         *zilog;
1612         uint64_t        acl_obj, xattr_obj;
1613         zfs_dirlock_t   *dl;
1614         dmu_tx_t        *tx;
1615         boolean_t       may_delete_now, delete_now = FALSE;
1616         boolean_t       unlinked, toobig = FALSE;
1617         uint64_t        txtype;
1618         pathname_t      *realnmp = NULL;
1619         pathname_t      realnm;
1620         int             error;
1621         int             zflg = ZEXISTS;
1622
1623         ZFS_ENTER(zfsvfs);
1624         ZFS_VERIFY_ZP(dzp);
1625         zilog = zfsvfs->z_log;
1626
1627         if (flags & FIGNORECASE) {
1628                 zflg |= ZCILOOK;
1629                 pn_alloc(&realnm);
1630                 realnmp = &realnm;
1631         }
1632
1633 top:
1634         /*
1635          * Attempt to lock directory; fail if entry doesn't exist.
1636          */
1637         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1638             NULL, realnmp)) {
1639                 if (realnmp)
1640                         pn_free(realnmp);
1641                 ZFS_EXIT(zfsvfs);
1642                 return (error);
1643         }
1644
1645         vp = ZTOV(zp);
1646
1647         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1648                 goto out;
1649         }
1650
1651         /*
1652          * Need to use rmdir for removing directories.
1653          */
1654         if (vp->v_type == VDIR) {
1655                 error = EPERM;
1656                 goto out;
1657         }
1658
1659         vnevent_remove(vp, dvp, name, ct);
1660
1661         if (realnmp)
1662                 dnlc_remove(dvp, realnmp->pn_buf);
1663         else
1664                 dnlc_remove(dvp, name);
1665
1666         may_delete_now = FALSE;
1667
1668         /*
1669          * We may delete the znode now, or we may put it in the unlinked set;
1670          * it depends on whether we're the last link, and on whether there are
1671          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1672          * allow for either case.
1673          */
1674         tx = dmu_tx_create(zfsvfs->z_os);
1675         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1676         dmu_tx_hold_bonus(tx, zp->z_id);
1677         if (may_delete_now) {
1678                 toobig =
1679                     zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1680                 /* if the file is too big, only hold_free a token amount */
1681                 dmu_tx_hold_free(tx, zp->z_id, 0,
1682                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1683         }
1684
1685         /* are there any extended attributes? */
1686         if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1687                 /* XXX - do we need this if we are deleting? */
1688                 dmu_tx_hold_bonus(tx, xattr_obj);
1689         }
1690
1691         /* are there any additional acls */
1692         if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1693             may_delete_now)
1694                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1695
1696         /* charge as an update -- would be nice not to charge at all */
1697         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1698
1699         error = dmu_tx_assign(tx, TXG_NOWAIT);
1700         if (error) {
1701                 zfs_dirent_unlock(dl);
1702                 VN_RELE(vp);
1703                 if (error == ERESTART) {
1704                         dmu_tx_wait(tx);
1705                         dmu_tx_abort(tx);
1706                         goto top;
1707                 }
1708                 if (realnmp)
1709                         pn_free(realnmp);
1710                 dmu_tx_abort(tx);
1711                 ZFS_EXIT(zfsvfs);
1712                 return (error);
1713         }
1714
1715         /*
1716          * Remove the directory entry.
1717          */
1718         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1719
1720         if (error) {
1721                 dmu_tx_commit(tx);
1722                 goto out;
1723         }
1724
1725         if (0 && unlinked) {
1726                 VI_LOCK(vp);
1727                 delete_now = may_delete_now && !toobig &&
1728                     vp->v_count == 1 && !vn_has_cached_data(vp) &&
1729                     zp->z_phys->zp_xattr == xattr_obj &&
1730                     zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1731                 VI_UNLOCK(vp);
1732         }
1733
1734         if (delete_now) {
1735                 if (zp->z_phys->zp_xattr) {
1736                         error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1737                         ASSERT3U(error, ==, 0);
1738                         ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1739                         dmu_buf_will_dirty(xzp->z_dbuf, tx);
1740                         mutex_enter(&xzp->z_lock);
1741                         xzp->z_unlinked = 1;
1742                         xzp->z_phys->zp_links = 0;
1743                         mutex_exit(&xzp->z_lock);
1744                         zfs_unlinked_add(xzp, tx);
1745                         zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1746                 }
1747                 mutex_enter(&zp->z_lock);
1748                 VI_LOCK(vp);
1749                 vp->v_count--;
1750                 ASSERT3U(vp->v_count, ==, 0);
1751                 VI_UNLOCK(vp);
1752                 mutex_exit(&zp->z_lock);
1753                 zfs_znode_delete(zp, tx);
1754         } else if (unlinked) {
1755                 zfs_unlinked_add(zp, tx);
1756         }
1757
1758         txtype = TX_REMOVE;
1759         if (flags & FIGNORECASE)
1760                 txtype |= TX_CI;
1761         zfs_log_remove(zilog, tx, txtype, dzp, name);
1762
1763         dmu_tx_commit(tx);
1764 out:
1765         if (realnmp)
1766                 pn_free(realnmp);
1767
1768         zfs_dirent_unlock(dl);
1769
1770         if (!delete_now) {
1771                 VN_RELE(vp);
1772         } else if (xzp) {
1773                 /* this rele is delayed to prevent nesting transactions */
1774                 VN_RELE(ZTOV(xzp));
1775         }
1776
1777         ZFS_EXIT(zfsvfs);
1778         return (error);
1779 }
1780
1781 /*
1782  * Create a new directory and insert it into dvp using the name
1783  * provided.  Return a pointer to the inserted directory.
1784  *
1785  *      IN:     dvp     - vnode of directory to add subdir to.
1786  *              dirname - name of new directory.
1787  *              vap     - attributes of new directory.
1788  *              cr      - credentials of caller.
1789  *              ct      - caller context
1790  *              vsecp   - ACL to be set
1791  *
1792  *      OUT:    vpp     - vnode of created directory.
1793  *
1794  *      RETURN: 0 if success
1795  *              error code if failure
1796  *
1797  * Timestamps:
1798  *      dvp - ctime|mtime updated
1799  *       vp - ctime|mtime|atime updated
1800  */
1801 /*ARGSUSED*/
1802 static int
1803 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1804     caller_context_t *ct, int flags, vsecattr_t *vsecp)
1805 {
1806         znode_t         *zp, *dzp = VTOZ(dvp);
1807         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1808         zilog_t         *zilog;
1809         zfs_dirlock_t   *dl;
1810         uint64_t        txtype;
1811         dmu_tx_t        *tx;
1812         int             error;
1813         int             zf = ZNEW;
1814         ksid_t          *ksid;
1815         uid_t           uid;
1816         gid_t           gid = crgetgid(cr);
1817         zfs_acl_ids_t   acl_ids;
1818         boolean_t       fuid_dirtied;
1819
1820         ASSERT(vap->va_type == VDIR);
1821
1822         /*
1823          * If we have an ephemeral id, ACL, or XVATTR then
1824          * make sure file system is at proper version
1825          */
1826
1827         ksid = crgetsid(cr, KSID_OWNER);
1828         if (ksid)
1829                 uid = ksid_getid(ksid);
1830         else
1831                 uid = crgetuid(cr);
1832         if (zfsvfs->z_use_fuids == B_FALSE &&
1833             (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1834             IS_EPHEMERAL(crgetgid(cr))))
1835                 return (EINVAL);
1836
1837         ZFS_ENTER(zfsvfs);
1838         ZFS_VERIFY_ZP(dzp);
1839         zilog = zfsvfs->z_log;
1840
1841         if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1842                 ZFS_EXIT(zfsvfs);
1843                 return (EINVAL);
1844         }
1845
1846         if (zfsvfs->z_utf8 && u8_validate(dirname,
1847             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1848                 ZFS_EXIT(zfsvfs);
1849                 return (EILSEQ);
1850         }
1851         if (flags & FIGNORECASE)
1852                 zf |= ZCILOOK;
1853
1854         if (vap->va_mask & AT_XVATTR)
1855                 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1856                     crgetuid(cr), cr, vap->va_type)) != 0) {
1857                         ZFS_EXIT(zfsvfs);
1858                         return (error);
1859                 }
1860
1861         /*
1862          * First make sure the new directory doesn't exist.
1863          */
1864 top:
1865         *vpp = NULL;
1866
1867         if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1868             NULL, NULL)) {
1869                 ZFS_EXIT(zfsvfs);
1870                 return (error);
1871         }
1872
1873         if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1874                 zfs_dirent_unlock(dl);
1875                 ZFS_EXIT(zfsvfs);
1876                 return (error);
1877         }
1878
1879         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1880             &acl_ids)) != 0) {
1881                 zfs_dirent_unlock(dl);
1882                 ZFS_EXIT(zfsvfs);
1883                 return (error);
1884         }
1885         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1886                 zfs_dirent_unlock(dl);
1887                 ZFS_EXIT(zfsvfs);
1888                 return (EDQUOT);
1889         }
1890
1891         /*
1892          * Add a new entry to the directory.
1893          */
1894         tx = dmu_tx_create(zfsvfs->z_os);
1895         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1896         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1897         fuid_dirtied = zfsvfs->z_fuid_dirty;
1898         if (fuid_dirtied)
1899                 zfs_fuid_txhold(zfsvfs, tx);
1900         if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
1901                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1902                     0, SPA_MAXBLOCKSIZE);
1903         error = dmu_tx_assign(tx, TXG_NOWAIT);
1904         if (error) {
1905                 zfs_acl_ids_free(&acl_ids);
1906                 zfs_dirent_unlock(dl);
1907                 if (error == ERESTART) {
1908                         dmu_tx_wait(tx);
1909                         dmu_tx_abort(tx);
1910                         goto top;
1911                 }
1912                 dmu_tx_abort(tx);
1913                 ZFS_EXIT(zfsvfs);
1914                 return (error);
1915         }
1916
1917         /*
1918          * Create new node.
1919          */
1920         zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1921
1922         if (fuid_dirtied)
1923                 zfs_fuid_sync(zfsvfs, tx);
1924         /*
1925          * Now put new name in parent dir.
1926          */
1927         (void) zfs_link_create(dl, zp, tx, ZNEW);
1928
1929         *vpp = ZTOV(zp);
1930
1931         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1932         if (flags & FIGNORECASE)
1933                 txtype |= TX_CI;
1934         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1935             acl_ids.z_fuidp, vap);
1936
1937         zfs_acl_ids_free(&acl_ids);
1938         dmu_tx_commit(tx);
1939
1940         zfs_dirent_unlock(dl);
1941
1942         ZFS_EXIT(zfsvfs);
1943         return (0);
1944 }
1945
1946 /*
1947  * Remove a directory subdir entry.  If the current working
1948  * directory is the same as the subdir to be removed, the
1949  * remove will fail.
1950  *
1951  *      IN:     dvp     - vnode of directory to remove from.
1952  *              name    - name of directory to be removed.
1953  *              cwd     - vnode of current working directory.
1954  *              cr      - credentials of caller.
1955  *              ct      - caller context
1956  *              flags   - case flags
1957  *
1958  *      RETURN: 0 if success
1959  *              error code if failure
1960  *
1961  * Timestamps:
1962  *      dvp - ctime|mtime updated
1963  */
1964 /*ARGSUSED*/
1965 static int
1966 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1967     caller_context_t *ct, int flags)
1968 {
1969         znode_t         *dzp = VTOZ(dvp);
1970         znode_t         *zp;
1971         vnode_t         *vp;
1972         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1973         zilog_t         *zilog;
1974         zfs_dirlock_t   *dl;
1975         dmu_tx_t        *tx;
1976         int             error;
1977         int             zflg = ZEXISTS;
1978
1979         ZFS_ENTER(zfsvfs);
1980         ZFS_VERIFY_ZP(dzp);
1981         zilog = zfsvfs->z_log;
1982
1983         if (flags & FIGNORECASE)
1984                 zflg |= ZCILOOK;
1985 top:
1986         zp = NULL;
1987
1988         /*
1989          * Attempt to lock directory; fail if entry doesn't exist.
1990          */
1991         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1992             NULL, NULL)) {
1993                 ZFS_EXIT(zfsvfs);
1994                 return (error);
1995         }
1996
1997         vp = ZTOV(zp);
1998
1999         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2000                 goto out;
2001         }
2002
2003         if (vp->v_type != VDIR) {
2004                 error = ENOTDIR;
2005                 goto out;
2006         }
2007
2008         if (vp == cwd) {
2009                 error = EINVAL;
2010                 goto out;
2011         }
2012
2013         vnevent_rmdir(vp, dvp, name, ct);
2014
2015         /*
2016          * Grab a lock on the directory to make sure that noone is
2017          * trying to add (or lookup) entries while we are removing it.
2018          */
2019         rw_enter(&zp->z_name_lock, RW_WRITER);
2020
2021         /*
2022          * Grab a lock on the parent pointer to make sure we play well
2023          * with the treewalk and directory rename code.
2024          */
2025         rw_enter(&zp->z_parent_lock, RW_WRITER);
2026
2027         tx = dmu_tx_create(zfsvfs->z_os);
2028         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2029         dmu_tx_hold_bonus(tx, zp->z_id);
2030         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2031         error = dmu_tx_assign(tx, TXG_NOWAIT);
2032         if (error) {
2033                 rw_exit(&zp->z_parent_lock);
2034                 rw_exit(&zp->z_name_lock);
2035                 zfs_dirent_unlock(dl);
2036                 VN_RELE(vp);
2037                 if (error == ERESTART) {
2038                         dmu_tx_wait(tx);
2039                         dmu_tx_abort(tx);
2040                         goto top;
2041                 }
2042                 dmu_tx_abort(tx);
2043                 ZFS_EXIT(zfsvfs);
2044                 return (error);
2045         }
2046
2047 #ifdef FREEBSD_NAMECACHE
2048         cache_purge(dvp);
2049 #endif
2050
2051         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2052
2053         if (error == 0) {
2054                 uint64_t txtype = TX_RMDIR;
2055                 if (flags & FIGNORECASE)
2056                         txtype |= TX_CI;
2057                 zfs_log_remove(zilog, tx, txtype, dzp, name);
2058         }
2059
2060         dmu_tx_commit(tx);
2061
2062         rw_exit(&zp->z_parent_lock);
2063         rw_exit(&zp->z_name_lock);
2064 #ifdef FREEBSD_NAMECACHE
2065         cache_purge(vp);
2066 #endif
2067 out:
2068         zfs_dirent_unlock(dl);
2069
2070         VN_RELE(vp);
2071
2072         ZFS_EXIT(zfsvfs);
2073         return (error);
2074 }
2075
2076 /*
2077  * Read as many directory entries as will fit into the provided
2078  * buffer from the given directory cursor position (specified in
2079  * the uio structure.
2080  *
2081  *      IN:     vp      - vnode of directory to read.
2082  *              uio     - structure supplying read location, range info,
2083  *                        and return buffer.
2084  *              cr      - credentials of caller.
2085  *              ct      - caller context
2086  *              flags   - case flags
2087  *
2088  *      OUT:    uio     - updated offset and range, buffer filled.
2089  *              eofp    - set to true if end-of-file detected.
2090  *
2091  *      RETURN: 0 if success
2092  *              error code if failure
2093  *
2094  * Timestamps:
2095  *      vp - atime updated
2096  *
2097  * Note that the low 4 bits of the cookie returned by zap is always zero.
2098  * This allows us to use the low range for "special" directory entries:
2099  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2100  * we use the offset 2 for the '.zfs' directory.
2101  */
2102 /* ARGSUSED */
2103 static int
2104 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2105 {
2106         znode_t         *zp = VTOZ(vp);
2107         iovec_t         *iovp;
2108         edirent_t       *eodp;
2109         dirent64_t      *odp;
2110         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2111         objset_t        *os;
2112         caddr_t         outbuf;
2113         size_t          bufsize;
2114         zap_cursor_t    zc;
2115         zap_attribute_t zap;
2116         uint_t          bytes_wanted;
2117         uint64_t        offset; /* must be unsigned; checks for < 1 */
2118         int             local_eof;
2119         int             outcount;
2120         int             error;
2121         uint8_t         prefetch;
2122         boolean_t       check_sysattrs;
2123         uint8_t         type;
2124         int             ncooks;
2125         u_long          *cooks = NULL;
2126         int             flags = 0;
2127
2128         ZFS_ENTER(zfsvfs);
2129         ZFS_VERIFY_ZP(zp);
2130
2131         /*
2132          * If we are not given an eof variable,
2133          * use a local one.
2134          */
2135         if (eofp == NULL)
2136                 eofp = &local_eof;
2137
2138         /*
2139          * Check for valid iov_len.
2140          */
2141         if (uio->uio_iov->iov_len <= 0) {
2142                 ZFS_EXIT(zfsvfs);
2143                 return (EINVAL);
2144         }
2145
2146         /*
2147          * Quit if directory has been removed (posix)
2148          */
2149         if ((*eofp = zp->z_unlinked) != 0) {
2150                 ZFS_EXIT(zfsvfs);
2151                 return (0);
2152         }
2153
2154         error = 0;
2155         os = zfsvfs->z_os;
2156         offset = uio->uio_loffset;
2157         prefetch = zp->z_zn_prefetch;
2158
2159         /*
2160          * Initialize the iterator cursor.
2161          */
2162         if (offset <= 3) {
2163                 /*
2164                  * Start iteration from the beginning of the directory.
2165                  */
2166                 zap_cursor_init(&zc, os, zp->z_id);
2167         } else {
2168                 /*
2169                  * The offset is a serialized cursor.
2170                  */
2171                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2172         }
2173
2174         /*
2175          * Get space to change directory entries into fs independent format.
2176          */
2177         iovp = uio->uio_iov;
2178         bytes_wanted = iovp->iov_len;
2179         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2180                 bufsize = bytes_wanted;
2181                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2182                 odp = (struct dirent64 *)outbuf;
2183         } else {
2184                 bufsize = bytes_wanted;
2185                 odp = (struct dirent64 *)iovp->iov_base;
2186         }
2187         eodp = (struct edirent *)odp;
2188
2189         if (ncookies != NULL) {
2190                 /*
2191                  * Minimum entry size is dirent size and 1 byte for a file name.
2192                  */
2193                 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2194                 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2195                 *cookies = cooks;
2196                 *ncookies = ncooks;
2197         }
2198         /*
2199          * If this VFS supports the system attribute view interface; and
2200          * we're looking at an extended attribute directory; and we care
2201          * about normalization conflicts on this vfs; then we must check
2202          * for normalization conflicts with the sysattr name space.
2203          */
2204 #ifdef TODO
2205         check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2206             (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2207             (flags & V_RDDIR_ENTFLAGS);
2208 #else
2209         check_sysattrs = 0;
2210 #endif
2211
2212         /*
2213          * Transform to file-system independent format
2214          */
2215         outcount = 0;
2216         while (outcount < bytes_wanted) {
2217                 ino64_t objnum;
2218                 ushort_t reclen;
2219                 off64_t *next;
2220
2221                 /*
2222                  * Special case `.', `..', and `.zfs'.
2223                  */
2224                 if (offset == 0) {
2225                         (void) strcpy(zap.za_name, ".");
2226                         zap.za_normalization_conflict = 0;
2227                         objnum = zp->z_id;
2228                         type = DT_DIR;
2229                 } else if (offset == 1) {
2230                         (void) strcpy(zap.za_name, "..");
2231                         zap.za_normalization_conflict = 0;
2232                         objnum = zp->z_phys->zp_parent;
2233                         type = DT_DIR;
2234                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2235                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2236                         zap.za_normalization_conflict = 0;
2237                         objnum = ZFSCTL_INO_ROOT;
2238                         type = DT_DIR;
2239                 } else {
2240                         /*
2241                          * Grab next entry.
2242                          */
2243                         if (error = zap_cursor_retrieve(&zc, &zap)) {
2244                                 if ((*eofp = (error == ENOENT)) != 0)
2245                                         break;
2246                                 else
2247                                         goto update;
2248                         }
2249
2250                         if (zap.za_integer_length != 8 ||
2251                             zap.za_num_integers != 1) {
2252                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2253                                     "entry, obj = %lld, offset = %lld\n",
2254                                     (u_longlong_t)zp->z_id,
2255                                     (u_longlong_t)offset);
2256                                 error = ENXIO;
2257                                 goto update;
2258                         }
2259
2260                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2261                         /*
2262                          * MacOS X can extract the object type here such as:
2263                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2264                          */
2265                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2266
2267                         if (check_sysattrs && !zap.za_normalization_conflict) {
2268 #ifdef TODO
2269                                 zap.za_normalization_conflict =
2270                                     xattr_sysattr_casechk(zap.za_name);
2271 #else
2272                                 panic("%s:%u: TODO", __func__, __LINE__);
2273 #endif
2274                         }
2275                 }
2276
2277                 if (flags & V_RDDIR_ENTFLAGS)
2278                         reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2279                 else
2280                         reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2281
2282                 /*
2283                  * Will this entry fit in the buffer?
2284                  */
2285                 if (outcount + reclen > bufsize) {
2286                         /*
2287                          * Did we manage to fit anything in the buffer?
2288                          */
2289                         if (!outcount) {
2290                                 error = EINVAL;
2291                                 goto update;
2292                         }
2293                         break;
2294                 }
2295                 if (flags & V_RDDIR_ENTFLAGS) {
2296                         /*
2297                          * Add extended flag entry:
2298                          */
2299                         eodp->ed_ino = objnum;
2300                         eodp->ed_reclen = reclen;
2301                         /* NOTE: ed_off is the offset for the *next* entry */
2302                         next = &(eodp->ed_off);
2303                         eodp->ed_eflags = zap.za_normalization_conflict ?
2304                             ED_CASE_CONFLICT : 0;
2305                         (void) strncpy(eodp->ed_name, zap.za_name,
2306                             EDIRENT_NAMELEN(reclen));
2307                         eodp = (edirent_t *)((intptr_t)eodp + reclen);
2308                 } else {
2309                         /*
2310                          * Add normal entry:
2311                          */
2312                         odp->d_ino = objnum;
2313                         odp->d_reclen = reclen;
2314                         odp->d_namlen = strlen(zap.za_name);
2315                         (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2316                         odp->d_type = type;
2317                         odp = (dirent64_t *)((intptr_t)odp + reclen);
2318                 }
2319                 outcount += reclen;
2320
2321                 ASSERT(outcount <= bufsize);
2322
2323                 /* Prefetch znode */
2324                 if (prefetch)
2325                         dmu_prefetch(os, objnum, 0, 0);
2326
2327                 /*
2328                  * Move to the next entry, fill in the previous offset.
2329                  */
2330                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2331                         zap_cursor_advance(&zc);
2332                         offset = zap_cursor_serialize(&zc);
2333                 } else {
2334                         offset += 1;
2335                 }
2336
2337                 if (cooks != NULL) {
2338                         *cooks++ = offset;
2339                         ncooks--;
2340                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2341                 }
2342         }
2343         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2344
2345         /* Subtract unused cookies */
2346         if (ncookies != NULL)
2347                 *ncookies -= ncooks;
2348
2349         if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2350                 iovp->iov_base += outcount;
2351                 iovp->iov_len -= outcount;
2352                 uio->uio_resid -= outcount;
2353         } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2354                 /*
2355                  * Reset the pointer.
2356                  */
2357                 offset = uio->uio_loffset;
2358         }
2359
2360 update:
2361         zap_cursor_fini(&zc);
2362         if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2363                 kmem_free(outbuf, bufsize);
2364
2365         if (error == ENOENT)
2366                 error = 0;
2367
2368         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2369
2370         uio->uio_loffset = offset;
2371         ZFS_EXIT(zfsvfs);
2372         if (error != 0 && cookies != NULL) {
2373                 free(*cookies, M_TEMP);
2374                 *cookies = NULL;
2375                 *ncookies = 0;
2376         }
2377         return (error);
2378 }
2379
2380 ulong_t zfs_fsync_sync_cnt = 4;
2381
2382 static int
2383 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2384 {
2385         znode_t *zp = VTOZ(vp);
2386         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2387
2388         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2389
2390         ZFS_ENTER(zfsvfs);
2391         ZFS_VERIFY_ZP(zp);
2392         zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2393         ZFS_EXIT(zfsvfs);
2394         return (0);
2395 }
2396
2397
2398 /*
2399  * Get the requested file attributes and place them in the provided
2400  * vattr structure.
2401  *
2402  *      IN:     vp      - vnode of file.
2403  *              vap     - va_mask identifies requested attributes.
2404  *                        If AT_XVATTR set, then optional attrs are requested
2405  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2406  *              cr      - credentials of caller.
2407  *              ct      - caller context
2408  *
2409  *      OUT:    vap     - attribute values.
2410  *
2411  *      RETURN: 0 (always succeeds)
2412  */
2413 /* ARGSUSED */
2414 static int
2415 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2416     caller_context_t *ct)
2417 {
2418         znode_t *zp = VTOZ(vp);
2419         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2420         znode_phys_t *pzp;
2421         int     error = 0;
2422         uint32_t blksize;
2423         u_longlong_t nblocks;
2424         uint64_t links;
2425         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2426         xoptattr_t *xoap = NULL;
2427         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2428
2429         ZFS_ENTER(zfsvfs);
2430         ZFS_VERIFY_ZP(zp);
2431         pzp = zp->z_phys;
2432
2433         /*
2434          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2435          * Also, if we are the owner don't bother, since owner should
2436          * always be allowed to read basic attributes of file.
2437          */
2438         if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2439             (pzp->zp_uid != crgetuid(cr))) {
2440                 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2441                     skipaclchk, cr)) {
2442                         ZFS_EXIT(zfsvfs);
2443                         return (error);
2444                 }
2445         }
2446
2447         /*
2448          * Return all attributes.  It's cheaper to provide the answer
2449          * than to determine whether we were asked the question.
2450          */
2451
2452         mutex_enter(&zp->z_lock);
2453         vap->va_type = IFTOVT(pzp->zp_mode);
2454         vap->va_mode = pzp->zp_mode & ~S_IFMT;
2455         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2456 //      vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2457         vap->va_nodeid = zp->z_id;
2458         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2459                 links = pzp->zp_links + 1;
2460         else
2461                 links = pzp->zp_links;
2462         vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2463         vap->va_size = pzp->zp_size;
2464         vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2465         vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2466         vap->va_seq = zp->z_seq;
2467         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
2468
2469         /*
2470          * Add in any requested optional attributes and the create time.
2471          * Also set the corresponding bits in the returned attribute bitmap.
2472          */
2473         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2474                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2475                         xoap->xoa_archive =
2476                             ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2477                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
2478                 }
2479
2480                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2481                         xoap->xoa_readonly =
2482                             ((pzp->zp_flags & ZFS_READONLY) != 0);
2483                         XVA_SET_RTN(xvap, XAT_READONLY);
2484                 }
2485
2486                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2487                         xoap->xoa_system =
2488                             ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2489                         XVA_SET_RTN(xvap, XAT_SYSTEM);
2490                 }
2491
2492                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2493                         xoap->xoa_hidden =
2494                             ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2495                         XVA_SET_RTN(xvap, XAT_HIDDEN);
2496                 }
2497
2498                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2499                         xoap->xoa_nounlink =
2500                             ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2501                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
2502                 }
2503
2504                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2505                         xoap->xoa_immutable =
2506                             ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2507                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2508                 }
2509
2510                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2511                         xoap->xoa_appendonly =
2512                             ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2513                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
2514                 }
2515
2516                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2517                         xoap->xoa_nodump =
2518                             ((pzp->zp_flags & ZFS_NODUMP) != 0);
2519                         XVA_SET_RTN(xvap, XAT_NODUMP);
2520                 }
2521
2522                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2523                         xoap->xoa_opaque =
2524                             ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2525                         XVA_SET_RTN(xvap, XAT_OPAQUE);
2526                 }
2527
2528                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2529                         xoap->xoa_av_quarantined =
2530                             ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2531                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2532                 }
2533
2534                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2535                         xoap->xoa_av_modified =
2536                             ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2537                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2538                 }
2539
2540                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2541                     vp->v_type == VREG &&
2542                     (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2543                         size_t len;
2544                         dmu_object_info_t doi;
2545
2546                         /*
2547                          * Only VREG files have anti-virus scanstamps, so we
2548                          * won't conflict with symlinks in the bonus buffer.
2549                          */
2550                         dmu_object_info_from_db(zp->z_dbuf, &doi);
2551                         len = sizeof (xoap->xoa_av_scanstamp) +
2552                             sizeof (znode_phys_t);
2553                         if (len <= doi.doi_bonus_size) {
2554                                 /*
2555                                  * pzp points to the start of the
2556                                  * znode_phys_t. pzp + 1 points to the
2557                                  * first byte after the znode_phys_t.
2558                                  */
2559                                 (void) memcpy(xoap->xoa_av_scanstamp,
2560                                     pzp + 1,
2561                                     sizeof (xoap->xoa_av_scanstamp));
2562                                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2563                         }
2564                 }
2565
2566                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2567                         ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2568                         XVA_SET_RTN(xvap, XAT_CREATETIME);
2569                 }
2570         }
2571
2572         ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2573         ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2574         ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2575         ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2576
2577         mutex_exit(&zp->z_lock);
2578
2579         dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2580         vap->va_blksize = blksize;
2581         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2582
2583         if (zp->z_blksz == 0) {
2584                 /*
2585                  * Block size hasn't been set; suggest maximal I/O transfers.
2586                  */
2587                 vap->va_blksize = zfsvfs->z_max_blksz;
2588         }
2589
2590         ZFS_EXIT(zfsvfs);
2591         return (0);
2592 }
2593
2594 /*
2595  * Set the file attributes to the values contained in the
2596  * vattr structure.
2597  *
2598  *      IN:     vp      - vnode of file to be modified.
2599  *              vap     - new attribute values.
2600  *                        If AT_XVATTR set, then optional attrs are being set
2601  *              flags   - ATTR_UTIME set if non-default time values provided.
2602  *                      - ATTR_NOACLCHECK (CIFS context only).
2603  *              cr      - credentials of caller.
2604  *              ct      - caller context
2605  *
2606  *      RETURN: 0 if success
2607  *              error code if failure
2608  *
2609  * Timestamps:
2610  *      vp - ctime updated, mtime updated if size changed.
2611  */
2612 /* ARGSUSED */
2613 static int
2614 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2615         caller_context_t *ct)
2616 {
2617         znode_t         *zp = VTOZ(vp);
2618         znode_phys_t    *pzp;
2619         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2620         zilog_t         *zilog;
2621         dmu_tx_t        *tx;
2622         vattr_t         oldva;
2623         xvattr_t        tmpxvattr;
2624         uint_t          mask = vap->va_mask;
2625         uint_t          saved_mask;
2626         uint64_t        saved_mode;
2627         int             trim_mask = 0;
2628         uint64_t        new_mode;
2629         uint64_t        new_uid, new_gid;
2630         znode_t         *attrzp;
2631         int             need_policy = FALSE;
2632         int             err;
2633         zfs_fuid_info_t *fuidp = NULL;
2634         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2635         xoptattr_t      *xoap;
2636         zfs_acl_t       *aclp = NULL;
2637         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2638         boolean_t fuid_dirtied = B_FALSE;
2639
2640         if (mask == 0)
2641                 return (0);
2642
2643         if (mask & AT_NOSET)
2644                 return (EINVAL);
2645
2646         ZFS_ENTER(zfsvfs);
2647         ZFS_VERIFY_ZP(zp);
2648
2649         pzp = zp->z_phys;
2650         zilog = zfsvfs->z_log;
2651
2652         /*
2653          * Make sure that if we have ephemeral uid/gid or xvattr specified
2654          * that file system is at proper version level
2655          */
2656
2657         if (zfsvfs->z_use_fuids == B_FALSE &&
2658             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2659             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2660             (mask & AT_XVATTR))) {
2661                 ZFS_EXIT(zfsvfs);
2662                 return (EINVAL);
2663         }
2664
2665         if (mask & AT_SIZE && vp->v_type == VDIR) {
2666                 ZFS_EXIT(zfsvfs);
2667                 return (EISDIR);
2668         }
2669
2670         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2671                 ZFS_EXIT(zfsvfs);
2672                 return (EINVAL);
2673         }
2674
2675         /*
2676          * If this is an xvattr_t, then get a pointer to the structure of
2677          * optional attributes.  If this is NULL, then we have a vattr_t.
2678          */
2679         xoap = xva_getxoptattr(xvap);
2680
2681         xva_init(&tmpxvattr);
2682
2683         /*
2684          * Immutable files can only alter immutable bit and atime
2685          */
2686         if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2687             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2688             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2689                 ZFS_EXIT(zfsvfs);
2690                 return (EPERM);
2691         }
2692
2693         if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2694                 ZFS_EXIT(zfsvfs);
2695                 return (EPERM);
2696         }
2697
2698         /*
2699          * Verify timestamps doesn't overflow 32 bits.
2700          * ZFS can handle large timestamps, but 32bit syscalls can't
2701          * handle times greater than 2039.  This check should be removed
2702          * once large timestamps are fully supported.
2703          */
2704         if (mask & (AT_ATIME | AT_MTIME)) {
2705                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2706                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2707                         ZFS_EXIT(zfsvfs);
2708                         return (EOVERFLOW);
2709                 }
2710         }
2711
2712 top:
2713         attrzp = NULL;
2714
2715         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2716                 ZFS_EXIT(zfsvfs);
2717                 return (EROFS);
2718         }
2719
2720         /*
2721          * First validate permissions
2722          */
2723
2724         if (mask & AT_SIZE) {
2725                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2726                 if (err) {
2727                         ZFS_EXIT(zfsvfs);
2728                         return (err);
2729                 }
2730                 /*
2731                  * XXX - Note, we are not providing any open
2732                  * mode flags here (like FNDELAY), so we may
2733                  * block if there are locks present... this
2734                  * should be addressed in openat().
2735                  */
2736                 /* XXX - would it be OK to generate a log record here? */
2737                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2738                 if (err) {
2739                         ZFS_EXIT(zfsvfs);
2740                         return (err);
2741                 }
2742         }
2743
2744         if (mask & (AT_ATIME|AT_MTIME) ||
2745             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2746             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2747             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2748             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2749             XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2750                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2751                     skipaclchk, cr);
2752
2753         if (mask & (AT_UID|AT_GID)) {
2754                 int     idmask = (mask & (AT_UID|AT_GID));
2755                 int     take_owner;
2756                 int     take_group;
2757
2758                 /*
2759                  * NOTE: even if a new mode is being set,
2760                  * we may clear S_ISUID/S_ISGID bits.
2761                  */
2762
2763                 if (!(mask & AT_MODE))
2764                         vap->va_mode = pzp->zp_mode;
2765
2766                 /*
2767                  * Take ownership or chgrp to group we are a member of
2768                  */
2769
2770                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2771                 take_group = (mask & AT_GID) &&
2772                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2773
2774                 /*
2775                  * If both AT_UID and AT_GID are set then take_owner and
2776                  * take_group must both be set in order to allow taking
2777                  * ownership.
2778                  *
2779                  * Otherwise, send the check through secpolicy_vnode_setattr()
2780                  *
2781                  */
2782
2783                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2784                     ((idmask == AT_UID) && take_owner) ||
2785                     ((idmask == AT_GID) && take_group)) {
2786                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2787                             skipaclchk, cr) == 0) {
2788                                 /*
2789                                  * Remove setuid/setgid for non-privileged users
2790                                  */
2791                                 secpolicy_setid_clear(vap, vp, cr);
2792                                 trim_mask = (mask & (AT_UID|AT_GID));
2793                         } else {
2794                                 need_policy =  TRUE;
2795                         }
2796                 } else {
2797                         need_policy =  TRUE;
2798                 }
2799         }
2800
2801         mutex_enter(&zp->z_lock);
2802         oldva.va_mode = pzp->zp_mode;
2803         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2804         if (mask & AT_XVATTR) {
2805                 /*
2806                  * Update xvattr mask to include only those attributes
2807                  * that are actually changing.
2808                  *
2809                  * the bits will be restored prior to actually setting
2810                  * the attributes so the caller thinks they were set.
2811                  */
2812                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2813                         if (xoap->xoa_appendonly !=
2814                             ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
2815                                 need_policy = TRUE;
2816                         } else {
2817                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2818                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2819                         }
2820                 }
2821
2822                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2823                         if (xoap->xoa_nounlink !=
2824                             ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
2825                                 need_policy = TRUE;
2826                         } else {
2827                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2828                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2829                         }
2830                 }
2831
2832                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2833                         if (xoap->xoa_immutable !=
2834                             ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
2835                                 need_policy = TRUE;
2836                         } else {
2837                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2838                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2839                         }
2840                 }
2841
2842                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2843                         if (xoap->xoa_nodump !=
2844                             ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
2845                                 need_policy = TRUE;
2846                         } else {
2847                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2848                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2849                         }
2850                 }
2851
2852                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2853                         if (xoap->xoa_av_modified !=
2854                             ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
2855                                 need_policy = TRUE;
2856                         } else {
2857                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2858                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2859                         }
2860                 }
2861
2862                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2863                         if ((vp->v_type != VREG &&
2864                             xoap->xoa_av_quarantined) ||
2865                             xoap->xoa_av_quarantined !=
2866                             ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
2867                                 need_policy = TRUE;
2868                         } else {
2869                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2870                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2871                         }
2872                 }
2873
2874                 if (need_policy == FALSE &&
2875                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2876                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2877                         need_policy = TRUE;
2878                 }
2879         }
2880
2881         mutex_exit(&zp->z_lock);
2882
2883         if (mask & AT_MODE) {
2884                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2885                         err = secpolicy_setid_setsticky_clear(vp, vap,
2886                             &oldva, cr);
2887                         if (err) {
2888                                 ZFS_EXIT(zfsvfs);
2889                                 return (err);
2890                         }
2891                         trim_mask |= AT_MODE;
2892                 } else {
2893                         need_policy = TRUE;
2894                 }
2895         }
2896
2897         if (need_policy) {
2898                 /*
2899                  * If trim_mask is set then take ownership
2900                  * has been granted or write_acl is present and user
2901                  * has the ability to modify mode.  In that case remove
2902                  * UID|GID and or MODE from mask so that
2903                  * secpolicy_vnode_setattr() doesn't revoke it.
2904                  */
2905
2906                 if (trim_mask) {
2907                         saved_mask = vap->va_mask;
2908                         vap->va_mask &= ~trim_mask;
2909                         if (trim_mask & AT_MODE) {
2910                                 /*
2911                                  * Save the mode, as secpolicy_vnode_setattr()
2912                                  * will overwrite it with ova.va_mode.
2913                                  */
2914                                 saved_mode = vap->va_mode;
2915                         }
2916                 }
2917                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2918                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2919                 if (err) {
2920                         ZFS_EXIT(zfsvfs);
2921                         return (err);
2922                 }
2923
2924                 if (trim_mask) {
2925                         vap->va_mask |= saved_mask;
2926                         if (trim_mask & AT_MODE) {
2927                                 /*
2928                                  * Recover the mode after
2929                                  * secpolicy_vnode_setattr().
2930                                  */
2931                                 vap->va_mode = saved_mode;
2932                         }
2933                 }
2934         }
2935
2936         /*
2937          * secpolicy_vnode_setattr, or take ownership may have
2938          * changed va_mask
2939          */
2940         mask = vap->va_mask;
2941
2942         tx = dmu_tx_create(zfsvfs->z_os);
2943         dmu_tx_hold_bonus(tx, zp->z_id);
2944
2945         if (mask & AT_MODE) {
2946                 uint64_t pmode = pzp->zp_mode;
2947
2948                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2949
2950                 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
2951                         goto out;
2952                 if (pzp->zp_acl.z_acl_extern_obj) {
2953                         /* Are we upgrading ACL from old V0 format to new V1 */
2954                         if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
2955                             pzp->zp_acl.z_acl_version ==
2956                             ZFS_ACL_VERSION_INITIAL) {
2957                                 dmu_tx_hold_free(tx,
2958                                     pzp->zp_acl.z_acl_extern_obj, 0,
2959                                     DMU_OBJECT_END);
2960                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2961                                     0, aclp->z_acl_bytes);
2962                         } else {
2963                                 dmu_tx_hold_write(tx,
2964                                     pzp->zp_acl.z_acl_extern_obj, 0,
2965                                     aclp->z_acl_bytes);
2966                         }
2967                 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2968                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2969                             0, aclp->z_acl_bytes);
2970                 }
2971         }
2972
2973         if (mask & (AT_UID | AT_GID)) {
2974                 if (pzp->zp_xattr) {
2975                         err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
2976                         if (err)
2977                                 goto out;
2978                         dmu_tx_hold_bonus(tx, attrzp->z_id);
2979                 }
2980                 if (mask & AT_UID) {
2981                         new_uid = zfs_fuid_create(zfsvfs,
2982                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2983                         if (new_uid != pzp->zp_uid &&
2984                             zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
2985                                 err = EDQUOT;
2986                                 goto out;
2987                         }
2988                 }
2989
2990                 if (mask & AT_GID) {
2991                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2992                             cr, ZFS_GROUP, &fuidp);
2993                         if (new_gid != pzp->zp_gid &&
2994                             zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
2995                                 err = EDQUOT;
2996                                 goto out;
2997                         }
2998                 }
2999                 fuid_dirtied = zfsvfs->z_fuid_dirty;
3000                 if (fuid_dirtied) {
3001                         if (zfsvfs->z_fuid_obj == 0) {
3002                                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3003                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3004                                     FUID_SIZE_ESTIMATE(zfsvfs));
3005                                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
3006                                     FALSE, NULL);
3007                         } else {
3008                                 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3009                                 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3010                                     FUID_SIZE_ESTIMATE(zfsvfs));
3011                         }
3012                 }
3013         }
3014
3015         err = dmu_tx_assign(tx, TXG_NOWAIT);
3016         if (err) {
3017                 if (err == ERESTART)
3018                         dmu_tx_wait(tx);
3019                 goto out;
3020         }
3021
3022         dmu_buf_will_dirty(zp->z_dbuf, tx);
3023
3024         /*
3025          * Set each attribute requested.
3026          * We group settings according to the locks they need to acquire.
3027          *
3028          * Note: you cannot set ctime directly, although it will be
3029          * updated as a side-effect of calling this function.
3030          */
3031
3032         mutex_enter(&zp->z_lock);
3033
3034         if (mask & AT_MODE) {
3035                 mutex_enter(&zp->z_acl_lock);
3036                 zp->z_phys->zp_mode = new_mode;
3037                 err = zfs_aclset_common(zp, aclp, cr, tx);
3038                 ASSERT3U(err, ==, 0);
3039                 mutex_exit(&zp->z_acl_lock);
3040         }
3041
3042         if (attrzp)
3043                 mutex_enter(&attrzp->z_lock);
3044
3045         if (mask & AT_UID) {
3046                 pzp->zp_uid = new_uid;
3047                 if (attrzp)
3048                         attrzp->z_phys->zp_uid = new_uid;
3049         }
3050
3051         if (mask & AT_GID) {
3052                 pzp->zp_gid = new_gid;
3053                 if (attrzp)
3054                         attrzp->z_phys->zp_gid = new_gid;
3055         }
3056
3057         if (attrzp)
3058                 mutex_exit(&attrzp->z_lock);
3059
3060         if (mask & AT_ATIME)
3061                 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
3062
3063         if (mask & AT_MTIME)
3064                 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
3065
3066         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3067         if (mask & AT_SIZE)
3068                 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
3069         else if (mask != 0)
3070                 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
3071         /*
3072          * Do this after setting timestamps to prevent timestamp
3073          * update from toggling bit
3074          */
3075
3076         if (xoap && (mask & AT_XVATTR)) {
3077
3078                 /*
3079                  * restore trimmed off masks
3080                  * so that return masks can be set for caller.
3081                  */
3082
3083                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3084                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
3085                 }
3086                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3087                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
3088                 }
3089                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3090                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3091                 }
3092                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3093                         XVA_SET_REQ(xvap, XAT_NODUMP);
3094                 }
3095                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3096                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3097                 }
3098                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3099                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3100                 }
3101
3102                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
3103                         size_t len;
3104                         dmu_object_info_t doi;
3105
3106                         ASSERT(vp->v_type == VREG);
3107
3108                         /* Grow the bonus buffer if necessary. */
3109                         dmu_object_info_from_db(zp->z_dbuf, &doi);
3110                         len = sizeof (xoap->xoa_av_scanstamp) +
3111                             sizeof (znode_phys_t);
3112                         if (len > doi.doi_bonus_size)
3113                                 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
3114                 }
3115                 zfs_xvattr_set(zp, xvap);
3116         }
3117
3118         if (fuid_dirtied)
3119                 zfs_fuid_sync(zfsvfs, tx);
3120
3121         if (mask != 0)
3122                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3123
3124         mutex_exit(&zp->z_lock);
3125
3126 out:
3127         if (attrzp)
3128                 VN_RELE(ZTOV(attrzp));
3129
3130         if (aclp) {
3131                 zfs_acl_free(aclp);
3132                 aclp = NULL;
3133         }
3134
3135         if (fuidp) {
3136                 zfs_fuid_info_free(fuidp);
3137                 fuidp = NULL;
3138         }
3139
3140         if (err)
3141                 dmu_tx_abort(tx);
3142         else
3143                 dmu_tx_commit(tx);
3144
3145         if (err == ERESTART)
3146                 goto top;
3147
3148         ZFS_EXIT(zfsvfs);
3149         return (err);
3150 }
3151
3152 typedef struct zfs_zlock {
3153         krwlock_t       *zl_rwlock;     /* lock we acquired */
3154         znode_t         *zl_znode;      /* znode we held */
3155         struct zfs_zlock *zl_next;      /* next in list */
3156 } zfs_zlock_t;
3157
3158 /*
3159  * Drop locks and release vnodes that were held by zfs_rename_lock().
3160  */
3161 static void
3162 zfs_rename_unlock(zfs_zlock_t **zlpp)
3163 {
3164         zfs_zlock_t *zl;
3165
3166         while ((zl = *zlpp) != NULL) {
3167                 if (zl->zl_znode != NULL)
3168                         VN_RELE(ZTOV(zl->zl_znode));
3169                 rw_exit(zl->zl_rwlock);
3170                 *zlpp = zl->zl_next;
3171                 kmem_free(zl, sizeof (*zl));
3172         }
3173 }
3174
3175 /*
3176  * Search back through the directory tree, using the ".." entries.
3177  * Lock each directory in the chain to prevent concurrent renames.
3178  * Fail any attempt to move a directory into one of its own descendants.
3179  * XXX - z_parent_lock can overlap with map or grow locks
3180  */
3181 static int
3182 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3183 {
3184         zfs_zlock_t     *zl;
3185         znode_t         *zp = tdzp;
3186         uint64_t        rootid = zp->z_zfsvfs->z_root;
3187         uint64_t        *oidp = &zp->z_id;
3188         krwlock_t       *rwlp = &szp->z_parent_lock;
3189         krw_t           rw = RW_WRITER;
3190
3191         /*
3192          * First pass write-locks szp and compares to zp->z_id.
3193          * Later passes read-lock zp and compare to zp->z_parent.
3194          */
3195         do {
3196                 if (!rw_tryenter(rwlp, rw)) {
3197                         /*
3198                          * Another thread is renaming in this path.
3199                          * Note that if we are a WRITER, we don't have any
3200                          * parent_locks held yet.
3201                          */
3202                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3203                                 /*
3204                                  * Drop our locks and restart
3205                                  */
3206                                 zfs_rename_unlock(&zl);
3207                                 *zlpp = NULL;
3208                                 zp = tdzp;
3209                                 oidp = &zp->z_id;
3210                                 rwlp = &szp->z_parent_lock;
3211                                 rw = RW_WRITER;
3212                                 continue;
3213                         } else {
3214                                 /*
3215                                  * Wait for other thread to drop its locks
3216                                  */
3217                                 rw_enter(rwlp, rw);
3218                         }
3219                 }
3220
3221                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3222                 zl->zl_rwlock = rwlp;
3223                 zl->zl_znode = NULL;
3224                 zl->zl_next = *zlpp;
3225                 *zlpp = zl;
3226
3227                 if (*oidp == szp->z_id)         /* We're a descendant of szp */
3228                         return (EINVAL);
3229
3230                 if (*oidp == rootid)            /* We've hit the top */
3231                         return (0);
3232
3233                 if (rw == RW_READER) {          /* i.e. not the first pass */
3234                         int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3235                         if (error)
3236                                 return (error);
3237                         zl->zl_znode = zp;
3238                 }
3239                 oidp = &zp->z_phys->zp_parent;
3240                 rwlp = &zp->z_parent_lock;
3241                 rw = RW_READER;
3242
3243         } while (zp->z_id != sdzp->z_id);
3244
3245         return (0);
3246 }
3247
3248 /*
3249  * Move an entry from the provided source directory to the target
3250  * directory.  Change the entry name as indicated.
3251  *
3252  *      IN:     sdvp    - Source directory containing the "old entry".
3253  *              snm     - Old entry name.
3254  *              tdvp    - Target directory to contain the "new entry".
3255  *              tnm     - New entry name.
3256  *              cr      - credentials of caller.
3257  *              ct      - caller context
3258  *              flags   - case flags
3259  *
3260  *      RETURN: 0 if success
3261  *              error code if failure
3262  *
3263  * Timestamps:
3264  *      sdvp,tdvp - ctime|mtime updated
3265  */
3266 /*ARGSUSED*/
3267 static int
3268 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3269     caller_context_t *ct, int flags)
3270 {
3271         znode_t         *tdzp, *szp, *tzp;
3272         znode_t         *sdzp = VTOZ(sdvp);
3273         zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3274         zilog_t         *zilog;
3275         vnode_t         *realvp;
3276         zfs_dirlock_t   *sdl, *tdl;
3277         dmu_tx_t        *tx;
3278         zfs_zlock_t     *zl;
3279         int             cmp, serr, terr;
3280         int             error = 0;
3281         int             zflg = 0;
3282
3283         ZFS_ENTER(zfsvfs);
3284         ZFS_VERIFY_ZP(sdzp);
3285         zilog = zfsvfs->z_log;
3286
3287         /*
3288          * Make sure we have the real vp for the target directory.
3289          */
3290         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3291                 tdvp = realvp;
3292
3293         if (tdvp->v_vfsp != sdvp->v_vfsp) {
3294                 ZFS_EXIT(zfsvfs);
3295                 return (EXDEV);
3296         }
3297
3298         tdzp = VTOZ(tdvp);
3299         ZFS_VERIFY_ZP(tdzp);
3300         if (zfsvfs->z_utf8 && u8_validate(tnm,
3301             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3302                 ZFS_EXIT(zfsvfs);
3303                 return (EILSEQ);
3304         }
3305
3306         if (flags & FIGNORECASE)
3307                 zflg |= ZCILOOK;
3308
3309 top:
3310         szp = NULL;
3311         tzp = NULL;
3312         zl = NULL;
3313
3314         /*
3315          * This is to prevent the creation of links into attribute space
3316          * by renaming a linked file into/outof an attribute directory.
3317          * See the comment in zfs_link() for why this is considered bad.
3318          */
3319         if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3320             (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3321                 ZFS_EXIT(zfsvfs);
3322                 return (EINVAL);
3323         }
3324
3325         /*
3326          * Lock source and target directory entries.  To prevent deadlock,
3327          * a lock ordering must be defined.  We lock the directory with
3328          * the smallest object id first, or if it's a tie, the one with
3329          * the lexically first name.
3330          */
3331         if (sdzp->z_id < tdzp->z_id) {
3332                 cmp = -1;
3333         } else if (sdzp->z_id > tdzp->z_id) {
3334                 cmp = 1;
3335         } else {
3336                 /*
3337                  * First compare the two name arguments without
3338                  * considering any case folding.
3339                  */
3340                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3341
3342                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3343                 ASSERT(error == 0 || !zfsvfs->z_utf8);
3344                 if (cmp == 0) {
3345                         /*
3346                          * POSIX: "If the old argument and the new argument
3347                          * both refer to links to the same existing file,
3348                          * the rename() function shall return successfully
3349                          * and perform no other action."
3350                          */
3351                         ZFS_EXIT(zfsvfs);
3352                         return (0);
3353                 }
3354                 /*
3355                  * If the file system is case-folding, then we may
3356                  * have some more checking to do.  A case-folding file
3357                  * system is either supporting mixed case sensitivity
3358                  * access or is completely case-insensitive.  Note
3359                  * that the file system is always case preserving.
3360                  *
3361                  * In mixed sensitivity mode case sensitive behavior
3362                  * is the default.  FIGNORECASE must be used to
3363                  * explicitly request case insensitive behavior.
3364                  *
3365                  * If the source and target names provided differ only
3366                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3367                  * we will treat this as a special case in the
3368                  * case-insensitive mode: as long as the source name
3369                  * is an exact match, we will allow this to proceed as
3370                  * a name-change request.
3371                  */
3372                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3373                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
3374                     flags & FIGNORECASE)) &&
3375                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3376                     &error) == 0) {
3377                         /*
3378                          * case preserving rename request, require exact
3379                          * name matches
3380                          */
3381                         zflg |= ZCIEXACT;
3382                         zflg &= ~ZCILOOK;
3383                 }
3384         }
3385
3386         /*
3387          * If the source and destination directories are the same, we should
3388          * grab the z_name_lock of that directory only once.
3389          */
3390         if (sdzp == tdzp) {
3391                 zflg |= ZHAVELOCK;
3392                 rw_enter(&sdzp->z_name_lock, RW_READER);
3393         }
3394
3395         if (cmp < 0) {
3396                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3397                     ZEXISTS | zflg, NULL, NULL);
3398                 terr = zfs_dirent_lock(&tdl,
3399                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3400         } else {
3401                 terr = zfs_dirent_lock(&tdl,
3402                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3403                 serr = zfs_dirent_lock(&sdl,
3404                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3405                     NULL, NULL);
3406         }
3407
3408         if (serr) {
3409                 /*
3410                  * Source entry invalid or not there.
3411                  */
3412                 if (!terr) {
3413                         zfs_dirent_unlock(tdl);
3414                         if (tzp)
3415                                 VN_RELE(ZTOV(tzp));
3416                 }
3417
3418                 if (sdzp == tdzp)
3419                         rw_exit(&sdzp->z_name_lock);
3420
3421                 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3422                         serr = EINVAL;
3423                 ZFS_EXIT(zfsvfs);
3424                 return (serr);
3425         }
3426         if (terr) {
3427                 zfs_dirent_unlock(sdl);
3428                 VN_RELE(ZTOV(szp));
3429
3430                 if (sdzp == tdzp)
3431                         rw_exit(&sdzp->z_name_lock);
3432
3433                 if (strcmp(tnm, "..") == 0)
3434                         terr = EINVAL;
3435                 ZFS_EXIT(zfsvfs);
3436                 return (terr);
3437         }
3438
3439         /*
3440          * Must have write access at the source to remove the old entry
3441          * and write access at the target to create the new entry.
3442          * Note that if target and source are the same, this can be
3443          * done in a single check.
3444          */
3445
3446         if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3447                 goto out;
3448
3449         if (ZTOV(szp)->v_type == VDIR) {
3450                 /*
3451                  * Check to make sure rename is valid.
3452                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3453                  */
3454                 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3455                         goto out;
3456         }
3457
3458         /*
3459          * Does target exist?
3460          */
3461         if (tzp) {
3462                 /*
3463                  * Source and target must be the same type.
3464                  */
3465                 if (ZTOV(szp)->v_type == VDIR) {
3466                         if (ZTOV(tzp)->v_type != VDIR) {
3467                                 error = ENOTDIR;
3468                                 goto out;
3469                         }
3470                 } else {
3471                         if (ZTOV(tzp)->v_type == VDIR) {
3472                                 error = EISDIR;
3473                                 goto out;
3474                         }
3475                 }
3476                 /*
3477                  * POSIX dictates that when the source and target
3478                  * entries refer to the same file object, rename
3479                  * must do nothing and exit without error.
3480                  */
3481                 if (szp->z_id == tzp->z_id) {
3482                         error = 0;
3483                         goto out;
3484                 }
3485         }
3486
3487         vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3488         if (tzp)
3489                 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3490
3491         /*
3492          * notify the target directory if it is not the same
3493          * as source directory.
3494          */
3495         if (tdvp != sdvp) {
3496                 vnevent_rename_dest_dir(tdvp, ct);
3497         }
3498
3499         tx = dmu_tx_create(zfsvfs->z_os);
3500         dmu_tx_hold_bonus(tx, szp->z_id);       /* nlink changes */
3501         dmu_tx_hold_bonus(tx, sdzp->z_id);      /* nlink changes */
3502         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3503         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3504         if (sdzp != tdzp)
3505                 dmu_tx_hold_bonus(tx, tdzp->z_id);      /* nlink changes */
3506         if (tzp)
3507                 dmu_tx_hold_bonus(tx, tzp->z_id);       /* parent changes */
3508         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3509         error = dmu_tx_assign(tx, TXG_NOWAIT);
3510         if (error) {
3511                 if (zl != NULL)
3512                         zfs_rename_unlock(&zl);
3513                 zfs_dirent_unlock(sdl);
3514                 zfs_dirent_unlock(tdl);
3515
3516                 if (sdzp == tdzp)
3517                         rw_exit(&sdzp->z_name_lock);
3518
3519                 VN_RELE(ZTOV(szp));
3520                 if (tzp)
3521                         VN_RELE(ZTOV(tzp));
3522                 if (error == ERESTART) {
3523                         dmu_tx_wait(tx);
3524                         dmu_tx_abort(tx);
3525                         goto top;
3526                 }
3527                 dmu_tx_abort(tx);
3528                 ZFS_EXIT(zfsvfs);
3529                 return (error);
3530         }
3531
3532         if (tzp)        /* Attempt to remove the existing target */
3533                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3534
3535         if (error == 0) {
3536                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3537                 if (error == 0) {
3538                         szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3539
3540                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3541                         ASSERT(error == 0);
3542
3543                         zfs_log_rename(zilog, tx,
3544                             TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3545                             sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3546
3547                         /* Update path information for the target vnode */
3548                         vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3549                 }
3550 #ifdef FREEBSD_NAMECACHE
3551                 if (error == 0) {
3552                         cache_purge(sdvp);
3553                         cache_purge(tdvp);
3554                 }
3555 #endif
3556         }
3557
3558         dmu_tx_commit(tx);
3559 out:
3560         if (zl != NULL)
3561                 zfs_rename_unlock(&zl);
3562
3563         zfs_dirent_unlock(sdl);
3564         zfs_dirent_unlock(tdl);
3565
3566         if (sdzp == tdzp)
3567                 rw_exit(&sdzp->z_name_lock);
3568
3569         VN_RELE(ZTOV(szp));
3570         if (tzp)
3571                 VN_RELE(ZTOV(tzp));
3572
3573         ZFS_EXIT(zfsvfs);
3574
3575         return (error);
3576 }
3577
3578 /*
3579  * Insert the indicated symbolic reference entry into the directory.
3580  *
3581  *      IN:     dvp     - Directory to contain new symbolic link.
3582  *              link    - Name for new symlink entry.
3583  *              vap     - Attributes of new entry.
3584  *              target  - Target path of new symlink.
3585  *              cr      - credentials of caller.
3586  *              ct      - caller context
3587  *              flags   - case flags
3588  *
3589  *      RETURN: 0 if success
3590  *              error code if failure
3591  *
3592  * Timestamps:
3593  *      dvp - ctime|mtime updated
3594  */
3595 /*ARGSUSED*/
3596 static int
3597 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3598     cred_t *cr, kthread_t *td)
3599 {
3600         znode_t         *zp, *dzp = VTOZ(dvp);
3601         zfs_dirlock_t   *dl;
3602         dmu_tx_t        *tx;
3603         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3604         zilog_t         *zilog;
3605         int             len = strlen(link);
3606         int             error;
3607         int             zflg = ZNEW;
3608         zfs_acl_ids_t   acl_ids;
3609         boolean_t       fuid_dirtied;
3610         int             flags = 0;
3611
3612         ASSERT(vap->va_type == VLNK);
3613
3614         ZFS_ENTER(zfsvfs);
3615         ZFS_VERIFY_ZP(dzp);
3616         zilog = zfsvfs->z_log;
3617
3618         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3619             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3620                 ZFS_EXIT(zfsvfs);
3621                 return (EILSEQ);
3622         }
3623         if (flags & FIGNORECASE)
3624                 zflg |= ZCILOOK;
3625 top:
3626         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3627                 ZFS_EXIT(zfsvfs);
3628                 return (error);
3629         }
3630
3631         if (len > MAXPATHLEN) {
3632                 ZFS_EXIT(zfsvfs);
3633                 return (ENAMETOOLONG);
3634         }
3635
3636         /*
3637          * Attempt to lock directory; fail if entry already exists.
3638          */
3639         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3640         if (error) {
3641                 ZFS_EXIT(zfsvfs);
3642                 return (error);
3643         }
3644
3645         VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
3646         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3647                 zfs_acl_ids_free(&acl_ids);
3648                 zfs_dirent_unlock(dl);
3649                 ZFS_EXIT(zfsvfs);
3650                 return (EDQUOT);
3651         }
3652         tx = dmu_tx_create(zfsvfs->z_os);
3653         fuid_dirtied = zfsvfs->z_fuid_dirty;
3654         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3655         dmu_tx_hold_bonus(tx, dzp->z_id);
3656         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3657         if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
3658                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3659         if (fuid_dirtied)
3660                 zfs_fuid_txhold(zfsvfs, tx);
3661         error = dmu_tx_assign(tx, TXG_NOWAIT);
3662         if (error) {
3663                 zfs_acl_ids_free(&acl_ids);
3664                 zfs_dirent_unlock(dl);
3665                 if (error == ERESTART) {
3666                         dmu_tx_wait(tx);
3667                         dmu_tx_abort(tx);
3668                         goto top;
3669                 }
3670                 dmu_tx_abort(tx);
3671                 ZFS_EXIT(zfsvfs);
3672                 return (error);
3673         }
3674
3675         dmu_buf_will_dirty(dzp->z_dbuf, tx);
3676
3677         /*
3678          * Create a new object for the symlink.
3679          * Put the link content into bonus buffer if it will fit;
3680          * otherwise, store it just like any other file data.
3681          */
3682         if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3683                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
3684                 if (len != 0)
3685                         bcopy(link, zp->z_phys + 1, len);
3686         } else {
3687                 dmu_buf_t *dbp;
3688
3689                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
3690
3691                 if (fuid_dirtied)
3692                         zfs_fuid_sync(zfsvfs, tx);
3693                 /*
3694                  * Nothing can access the znode yet so no locking needed
3695                  * for growing the znode's blocksize.
3696                  */
3697                 zfs_grow_blocksize(zp, len, tx);
3698
3699                 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3700                     zp->z_id, 0, FTAG, &dbp));
3701                 dmu_buf_will_dirty(dbp, tx);
3702
3703                 ASSERT3U(len, <=, dbp->db_size);
3704                 bcopy(link, dbp->db_data, len);
3705                 dmu_buf_rele(dbp, FTAG);
3706         }
3707         zp->z_phys->zp_size = len;
3708
3709         /*
3710          * Insert the new object into the directory.
3711          */
3712         (void) zfs_link_create(dl, zp, tx, ZNEW);
3713         if (error == 0) {
3714                 uint64_t txtype = TX_SYMLINK;
3715                 if (flags & FIGNORECASE)
3716                         txtype |= TX_CI;
3717                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3718                 *vpp = ZTOV(zp);
3719         }
3720
3721         zfs_acl_ids_free(&acl_ids);
3722
3723         dmu_tx_commit(tx);
3724
3725         zfs_dirent_unlock(dl);
3726
3727         ZFS_EXIT(zfsvfs);
3728         return (error);
3729 }
3730
3731 /*
3732  * Return, in the buffer contained in the provided uio structure,
3733  * the symbolic path referred to by vp.
3734  *
3735  *      IN:     vp      - vnode of symbolic link.
3736  *              uoip    - structure to contain the link path.
3737  *              cr      - credentials of caller.
3738  *              ct      - caller context
3739  *
3740  *      OUT:    uio     - structure to contain the link path.
3741  *
3742  *      RETURN: 0 if success
3743  *              error code if failure
3744  *
3745  * Timestamps:
3746  *      vp - atime updated
3747  */
3748 /* ARGSUSED */
3749 static int
3750 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3751 {
3752         znode_t         *zp = VTOZ(vp);
3753         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3754         size_t          bufsz;
3755         int             error;
3756
3757         ZFS_ENTER(zfsvfs);
3758         ZFS_VERIFY_ZP(zp);
3759
3760         bufsz = (size_t)zp->z_phys->zp_size;
3761         if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3762                 error = uiomove(zp->z_phys + 1,
3763                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3764         } else {
3765                 dmu_buf_t *dbp;
3766                 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3767                 if (error) {
3768                         ZFS_EXIT(zfsvfs);
3769                         return (error);
3770                 }
3771                 error = uiomove(dbp->db_data,
3772                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3773                 dmu_buf_rele(dbp, FTAG);
3774         }
3775
3776         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3777         ZFS_EXIT(zfsvfs);
3778         return (error);
3779 }
3780
3781 /*
3782  * Insert a new entry into directory tdvp referencing svp.
3783  *
3784  *      IN:     tdvp    - Directory to contain new entry.
3785  *              svp     - vnode of new entry.
3786  *              name    - name of new entry.
3787  *              cr      - credentials of caller.
3788  *              ct      - caller context
3789  *
3790  *      RETURN: 0 if success
3791  *              error code if failure
3792  *
3793  * Timestamps:
3794  *      tdvp - ctime|mtime updated
3795  *       svp - ctime updated
3796  */
3797 /* ARGSUSED */
3798 static int
3799 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3800     caller_context_t *ct, int flags)
3801 {
3802         znode_t         *dzp = VTOZ(tdvp);
3803         znode_t         *tzp, *szp;
3804         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3805         zilog_t         *zilog;
3806         zfs_dirlock_t   *dl;
3807         dmu_tx_t        *tx;
3808         vnode_t         *realvp;
3809         int             error;
3810         int             zf = ZNEW;
3811         uid_t           owner;
3812
3813         ASSERT(tdvp->v_type == VDIR);
3814
3815         ZFS_ENTER(zfsvfs);
3816         ZFS_VERIFY_ZP(dzp);
3817         zilog = zfsvfs->z_log;
3818
3819         if (VOP_REALVP(svp, &realvp, ct) == 0)
3820                 svp = realvp;
3821
3822         if (svp->v_vfsp != tdvp->v_vfsp) {
3823                 ZFS_EXIT(zfsvfs);
3824                 return (EXDEV);
3825         }
3826         szp = VTOZ(svp);
3827         ZFS_VERIFY_ZP(szp);
3828
3829         if (zfsvfs->z_utf8 && u8_validate(name,
3830             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3831                 ZFS_EXIT(zfsvfs);
3832                 return (EILSEQ);
3833         }
3834         if (flags & FIGNORECASE)
3835                 zf |= ZCILOOK;
3836
3837 top:
3838         /*
3839          * We do not support links between attributes and non-attributes
3840          * because of the potential security risk of creating links
3841          * into "normal" file space in order to circumvent restrictions
3842          * imposed in attribute space.
3843          */
3844         if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3845             (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3846                 ZFS_EXIT(zfsvfs);
3847                 return (EINVAL);
3848         }
3849
3850         /*
3851          * POSIX dictates that we return EPERM here.
3852          * Better choices include ENOTSUP or EISDIR.
3853          */
3854         if (svp->v_type == VDIR) {
3855                 ZFS_EXIT(zfsvfs);
3856                 return (EPERM);
3857         }
3858
3859         owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3860         if (owner != crgetuid(cr) &&
3861             secpolicy_basic_link(svp, cr) != 0) {
3862                 ZFS_EXIT(zfsvfs);
3863                 return (EPERM);
3864         }
3865
3866         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3867                 ZFS_EXIT(zfsvfs);
3868                 return (error);
3869         }
3870
3871         /*
3872          * Attempt to lock directory; fail if entry already exists.
3873          */
3874         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3875         if (error) {
3876                 ZFS_EXIT(zfsvfs);
3877                 return (error);
3878         }
3879
3880         tx = dmu_tx_create(zfsvfs->z_os);
3881         dmu_tx_hold_bonus(tx, szp->z_id);
3882         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3883         error = dmu_tx_assign(tx, TXG_NOWAIT);
3884         if (error) {
3885                 zfs_dirent_unlock(dl);
3886                 if (error == ERESTART) {
3887                         dmu_tx_wait(tx);
3888                         dmu_tx_abort(tx);
3889                         goto top;
3890                 }
3891                 dmu_tx_abort(tx);
3892                 ZFS_EXIT(zfsvfs);
3893                 return (error);
3894         }
3895
3896         error = zfs_link_create(dl, szp, tx, 0);
3897
3898         if (error == 0) {
3899                 uint64_t txtype = TX_LINK;
3900                 if (flags & FIGNORECASE)
3901                         txtype |= TX_CI;
3902                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3903         }
3904
3905         dmu_tx_commit(tx);
3906
3907         zfs_dirent_unlock(dl);
3908
3909         if (error == 0) {
3910                 vnevent_link(svp, ct);
3911         }
3912
3913         ZFS_EXIT(zfsvfs);
3914         return (error);
3915 }
3916
3917 /*ARGSUSED*/
3918 void
3919 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3920 {
3921         znode_t *zp = VTOZ(vp);
3922         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3923         int error;
3924
3925         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3926         if (zp->z_dbuf == NULL) {
3927                 /*
3928                  * The fs has been unmounted, or we did a
3929                  * suspend/resume and this file no longer exists.
3930                  */
3931                 VI_LOCK(vp);
3932                 vp->v_count = 0; /* count arrives as 1 */
3933                 VI_UNLOCK(vp);
3934                 vrecycle(vp, curthread);
3935                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3936                 return;
3937         }
3938
3939         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3940                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3941
3942                 dmu_tx_hold_bonus(tx, zp->z_id);
3943                 error = dmu_tx_assign(tx, TXG_WAIT);
3944                 if (error) {
3945                         dmu_tx_abort(tx);
3946                 } else {
3947                         dmu_buf_will_dirty(zp->z_dbuf, tx);
3948                         mutex_enter(&zp->z_lock);
3949                         zp->z_atime_dirty = 0;
3950                         mutex_exit(&zp->z_lock);
3951                         dmu_tx_commit(tx);
3952                 }
3953         }
3954
3955         zfs_zinactive(zp);
3956         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3957 }
3958
3959 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
3960 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
3961
3962 /*ARGSUSED*/
3963 static int
3964 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3965 {
3966         znode_t         *zp = VTOZ(vp);
3967         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3968         uint32_t        gen;
3969         uint64_t        object = zp->z_id;
3970         zfid_short_t    *zfid;
3971         int             size, i;
3972
3973         ZFS_ENTER(zfsvfs);
3974         ZFS_VERIFY_ZP(zp);
3975         gen = (uint32_t)zp->z_gen;
3976
3977         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3978         fidp->fid_len = size;
3979
3980         zfid = (zfid_short_t *)fidp;
3981
3982         zfid->zf_len = size;
3983
3984         for (i = 0; i < sizeof (zfid->zf_object); i++)
3985                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3986
3987         /* Must have a non-zero generation number to distinguish from .zfs */
3988         if (gen == 0)
3989                 gen = 1;
3990         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3991                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3992
3993         if (size == LONG_FID_LEN) {
3994                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
3995                 zfid_long_t     *zlfid;
3996
3997                 zlfid = (zfid_long_t *)fidp;
3998
3999                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4000                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4001
4002                 /* XXX - this should be the generation number for the objset */
4003                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4004                         zlfid->zf_setgen[i] = 0;
4005         }
4006
4007         ZFS_EXIT(zfsvfs);
4008         return (0);
4009 }
4010
4011 static int
4012 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4013     caller_context_t *ct)
4014 {
4015         znode_t         *zp, *xzp;
4016         zfsvfs_t        *zfsvfs;
4017         zfs_dirlock_t   *dl;
4018         int             error;
4019
4020         switch (cmd) {
4021         case _PC_LINK_MAX:
4022                 *valp = INT_MAX;
4023                 return (0);
4024
4025         case _PC_FILESIZEBITS:
4026                 *valp = 64;
4027                 return (0);
4028
4029 #if 0
4030         case _PC_XATTR_EXISTS:
4031                 zp = VTOZ(vp);
4032                 zfsvfs = zp->z_zfsvfs;
4033                 ZFS_ENTER(zfsvfs);
4034                 ZFS_VERIFY_ZP(zp);
4035                 *valp = 0;
4036                 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4037                     ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4038                 if (error == 0) {
4039                         zfs_dirent_unlock(dl);
4040                         if (!zfs_dirempty(xzp))
4041                                 *valp = 1;
4042                         VN_RELE(ZTOV(xzp));
4043                 } else if (error == ENOENT) {
4044                         /*
4045                          * If there aren't extended attributes, it's the
4046                          * same as having zero of them.
4047                          */
4048                         error = 0;
4049                 }
4050                 ZFS_EXIT(zfsvfs);
4051                 return (error);
4052 #endif
4053
4054         case _PC_ACL_EXTENDED:
4055                 *valp = 0;
4056                 return (0);
4057
4058         case _PC_ACL_NFS4:
4059                 *valp = 1;
4060                 return (0);
4061
4062         case _PC_ACL_PATH_MAX:
4063                 *valp = ACL_MAX_ENTRIES;
4064                 return (0);
4065
4066         case _PC_MIN_HOLE_SIZE:
4067                 *valp = (int)SPA_MINBLOCKSIZE;
4068                 return (0);
4069
4070         default:
4071                 return (EOPNOTSUPP);
4072         }
4073 }
4074
4075 /*ARGSUSED*/
4076 static int
4077 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4078     caller_context_t *ct)
4079 {
4080         znode_t *zp = VTOZ(vp);
4081         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4082         int error;
4083         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4084
4085         ZFS_ENTER(zfsvfs);
4086         ZFS_VERIFY_ZP(zp);
4087         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4088         ZFS_EXIT(zfsvfs);
4089
4090         return (error);
4091 }
4092
4093 /*ARGSUSED*/
4094 static int
4095 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4096     caller_context_t *ct)
4097 {
4098         znode_t *zp = VTOZ(vp);
4099         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4100         int error;
4101         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4102
4103         ZFS_ENTER(zfsvfs);
4104         ZFS_VERIFY_ZP(zp);
4105         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4106         ZFS_EXIT(zfsvfs);
4107         return (error);
4108 }
4109
4110 static int
4111 zfs_freebsd_open(ap)
4112         struct vop_open_args /* {
4113                 struct vnode *a_vp;
4114                 int a_mode;
4115                 struct ucred *a_cred;
4116                 struct thread *a_td;
4117         } */ *ap;
4118 {
4119         vnode_t *vp = ap->a_vp;
4120         znode_t *zp = VTOZ(vp);
4121         int error;
4122
4123         error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4124         if (error == 0)
4125                 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
4126         return (error);
4127 }
4128
4129 static int
4130 zfs_freebsd_close(ap)
4131         struct vop_close_args /* {
4132                 struct vnode *a_vp;
4133                 int  a_fflag;
4134                 struct ucred *a_cred;
4135                 struct thread *a_td;
4136         } */ *ap;
4137 {
4138
4139         return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
4140 }
4141
4142 static int
4143 zfs_freebsd_ioctl(ap)
4144         struct vop_ioctl_args /* {
4145                 struct vnode *a_vp;
4146                 u_long a_command;
4147                 caddr_t a_data;
4148                 int a_fflag;
4149                 struct ucred *cred;
4150                 struct thread *td;
4151         } */ *ap;
4152 {
4153
4154         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4155             ap->a_fflag, ap->a_cred, NULL, NULL));
4156 }
4157
4158 static int
4159 zfs_freebsd_read(ap)
4160         struct vop_read_args /* {
4161                 struct vnode *a_vp;
4162                 struct uio *a_uio;
4163                 int a_ioflag;
4164                 struct ucred *a_cred;
4165         } */ *ap;
4166 {
4167
4168         return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4169 }
4170
4171 static int
4172 zfs_freebsd_write(ap)
4173         struct vop_write_args /* {
4174                 struct vnode *a_vp;
4175                 struct uio *a_uio;
4176                 int a_ioflag;
4177                 struct ucred *a_cred;
4178         } */ *ap;
4179 {
4180
4181         return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4182 }
4183
4184 static int
4185 zfs_freebsd_access(ap)
4186         struct vop_access_args /* {
4187                 struct vnode *a_vp;
4188                 accmode_t a_accmode;
4189                 struct ucred *a_cred;
4190                 struct thread *a_td;
4191         } */ *ap;
4192 {
4193         accmode_t accmode;
4194         int error = 0;
4195
4196         /*
4197          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4198          */
4199         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4200         if (accmode != 0)
4201                 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4202
4203         /*
4204          * VADMIN has to be handled by vaccess().
4205          */
4206         if (error == 0) {
4207                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4208                 if (accmode != 0) {
4209                         vnode_t *vp = ap->a_vp;
4210                         znode_t *zp = VTOZ(vp);
4211                         znode_phys_t *zphys = zp->z_phys;
4212
4213                         error = vaccess(vp->v_type, zphys->zp_mode,
4214                             zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred,
4215                             NULL);
4216                 }
4217         }
4218
4219         return (error);
4220 }
4221
4222 static int
4223 zfs_freebsd_lookup(ap)
4224         struct vop_lookup_args /* {
4225                 struct vnode *a_dvp;
4226                 struct vnode **a_vpp;
4227                 struct componentname *a_cnp;
4228         } */ *ap;
4229 {
4230         struct componentname *cnp = ap->a_cnp;
4231         char nm[NAME_MAX + 1];
4232
4233         ASSERT(cnp->cn_namelen < sizeof(nm));
4234         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4235
4236         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4237             cnp->cn_cred, cnp->cn_thread, 0));
4238 }
4239
4240 static int
4241 zfs_freebsd_create(ap)
4242         struct vop_create_args /* {
4243                 struct vnode *a_dvp;
4244                 struct vnode **a_vpp;
4245                 struct componentname *a_cnp;
4246                 struct vattr *a_vap;
4247         } */ *ap;
4248 {
4249         struct componentname *cnp = ap->a_cnp;
4250         vattr_t *vap = ap->a_vap;
4251         int mode;
4252
4253         ASSERT(cnp->cn_flags & SAVENAME);
4254
4255         vattr_init_mask(vap);
4256         mode = vap->va_mode & ALLPERMS;
4257
4258         return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4259             ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
4260 }
4261
4262 static int
4263 zfs_freebsd_remove(ap)
4264         struct vop_remove_args /* {
4265                 struct vnode *a_dvp;
4266                 struct vnode *a_vp;
4267                 struct componentname *a_cnp;
4268         } */ *ap;
4269 {
4270
4271         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4272
4273         return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
4274             ap->a_cnp->cn_cred, NULL, 0));
4275 }
4276
4277 static int
4278 zfs_freebsd_mkdir(ap)
4279         struct vop_mkdir_args /* {
4280                 struct vnode *a_dvp;
4281                 struct vnode **a_vpp;
4282                 struct componentname *a_cnp;
4283                 struct vattr *a_vap;
4284         } */ *ap;
4285 {
4286         vattr_t *vap = ap->a_vap;
4287
4288         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4289
4290         vattr_init_mask(vap);
4291
4292         return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4293             ap->a_cnp->cn_cred, NULL, 0, NULL));
4294 }
4295
4296 static int
4297 zfs_freebsd_rmdir(ap)
4298         struct vop_rmdir_args /* {
4299                 struct vnode *a_dvp;
4300                 struct vnode *a_vp;
4301                 struct componentname *a_cnp;
4302         } */ *ap;
4303 {
4304         struct componentname *cnp = ap->a_cnp;
4305
4306         ASSERT(cnp->cn_flags & SAVENAME);
4307
4308         return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4309 }
4310
4311 static int
4312 zfs_freebsd_readdir(ap)
4313         struct vop_readdir_args /* {
4314                 struct vnode *a_vp;
4315                 struct uio *a_uio;
4316                 struct ucred *a_cred;
4317                 int *a_eofflag;
4318                 int *a_ncookies;
4319                 u_long **a_cookies;
4320         } */ *ap;
4321 {
4322
4323         return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4324             ap->a_ncookies, ap->a_cookies));
4325 }
4326
4327 static int
4328 zfs_freebsd_fsync(ap)
4329         struct vop_fsync_args /* {
4330                 struct vnode *a_vp;
4331                 int a_waitfor;
4332                 struct thread *a_td;
4333         } */ *ap;
4334 {
4335
4336         vop_stdfsync(ap);
4337         return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
4338 }
4339
4340 static int
4341 zfs_freebsd_getattr(ap)
4342         struct vop_getattr_args /* {
4343                 struct vnode *a_vp;
4344                 struct vattr *a_vap;
4345                 struct ucred *a_cred;
4346                 struct thread *a_td;
4347         } */ *ap;
4348 {
4349         vattr_t *vap = ap->a_vap;
4350         xvattr_t xvap;
4351         u_long fflags = 0;
4352         int error;
4353
4354         xva_init(&xvap);
4355         xvap.xva_vattr = *vap;
4356         xvap.xva_vattr.va_mask |= AT_XVATTR;
4357
4358         /* Convert chflags into ZFS-type flags. */
4359         /* XXX: what about SF_SETTABLE?. */
4360         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4361         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4362         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4363         XVA_SET_REQ(&xvap, XAT_NODUMP);
4364         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4365         if (error != 0)
4366                 return (error);
4367
4368         /* Convert ZFS xattr into chflags. */
4369 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4370         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4371                 fflags |= (fflag);                                      \
4372 } while (0)
4373         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4374             xvap.xva_xoptattrs.xoa_immutable);
4375         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4376             xvap.xva_xoptattrs.xoa_appendonly);
4377         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4378             xvap.xva_xoptattrs.xoa_nounlink);
4379         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4380             xvap.xva_xoptattrs.xoa_nodump);
4381 #undef  FLAG_CHECK
4382         *vap = xvap.xva_vattr;
4383         vap->va_flags = fflags;
4384         return (0);
4385 }
4386
4387 static int
4388 zfs_freebsd_setattr(ap)
4389         struct vop_setattr_args /* {
4390                 struct vnode *a_vp;
4391                 struct vattr *a_vap;
4392                 struct ucred *a_cred;
4393                 struct thread *a_td;
4394         } */ *ap;
4395 {
4396         vnode_t *vp = ap->a_vp;
4397         vattr_t *vap = ap->a_vap;
4398         cred_t *cred = ap->a_cred;
4399         xvattr_t xvap;
4400         u_long fflags;
4401         uint64_t zflags;
4402
4403         vattr_init_mask(vap);
4404         vap->va_mask &= ~AT_NOSET;
4405
4406         xva_init(&xvap);
4407         xvap.xva_vattr = *vap;
4408
4409         zflags = VTOZ(vp)->z_phys->zp_flags;
4410
4411         if (vap->va_flags != VNOVAL) {
4412                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4413                 int error;
4414
4415                 if (zfsvfs->z_use_fuids == B_FALSE)
4416                         return (EOPNOTSUPP);
4417
4418                 fflags = vap->va_flags;
4419                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4420                         return (EOPNOTSUPP);
4421                 /*
4422                  * Unprivileged processes are not permitted to unset system
4423                  * flags, or modify flags if any system flags are set.
4424                  * Privileged non-jail processes may not modify system flags
4425                  * if securelevel > 0 and any existing system flags are set.
4426                  * Privileged jail processes behave like privileged non-jail
4427                  * processes if the security.jail.chflags_allowed sysctl is
4428                  * is non-zero; otherwise, they behave like unprivileged
4429                  * processes.
4430                  */
4431                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4432                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
4433                         if (zflags &
4434                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4435                                 error = securelevel_gt(cred, 0);
4436                                 if (error != 0)
4437                                         return (error);
4438                         }
4439                 } else {
4440                         /*
4441                          * Callers may only modify the file flags on objects they
4442                          * have VADMIN rights for.
4443                          */
4444                         if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
4445                                 return (error);
4446                         if (zflags &
4447                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4448                                 return (EPERM);
4449                         }
4450                         if (fflags &
4451                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4452                                 return (EPERM);
4453                         }
4454                 }
4455
4456 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4457         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4458             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4459                 XVA_SET_REQ(&xvap, (xflag));                            \
4460                 (xfield) = ((fflags & (fflag)) != 0);                   \
4461         }                                                               \
4462 } while (0)
4463                 /* Convert chflags into ZFS-type flags. */
4464                 /* XXX: what about SF_SETTABLE?. */
4465                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4466                     xvap.xva_xoptattrs.xoa_immutable);
4467                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4468                     xvap.xva_xoptattrs.xoa_appendonly);
4469                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4470                     xvap.xva_xoptattrs.xoa_nounlink);
4471                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4472                     xvap.xva_xoptattrs.xoa_nodump);
4473 #undef  FLAG_CHANGE
4474         }
4475         return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4476 }
4477
4478 static int
4479 zfs_freebsd_rename(ap)
4480         struct vop_rename_args  /* {
4481                 struct vnode *a_fdvp;
4482                 struct vnode *a_fvp;
4483                 struct componentname *a_fcnp;
4484                 struct vnode *a_tdvp;
4485                 struct vnode *a_tvp;
4486                 struct componentname *a_tcnp;
4487         } */ *ap;
4488 {
4489         vnode_t *fdvp = ap->a_fdvp;
4490         vnode_t *fvp = ap->a_fvp;
4491         vnode_t *tdvp = ap->a_tdvp;
4492         vnode_t *tvp = ap->a_tvp;
4493         int error;
4494
4495         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4496         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4497
4498         error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
4499             ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4500
4501         if (tdvp == tvp)
4502                 VN_RELE(tdvp);
4503         else
4504                 VN_URELE(tdvp);
4505         if (tvp)
4506                 VN_URELE(tvp);
4507         VN_RELE(fdvp);
4508         VN_RELE(fvp);
4509
4510         return (error);
4511 }
4512
4513 static int
4514 zfs_freebsd_symlink(ap)
4515         struct vop_symlink_args /* {
4516                 struct vnode *a_dvp;
4517                 struct vnode **a_vpp;
4518                 struct componentname *a_cnp;
4519                 struct vattr *a_vap;
4520                 char *a_target;
4521         } */ *ap;
4522 {
4523         struct componentname *cnp = ap->a_cnp;
4524         vattr_t *vap = ap->a_vap;
4525
4526         ASSERT(cnp->cn_flags & SAVENAME);
4527
4528         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
4529         vattr_init_mask(vap);
4530
4531         return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
4532             ap->a_target, cnp->cn_cred, cnp->cn_thread));
4533 }
4534
4535 static int
4536 zfs_freebsd_readlink(ap)
4537         struct vop_readlink_args /* {
4538                 struct vnode *a_vp;
4539                 struct uio *a_uio;
4540                 struct ucred *a_cred;
4541         } */ *ap;
4542 {
4543
4544         return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4545 }
4546
4547 static int
4548 zfs_freebsd_link(ap)
4549         struct vop_link_args /* {
4550                 struct vnode *a_tdvp;
4551                 struct vnode *a_vp;
4552                 struct componentname *a_cnp;
4553         } */ *ap;
4554 {
4555         struct componentname *cnp = ap->a_cnp;
4556
4557         ASSERT(cnp->cn_flags & SAVENAME);
4558
4559         return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4560 }
4561
4562 static int
4563 zfs_freebsd_inactive(ap)
4564         struct vop_inactive_args /* {
4565                 struct vnode *a_vp;
4566                 struct thread *a_td;
4567         } */ *ap;
4568 {
4569         vnode_t *vp = ap->a_vp;
4570
4571         zfs_inactive(vp, ap->a_td->td_ucred, NULL);
4572         return (0);
4573 }
4574
4575 static void
4576 zfs_reclaim_complete(void *arg, int pending)
4577 {
4578         znode_t *zp = arg;
4579         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4580
4581         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4582         if (zp->z_dbuf != NULL) {
4583                 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
4584                 zfs_znode_dmu_fini(zp);
4585                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4586         }
4587         zfs_znode_free(zp);
4588         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4589         /*
4590          * If the file system is being unmounted, there is a process waiting
4591          * for us, wake it up.
4592          */
4593         if (zfsvfs->z_unmounted)
4594                 wakeup_one(zfsvfs);
4595 }
4596
4597 static int
4598 zfs_freebsd_reclaim(ap)
4599         struct vop_reclaim_args /* {
4600                 struct vnode *a_vp;
4601                 struct thread *a_td;
4602         } */ *ap;
4603 {
4604         vnode_t *vp = ap->a_vp;
4605         znode_t *zp = VTOZ(vp);
4606         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4607
4608         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4609
4610         ASSERT(zp != NULL);
4611
4612         /*
4613          * Destroy the vm object and flush associated pages.
4614          */
4615         vnode_destroy_vobject(vp);
4616
4617         mutex_enter(&zp->z_lock);
4618         ASSERT(zp->z_phys != NULL);
4619         zp->z_vnode = NULL;
4620         mutex_exit(&zp->z_lock);
4621
4622         if (zp->z_unlinked)
4623                 ;       /* Do nothing. */
4624         else if (zp->z_dbuf == NULL)
4625                 zfs_znode_free(zp);
4626         else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
4627                 int locked;
4628
4629                 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4630                     ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4631                 if (locked == 0) {
4632                         /*
4633                          * Lock can't be obtained due to deadlock possibility,
4634                          * so defer znode destruction.
4635                          */
4636                         TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
4637                         taskqueue_enqueue(taskqueue_thread, &zp->z_task);
4638                 } else {
4639                         zfs_znode_dmu_fini(zp);
4640                         if (locked == 1)
4641                                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4642                         zfs_znode_free(zp);
4643                 }
4644         }
4645         VI_LOCK(vp);
4646         vp->v_data = NULL;
4647         ASSERT(vp->v_holdcnt >= 1);
4648         VI_UNLOCK(vp);
4649         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4650         return (0);
4651 }
4652
4653 static int
4654 zfs_freebsd_fid(ap)
4655         struct vop_fid_args /* {
4656                 struct vnode *a_vp;
4657                 struct fid *a_fid;
4658         } */ *ap;
4659 {
4660
4661         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4662 }
4663
4664 static int
4665 zfs_freebsd_pathconf(ap)
4666         struct vop_pathconf_args /* {
4667                 struct vnode *a_vp;
4668                 int a_name;
4669                 register_t *a_retval;
4670         } */ *ap;
4671 {
4672         ulong_t val;
4673         int error;
4674
4675         error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
4676         if (error == 0)
4677                 *ap->a_retval = val;
4678         else if (error == EOPNOTSUPP)
4679                 error = vop_stdpathconf(ap);
4680         return (error);
4681 }
4682
4683 static int
4684 zfs_freebsd_fifo_pathconf(ap)
4685         struct vop_pathconf_args /* {
4686                 struct vnode *a_vp;
4687                 int a_name;
4688                 register_t *a_retval;
4689         } */ *ap;
4690 {
4691
4692         switch (ap->a_name) {
4693         case _PC_ACL_EXTENDED:
4694         case _PC_ACL_NFS4:
4695         case _PC_ACL_PATH_MAX:
4696         case _PC_MAC_PRESENT:
4697                 return (zfs_freebsd_pathconf(ap));
4698         default:
4699                 return (fifo_specops.vop_pathconf(ap));
4700         }
4701 }
4702
4703 /*
4704  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
4705  * extended attribute name:
4706  *
4707  *      NAMESPACE       PREFIX
4708  *      system          freebsd:system:
4709  *      user            (none, can be used to access ZFS fsattr(5) attributes
4710  *                      created on Solaris)
4711  */
4712 static int
4713 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
4714     size_t size)
4715 {
4716         const char *namespace, *prefix, *suffix;
4717
4718         /* We don't allow '/' character in attribute name. */
4719         if (strchr(name, '/') != NULL)
4720                 return (EINVAL);
4721         /* We don't allow attribute names that start with "freebsd:" string. */
4722         if (strncmp(name, "freebsd:", 8) == 0)
4723                 return (EINVAL);
4724
4725         bzero(attrname, size);
4726
4727         switch (attrnamespace) {
4728         case EXTATTR_NAMESPACE_USER:
4729 #if 0
4730                 prefix = "freebsd:";
4731                 namespace = EXTATTR_NAMESPACE_USER_STRING;
4732                 suffix = ":";
4733 #else
4734                 /*
4735                  * This is the default namespace by which we can access all
4736                  * attributes created on Solaris.
4737                  */
4738                 prefix = namespace = suffix = "";
4739 #endif
4740                 break;
4741         case EXTATTR_NAMESPACE_SYSTEM:
4742                 prefix = "freebsd:";
4743                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
4744                 suffix = ":";
4745                 break;
4746         case EXTATTR_NAMESPACE_EMPTY:
4747         default:
4748                 return (EINVAL);
4749         }
4750         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
4751             name) >= size) {
4752                 return (ENAMETOOLONG);
4753         }
4754         return (0);
4755 }
4756
4757 /*
4758  * Vnode operating to retrieve a named extended attribute.
4759  */
4760 static int
4761 zfs_getextattr(struct vop_getextattr_args *ap)
4762 /*
4763 vop_getextattr {
4764         IN struct vnode *a_vp;
4765         IN int a_attrnamespace;
4766         IN const char *a_name;
4767         INOUT struct uio *a_uio;
4768         OUT size_t *a_size;
4769         IN struct ucred *a_cred;
4770         IN struct thread *a_td;
4771 };
4772 */
4773 {
4774         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4775         struct thread *td = ap->a_td;
4776         struct nameidata nd;
4777         char attrname[255];
4778         struct vattr va;
4779         vnode_t *xvp = NULL, *vp;
4780         int error, flags;
4781
4782         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4783             ap->a_cred, ap->a_td, VREAD);
4784         if (error != 0)
4785                 return (error);
4786
4787         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4788             sizeof(attrname));
4789         if (error != 0)
4790                 return (error);
4791
4792         ZFS_ENTER(zfsvfs);
4793
4794         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4795             LOOKUP_XATTR);
4796         if (error != 0) {
4797                 ZFS_EXIT(zfsvfs);
4798                 return (error);
4799         }
4800
4801         flags = FREAD;
4802         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4803             xvp, td);
4804         error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
4805         vp = nd.ni_vp;
4806         NDFREE(&nd, NDF_ONLY_PNBUF);
4807         if (error != 0) {
4808                 ZFS_EXIT(zfsvfs);
4809                 if (error == ENOENT)
4810                         error = ENOATTR;
4811                 return (error);
4812         }
4813
4814         if (ap->a_size != NULL) {
4815                 error = VOP_GETATTR(vp, &va, ap->a_cred);
4816                 if (error == 0)
4817                         *ap->a_size = (size_t)va.va_size;
4818         } else if (ap->a_uio != NULL)
4819                 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4820
4821         VOP_UNLOCK(vp, 0);
4822         vn_close(vp, flags, ap->a_cred, td);
4823         ZFS_EXIT(zfsvfs);
4824
4825         return (error);
4826 }
4827
4828 /*
4829  * Vnode operation to remove a named attribute.
4830  */
4831 int
4832 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
4833 /*
4834 vop_deleteextattr {
4835         IN struct vnode *a_vp;
4836         IN int a_attrnamespace;
4837         IN const char *a_name;
4838         IN struct ucred *a_cred;
4839         IN struct thread *a_td;
4840 };
4841 */
4842 {
4843         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4844         struct thread *td = ap->a_td;
4845         struct nameidata nd;
4846         char attrname[255];
4847         struct vattr va;
4848         vnode_t *xvp = NULL, *vp;
4849         int error, flags;
4850
4851         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4852             ap->a_cred, ap->a_td, VWRITE);
4853         if (error != 0)
4854                 return (error);
4855
4856         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4857             sizeof(attrname));
4858         if (error != 0)
4859                 return (error);
4860
4861         ZFS_ENTER(zfsvfs);
4862
4863         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4864             LOOKUP_XATTR);
4865         if (error != 0) {
4866                 ZFS_EXIT(zfsvfs);
4867                 return (error);
4868         }
4869
4870         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
4871             UIO_SYSSPACE, attrname, xvp, td);
4872         error = namei(&nd);
4873         vp = nd.ni_vp;
4874         NDFREE(&nd, NDF_ONLY_PNBUF);
4875         if (error != 0) {
4876                 ZFS_EXIT(zfsvfs);
4877                 if (error == ENOENT)
4878                         error = ENOATTR;
4879                 return (error);
4880         }
4881         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
4882
4883         vput(nd.ni_dvp);
4884         if (vp == nd.ni_dvp)
4885                 vrele(vp);
4886         else
4887                 vput(vp);
4888         ZFS_EXIT(zfsvfs);
4889
4890         return (error);
4891 }
4892
4893 /*
4894  * Vnode operation to set a named attribute.
4895  */
4896 static int
4897 zfs_setextattr(struct vop_setextattr_args *ap)
4898 /*
4899 vop_setextattr {
4900         IN struct vnode *a_vp;
4901         IN int a_attrnamespace;
4902         IN const char *a_name;
4903         INOUT struct uio *a_uio;
4904         IN struct ucred *a_cred;
4905         IN struct thread *a_td;
4906 };
4907 */
4908 {
4909         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4910         struct thread *td = ap->a_td;
4911         struct nameidata nd;
4912         char attrname[255];
4913         struct vattr va;
4914         vnode_t *xvp = NULL, *vp;
4915         int error, flags;
4916
4917         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4918             ap->a_cred, ap->a_td, VWRITE);
4919         if (error != 0)
4920                 return (error);
4921
4922         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4923             sizeof(attrname));
4924         if (error != 0)
4925                 return (error);
4926
4927         ZFS_ENTER(zfsvfs);
4928
4929         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4930             LOOKUP_XATTR | CREATE_XATTR_DIR);
4931         if (error != 0) {
4932                 ZFS_EXIT(zfsvfs);
4933                 return (error);
4934         }
4935
4936         flags = FFLAGS(O_WRONLY | O_CREAT);
4937         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4938             xvp, td);
4939         error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
4940         vp = nd.ni_vp;
4941         NDFREE(&nd, NDF_ONLY_PNBUF);
4942         if (error != 0) {
4943                 ZFS_EXIT(zfsvfs);
4944                 return (error);
4945         }
4946
4947         VATTR_NULL(&va);
4948         va.va_size = 0;
4949         error = VOP_SETATTR(vp, &va, ap->a_cred);
4950         if (error == 0)
4951                 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4952
4953         VOP_UNLOCK(vp, 0);
4954         vn_close(vp, flags, ap->a_cred, td);
4955         ZFS_EXIT(zfsvfs);
4956
4957         return (error);
4958 }
4959
4960 /*
4961  * Vnode operation to retrieve extended attributes on a vnode.
4962  */
4963 static int
4964 zfs_listextattr(struct vop_listextattr_args *ap)
4965 /*
4966 vop_listextattr {
4967         IN struct vnode *a_vp;
4968         IN int a_attrnamespace;
4969         INOUT struct uio *a_uio;
4970         OUT size_t *a_size;
4971         IN struct ucred *a_cred;
4972         IN struct thread *a_td;
4973 };
4974 */
4975 {
4976         zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4977         struct thread *td = ap->a_td;
4978         struct nameidata nd;
4979         char attrprefix[16];
4980         u_char dirbuf[sizeof(struct dirent)];
4981         struct dirent *dp;
4982         struct iovec aiov;
4983         struct uio auio, *uio = ap->a_uio;
4984         size_t *sizep = ap->a_size;
4985         size_t plen;
4986         vnode_t *xvp = NULL, *vp;
4987         int done, error, eof, pos;
4988
4989         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4990             ap->a_cred, ap->a_td, VREAD);
4991         if (error != 0)
4992                 return (error);
4993
4994         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
4995             sizeof(attrprefix));
4996         if (error != 0)
4997                 return (error);
4998         plen = strlen(attrprefix);
4999
5000         ZFS_ENTER(zfsvfs);
5001
5002         if (sizep != NULL)
5003                 *sizep = 0;
5004
5005         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5006             LOOKUP_XATTR);
5007         if (error != 0) {
5008                 ZFS_EXIT(zfsvfs);
5009                 /*
5010                  * ENOATTR means that the EA directory does not yet exist,
5011                  * i.e. there are no extended attributes there.
5012                  */
5013                 if (error == ENOATTR)
5014                         error = 0;
5015                 return (error);
5016         }
5017
5018         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
5019             UIO_SYSSPACE, ".", xvp, td);
5020         error = namei(&nd);
5021         vp = nd.ni_vp;
5022         NDFREE(&nd, NDF_ONLY_PNBUF);
5023         if (error != 0) {
5024                 ZFS_EXIT(zfsvfs);
5025                 return (error);
5026         }
5027
5028         auio.uio_iov = &aiov;
5029         auio.uio_iovcnt = 1;
5030         auio.uio_segflg = UIO_SYSSPACE;
5031         auio.uio_td = td;
5032         auio.uio_rw = UIO_READ;
5033         auio.uio_offset = 0;
5034
5035         do {
5036                 u_char nlen;
5037
5038                 aiov.iov_base = (void *)dirbuf;
5039                 aiov.iov_len = sizeof(dirbuf);
5040                 auio.uio_resid = sizeof(dirbuf);
5041                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5042                 done = sizeof(dirbuf) - auio.uio_resid;
5043                 if (error != 0)
5044                         break;
5045                 for (pos = 0; pos < done;) {
5046                         dp = (struct dirent *)(dirbuf + pos);
5047                         pos += dp->d_reclen;
5048                         /*
5049                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
5050                          * is what we get when attribute was created on Solaris.
5051                          */
5052                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5053                                 continue;
5054                         if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5055                                 continue;
5056                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5057                                 continue;
5058                         nlen = dp->d_namlen - plen;
5059                         if (sizep != NULL)
5060                                 *sizep += 1 + nlen;
5061                         else if (uio != NULL) {
5062                                 /*
5063                                  * Format of extattr name entry is one byte for
5064                                  * length and the rest for name.
5065                                  */
5066                                 error = uiomove(&nlen, 1, uio->uio_rw, uio);
5067                                 if (error == 0) {
5068                                         error = uiomove(dp->d_name + plen, nlen,
5069                                             uio->uio_rw, uio);
5070                                 }
5071                                 if (error != 0)
5072                                         break;
5073                         }
5074                 }
5075         } while (!eof && error == 0);
5076
5077         vput(vp);
5078         ZFS_EXIT(zfsvfs);
5079
5080         return (error);
5081 }
5082
5083 int
5084 zfs_freebsd_getacl(ap)
5085         struct vop_getacl_args /* {
5086                 struct vnode *vp;
5087                 acl_type_t type;
5088                 struct acl *aclp;
5089                 struct ucred *cred;
5090                 struct thread *td;
5091         } */ *ap;
5092 {
5093         int             error;
5094         vsecattr_t      vsecattr;
5095
5096         if (ap->a_type != ACL_TYPE_NFS4)
5097                 return (EINVAL);
5098
5099         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5100         if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5101                 return (error);
5102
5103         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5104         if (vsecattr.vsa_aclentp != NULL)
5105                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5106
5107         return (error);
5108 }
5109
5110 int
5111 zfs_freebsd_setacl(ap)
5112         struct vop_setacl_args /* {
5113                 struct vnode *vp;
5114                 acl_type_t type;
5115                 struct acl *aclp;
5116                 struct ucred *cred;
5117                 struct thread *td;
5118         } */ *ap;
5119 {
5120         int             error;
5121         vsecattr_t      vsecattr;
5122         int             aclbsize;       /* size of acl list in bytes */
5123         aclent_t        *aaclp;
5124
5125         if (ap->a_type != ACL_TYPE_NFS4)
5126                 return (EINVAL);
5127
5128         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5129                 return (EINVAL);
5130
5131         /*
5132          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5133          * splitting every entry into two and appending "canonical six"
5134          * entries at the end.  Don't allow for setting an ACL that would
5135          * cause chmod(2) to run out of ACL entries.
5136          */
5137         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5138                 return (ENOSPC);
5139
5140         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5141         if (error != 0)
5142                 return (error);
5143
5144         vsecattr.vsa_mask = VSA_ACE;
5145         aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5146         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5147         aaclp = vsecattr.vsa_aclentp;
5148         vsecattr.vsa_aclentsz = aclbsize;
5149
5150         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5151         error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5152         kmem_free(aaclp, aclbsize);
5153
5154         return (error);
5155 }
5156
5157 int
5158 zfs_freebsd_aclcheck(ap)
5159         struct vop_aclcheck_args /* {
5160                 struct vnode *vp;
5161                 acl_type_t type;
5162                 struct acl *aclp;
5163                 struct ucred *cred;
5164                 struct thread *td;
5165         } */ *ap;
5166 {
5167
5168         return (EOPNOTSUPP);
5169 }
5170
5171 struct vop_vector zfs_vnodeops;
5172 struct vop_vector zfs_fifoops;
5173 struct vop_vector zfs_shareops;
5174
5175 struct vop_vector zfs_vnodeops = {
5176         .vop_default =          &default_vnodeops,
5177         .vop_inactive =         zfs_freebsd_inactive,
5178         .vop_reclaim =          zfs_freebsd_reclaim,
5179         .vop_access =           zfs_freebsd_access,
5180 #ifdef FREEBSD_NAMECACHE
5181         .vop_lookup =           vfs_cache_lookup,
5182         .vop_cachedlookup =     zfs_freebsd_lookup,
5183 #else
5184         .vop_lookup =           zfs_freebsd_lookup,
5185 #endif
5186         .vop_getattr =          zfs_freebsd_getattr,
5187         .vop_setattr =          zfs_freebsd_setattr,
5188         .vop_create =           zfs_freebsd_create,
5189         .vop_mknod =            zfs_freebsd_create,
5190         .vop_mkdir =            zfs_freebsd_mkdir,
5191         .vop_readdir =          zfs_freebsd_readdir,
5192         .vop_fsync =            zfs_freebsd_fsync,
5193         .vop_open =             zfs_freebsd_open,
5194         .vop_close =            zfs_freebsd_close,
5195         .vop_rmdir =            zfs_freebsd_rmdir,
5196         .vop_ioctl =            zfs_freebsd_ioctl,
5197         .vop_link =             zfs_freebsd_link,
5198         .vop_symlink =          zfs_freebsd_symlink,
5199         .vop_readlink =         zfs_freebsd_readlink,
5200         .vop_read =             zfs_freebsd_read,
5201         .vop_write =            zfs_freebsd_write,
5202         .vop_remove =           zfs_freebsd_remove,
5203         .vop_rename =           zfs_freebsd_rename,
5204         .vop_pathconf =         zfs_freebsd_pathconf,
5205         .vop_bmap =             VOP_EOPNOTSUPP,
5206         .vop_fid =              zfs_freebsd_fid,
5207         .vop_getextattr =       zfs_getextattr,
5208         .vop_deleteextattr =    zfs_deleteextattr,
5209         .vop_setextattr =       zfs_setextattr,
5210         .vop_listextattr =      zfs_listextattr,
5211         .vop_getacl =           zfs_freebsd_getacl,
5212         .vop_setacl =           zfs_freebsd_setacl,
5213         .vop_aclcheck =         zfs_freebsd_aclcheck,
5214 };
5215
5216 struct vop_vector zfs_fifoops = {
5217         .vop_default =          &fifo_specops,
5218         .vop_fsync =            zfs_freebsd_fsync,
5219         .vop_access =           zfs_freebsd_access,
5220         .vop_getattr =          zfs_freebsd_getattr,
5221         .vop_inactive =         zfs_freebsd_inactive,
5222         .vop_read =             VOP_PANIC,
5223         .vop_reclaim =          zfs_freebsd_reclaim,
5224         .vop_setattr =          zfs_freebsd_setattr,
5225         .vop_write =            VOP_PANIC,
5226         .vop_pathconf =         zfs_freebsd_fifo_pathconf,
5227         .vop_fid =              zfs_freebsd_fid,
5228         .vop_getacl =           zfs_freebsd_getacl,
5229         .vop_setacl =           zfs_freebsd_setacl,
5230         .vop_aclcheck =         zfs_freebsd_aclcheck,
5231 };
5232
5233 /*
5234  * special share hidden files vnode operations template
5235  */
5236 struct vop_vector zfs_shareops = {
5237         .vop_default =          &default_vnodeops,
5238         .vop_access =           zfs_freebsd_access,
5239         .vop_inactive =         zfs_freebsd_inactive,
5240         .vop_reclaim =          zfs_freebsd_reclaim,
5241         .vop_fid =              zfs_freebsd_fid,
5242         .vop_pathconf =         zfs_freebsd_pathconf,
5243 };