module/os/linux/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  86  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit()
  91  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  92  *      First, if it's the last reference, the vnode/znode
  93  *      can be freed, so the zp may point to freed memory.  Second, the last
  94  *      reference will call zfs_zinactive(), which may induce a lot of work --
  95  *      pushing cached pages (which acquires range locks) and syncing out
  96  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
  97  *      which could deadlock the system if you were already holding one.
  98  *      If you must call zrele() within a tx then use zfs_zrele_async().
  99  *
 100  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 101  *      as they can span dmu_tx_assign() calls.
 102  *
 103  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 104  *      dmu_tx_assign().  This is critical because we don't want to block
 105  *      while holding locks.
 106  *
 107  *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 108  *      reduces lock contention and CPU usage when we must wait (note that if
 109  *      throughput is constrained by the storage, nearly every transaction
 110  *      must wait).
 111  *
 112  *      Note, in particular, that if a lock is sometimes acquired before
 113  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 114  *      to use a non-blocking assign can deadlock the system.  The scenario:
 115  *
 116  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 117  *      Thread B is in an already-assigned tx, and blocks for this lock.
 118  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 119  *      forever, because the previous txg can't quiesce until B's tx commits.
 120  *
 121  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 122  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 123  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 124  *      to indicate that this operation has already called dmu_tx_wait().
 125  *      This will ensure that we don't retry forever, waiting a short bit
 126  *      each time.
 127  *
 128  *  (5) If the operation succeeded, generate the intent log entry for it
 129  *      before dropping locks.  This ensures that the ordering of events
 130  *      in the intent log matches the order in which they actually occurred.
 131  *      During ZIL replay the zfs_log_* functions will update the sequence
 132  *      number to indicate the zil transaction has replayed.
 133  *
 134  *  (6) At the end of each vnode op, the DMU tx must always commit,
 135  *      regardless of whether there were any errors.
 136  *
 137  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 138  *      to ensure that synchronous semantics are provided when necessary.
 139  *
 140  * In general, this is how things should be ordered in each vnode op:
 141  *
 142  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 143  * top:
 144  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 145  *      rw_enter(...);                  // grab any other locks you need
 146  *      tx = dmu_tx_create(...);        // get DMU tx
 147  *      dmu_tx_hold_*();                // hold each object you might modify
 148  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 149  *      if (error) {
 150  *              rw_exit(...);           // drop locks
 151  *              zfs_dirent_unlock(dl);  // unlock directory entry
 152  *              zrele(...);             // release held znodes
 153  *              if (error == ERESTART) {
 154  *                      waited = B_TRUE;
 155  *                      dmu_tx_wait(tx);
 156  *                      dmu_tx_abort(tx);
 157  *                      goto top;
 158  *              }
 159  *              dmu_tx_abort(tx);       // abort DMU tx
 160  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 161  *              return (error);         // really out of space
 162  *      }
 163  *      error = do_real_work();         // do whatever this VOP does
 164  *      if (error == 0)
 165  *              zfs_log_*(...);         // on success, make ZIL entry
 166  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 167  *      rw_exit(...);                   // drop locks
 168  *      zfs_dirent_unlock(dl);          // unlock directory entry
 169  *      zrele(...);                     // release held znodes
 170  *      zil_commit(zilog, foid);        // synchronous when necessary
 171  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 172  *      return (error);                 // done, report error
 173  */
 174
 175 /*
 176  * Virus scanning is unsupported.  It would be possible to add a hook
 177  * here to performance the required virus scan.  This could be done
 178  * entirely in the kernel or potentially as an update to invoke a
 179  * scanning utility.
 180  */
 181 static int
 182 zfs_vscan(struct inode *ip, cred_t *cr, int async)
 183 {
 184         return (0);
 185 }
 186
 187 /* ARGSUSED */
 188 int
 189 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 190 {
 191         znode_t *zp = ITOZ(ip);
 192         zfsvfs_t *zfsvfs = ITOZSB(ip);
 193
 194         ZFS_ENTER(zfsvfs);
 195         ZFS_VERIFY_ZP(zp);
 196
 197         /* Honor ZFS_APPENDONLY file attribute */
 198         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 199             ((flag & O_APPEND) == 0)) {
 200                 ZFS_EXIT(zfsvfs);
 201                 return (SET_ERROR(EPERM));
 202         }
 203
 204         /* Virus scan eligible files on open */
 205         if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
 206             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 207                 if (zfs_vscan(ip, cr, 0) != 0) {
 208                         ZFS_EXIT(zfsvfs);
 209                         return (SET_ERROR(EACCES));
 210                 }
 211         }
 212
 213         /* Keep a count of the synchronous opens in the znode */
 214         if (flag & O_SYNC)
 215                 atomic_inc_32(&zp->z_sync_cnt);
 216
 217         ZFS_EXIT(zfsvfs);
 218         return (0);
 219 }
 220
 221 /* ARGSUSED */
 222 int
 223 zfs_close(struct inode *ip, int flag, cred_t *cr)
 224 {
 225         znode_t *zp = ITOZ(ip);
 226         zfsvfs_t *zfsvfs = ITOZSB(ip);
 227
 228         ZFS_ENTER(zfsvfs);
 229         ZFS_VERIFY_ZP(zp);
 230
 231         /* Decrement the synchronous opens in the znode */
 232         if (flag & O_SYNC)
 233                 atomic_dec_32(&zp->z_sync_cnt);
 234
 235         if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
 236             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 237                 VERIFY(zfs_vscan(ip, cr, 1) == 0);
 238
 239         ZFS_EXIT(zfsvfs);
 240         return (0);
 241 }
 242
 243 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 244 /*
 245  * Lseek support for finding holes (cmd == SEEK_HOLE) and
 246  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
 247  */
 248 static int
 249 zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
 250 {
 251         znode_t *zp = ITOZ(ip);
 252         uint64_t noff = (uint64_t)*off; /* new offset */
 253         uint64_t file_sz;
 254         int error;
 255         boolean_t hole;
 256
 257         file_sz = zp->z_size;
 258         if (noff >= file_sz)  {
 259                 return (SET_ERROR(ENXIO));
 260         }
 261
 262         if (cmd == SEEK_HOLE)
 263                 hole = B_TRUE;
 264         else
 265                 hole = B_FALSE;
 266
 267         error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 268
 269         if (error == ESRCH)
 270                 return (SET_ERROR(ENXIO));
 271
 272         /* file was dirty, so fall back to using generic logic */
 273         if (error == EBUSY) {
 274                 if (hole)
 275                         *off = file_sz;
 276
 277                 return (0);
 278         }
 279
 280         /*
 281          * We could find a hole that begins after the logical end-of-file,
 282          * because dmu_offset_next() only works on whole blocks.  If the
 283          * EOF falls mid-block, then indicate that the "virtual hole"
 284          * at the end of the file begins at the logical EOF, rather than
 285          * at the end of the last block.
 286          */
 287         if (noff > file_sz) {
 288                 ASSERT(hole);
 289                 noff = file_sz;
 290         }
 291
 292         if (noff < *off)
 293                 return (error);
 294         *off = noff;
 295         return (error);
 296 }
 297
 298 int
 299 zfs_holey(struct inode *ip, int cmd, loff_t *off)
 300 {
 301         znode_t *zp = ITOZ(ip);
 302         zfsvfs_t *zfsvfs = ITOZSB(ip);
 303         int error;
 304
 305         ZFS_ENTER(zfsvfs);
 306         ZFS_VERIFY_ZP(zp);
 307
 308         error = zfs_holey_common(ip, cmd, off);
 309
 310         ZFS_EXIT(zfsvfs);
 311         return (error);
 312 }
 313 #endif /* SEEK_HOLE && SEEK_DATA */
 314
 315 #if defined(_KERNEL)
 316 /*
 317  * When a file is memory mapped, we must keep the IO data synchronized
 318  * between the DMU cache and the memory mapped pages.  What this means:
 319  *
 320  * On Write:    If we find a memory mapped page, we write to *both*
 321  *              the page and the dmu buffer.
 322  */
 323 static void
 324 update_pages(struct inode *ip, int64_t start, int len,
 325     objset_t *os, uint64_t oid)
 326 {
 327         struct address_space *mp = ip->i_mapping;
 328         struct page *pp;
 329         uint64_t nbytes;
 330         int64_t off;
 331         void *pb;
 332
 333         off = start & (PAGE_SIZE-1);
 334         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 335                 nbytes = MIN(PAGE_SIZE - off, len);
 336
 337                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 338                 if (pp) {
 339                         if (mapping_writably_mapped(mp))
 340                                 flush_dcache_page(pp);
 341
 342                         pb = kmap(pp);
 343                         (void) dmu_read(os, oid, start+off, nbytes, pb+off,
 344                             DMU_READ_PREFETCH);
 345                         kunmap(pp);
 346
 347                         if (mapping_writably_mapped(mp))
 348                                 flush_dcache_page(pp);
 349
 350                         mark_page_accessed(pp);
 351                         SetPageUptodate(pp);
 352                         ClearPageError(pp);
 353                         unlock_page(pp);
 354                         put_page(pp);
 355                 }
 356
 357                 len -= nbytes;
 358                 off = 0;
 359         }
 360 }
 361
 362 /*
 363  * When a file is memory mapped, we must keep the IO data synchronized
 364  * between the DMU cache and the memory mapped pages.  What this means:
 365  *
 366  * On Read:     We "read" preferentially from memory mapped pages,
 367  *              else we default from the dmu buffer.
 368  *
 369  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 370  *       the file is memory mapped.
 371  */
 372 static int
 373 mappedread(struct inode *ip, int nbytes, uio_t *uio)
 374 {
 375         struct address_space *mp = ip->i_mapping;
 376         struct page *pp;
 377         znode_t *zp = ITOZ(ip);
 378         int64_t start, off;
 379         uint64_t bytes;
 380         int len = nbytes;
 381         int error = 0;
 382         void *pb;
 383
 384         start = uio->uio_loffset;
 385         off = start & (PAGE_SIZE-1);
 386         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 387                 bytes = MIN(PAGE_SIZE - off, len);
 388
 389                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 390                 if (pp) {
 391                         ASSERT(PageUptodate(pp));
 392                         unlock_page(pp);
 393
 394                         pb = kmap(pp);
 395                         error = uiomove(pb + off, bytes, UIO_READ, uio);
 396                         kunmap(pp);
 397
 398                         if (mapping_writably_mapped(mp))
 399                                 flush_dcache_page(pp);
 400
 401                         mark_page_accessed(pp);
 402                         put_page(pp);
 403                 } else {
 404                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 405                             uio, bytes);
 406                 }
 407
 408                 len -= bytes;
 409                 off = 0;
 410                 if (error)
 411                         break;
 412         }
 413         return (error);
 414 }
 415 #endif /* _KERNEL */
 416
 417 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 418 unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 419
 420 /*
 421  * Read bytes from specified file into supplied buffer.
 422  *
 423  *      IN:     ip      - inode of file to be read from.
 424  *              uio     - structure supplying read location, range info,
 425  *                        and return buffer.
 426  *              ioflag  - O_SYNC flags; used to provide FRSYNC semantics.
 427  *                        O_DIRECT flag; used to bypass page cache.
 428  *              cr      - credentials of caller.
 429  *
 430  *      OUT:    uio     - updated offset and range, buffer filled.
 431  *
 432  *      RETURN: 0 on success, error code on failure.
 433  *
 434  * Side Effects:
 435  *      inode - atime updated if byte count > 0
 436  */
 437 /* ARGSUSED */
 438 int
 439 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 440 {
 441         int error = 0;
 442         boolean_t frsync = B_FALSE;
 443
 444         znode_t *zp = ITOZ(ip);
 445         zfsvfs_t *zfsvfs = ITOZSB(ip);
 446         ZFS_ENTER(zfsvfs);
 447         ZFS_VERIFY_ZP(zp);
 448
 449         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 450                 ZFS_EXIT(zfsvfs);
 451                 return (SET_ERROR(EACCES));
 452         }
 453
 454         /*
 455          * Validate file offset
 456          */
 457         if (uio->uio_loffset < (offset_t)0) {
 458                 ZFS_EXIT(zfsvfs);
 459                 return (SET_ERROR(EINVAL));
 460         }
 461
 462         /*
 463          * Fasttrack empty reads
 464          */
 465         if (uio->uio_resid == 0) {
 466                 ZFS_EXIT(zfsvfs);
 467                 return (0);
 468         }
 469
 470 #ifdef FRSYNC
 471         /*
 472          * If we're in FRSYNC mode, sync out this znode before reading it.
 473          * Only do this for non-snapshots.
 474          *
 475          * Some platforms do not support FRSYNC and instead map it
 476          * to O_SYNC, which results in unnecessary calls to zil_commit. We
 477          * only honor FRSYNC requests on platforms which support it.
 478          */
 479         frsync = !!(ioflag & FRSYNC);
 480 #endif
 481         if (zfsvfs->z_log &&
 482             (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 483                 zil_commit(zfsvfs->z_log, zp->z_id);
 484
 485         /*
 486          * Lock the range against changes.
 487          */
 488         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 489             uio->uio_loffset, uio->uio_resid, RL_READER);
 490
 491         /*
 492          * If we are reading past end-of-file we can skip
 493          * to the end; but we might still need to set atime.
 494          */
 495         if (uio->uio_loffset >= zp->z_size) {
 496                 error = 0;
 497                 goto out;
 498         }
 499
 500         ASSERT(uio->uio_loffset < zp->z_size);
 501         ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 502         ssize_t start_resid = n;
 503
 504 #ifdef HAVE_UIO_ZEROCOPY
 505         xuio_t *xuio = NULL;
 506         if ((uio->uio_extflg == UIO_XUIO) &&
 507             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 508                 int nblk;
 509                 int blksz = zp->z_blksz;
 510                 uint64_t offset = uio->uio_loffset;
 511
 512                 xuio = (xuio_t *)uio;
 513                 if ((ISP2(blksz))) {
 514                         nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 515                             blksz)) / blksz;
 516                 } else {
 517                         ASSERT(offset + n <= blksz);
 518                         nblk = 1;
 519                 }
 520                 (void) dmu_xuio_init(xuio, nblk);
 521
 522                 if (vn_has_cached_data(ip)) {
 523                         /*
 524                          * For simplicity, we always allocate a full buffer
 525                          * even if we only expect to read a portion of a block.
 526                          */
 527                         while (--nblk >= 0) {
 528                                 (void) dmu_xuio_add(xuio,
 529                                     dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 530                                     blksz), 0, blksz);
 531                         }
 532                 }
 533         }
 534 #endif /* HAVE_UIO_ZEROCOPY */
 535
 536         while (n > 0) {
 537                 ssize_t nbytes = MIN(n, zfs_read_chunk_size -
 538                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 539
 540                 if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
 541                         error = mappedread(ip, nbytes, uio);
 542                 } else {
 543                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 544                             uio, nbytes);
 545                 }
 546
 547                 if (error) {
 548                         /* convert checksum errors into IO errors */
 549                         if (error == ECKSUM)
 550                                 error = SET_ERROR(EIO);
 551                         break;
 552                 }
 553
 554                 n -= nbytes;
 555         }
 556
 557         int64_t nread = start_resid - n;
 558         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 559         task_io_account_read(nread);
 560 out:
 561         zfs_rangelock_exit(lr);
 562
 563         ZFS_EXIT(zfsvfs);
 564         return (error);
 565 }
 566
 567 /*
 568  * Write the bytes to a file.
 569  *
 570  *      IN:     ip      - inode of file to be written to.
 571  *              uio     - structure supplying write location, range info,
 572  *                        and data buffer.
 573  *              ioflag  - O_APPEND flag set if in append mode.
 574  *                        O_DIRECT flag; used to bypass page cache.
 575  *              cr      - credentials of caller.
 576  *
 577  *      OUT:    uio     - updated offset and range.
 578  *
 579  *      RETURN: 0 if success
 580  *              error code if failure
 581  *
 582  * Timestamps:
 583  *      ip - ctime|mtime updated if byte count > 0
 584  */
 585
 586 /* ARGSUSED */
 587 int
 588 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 589 {
 590         int error = 0;
 591         ssize_t start_resid = uio->uio_resid;
 592
 593         /*
 594          * Fasttrack empty write
 595          */
 596         ssize_t n = start_resid;
 597         if (n == 0)
 598                 return (0);
 599
 600         rlim64_t limit = uio->uio_limit;
 601         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 602                 limit = MAXOFFSET_T;
 603
 604         znode_t *zp = ITOZ(ip);
 605         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 606         ZFS_ENTER(zfsvfs);
 607         ZFS_VERIFY_ZP(zp);
 608
 609         sa_bulk_attr_t bulk[4];
 610         int count = 0;
 611         uint64_t mtime[2], ctime[2];
 612         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 613         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 614         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 615             &zp->z_size, 8);
 616         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 617             &zp->z_pflags, 8);
 618
 619         /*
 620          * Callers might not be able to detect properly that we are read-only,
 621          * so check it explicitly here.
 622          */
 623         if (zfs_is_readonly(zfsvfs)) {
 624                 ZFS_EXIT(zfsvfs);
 625                 return (SET_ERROR(EROFS));
 626         }
 627
 628         /*
 629          * If immutable or not appending then return EPERM
 630          */
 631         if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 632             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 633             (uio->uio_loffset < zp->z_size))) {
 634                 ZFS_EXIT(zfsvfs);
 635                 return (SET_ERROR(EPERM));
 636         }
 637
 638         /*
 639          * Validate file offset
 640          */
 641         offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset;
 642         if (woff < 0) {
 643                 ZFS_EXIT(zfsvfs);
 644                 return (SET_ERROR(EINVAL));
 645         }
 646
 647         int max_blksz = zfsvfs->z_max_blksz;
 648         xuio_t *xuio = NULL;
 649
 650         /*
 651          * Pre-fault the pages to ensure slow (eg NFS) pages
 652          * don't hold up txg.
 653          * Skip this if uio contains loaned arc_buf.
 654          */
 655 #ifdef HAVE_UIO_ZEROCOPY
 656         if ((uio->uio_extflg == UIO_XUIO) &&
 657             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 658                 xuio = (xuio_t *)uio;
 659         else
 660 #endif
 661                 if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
 662                         ZFS_EXIT(zfsvfs);
 663                         return (SET_ERROR(EFAULT));
 664                 }
 665
 666         /*
 667          * If in append mode, set the io offset pointer to eof.
 668          */
 669         zfs_locked_range_t *lr;
 670         if (ioflag & O_APPEND) {
 671                 /*
 672                  * Obtain an appending range lock to guarantee file append
 673                  * semantics.  We reset the write offset once we have the lock.
 674                  */
 675                 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 676                 woff = lr->lr_offset;
 677                 if (lr->lr_length == UINT64_MAX) {
 678                         /*
 679                          * We overlocked the file because this write will cause
 680                          * the file block size to increase.
 681                          * Note that zp_size cannot change with this lock held.
 682                          */
 683                         woff = zp->z_size;
 684                 }
 685                 uio->uio_loffset = woff;
 686         } else {
 687                 /*
 688                  * Note that if the file block size will change as a result of
 689                  * this write, then this range lock will lock the entire file
 690                  * so that we can re-write the block safely.
 691                  */
 692                 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 693         }
 694
 695         if (woff >= limit) {
 696                 zfs_rangelock_exit(lr);
 697                 ZFS_EXIT(zfsvfs);
 698                 return (SET_ERROR(EFBIG));
 699         }
 700
 701         if ((woff + n) > limit || woff > (limit - n))
 702                 n = limit - woff;
 703
 704         /* Will this write extend the file length? */
 705         int write_eof = (woff + n > zp->z_size);
 706
 707         uint64_t end_size = MAX(zp->z_size, woff + n);
 708         zilog_t *zilog = zfsvfs->z_log;
 709 #ifdef HAVE_UIO_ZEROCOPY
 710         int i_iov = 0;
 711         const iovec_t *iovp = uio->uio_iov;
 712         int iovcnt __maybe_unused = uio->uio_iovcnt;
 713 #endif
 714
 715
 716         /*
 717          * Write the file in reasonable size chunks.  Each chunk is written
 718          * in a separate transaction; this keeps the intent log records small
 719          * and allows us to do more fine-grained space accounting.
 720          */
 721         while (n > 0) {
 722                 woff = uio->uio_loffset;
 723
 724                 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 725                     KUID_TO_SUID(ip->i_uid)) ||
 726                     zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 727                     KGID_TO_SGID(ip->i_gid)) ||
 728                     (zp->z_projid != ZFS_DEFAULT_PROJID &&
 729                     zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 730                     zp->z_projid))) {
 731                         error = SET_ERROR(EDQUOT);
 732                         break;
 733                 }
 734
 735                 arc_buf_t *abuf = NULL;
 736                 const iovec_t *aiov = NULL;
 737                 if (xuio) {
 738 #ifdef HAVE_UIO_ZEROCOPY
 739                         ASSERT(i_iov < iovcnt);
 740                         ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
 741                         aiov = &iovp[i_iov];
 742                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 743                         dmu_xuio_clear(xuio, i_iov);
 744                         ASSERT((aiov->iov_base == abuf->b_data) ||
 745                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 746                             aiov->iov_len == arc_buf_size(abuf)));
 747                         i_iov++;
 748 #endif
 749                 } else if (n >= max_blksz && woff >= zp->z_size &&
 750                     P2PHASE(woff, max_blksz) == 0 &&
 751                     zp->z_blksz == max_blksz) {
 752                         /*
 753                          * This write covers a full block.  "Borrow" a buffer
 754                          * from the dmu so that we can fill it before we enter
 755                          * a transaction.  This avoids the possibility of
 756                          * holding up the transaction if the data copy hangs
 757                          * up on a pagefault (e.g., from an NFS server mapping).
 758                          */
 759                         size_t cbytes;
 760
 761                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 762                             max_blksz);
 763                         ASSERT(abuf != NULL);
 764                         ASSERT(arc_buf_size(abuf) == max_blksz);
 765                         if ((error = uiocopy(abuf->b_data, max_blksz,
 766                             UIO_WRITE, uio, &cbytes))) {
 767                                 dmu_return_arcbuf(abuf);
 768                                 break;
 769                         }
 770                         ASSERT(cbytes == max_blksz);
 771                 }
 772
 773                 /*
 774                  * Start a transaction.
 775                  */
 776                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 777                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 778                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 779                 DB_DNODE_ENTER(db);
 780                 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
 781                     MIN(n, max_blksz));
 782                 DB_DNODE_EXIT(db);
 783                 zfs_sa_upgrade_txholds(tx, zp);
 784                 error = dmu_tx_assign(tx, TXG_WAIT);
 785                 if (error) {
 786                         dmu_tx_abort(tx);
 787                         if (abuf != NULL)
 788                                 dmu_return_arcbuf(abuf);
 789                         break;
 790                 }
 791
 792                 /*
 793                  * If rangelock_enter() over-locked we grow the blocksize
 794                  * and then reduce the lock range.  This will only happen
 795                  * on the first iteration since rangelock_reduce() will
 796                  * shrink down lr_length to the appropriate size.
 797                  */
 798                 if (lr->lr_length == UINT64_MAX) {
 799                         uint64_t new_blksz;
 800
 801                         if (zp->z_blksz > max_blksz) {
 802                                 /*
 803                                  * File's blocksize is already larger than the
 804                                  * "recordsize" property.  Only let it grow to
 805                                  * the next power of 2.
 806                                  */
 807                                 ASSERT(!ISP2(zp->z_blksz));
 808                                 new_blksz = MIN(end_size,
 809                                     1 << highbit64(zp->z_blksz));
 810                         } else {
 811                                 new_blksz = MIN(end_size, max_blksz);
 812                         }
 813                         zfs_grow_blocksize(zp, new_blksz, tx);
 814                         zfs_rangelock_reduce(lr, woff, n);
 815                 }
 816
 817                 /*
 818                  * XXX - should we really limit each write to z_max_blksz?
 819                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 820                  */
 821                 ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 822
 823                 ssize_t tx_bytes;
 824                 if (abuf == NULL) {
 825                         tx_bytes = uio->uio_resid;
 826                         uio->uio_fault_disable = B_TRUE;
 827                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 828                             uio, nbytes, tx);
 829                         uio->uio_fault_disable = B_FALSE;
 830                         if (error == EFAULT) {
 831                                 dmu_tx_commit(tx);
 832                                 /*
 833                                  * Account for partial writes before
 834                                  * continuing the loop.
 835                                  * Update needs to occur before the next
 836                                  * uio_prefaultpages, or prefaultpages may
 837                                  * error, and we may break the loop early.
 838                                  */
 839                                 if (tx_bytes != uio->uio_resid)
 840                                         n -= tx_bytes - uio->uio_resid;
 841                                 if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
 842                                         break;
 843                                 }
 844                                 continue;
 845                         } else if (error != 0) {
 846                                 dmu_tx_commit(tx);
 847                                 break;
 848                         }
 849                         tx_bytes -= uio->uio_resid;
 850                 } else {
 851                         tx_bytes = nbytes;
 852                         ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 853                         /*
 854                          * If this is not a full block write, but we are
 855                          * extending the file past EOF and this data starts
 856                          * block-aligned, use assign_arcbuf().  Otherwise,
 857                          * write via dmu_write().
 858                          */
 859                         if (tx_bytes < max_blksz && (!write_eof ||
 860                             aiov->iov_base != abuf->b_data)) {
 861                                 ASSERT(xuio);
 862                                 dmu_write(zfsvfs->z_os, zp->z_id, woff,
 863                                     /* cppcheck-suppress nullPointer */
 864                                     aiov->iov_len, aiov->iov_base, tx);
 865                                 dmu_return_arcbuf(abuf);
 866                                 xuio_stat_wbuf_copied();
 867                         } else {
 868                                 ASSERT(xuio || tx_bytes == max_blksz);
 869                                 error = dmu_assign_arcbuf_by_dbuf(
 870                                     sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 871                                 if (error != 0) {
 872                                         dmu_return_arcbuf(abuf);
 873                                         dmu_tx_commit(tx);
 874                                         break;
 875                                 }
 876                         }
 877                         ASSERT(tx_bytes <= uio->uio_resid);
 878                         uioskip(uio, tx_bytes);
 879                 }
 880                 if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
 881                         update_pages(ip, woff,
 882                             tx_bytes, zfsvfs->z_os, zp->z_id);
 883                 }
 884
 885                 /*
 886                  * If we made no progress, we're done.  If we made even
 887                  * partial progress, update the znode and ZIL accordingly.
 888                  */
 889                 if (tx_bytes == 0) {
 890                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 891                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 892                         dmu_tx_commit(tx);
 893                         ASSERT(error != 0);
 894                         break;
 895                 }
 896
 897                 /*
 898                  * Clear Set-UID/Set-GID bits on successful write if not
 899                  * privileged and at least one of the execute bits is set.
 900                  *
 901                  * It would be nice to do this after all writes have
 902                  * been done, but that would still expose the ISUID/ISGID
 903                  * to another app after the partial write is committed.
 904                  *
 905                  * Note: we don't call zfs_fuid_map_id() here because
 906                  * user 0 is not an ephemeral uid.
 907                  */
 908                 mutex_enter(&zp->z_acl_lock);
 909                 uint32_t uid = KUID_TO_SUID(ip->i_uid);
 910                 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 911                     (S_IXUSR >> 6))) != 0 &&
 912                     (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 913                     secpolicy_vnode_setid_retain(cr,
 914                     ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 915                         uint64_t newmode;
 916                         zp->z_mode &= ~(S_ISUID | S_ISGID);
 917                         ip->i_mode = newmode = zp->z_mode;
 918                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 919                             (void *)&newmode, sizeof (uint64_t), tx);
 920                 }
 921                 mutex_exit(&zp->z_acl_lock);
 922
 923                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 924
 925                 /*
 926                  * Update the file size (zp_size) if it has changed;
 927                  * account for possible concurrent updates.
 928                  */
 929                 while ((end_size = zp->z_size) < uio->uio_loffset) {
 930                         (void) atomic_cas_64(&zp->z_size, end_size,
 931                             uio->uio_loffset);
 932                         ASSERT(error == 0);
 933                 }
 934                 /*
 935                  * If we are replaying and eof is non zero then force
 936                  * the file size to the specified eof. Note, there's no
 937                  * concurrency during replay.
 938                  */
 939                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 940                         zp->z_size = zfsvfs->z_replay_eof;
 941
 942                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 943
 944                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
 945                     NULL, NULL);
 946                 dmu_tx_commit(tx);
 947
 948                 if (error != 0)
 949                         break;
 950                 ASSERT(tx_bytes == nbytes);
 951                 n -= nbytes;
 952
 953                 if (!xuio && n > 0) {
 954                         if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
 955                                 error = EFAULT;
 956                                 break;
 957                         }
 958                 }
 959         }
 960
 961         zfs_inode_update(zp);
 962         zfs_rangelock_exit(lr);
 963
 964         /*
 965          * If we're in replay mode, or we made no progress, return error.
 966          * Otherwise, it's at least a partial write, so it's successful.
 967          */
 968         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 969                 ZFS_EXIT(zfsvfs);
 970                 return (error);
 971         }
 972
 973         if (ioflag & (O_SYNC | O_DSYNC) ||
 974             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 975                 zil_commit(zilog, zp->z_id);
 976
 977         int64_t nwritten = start_resid - uio->uio_resid;
 978         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
 979         task_io_account_write(nwritten);
 980
 981         ZFS_EXIT(zfsvfs);
 982         return (0);
 983 }
 984
 985 /*
 986  * Write the bytes to a file.
 987  *
 988  *      IN:     zp      - znode of file to be written to
 989  *              data    - bytes to write
 990  *              len     - number of bytes to write
 991  *              pos     - offset to start writing at
 992  *
 993  *      OUT:    resid   - remaining bytes to write
 994  *
 995  *      RETURN: 0 if success
 996  *              positive error code if failure
 997  *
 998  * Timestamps:
 999  *      zp - ctime|mtime updated if byte count > 0
1000  */
1001 int
1002 zfs_write_simple(znode_t *zp, const void *data, size_t len,
1003     loff_t pos, size_t *resid)
1004 {
1005         ssize_t written;
1006         int error = 0;
1007
1008         written = zpl_write_common(ZTOI(zp), data, len, &pos,
1009             UIO_SYSSPACE, 0, kcred);
1010         if (written < 0) {
1011                 error = -written;
1012         } else if (resid == NULL) {
1013                 if (written < len)
1014                         error = SET_ERROR(EIO); /* short write */
1015         } else {
1016                 *resid = len - written;
1017         }
1018         return (error);
1019 }
1020
1021 /*
1022  * Drop a reference on the passed inode asynchronously. This ensures
1023  * that the caller will never drop the last reference on an inode in
1024  * the current context. Doing so while holding open a tx could result
1025  * in a deadlock if iput_final() re-enters the filesystem code.
1026  */
1027 void
1028 zfs_zrele_async(znode_t *zp)
1029 {
1030         struct inode *ip = ZTOI(zp);
1031         objset_t *os = ITOZSB(ip)->z_os;
1032
1033         ASSERT(atomic_read(&ip->i_count) > 0);
1034         ASSERT(os != NULL);
1035
1036         if (atomic_read(&ip->i_count) == 1)
1037                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
1038                     (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
1039         else
1040                 zrele(zp);
1041 }
1042
1043 /* ARGSUSED */
1044 static void
1045 zfs_get_done(zgd_t *zgd, int error)
1046 {
1047         znode_t *zp = zgd->zgd_private;
1048
1049         if (zgd->zgd_db)
1050                 dmu_buf_rele(zgd->zgd_db, zgd);
1051
1052         zfs_rangelock_exit(zgd->zgd_lr);
1053
1054         /*
1055          * Release the vnode asynchronously as we currently have the
1056          * txg stopped from syncing.
1057          */
1058         zfs_zrele_async(zp);
1059
1060         kmem_free(zgd, sizeof (zgd_t));
1061 }
1062
1063 #ifdef ZFS_DEBUG
1064 static int zil_fault_io = 0;
1065 #endif
1066
1067 /*
1068  * Get data to generate a TX_WRITE intent log record.
1069  */
1070 int
1071 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1072 {
1073         zfsvfs_t *zfsvfs = arg;
1074         objset_t *os = zfsvfs->z_os;
1075         znode_t *zp;
1076         uint64_t object = lr->lr_foid;
1077         uint64_t offset = lr->lr_offset;
1078         uint64_t size = lr->lr_length;
1079         dmu_buf_t *db;
1080         zgd_t *zgd;
1081         int error = 0;
1082
1083         ASSERT3P(lwb, !=, NULL);
1084         ASSERT3P(zio, !=, NULL);
1085         ASSERT3U(size, !=, 0);
1086
1087         /*
1088          * Nothing to do if the file has been removed
1089          */
1090         if (zfs_zget(zfsvfs, object, &zp) != 0)
1091                 return (SET_ERROR(ENOENT));
1092         if (zp->z_unlinked) {
1093                 /*
1094                  * Release the vnode asynchronously as we currently have the
1095                  * txg stopped from syncing.
1096                  */
1097                 zfs_zrele_async(zp);
1098                 return (SET_ERROR(ENOENT));
1099         }
1100
1101         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1102         zgd->zgd_lwb = lwb;
1103         zgd->zgd_private = zp;
1104
1105         /*
1106          * Write records come in two flavors: immediate and indirect.
1107          * For small writes it's cheaper to store the data with the
1108          * log record (immediate); for large writes it's cheaper to
1109          * sync the data and get a pointer to it (indirect) so that
1110          * we don't have to write the data twice.
1111          */
1112         if (buf != NULL) { /* immediate write */
1113                 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
1114                     offset, size, RL_READER);
1115                 /* test for truncation needs to be done while range locked */
1116                 if (offset >= zp->z_size) {
1117                         error = SET_ERROR(ENOENT);
1118                 } else {
1119                         error = dmu_read(os, object, offset, size, buf,
1120                             DMU_READ_NO_PREFETCH);
1121                 }
1122                 ASSERT(error == 0 || error == ENOENT);
1123         } else { /* indirect write */
1124                 /*
1125                  * Have to lock the whole block to ensure when it's
1126                  * written out and its checksum is being calculated
1127                  * that no one can change the data. We need to re-check
1128                  * blocksize after we get the lock in case it's changed!
1129                  */
1130                 for (;;) {
1131                         uint64_t blkoff;
1132                         size = zp->z_blksz;
1133                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1134                         offset -= blkoff;
1135                         zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
1136                             offset, size, RL_READER);
1137                         if (zp->z_blksz == size)
1138                                 break;
1139                         offset += blkoff;
1140                         zfs_rangelock_exit(zgd->zgd_lr);
1141                 }
1142                 /* test for truncation needs to be done while range locked */
1143                 if (lr->lr_offset >= zp->z_size)
1144                         error = SET_ERROR(ENOENT);
1145 #ifdef ZFS_DEBUG
1146                 if (zil_fault_io) {
1147                         error = SET_ERROR(EIO);
1148                         zil_fault_io = 0;
1149                 }
1150 #endif
1151                 if (error == 0)
1152                         error = dmu_buf_hold(os, object, offset, zgd, &db,
1153                             DMU_READ_NO_PREFETCH);
1154
1155                 if (error == 0) {
1156                         blkptr_t *bp = &lr->lr_blkptr;
1157
1158                         zgd->zgd_db = db;
1159                         zgd->zgd_bp = bp;
1160
1161                         ASSERT(db->db_offset == offset);
1162                         ASSERT(db->db_size == size);
1163
1164                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1165                             zfs_get_done, zgd);
1166                         ASSERT(error || lr->lr_length <= size);
1167
1168                         /*
1169                          * On success, we need to wait for the write I/O
1170                          * initiated by dmu_sync() to complete before we can
1171                          * release this dbuf.  We will finish everything up
1172                          * in the zfs_get_done() callback.
1173                          */
1174                         if (error == 0)
1175                                 return (0);
1176
1177                         if (error == EALREADY) {
1178                                 lr->lr_common.lrc_txtype = TX_WRITE2;
1179                                 /*
1180                                  * TX_WRITE2 relies on the data previously
1181                                  * written by the TX_WRITE that caused
1182                                  * EALREADY.  We zero out the BP because
1183                                  * it is the old, currently-on-disk BP.
1184                                  */
1185                                 zgd->zgd_bp = NULL;
1186                                 BP_ZERO(bp);
1187                                 error = 0;
1188                         }
1189                 }
1190         }
1191
1192         zfs_get_done(zgd, error);
1193
1194         return (error);
1195 }
1196
1197 /*ARGSUSED*/
1198 int
1199 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
1200 {
1201         znode_t *zp = ITOZ(ip);
1202         zfsvfs_t *zfsvfs = ITOZSB(ip);
1203         int error;
1204
1205         ZFS_ENTER(zfsvfs);
1206         ZFS_VERIFY_ZP(zp);
1207
1208         if (flag & V_ACE_MASK)
1209                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1210         else
1211                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1212
1213         ZFS_EXIT(zfsvfs);
1214         return (error);
1215 }
1216
1217 /*
1218  * Lookup an entry in a directory, or an extended attribute directory.
1219  * If it exists, return a held inode reference for it.
1220  *
1221  *      IN:     zdp     - znode of directory to search.
1222  *              nm      - name of entry to lookup.
1223  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1224  *              cr      - credentials of caller.
1225  *              direntflags - directory lookup flags
1226  *              realpnp - returned pathname.
1227  *
1228  *      OUT:    zpp     - znode of located entry, NULL if not found.
1229  *
1230  *      RETURN: 0 on success, error code on failure.
1231  *
1232  * Timestamps:
1233  *      NA
1234  */
1235 /* ARGSUSED */
1236 int
1237 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
1238     int *direntflags, pathname_t *realpnp)
1239 {
1240         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
1241         int error = 0;
1242
1243         /*
1244          * Fast path lookup, however we must skip DNLC lookup
1245          * for case folding or normalizing lookups because the
1246          * DNLC code only stores the passed in name.  This means
1247          * creating 'a' and removing 'A' on a case insensitive
1248          * file system would work, but DNLC still thinks 'a'
1249          * exists and won't let you create it again on the next
1250          * pass through fast path.
1251          */
1252         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1253
1254                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
1255                         return (SET_ERROR(ENOTDIR));
1256                 } else if (zdp->z_sa_hdl == NULL) {
1257                         return (SET_ERROR(EIO));
1258                 }
1259
1260                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1261                         error = zfs_fastaccesschk_execute(zdp, cr);
1262                         if (!error) {
1263                                 *zpp = zdp;
1264                                 zhold(*zpp);
1265                                 return (0);
1266                         }
1267                         return (error);
1268                 }
1269         }
1270
1271         ZFS_ENTER(zfsvfs);
1272         ZFS_VERIFY_ZP(zdp);
1273
1274         *zpp = NULL;
1275
1276         if (flags & LOOKUP_XATTR) {
1277                 /*
1278                  * We don't allow recursive attributes..
1279                  * Maybe someday we will.
1280                  */
1281                 if (zdp->z_pflags & ZFS_XATTR) {
1282                         ZFS_EXIT(zfsvfs);
1283                         return (SET_ERROR(EINVAL));
1284                 }
1285
1286                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
1287                         ZFS_EXIT(zfsvfs);
1288                         return (error);
1289                 }
1290
1291                 /*
1292                  * Do we have permission to get into attribute directory?
1293                  */
1294
1295                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
1296                     B_FALSE, cr))) {
1297                         zrele(*zpp);
1298                         *zpp = NULL;
1299                 }
1300
1301                 ZFS_EXIT(zfsvfs);
1302                 return (error);
1303         }
1304
1305         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
1306                 ZFS_EXIT(zfsvfs);
1307                 return (SET_ERROR(ENOTDIR));
1308         }
1309
1310         /*
1311          * Check accessibility of directory.
1312          */
1313
1314         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
1315                 ZFS_EXIT(zfsvfs);
1316                 return (error);
1317         }
1318
1319         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1320             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1321                 ZFS_EXIT(zfsvfs);
1322                 return (SET_ERROR(EILSEQ));
1323         }
1324
1325         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
1326         if ((error == 0) && (*zpp))
1327                 zfs_inode_update(*zpp);
1328
1329         ZFS_EXIT(zfsvfs);
1330         return (error);
1331 }
1332
1333 /*
1334  * Attempt to create a new entry in a directory.  If the entry
1335  * already exists, truncate the file if permissible, else return
1336  * an error.  Return the ip of the created or trunc'd file.
1337  *
1338  *      IN:     dzp     - znode of directory to put new file entry in.
1339  *              name    - name of new file entry.
1340  *              vap     - attributes of new file.
1341  *              excl    - flag indicating exclusive or non-exclusive mode.
1342  *              mode    - mode to open file with.
1343  *              cr      - credentials of caller.
1344  *              flag    - file flag.
1345  *              vsecp   - ACL to be set
1346  *
1347  *      OUT:    zpp     - znode of created or trunc'd entry.
1348  *
1349  *      RETURN: 0 on success, error code on failure.
1350  *
1351  * Timestamps:
1352  *      dzp - ctime|mtime updated if new entry created
1353  *       zp - ctime|mtime always, atime if new
1354  */
1355
1356 /* ARGSUSED */
1357 int
1358 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
1359     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
1360 {
1361         znode_t         *zp;
1362         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1363         zilog_t         *zilog;
1364         objset_t        *os;
1365         zfs_dirlock_t   *dl;
1366         dmu_tx_t        *tx;
1367         int             error;
1368         uid_t           uid;
1369         gid_t           gid;
1370         zfs_acl_ids_t   acl_ids;
1371         boolean_t       fuid_dirtied;
1372         boolean_t       have_acl = B_FALSE;
1373         boolean_t       waited = B_FALSE;
1374
1375         /*
1376          * If we have an ephemeral id, ACL, or XVATTR then
1377          * make sure file system is at proper version
1378          */
1379
1380         gid = crgetgid(cr);
1381         uid = crgetuid(cr);
1382
1383         if (zfsvfs->z_use_fuids == B_FALSE &&
1384             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1385                 return (SET_ERROR(EINVAL));
1386
1387         if (name == NULL)
1388                 return (SET_ERROR(EINVAL));
1389
1390         ZFS_ENTER(zfsvfs);
1391         ZFS_VERIFY_ZP(dzp);
1392         os = zfsvfs->z_os;
1393         zilog = zfsvfs->z_log;
1394
1395         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1396             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1397                 ZFS_EXIT(zfsvfs);
1398                 return (SET_ERROR(EILSEQ));
1399         }
1400
1401         if (vap->va_mask & ATTR_XVATTR) {
1402                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1403                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1404                         ZFS_EXIT(zfsvfs);
1405                         return (error);
1406                 }
1407         }
1408
1409 top:
1410         *zpp = NULL;
1411         if (*name == '\0') {
1412                 /*
1413                  * Null component name refers to the directory itself.
1414                  */
1415                 zhold(dzp);
1416                 zp = dzp;
1417                 dl = NULL;
1418                 error = 0;
1419         } else {
1420                 /* possible igrab(zp) */
1421                 int zflg = 0;
1422
1423                 if (flag & FIGNORECASE)
1424                         zflg |= ZCILOOK;
1425
1426                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1427                     NULL, NULL);
1428                 if (error) {
1429                         if (have_acl)
1430                                 zfs_acl_ids_free(&acl_ids);
1431                         if (strcmp(name, "..") == 0)
1432                                 error = SET_ERROR(EISDIR);
1433                         ZFS_EXIT(zfsvfs);
1434                         return (error);
1435                 }
1436         }
1437
1438         if (zp == NULL) {
1439                 uint64_t txtype;
1440                 uint64_t projid = ZFS_DEFAULT_PROJID;
1441
1442                 /*
1443                  * Create a new file object and update the directory
1444                  * to reference it.
1445                  */
1446                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1447                         if (have_acl)
1448                                 zfs_acl_ids_free(&acl_ids);
1449                         goto out;
1450                 }
1451
1452                 /*
1453                  * We only support the creation of regular files in
1454                  * extended attribute directories.
1455                  */
1456
1457                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
1458                         if (have_acl)
1459                                 zfs_acl_ids_free(&acl_ids);
1460                         error = SET_ERROR(EINVAL);
1461                         goto out;
1462                 }
1463
1464                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1465                     cr, vsecp, &acl_ids)) != 0)
1466                         goto out;
1467                 have_acl = B_TRUE;
1468
1469                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1470                         projid = zfs_inherit_projid(dzp);
1471                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1472                         zfs_acl_ids_free(&acl_ids);
1473                         error = SET_ERROR(EDQUOT);
1474                         goto out;
1475                 }
1476
1477                 tx = dmu_tx_create(os);
1478
1479                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1480                     ZFS_SA_BASE_ATTR_SIZE);
1481
1482                 fuid_dirtied = zfsvfs->z_fuid_dirty;
1483                 if (fuid_dirtied)
1484                         zfs_fuid_txhold(zfsvfs, tx);
1485                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1486                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1487                 if (!zfsvfs->z_use_sa &&
1488                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1489                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1490                             0, acl_ids.z_aclp->z_acl_bytes);
1491                 }
1492
1493                 error = dmu_tx_assign(tx,
1494                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1495                 if (error) {
1496                         zfs_dirent_unlock(dl);
1497                         if (error == ERESTART) {
1498                                 waited = B_TRUE;
1499                                 dmu_tx_wait(tx);
1500                                 dmu_tx_abort(tx);
1501                                 goto top;
1502                         }
1503                         zfs_acl_ids_free(&acl_ids);
1504                         dmu_tx_abort(tx);
1505                         ZFS_EXIT(zfsvfs);
1506                         return (error);
1507                 }
1508                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1509
1510                 error = zfs_link_create(dl, zp, tx, ZNEW);
1511                 if (error != 0) {
1512                         /*
1513                          * Since, we failed to add the directory entry for it,
1514                          * delete the newly created dnode.
1515                          */
1516                         zfs_znode_delete(zp, tx);
1517                         remove_inode_hash(ZTOI(zp));
1518                         zfs_acl_ids_free(&acl_ids);
1519                         dmu_tx_commit(tx);
1520                         goto out;
1521                 }
1522
1523                 if (fuid_dirtied)
1524                         zfs_fuid_sync(zfsvfs, tx);
1525
1526                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1527                 if (flag & FIGNORECASE)
1528                         txtype |= TX_CI;
1529                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1530                     vsecp, acl_ids.z_fuidp, vap);
1531                 zfs_acl_ids_free(&acl_ids);
1532                 dmu_tx_commit(tx);
1533         } else {
1534                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
1535
1536                 if (have_acl)
1537                         zfs_acl_ids_free(&acl_ids);
1538                 have_acl = B_FALSE;
1539
1540                 /*
1541                  * A directory entry already exists for this name.
1542                  */
1543                 /*
1544                  * Can't truncate an existing file if in exclusive mode.
1545                  */
1546                 if (excl) {
1547                         error = SET_ERROR(EEXIST);
1548                         goto out;
1549                 }
1550                 /*
1551                  * Can't open a directory for writing.
1552                  */
1553                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1554                         error = SET_ERROR(EISDIR);
1555                         goto out;
1556                 }
1557                 /*
1558                  * Verify requested access to file.
1559                  */
1560                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1561                         goto out;
1562                 }
1563
1564                 mutex_enter(&dzp->z_lock);
1565                 dzp->z_seq++;
1566                 mutex_exit(&dzp->z_lock);
1567
1568                 /*
1569                  * Truncate regular files if requested.
1570                  */
1571                 if (S_ISREG(ZTOI(zp)->i_mode) &&
1572                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
1573                         /* we can't hold any locks when calling zfs_freesp() */
1574                         if (dl) {
1575                                 zfs_dirent_unlock(dl);
1576                                 dl = NULL;
1577                         }
1578                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1579                 }
1580         }
1581 out:
1582
1583         if (dl)
1584                 zfs_dirent_unlock(dl);
1585
1586         if (error) {
1587                 if (zp)
1588                         zrele(zp);
1589         } else {
1590                 zfs_inode_update(dzp);
1591                 zfs_inode_update(zp);
1592                 *zpp = zp;
1593         }
1594
1595         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1596                 zil_commit(zilog, 0);
1597
1598         ZFS_EXIT(zfsvfs);
1599         return (error);
1600 }
1601
1602 /* ARGSUSED */
1603 int
1604 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
1605     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1606 {
1607         znode_t         *zp = NULL, *dzp = ITOZ(dip);
1608         zfsvfs_t        *zfsvfs = ITOZSB(dip);
1609         objset_t        *os;
1610         dmu_tx_t        *tx;
1611         int             error;
1612         uid_t           uid;
1613         gid_t           gid;
1614         zfs_acl_ids_t   acl_ids;
1615         uint64_t        projid = ZFS_DEFAULT_PROJID;
1616         boolean_t       fuid_dirtied;
1617         boolean_t       have_acl = B_FALSE;
1618         boolean_t       waited = B_FALSE;
1619
1620         /*
1621          * If we have an ephemeral id, ACL, or XVATTR then
1622          * make sure file system is at proper version
1623          */
1624
1625         gid = crgetgid(cr);
1626         uid = crgetuid(cr);
1627
1628         if (zfsvfs->z_use_fuids == B_FALSE &&
1629             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1630                 return (SET_ERROR(EINVAL));
1631
1632         ZFS_ENTER(zfsvfs);
1633         ZFS_VERIFY_ZP(dzp);
1634         os = zfsvfs->z_os;
1635
1636         if (vap->va_mask & ATTR_XVATTR) {
1637                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1638                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1639                         ZFS_EXIT(zfsvfs);
1640                         return (error);
1641                 }
1642         }
1643
1644 top:
1645         *ipp = NULL;
1646
1647         /*
1648          * Create a new file object and update the directory
1649          * to reference it.
1650          */
1651         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1652                 if (have_acl)
1653                         zfs_acl_ids_free(&acl_ids);
1654                 goto out;
1655         }
1656
1657         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1658             cr, vsecp, &acl_ids)) != 0)
1659                 goto out;
1660         have_acl = B_TRUE;
1661
1662         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1663                 projid = zfs_inherit_projid(dzp);
1664         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1665                 zfs_acl_ids_free(&acl_ids);
1666                 error = SET_ERROR(EDQUOT);
1667                 goto out;
1668         }
1669
1670         tx = dmu_tx_create(os);
1671
1672         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1673             ZFS_SA_BASE_ATTR_SIZE);
1674         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1675
1676         fuid_dirtied = zfsvfs->z_fuid_dirty;
1677         if (fuid_dirtied)
1678                 zfs_fuid_txhold(zfsvfs, tx);
1679         if (!zfsvfs->z_use_sa &&
1680             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1681                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1682                     0, acl_ids.z_aclp->z_acl_bytes);
1683         }
1684         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1685         if (error) {
1686                 if (error == ERESTART) {
1687                         waited = B_TRUE;
1688                         dmu_tx_wait(tx);
1689                         dmu_tx_abort(tx);
1690                         goto top;
1691                 }
1692                 zfs_acl_ids_free(&acl_ids);
1693                 dmu_tx_abort(tx);
1694                 ZFS_EXIT(zfsvfs);
1695                 return (error);
1696         }
1697         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
1698
1699         if (fuid_dirtied)
1700                 zfs_fuid_sync(zfsvfs, tx);
1701
1702         /* Add to unlinked set */
1703         zp->z_unlinked = B_TRUE;
1704         zfs_unlinked_add(zp, tx);
1705         zfs_acl_ids_free(&acl_ids);
1706         dmu_tx_commit(tx);
1707 out:
1708
1709         if (error) {
1710                 if (zp)
1711                         zrele(zp);
1712         } else {
1713                 zfs_inode_update(dzp);
1714                 zfs_inode_update(zp);
1715                 *ipp = ZTOI(zp);
1716         }
1717
1718         ZFS_EXIT(zfsvfs);
1719         return (error);
1720 }
1721
1722 /*
1723  * Remove an entry from a directory.
1724  *
1725  *      IN:     dzp     - znode of directory to remove entry from.
1726  *              name    - name of entry to remove.
1727  *              cr      - credentials of caller.
1728  *              flags   - case flags.
1729  *
1730  *      RETURN: 0 if success
1731  *              error code if failure
1732  *
1733  * Timestamps:
1734  *      dzp - ctime|mtime
1735  *       ip - ctime (if nlink > 0)
1736  */
1737
1738 uint64_t null_xattr = 0;
1739
1740 /*ARGSUSED*/
1741 int
1742 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
1743 {
1744         znode_t         *zp;
1745         znode_t         *xzp;
1746         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1747         zilog_t         *zilog;
1748         uint64_t        acl_obj, xattr_obj;
1749         uint64_t        xattr_obj_unlinked = 0;
1750         uint64_t        obj = 0;
1751         uint64_t        links;
1752         zfs_dirlock_t   *dl;
1753         dmu_tx_t        *tx;
1754         boolean_t       may_delete_now, delete_now = FALSE;
1755         boolean_t       unlinked, toobig = FALSE;
1756         uint64_t        txtype;
1757         pathname_t      *realnmp = NULL;
1758         pathname_t      realnm;
1759         int             error;
1760         int             zflg = ZEXISTS;
1761         boolean_t       waited = B_FALSE;
1762
1763         if (name == NULL)
1764                 return (SET_ERROR(EINVAL));
1765
1766         ZFS_ENTER(zfsvfs);
1767         ZFS_VERIFY_ZP(dzp);
1768         zilog = zfsvfs->z_log;
1769
1770         if (flags & FIGNORECASE) {
1771                 zflg |= ZCILOOK;
1772                 pn_alloc(&realnm);
1773                 realnmp = &realnm;
1774         }
1775
1776 top:
1777         xattr_obj = 0;
1778         xzp = NULL;
1779         /*
1780          * Attempt to lock directory; fail if entry doesn't exist.
1781          */
1782         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1783             NULL, realnmp))) {
1784                 if (realnmp)
1785                         pn_free(realnmp);
1786                 ZFS_EXIT(zfsvfs);
1787                 return (error);
1788         }
1789
1790         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1791                 goto out;
1792         }
1793
1794         /*
1795          * Need to use rmdir for removing directories.
1796          */
1797         if (S_ISDIR(ZTOI(zp)->i_mode)) {
1798                 error = SET_ERROR(EPERM);
1799                 goto out;
1800         }
1801
1802         mutex_enter(&zp->z_lock);
1803         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1804             !(zp->z_is_mapped);
1805         mutex_exit(&zp->z_lock);
1806
1807         /*
1808          * We may delete the znode now, or we may put it in the unlinked set;
1809          * it depends on whether we're the last link, and on whether there are
1810          * other holds on the inode.  So we dmu_tx_hold() the right things to
1811          * allow for either case.
1812          */
1813         obj = zp->z_id;
1814         tx = dmu_tx_create(zfsvfs->z_os);
1815         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1816         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1817         zfs_sa_upgrade_txholds(tx, zp);
1818         zfs_sa_upgrade_txholds(tx, dzp);
1819         if (may_delete_now) {
1820                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1821                 /* if the file is too big, only hold_free a token amount */
1822                 dmu_tx_hold_free(tx, zp->z_id, 0,
1823                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1824         }
1825
1826         /* are there any extended attributes? */
1827         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1828             &xattr_obj, sizeof (xattr_obj));
1829         if (error == 0 && xattr_obj) {
1830                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1831                 ASSERT0(error);
1832                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1833                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1834         }
1835
1836         mutex_enter(&zp->z_lock);
1837         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1838                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1839         mutex_exit(&zp->z_lock);
1840
1841         /* charge as an update -- would be nice not to charge at all */
1842         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1843
1844         /*
1845          * Mark this transaction as typically resulting in a net free of space
1846          */
1847         dmu_tx_mark_netfree(tx);
1848
1849         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1850         if (error) {
1851                 zfs_dirent_unlock(dl);
1852                 if (error == ERESTART) {
1853                         waited = B_TRUE;
1854                         dmu_tx_wait(tx);
1855                         dmu_tx_abort(tx);
1856                         zrele(zp);
1857                         if (xzp)
1858                                 zrele(xzp);
1859                         goto top;
1860                 }
1861                 if (realnmp)
1862                         pn_free(realnmp);
1863                 dmu_tx_abort(tx);
1864                 zrele(zp);
1865                 if (xzp)
1866                         zrele(xzp);
1867                 ZFS_EXIT(zfsvfs);
1868                 return (error);
1869         }
1870
1871         /*
1872          * Remove the directory entry.
1873          */
1874         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1875
1876         if (error) {
1877                 dmu_tx_commit(tx);
1878                 goto out;
1879         }
1880
1881         if (unlinked) {
1882                 /*
1883                  * Hold z_lock so that we can make sure that the ACL obj
1884                  * hasn't changed.  Could have been deleted due to
1885                  * zfs_sa_upgrade().
1886                  */
1887                 mutex_enter(&zp->z_lock);
1888                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1889                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1890                 delete_now = may_delete_now && !toobig &&
1891                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1892                     !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
1893                     zfs_external_acl(zp) == acl_obj;
1894         }
1895
1896         if (delete_now) {
1897                 if (xattr_obj_unlinked) {
1898                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1899                         mutex_enter(&xzp->z_lock);
1900                         xzp->z_unlinked = B_TRUE;
1901                         clear_nlink(ZTOI(xzp));
1902                         links = 0;
1903                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1904                             &links, sizeof (links), tx);
1905                         ASSERT3U(error,  ==,  0);
1906                         mutex_exit(&xzp->z_lock);
1907                         zfs_unlinked_add(xzp, tx);
1908
1909                         if (zp->z_is_sa)
1910                                 error = sa_remove(zp->z_sa_hdl,
1911                                     SA_ZPL_XATTR(zfsvfs), tx);
1912                         else
1913                                 error = sa_update(zp->z_sa_hdl,
1914                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1915                                     sizeof (uint64_t), tx);
1916                         ASSERT0(error);
1917                 }
1918                 /*
1919                  * Add to the unlinked set because a new reference could be
1920                  * taken concurrently resulting in a deferred destruction.
1921                  */
1922                 zfs_unlinked_add(zp, tx);
1923                 mutex_exit(&zp->z_lock);
1924         } else if (unlinked) {
1925                 mutex_exit(&zp->z_lock);
1926                 zfs_unlinked_add(zp, tx);
1927         }
1928
1929         txtype = TX_REMOVE;
1930         if (flags & FIGNORECASE)
1931                 txtype |= TX_CI;
1932         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1933
1934         dmu_tx_commit(tx);
1935 out:
1936         if (realnmp)
1937                 pn_free(realnmp);
1938
1939         zfs_dirent_unlock(dl);
1940         zfs_inode_update(dzp);
1941         zfs_inode_update(zp);
1942
1943         if (delete_now)
1944                 zrele(zp);
1945         else
1946                 zfs_zrele_async(zp);
1947
1948         if (xzp) {
1949                 zfs_inode_update(xzp);
1950                 zfs_zrele_async(xzp);
1951         }
1952
1953         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1954                 zil_commit(zilog, 0);
1955
1956         ZFS_EXIT(zfsvfs);
1957         return (error);
1958 }
1959
1960 /*
1961  * Create a new directory and insert it into dzp using the name
1962  * provided.  Return a pointer to the inserted directory.
1963  *
1964  *      IN:     dzp     - znode of directory to add subdir to.
1965  *              dirname - name of new directory.
1966  *              vap     - attributes of new directory.
1967  *              cr      - credentials of caller.
1968  *              flags   - case flags.
1969  *              vsecp   - ACL to be set
1970  *
1971  *      OUT:    zpp     - znode of created directory.
1972  *
1973  *      RETURN: 0 if success
1974  *              error code if failure
1975  *
1976  * Timestamps:
1977  *      dzp - ctime|mtime updated
1978  *      zpp - ctime|mtime|atime updated
1979  */
1980 /*ARGSUSED*/
1981 int
1982 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1983     cred_t *cr, int flags, vsecattr_t *vsecp)
1984 {
1985         znode_t         *zp;
1986         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1987         zilog_t         *zilog;
1988         zfs_dirlock_t   *dl;
1989         uint64_t        txtype;
1990         dmu_tx_t        *tx;
1991         int             error;
1992         int             zf = ZNEW;
1993         uid_t           uid;
1994         gid_t           gid = crgetgid(cr);
1995         zfs_acl_ids_t   acl_ids;
1996         boolean_t       fuid_dirtied;
1997         boolean_t       waited = B_FALSE;
1998
1999         ASSERT(S_ISDIR(vap->va_mode));
2000
2001         /*
2002          * If we have an ephemeral id, ACL, or XVATTR then
2003          * make sure file system is at proper version
2004          */
2005
2006         uid = crgetuid(cr);
2007         if (zfsvfs->z_use_fuids == B_FALSE &&
2008             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2009                 return (SET_ERROR(EINVAL));
2010
2011         if (dirname == NULL)
2012                 return (SET_ERROR(EINVAL));
2013
2014         ZFS_ENTER(zfsvfs);
2015         ZFS_VERIFY_ZP(dzp);
2016         zilog = zfsvfs->z_log;
2017
2018         if (dzp->z_pflags & ZFS_XATTR) {
2019                 ZFS_EXIT(zfsvfs);
2020                 return (SET_ERROR(EINVAL));
2021         }
2022
2023         if (zfsvfs->z_utf8 && u8_validate(dirname,
2024             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2025                 ZFS_EXIT(zfsvfs);
2026                 return (SET_ERROR(EILSEQ));
2027         }
2028         if (flags & FIGNORECASE)
2029                 zf |= ZCILOOK;
2030
2031         if (vap->va_mask & ATTR_XVATTR) {
2032                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
2033                     crgetuid(cr), cr, vap->va_mode)) != 0) {
2034                         ZFS_EXIT(zfsvfs);
2035                         return (error);
2036                 }
2037         }
2038
2039         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2040             vsecp, &acl_ids)) != 0) {
2041                 ZFS_EXIT(zfsvfs);
2042                 return (error);
2043         }
2044         /*
2045          * First make sure the new directory doesn't exist.
2046          *
2047          * Existence is checked first to make sure we don't return
2048          * EACCES instead of EEXIST which can cause some applications
2049          * to fail.
2050          */
2051 top:
2052         *zpp = NULL;
2053
2054         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2055             NULL, NULL))) {
2056                 zfs_acl_ids_free(&acl_ids);
2057                 ZFS_EXIT(zfsvfs);
2058                 return (error);
2059         }
2060
2061         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
2062                 zfs_acl_ids_free(&acl_ids);
2063                 zfs_dirent_unlock(dl);
2064                 ZFS_EXIT(zfsvfs);
2065                 return (error);
2066         }
2067
2068         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
2069                 zfs_acl_ids_free(&acl_ids);
2070                 zfs_dirent_unlock(dl);
2071                 ZFS_EXIT(zfsvfs);
2072                 return (SET_ERROR(EDQUOT));
2073         }
2074
2075         /*
2076          * Add a new entry to the directory.
2077          */
2078         tx = dmu_tx_create(zfsvfs->z_os);
2079         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2080         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2081         fuid_dirtied = zfsvfs->z_fuid_dirty;
2082         if (fuid_dirtied)
2083                 zfs_fuid_txhold(zfsvfs, tx);
2084         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2085                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2086                     acl_ids.z_aclp->z_acl_bytes);
2087         }
2088
2089         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2090             ZFS_SA_BASE_ATTR_SIZE);
2091
2092         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2093         if (error) {
2094                 zfs_dirent_unlock(dl);
2095                 if (error == ERESTART) {
2096                         waited = B_TRUE;
2097                         dmu_tx_wait(tx);
2098                         dmu_tx_abort(tx);
2099                         goto top;
2100                 }
2101                 zfs_acl_ids_free(&acl_ids);
2102                 dmu_tx_abort(tx);
2103                 ZFS_EXIT(zfsvfs);
2104                 return (error);
2105         }
2106
2107         /*
2108          * Create new node.
2109          */
2110         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2111
2112         /*
2113          * Now put new name in parent dir.
2114          */
2115         error = zfs_link_create(dl, zp, tx, ZNEW);
2116         if (error != 0) {
2117                 zfs_znode_delete(zp, tx);
2118                 remove_inode_hash(ZTOI(zp));
2119                 goto out;
2120         }
2121
2122         if (fuid_dirtied)
2123                 zfs_fuid_sync(zfsvfs, tx);
2124
2125         *zpp = zp;
2126
2127         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2128         if (flags & FIGNORECASE)
2129                 txtype |= TX_CI;
2130         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2131             acl_ids.z_fuidp, vap);
2132
2133 out:
2134         zfs_acl_ids_free(&acl_ids);
2135
2136         dmu_tx_commit(tx);
2137
2138         zfs_dirent_unlock(dl);
2139
2140         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2141                 zil_commit(zilog, 0);
2142
2143         if (error != 0) {
2144                 zrele(zp);
2145         } else {
2146                 zfs_inode_update(dzp);
2147                 zfs_inode_update(zp);
2148         }
2149         ZFS_EXIT(zfsvfs);
2150         return (error);
2151 }
2152
2153 /*
2154  * Remove a directory subdir entry.  If the current working
2155  * directory is the same as the subdir to be removed, the
2156  * remove will fail.
2157  *
2158  *      IN:     dzp     - znode of directory to remove from.
2159  *              name    - name of directory to be removed.
2160  *              cwd     - inode of current working directory.
2161  *              cr      - credentials of caller.
2162  *              flags   - case flags
2163  *
2164  *      RETURN: 0 on success, error code on failure.
2165  *
2166  * Timestamps:
2167  *      dzp - ctime|mtime updated
2168  */
2169 /*ARGSUSED*/
2170 int
2171 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
2172     int flags)
2173 {
2174         znode_t         *zp;
2175         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
2176         zilog_t         *zilog;
2177         zfs_dirlock_t   *dl;
2178         dmu_tx_t        *tx;
2179         int             error;
2180         int             zflg = ZEXISTS;
2181         boolean_t       waited = B_FALSE;
2182
2183         if (name == NULL)
2184                 return (SET_ERROR(EINVAL));
2185
2186         ZFS_ENTER(zfsvfs);
2187         ZFS_VERIFY_ZP(dzp);
2188         zilog = zfsvfs->z_log;
2189
2190         if (flags & FIGNORECASE)
2191                 zflg |= ZCILOOK;
2192 top:
2193         zp = NULL;
2194
2195         /*
2196          * Attempt to lock directory; fail if entry doesn't exist.
2197          */
2198         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2199             NULL, NULL))) {
2200                 ZFS_EXIT(zfsvfs);
2201                 return (error);
2202         }
2203
2204         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
2205                 goto out;
2206         }
2207
2208         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
2209                 error = SET_ERROR(ENOTDIR);
2210                 goto out;
2211         }
2212
2213         if (zp == cwd) {
2214                 error = SET_ERROR(EINVAL);
2215                 goto out;
2216         }
2217
2218         /*
2219          * Grab a lock on the directory to make sure that no one is
2220          * trying to add (or lookup) entries while we are removing it.
2221          */
2222         rw_enter(&zp->z_name_lock, RW_WRITER);
2223
2224         /*
2225          * Grab a lock on the parent pointer to make sure we play well
2226          * with the treewalk and directory rename code.
2227          */
2228         rw_enter(&zp->z_parent_lock, RW_WRITER);
2229
2230         tx = dmu_tx_create(zfsvfs->z_os);
2231         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2232         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2233         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2234         zfs_sa_upgrade_txholds(tx, zp);
2235         zfs_sa_upgrade_txholds(tx, dzp);
2236         dmu_tx_mark_netfree(tx);
2237         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2238         if (error) {
2239                 rw_exit(&zp->z_parent_lock);
2240                 rw_exit(&zp->z_name_lock);
2241                 zfs_dirent_unlock(dl);
2242                 if (error == ERESTART) {
2243                         waited = B_TRUE;
2244                         dmu_tx_wait(tx);
2245                         dmu_tx_abort(tx);
2246                         zrele(zp);
2247                         goto top;
2248                 }
2249                 dmu_tx_abort(tx);
2250                 zrele(zp);
2251                 ZFS_EXIT(zfsvfs);
2252                 return (error);
2253         }
2254
2255         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2256
2257         if (error == 0) {
2258                 uint64_t txtype = TX_RMDIR;
2259                 if (flags & FIGNORECASE)
2260                         txtype |= TX_CI;
2261                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
2262                     B_FALSE);
2263         }
2264
2265         dmu_tx_commit(tx);
2266
2267         rw_exit(&zp->z_parent_lock);
2268         rw_exit(&zp->z_name_lock);
2269 out:
2270         zfs_dirent_unlock(dl);
2271
2272         zfs_inode_update(dzp);
2273         zfs_inode_update(zp);
2274         zrele(zp);
2275
2276         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2277                 zil_commit(zilog, 0);
2278
2279         ZFS_EXIT(zfsvfs);
2280         return (error);
2281 }
2282
2283 /*
2284  * Read directory entries from the given directory cursor position and emit
2285  * name and position for each entry.
2286  *
2287  *      IN:     ip      - inode of directory to read.
2288  *              ctx     - directory entry context.
2289  *              cr      - credentials of caller.
2290  *
2291  *      RETURN: 0 if success
2292  *              error code if failure
2293  *
2294  * Timestamps:
2295  *      ip - atime updated
2296  *
2297  * Note that the low 4 bits of the cookie returned by zap is always zero.
2298  * This allows us to use the low range for "special" directory entries:
2299  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2300  * we use the offset 2 for the '.zfs' directory.
2301  */
2302 /* ARGSUSED */
2303 int
2304 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
2305 {
2306         znode_t         *zp = ITOZ(ip);
2307         zfsvfs_t        *zfsvfs = ITOZSB(ip);
2308         objset_t        *os;
2309         zap_cursor_t    zc;
2310         zap_attribute_t zap;
2311         int             error;
2312         uint8_t         prefetch;
2313         uint8_t         type;
2314         int             done = 0;
2315         uint64_t        parent;
2316         uint64_t        offset; /* must be unsigned; checks for < 1 */
2317
2318         ZFS_ENTER(zfsvfs);
2319         ZFS_VERIFY_ZP(zp);
2320
2321         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2322             &parent, sizeof (parent))) != 0)
2323                 goto out;
2324
2325         /*
2326          * Quit if directory has been removed (posix)
2327          */
2328         if (zp->z_unlinked)
2329                 goto out;
2330
2331         error = 0;
2332         os = zfsvfs->z_os;
2333         offset = ctx->pos;
2334         prefetch = zp->z_zn_prefetch;
2335
2336         /*
2337          * Initialize the iterator cursor.
2338          */
2339         if (offset <= 3) {
2340                 /*
2341                  * Start iteration from the beginning of the directory.
2342                  */
2343                 zap_cursor_init(&zc, os, zp->z_id);
2344         } else {
2345                 /*
2346                  * The offset is a serialized cursor.
2347                  */
2348                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2349         }
2350
2351         /*
2352          * Transform to file-system independent format
2353          */
2354         while (!done) {
2355                 uint64_t objnum;
2356                 /*
2357                  * Special case `.', `..', and `.zfs'.
2358                  */
2359                 if (offset == 0) {
2360                         (void) strcpy(zap.za_name, ".");
2361                         zap.za_normalization_conflict = 0;
2362                         objnum = zp->z_id;
2363                         type = DT_DIR;
2364                 } else if (offset == 1) {
2365                         (void) strcpy(zap.za_name, "..");
2366                         zap.za_normalization_conflict = 0;
2367                         objnum = parent;
2368                         type = DT_DIR;
2369                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2370                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2371                         zap.za_normalization_conflict = 0;
2372                         objnum = ZFSCTL_INO_ROOT;
2373                         type = DT_DIR;
2374                 } else {
2375                         /*
2376                          * Grab next entry.
2377                          */
2378                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
2379                                 if (error == ENOENT)
2380                                         break;
2381                                 else
2382                                         goto update;
2383                         }
2384
2385                         /*
2386                          * Allow multiple entries provided the first entry is
2387                          * the object id.  Non-zpl consumers may safely make
2388                          * use of the additional space.
2389                          *
2390                          * XXX: This should be a feature flag for compatibility
2391                          */
2392                         if (zap.za_integer_length != 8 ||
2393                             zap.za_num_integers == 0) {
2394                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2395                                     "entry, obj = %lld, offset = %lld, "
2396                                     "length = %d, num = %lld\n",
2397                                     (u_longlong_t)zp->z_id,
2398                                     (u_longlong_t)offset,
2399                                     zap.za_integer_length,
2400                                     (u_longlong_t)zap.za_num_integers);
2401                                 error = SET_ERROR(ENXIO);
2402                                 goto update;
2403                         }
2404
2405                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2406                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2407                 }
2408
2409                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
2410                     objnum, type);
2411                 if (done)
2412                         break;
2413
2414                 /* Prefetch znode */
2415                 if (prefetch) {
2416                         dmu_prefetch(os, objnum, 0, 0, 0,
2417                             ZIO_PRIORITY_SYNC_READ);
2418                 }
2419
2420                 /*
2421                  * Move to the next entry, fill in the previous offset.
2422                  */
2423                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2424                         zap_cursor_advance(&zc);
2425                         offset = zap_cursor_serialize(&zc);
2426                 } else {
2427                         offset += 1;
2428                 }
2429                 ctx->pos = offset;
2430         }
2431         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2432
2433 update:
2434         zap_cursor_fini(&zc);
2435         if (error == ENOENT)
2436                 error = 0;
2437 out:
2438         ZFS_EXIT(zfsvfs);
2439
2440         return (error);
2441 }
2442
2443 ulong_t zfs_fsync_sync_cnt = 4;
2444
2445 int
2446 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
2447 {
2448         zfsvfs_t *zfsvfs = ZTOZSB(zp);
2449
2450         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2451
2452         if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2453                 ZFS_ENTER(zfsvfs);
2454                 ZFS_VERIFY_ZP(zp);
2455                 zil_commit(zfsvfs->z_log, zp->z_id);
2456                 ZFS_EXIT(zfsvfs);
2457         }
2458         tsd_set(zfs_fsyncer_key, NULL);
2459
2460         return (0);
2461 }
2462
2463 /*
2464  * Get the basic file attributes and place them in the provided kstat
2465  * structure.  The inode is assumed to be the authoritative source
2466  * for most of the attributes.  However, the znode currently has the
2467  * authoritative atime, blksize, and block count.
2468  *
2469  *      IN:     ip      - inode of file.
2470  *
2471  *      OUT:    sp      - kstat values.
2472  *
2473  *      RETURN: 0 (always succeeds)
2474  */
2475 /* ARGSUSED */
2476 int
2477 zfs_getattr_fast(struct inode *ip, struct kstat *sp)
2478 {
2479         znode_t *zp = ITOZ(ip);
2480         zfsvfs_t *zfsvfs = ITOZSB(ip);
2481         uint32_t blksize;
2482         u_longlong_t nblocks;
2483
2484         ZFS_ENTER(zfsvfs);
2485         ZFS_VERIFY_ZP(zp);
2486
2487         mutex_enter(&zp->z_lock);
2488
2489         generic_fillattr(ip, sp);
2490         /*
2491          * +1 link count for root inode with visible '.zfs' directory.
2492          */
2493         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
2494                 if (sp->nlink < ZFS_LINK_MAX)
2495                         sp->nlink++;
2496
2497         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2498         sp->blksize = blksize;
2499         sp->blocks = nblocks;
2500
2501         if (unlikely(zp->z_blksz == 0)) {
2502                 /*
2503                  * Block size hasn't been set; suggest maximal I/O transfers.
2504                  */
2505                 sp->blksize = zfsvfs->z_max_blksz;
2506         }
2507
2508         mutex_exit(&zp->z_lock);
2509
2510         /*
2511          * Required to prevent NFS client from detecting different inode
2512          * numbers of snapshot root dentry before and after snapshot mount.
2513          */
2514         if (zfsvfs->z_issnap) {
2515                 if (ip->i_sb->s_root->d_inode == ip)
2516                         sp->ino = ZFSCTL_INO_SNAPDIRS -
2517                             dmu_objset_id(zfsvfs->z_os);
2518         }
2519
2520         ZFS_EXIT(zfsvfs);
2521
2522         return (0);
2523 }
2524
2525 /*
2526  * For the operation of changing file's user/group/project, we need to
2527  * handle not only the main object that is assigned to the file directly,
2528  * but also the ones that are used by the file via hidden xattr directory.
2529  *
2530  * Because the xattr directory may contains many EA entries, as to it may
2531  * be impossible to change all of them via the transaction of changing the
2532  * main object's user/group/project attributes. Then we have to change them
2533  * via other multiple independent transactions one by one. It may be not good
2534  * solution, but we have no better idea yet.
2535  */
2536 static int
2537 zfs_setattr_dir(znode_t *dzp)
2538 {
2539         struct inode    *dxip = ZTOI(dzp);
2540         struct inode    *xip = NULL;
2541         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
2542         objset_t        *os = zfsvfs->z_os;
2543         zap_cursor_t    zc;
2544         zap_attribute_t zap;
2545         zfs_dirlock_t   *dl;
2546         znode_t         *zp = NULL;
2547         dmu_tx_t        *tx = NULL;
2548         uint64_t        uid, gid;
2549         sa_bulk_attr_t  bulk[4];
2550         int             count;
2551         int             err;
2552
2553         zap_cursor_init(&zc, os, dzp->z_id);
2554         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
2555                 count = 0;
2556                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
2557                         err = ENXIO;
2558                         break;
2559                 }
2560
2561                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
2562                     ZEXISTS, NULL, NULL);
2563                 if (err == ENOENT)
2564                         goto next;
2565                 if (err)
2566                         break;
2567
2568                 xip = ZTOI(zp);
2569                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
2570                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
2571                     zp->z_projid == dzp->z_projid)
2572                         goto next;
2573
2574                 tx = dmu_tx_create(os);
2575                 if (!(zp->z_pflags & ZFS_PROJID))
2576                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2577                 else
2578                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2579
2580                 err = dmu_tx_assign(tx, TXG_WAIT);
2581                 if (err)
2582                         break;
2583
2584                 mutex_enter(&dzp->z_lock);
2585
2586                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
2587                         xip->i_uid = dxip->i_uid;
2588                         uid = zfs_uid_read(dxip);
2589                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2590                             &uid, sizeof (uid));
2591                 }
2592
2593                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
2594                         xip->i_gid = dxip->i_gid;
2595                         gid = zfs_gid_read(dxip);
2596                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2597                             &gid, sizeof (gid));
2598                 }
2599
2600                 if (zp->z_projid != dzp->z_projid) {
2601                         if (!(zp->z_pflags & ZFS_PROJID)) {
2602                                 zp->z_pflags |= ZFS_PROJID;
2603                                 SA_ADD_BULK_ATTR(bulk, count,
2604                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
2605                                     sizeof (zp->z_pflags));
2606                         }
2607
2608                         zp->z_projid = dzp->z_projid;
2609                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
2610                             NULL, &zp->z_projid, sizeof (zp->z_projid));
2611                 }
2612
2613                 mutex_exit(&dzp->z_lock);
2614
2615                 if (likely(count > 0)) {
2616                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2617                         dmu_tx_commit(tx);
2618                 } else {
2619                         dmu_tx_abort(tx);
2620                 }
2621                 tx = NULL;
2622                 if (err != 0 && err != ENOENT)
2623                         break;
2624
2625 next:
2626                 if (zp) {
2627                         zrele(zp);
2628                         zp = NULL;
2629                         zfs_dirent_unlock(dl);
2630                 }
2631                 zap_cursor_advance(&zc);
2632         }
2633
2634         if (tx)
2635                 dmu_tx_abort(tx);
2636         if (zp) {
2637                 zrele(zp);
2638                 zfs_dirent_unlock(dl);
2639         }
2640         zap_cursor_fini(&zc);
2641
2642         return (err == ENOENT ? 0 : err);
2643 }
2644
2645 /*
2646  * Set the file attributes to the values contained in the
2647  * vattr structure.
2648  *
2649  *      IN:     zp      - znode of file to be modified.
2650  *              vap     - new attribute values.
2651  *                        If ATTR_XVATTR set, then optional attrs are being set
2652  *              flags   - ATTR_UTIME set if non-default time values provided.
2653  *                      - ATTR_NOACLCHECK (CIFS context only).
2654  *              cr      - credentials of caller.
2655  *
2656  *      RETURN: 0 if success
2657  *              error code if failure
2658  *
2659  * Timestamps:
2660  *      ip - ctime updated, mtime updated if size changed.
2661  */
2662 /* ARGSUSED */
2663 int
2664 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
2665 {
2666         struct inode    *ip;
2667         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
2668         objset_t        *os = zfsvfs->z_os;
2669         zilog_t         *zilog;
2670         dmu_tx_t        *tx;
2671         vattr_t         oldva;
2672         xvattr_t        *tmpxvattr;
2673         uint_t          mask = vap->va_mask;
2674         uint_t          saved_mask = 0;
2675         int             trim_mask = 0;
2676         uint64_t        new_mode;
2677         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
2678         uint64_t        xattr_obj;
2679         uint64_t        mtime[2], ctime[2], atime[2];
2680         uint64_t        projid = ZFS_INVALID_PROJID;
2681         znode_t         *attrzp;
2682         int             need_policy = FALSE;
2683         int             err, err2 = 0;
2684         zfs_fuid_info_t *fuidp = NULL;
2685         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2686         xoptattr_t      *xoap;
2687         zfs_acl_t       *aclp;
2688         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2689         boolean_t       fuid_dirtied = B_FALSE;
2690         boolean_t       handle_eadir = B_FALSE;
2691         sa_bulk_attr_t  *bulk, *xattr_bulk;
2692         int             count = 0, xattr_count = 0, bulks = 8;
2693
2694         if (mask == 0)
2695                 return (0);
2696
2697         ZFS_ENTER(zfsvfs);
2698         ZFS_VERIFY_ZP(zp);
2699         ip = ZTOI(zp);
2700
2701         /*
2702          * If this is a xvattr_t, then get a pointer to the structure of
2703          * optional attributes.  If this is NULL, then we have a vattr_t.
2704          */
2705         xoap = xva_getxoptattr(xvap);
2706         if (xoap != NULL && (mask & ATTR_XVATTR)) {
2707                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2708                         if (!dmu_objset_projectquota_enabled(os) ||
2709                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
2710                                 ZFS_EXIT(zfsvfs);
2711                                 return (SET_ERROR(ENOTSUP));
2712                         }
2713
2714                         projid = xoap->xoa_projid;
2715                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
2716                                 ZFS_EXIT(zfsvfs);
2717                                 return (SET_ERROR(EINVAL));
2718                         }
2719
2720                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2721                                 projid = ZFS_INVALID_PROJID;
2722                         else
2723                                 need_policy = TRUE;
2724                 }
2725
2726                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2727                     (xoap->xoa_projinherit !=
2728                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2729                     (!dmu_objset_projectquota_enabled(os) ||
2730                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
2731                         ZFS_EXIT(zfsvfs);
2732                         return (SET_ERROR(ENOTSUP));
2733                 }
2734         }
2735
2736         zilog = zfsvfs->z_log;
2737
2738         /*
2739          * Make sure that if we have ephemeral uid/gid or xvattr specified
2740          * that file system is at proper version level
2741          */
2742
2743         if (zfsvfs->z_use_fuids == B_FALSE &&
2744             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2745             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2746             (mask & ATTR_XVATTR))) {
2747                 ZFS_EXIT(zfsvfs);
2748                 return (SET_ERROR(EINVAL));
2749         }
2750
2751         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2752                 ZFS_EXIT(zfsvfs);
2753                 return (SET_ERROR(EISDIR));
2754         }
2755
2756         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2757                 ZFS_EXIT(zfsvfs);
2758                 return (SET_ERROR(EINVAL));
2759         }
2760
2761         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2762         xva_init(tmpxvattr);
2763
2764         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2765         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2766
2767         /*
2768          * Immutable files can only alter immutable bit and atime
2769          */
2770         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2771             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2772             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2773                 err = SET_ERROR(EPERM);
2774                 goto out3;
2775         }
2776
2777         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2778                 err = SET_ERROR(EPERM);
2779                 goto out3;
2780         }
2781
2782         /*
2783          * Verify timestamps doesn't overflow 32 bits.
2784          * ZFS can handle large timestamps, but 32bit syscalls can't
2785          * handle times greater than 2039.  This check should be removed
2786          * once large timestamps are fully supported.
2787          */
2788         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2789                 if (((mask & ATTR_ATIME) &&
2790                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2791                     ((mask & ATTR_MTIME) &&
2792                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2793                         err = SET_ERROR(EOVERFLOW);
2794                         goto out3;
2795                 }
2796         }
2797
2798 top:
2799         attrzp = NULL;
2800         aclp = NULL;
2801
2802         /* Can this be moved to before the top label? */
2803         if (zfs_is_readonly(zfsvfs)) {
2804                 err = SET_ERROR(EROFS);
2805                 goto out3;
2806         }
2807
2808         /*
2809          * First validate permissions
2810          */
2811
2812         if (mask & ATTR_SIZE) {
2813                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2814                 if (err)
2815                         goto out3;
2816
2817                 /*
2818                  * XXX - Note, we are not providing any open
2819                  * mode flags here (like FNDELAY), so we may
2820                  * block if there are locks present... this
2821                  * should be addressed in openat().
2822                  */
2823                 /* XXX - would it be OK to generate a log record here? */
2824                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2825                 if (err)
2826                         goto out3;
2827         }
2828
2829         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2830             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2831             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2832             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2833             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2834             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2835             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2836             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2837                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2838                     skipaclchk, cr);
2839         }
2840
2841         if (mask & (ATTR_UID|ATTR_GID)) {
2842                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2843                 int     take_owner;
2844                 int     take_group;
2845
2846                 /*
2847                  * NOTE: even if a new mode is being set,
2848                  * we may clear S_ISUID/S_ISGID bits.
2849                  */
2850
2851                 if (!(mask & ATTR_MODE))
2852                         vap->va_mode = zp->z_mode;
2853
2854                 /*
2855                  * Take ownership or chgrp to group we are a member of
2856                  */
2857
2858                 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2859                 take_group = (mask & ATTR_GID) &&
2860                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2861
2862                 /*
2863                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2864                  * take_group must both be set in order to allow taking
2865                  * ownership.
2866                  *
2867                  * Otherwise, send the check through secpolicy_vnode_setattr()
2868                  *
2869                  */
2870
2871                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2872                     take_owner && take_group) ||
2873                     ((idmask == ATTR_UID) && take_owner) ||
2874                     ((idmask == ATTR_GID) && take_group)) {
2875                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2876                             skipaclchk, cr) == 0) {
2877                                 /*
2878                                  * Remove setuid/setgid for non-privileged users
2879                                  */
2880                                 (void) secpolicy_setid_clear(vap, cr);
2881                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2882                         } else {
2883                                 need_policy =  TRUE;
2884                         }
2885                 } else {
2886                         need_policy =  TRUE;
2887                 }
2888         }
2889
2890         mutex_enter(&zp->z_lock);
2891         oldva.va_mode = zp->z_mode;
2892         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2893         if (mask & ATTR_XVATTR) {
2894                 /*
2895                  * Update xvattr mask to include only those attributes
2896                  * that are actually changing.
2897                  *
2898                  * the bits will be restored prior to actually setting
2899                  * the attributes so the caller thinks they were set.
2900                  */
2901                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2902                         if (xoap->xoa_appendonly !=
2903                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2904                                 need_policy = TRUE;
2905                         } else {
2906                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2907                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2908                         }
2909                 }
2910
2911                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2912                         if (xoap->xoa_projinherit !=
2913                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2914                                 need_policy = TRUE;
2915                         } else {
2916                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2917                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2918                         }
2919                 }
2920
2921                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2922                         if (xoap->xoa_nounlink !=
2923                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2924                                 need_policy = TRUE;
2925                         } else {
2926                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2927                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2928                         }
2929                 }
2930
2931                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2932                         if (xoap->xoa_immutable !=
2933                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2934                                 need_policy = TRUE;
2935                         } else {
2936                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2937                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2938                         }
2939                 }
2940
2941                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2942                         if (xoap->xoa_nodump !=
2943                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2944                                 need_policy = TRUE;
2945                         } else {
2946                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2947                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2948                         }
2949                 }
2950
2951                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2952                         if (xoap->xoa_av_modified !=
2953                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2954                                 need_policy = TRUE;
2955                         } else {
2956                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2957                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2958                         }
2959                 }
2960
2961                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2962                         if ((!S_ISREG(ip->i_mode) &&
2963                             xoap->xoa_av_quarantined) ||
2964                             xoap->xoa_av_quarantined !=
2965                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2966                                 need_policy = TRUE;
2967                         } else {
2968                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2969                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2970                         }
2971                 }
2972
2973                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2974                         mutex_exit(&zp->z_lock);
2975                         err = SET_ERROR(EPERM);
2976                         goto out3;
2977                 }
2978
2979                 if (need_policy == FALSE &&
2980                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2981                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2982                         need_policy = TRUE;
2983                 }
2984         }
2985
2986         mutex_exit(&zp->z_lock);
2987
2988         if (mask & ATTR_MODE) {
2989                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2990                         err = secpolicy_setid_setsticky_clear(ip, vap,
2991                             &oldva, cr);
2992                         if (err)
2993                                 goto out3;
2994
2995                         trim_mask |= ATTR_MODE;
2996                 } else {
2997                         need_policy = TRUE;
2998                 }
2999         }
3000
3001         if (need_policy) {
3002                 /*
3003                  * If trim_mask is set then take ownership
3004                  * has been granted or write_acl is present and user
3005                  * has the ability to modify mode.  In that case remove
3006                  * UID|GID and or MODE from mask so that
3007                  * secpolicy_vnode_setattr() doesn't revoke it.
3008                  */
3009
3010                 if (trim_mask) {
3011                         saved_mask = vap->va_mask;
3012                         vap->va_mask &= ~trim_mask;
3013                 }
3014                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
3015                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3016                 if (err)
3017                         goto out3;
3018
3019                 if (trim_mask)
3020                         vap->va_mask |= saved_mask;
3021         }
3022
3023         /*
3024          * secpolicy_vnode_setattr, or take ownership may have
3025          * changed va_mask
3026          */
3027         mask = vap->va_mask;
3028
3029         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
3030                 handle_eadir = B_TRUE;
3031                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3032                     &xattr_obj, sizeof (xattr_obj));
3033
3034                 if (err == 0 && xattr_obj) {
3035                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
3036                         if (err)
3037                                 goto out2;
3038                 }
3039                 if (mask & ATTR_UID) {
3040                         new_kuid = zfs_fuid_create(zfsvfs,
3041                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3042                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
3043                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
3044                             new_kuid)) {
3045                                 if (attrzp)
3046                                         zrele(attrzp);
3047                                 err = SET_ERROR(EDQUOT);
3048                                 goto out2;
3049                         }
3050                 }
3051
3052                 if (mask & ATTR_GID) {
3053                         new_kgid = zfs_fuid_create(zfsvfs,
3054                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
3055                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
3056                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3057                             new_kgid)) {
3058                                 if (attrzp)
3059                                         zrele(attrzp);
3060                                 err = SET_ERROR(EDQUOT);
3061                                 goto out2;
3062                         }
3063                 }
3064
3065                 if (projid != ZFS_INVALID_PROJID &&
3066                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
3067                         if (attrzp)
3068                                 zrele(attrzp);
3069                         err = EDQUOT;
3070                         goto out2;
3071                 }
3072         }
3073         tx = dmu_tx_create(os);
3074
3075         if (mask & ATTR_MODE) {
3076                 uint64_t pmode = zp->z_mode;
3077                 uint64_t acl_obj;
3078                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3079
3080                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
3081                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3082                         err = EPERM;
3083                         goto out;
3084                 }
3085
3086                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
3087                         goto out;
3088
3089                 mutex_enter(&zp->z_lock);
3090                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3091                         /*
3092                          * Are we upgrading ACL from old V0 format
3093                          * to V1 format?
3094                          */
3095                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3096                             zfs_znode_acl_version(zp) ==
3097                             ZFS_ACL_VERSION_INITIAL) {
3098                                 dmu_tx_hold_free(tx, acl_obj, 0,
3099                                     DMU_OBJECT_END);
3100                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3101                                     0, aclp->z_acl_bytes);
3102                         } else {
3103                                 dmu_tx_hold_write(tx, acl_obj, 0,
3104                                     aclp->z_acl_bytes);
3105                         }
3106                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3107                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3108                             0, aclp->z_acl_bytes);
3109                 }
3110                 mutex_exit(&zp->z_lock);
3111                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3112         } else {
3113                 if (((mask & ATTR_XVATTR) &&
3114                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
3115                     (projid != ZFS_INVALID_PROJID &&
3116                     !(zp->z_pflags & ZFS_PROJID)))
3117                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3118                 else
3119                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3120         }
3121
3122         if (attrzp) {
3123                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3124         }
3125
3126         fuid_dirtied = zfsvfs->z_fuid_dirty;
3127         if (fuid_dirtied)
3128                 zfs_fuid_txhold(zfsvfs, tx);
3129
3130         zfs_sa_upgrade_txholds(tx, zp);
3131
3132         err = dmu_tx_assign(tx, TXG_WAIT);
3133         if (err)
3134                 goto out;
3135
3136         count = 0;
3137         /*
3138          * Set each attribute requested.
3139          * We group settings according to the locks they need to acquire.
3140          *
3141          * Note: you cannot set ctime directly, although it will be
3142          * updated as a side-effect of calling this function.
3143          */
3144
3145         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
3146                 /*
3147                  * For the existed object that is upgraded from old system,
3148                  * its on-disk layout has no slot for the project ID attribute.
3149                  * But quota accounting logic needs to access related slots by
3150                  * offset directly. So we need to adjust old objects' layout
3151                  * to make the project ID to some unified and fixed offset.
3152                  */
3153                 if (attrzp)
3154                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
3155                 if (err == 0)
3156                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
3157
3158                 if (unlikely(err == EEXIST))
3159                         err = 0;
3160                 else if (err != 0)
3161                         goto out;
3162                 else
3163                         projid = ZFS_INVALID_PROJID;
3164         }
3165
3166         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3167                 mutex_enter(&zp->z_acl_lock);
3168         mutex_enter(&zp->z_lock);
3169
3170         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3171             &zp->z_pflags, sizeof (zp->z_pflags));
3172
3173         if (attrzp) {
3174                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3175                         mutex_enter(&attrzp->z_acl_lock);
3176                 mutex_enter(&attrzp->z_lock);
3177                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3178                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3179                     sizeof (attrzp->z_pflags));
3180                 if (projid != ZFS_INVALID_PROJID) {
3181                         attrzp->z_projid = projid;
3182                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3183                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
3184                             sizeof (attrzp->z_projid));
3185                 }
3186         }
3187
3188         if (mask & (ATTR_UID|ATTR_GID)) {
3189
3190                 if (mask & ATTR_UID) {
3191                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
3192                         new_uid = zfs_uid_read(ZTOI(zp));
3193                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3194                             &new_uid, sizeof (new_uid));
3195                         if (attrzp) {
3196                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3197                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3198                                     sizeof (new_uid));
3199                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
3200                         }
3201                 }
3202
3203                 if (mask & ATTR_GID) {
3204                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
3205                         new_gid = zfs_gid_read(ZTOI(zp));
3206                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3207                             NULL, &new_gid, sizeof (new_gid));
3208                         if (attrzp) {
3209                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3210                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3211                                     sizeof (new_gid));
3212                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
3213                         }
3214                 }
3215                 if (!(mask & ATTR_MODE)) {
3216                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3217                             NULL, &new_mode, sizeof (new_mode));
3218                         new_mode = zp->z_mode;
3219                 }
3220                 err = zfs_acl_chown_setattr(zp);
3221                 ASSERT(err == 0);
3222                 if (attrzp) {
3223                         err = zfs_acl_chown_setattr(attrzp);
3224                         ASSERT(err == 0);
3225                 }
3226         }
3227
3228         if (mask & ATTR_MODE) {
3229                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3230                     &new_mode, sizeof (new_mode));
3231                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
3232                 ASSERT3P(aclp, !=, NULL);
3233                 err = zfs_aclset_common(zp, aclp, cr, tx);
3234                 ASSERT0(err);
3235                 if (zp->z_acl_cached)
3236                         zfs_acl_free(zp->z_acl_cached);
3237                 zp->z_acl_cached = aclp;
3238                 aclp = NULL;
3239         }
3240
3241         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
3242                 zp->z_atime_dirty = B_FALSE;
3243                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
3244                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3245                     &atime, sizeof (atime));
3246         }
3247
3248         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
3249                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3250                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
3251                     vap->va_mtime, ZTOI(zp));
3252
3253                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3254                     mtime, sizeof (mtime));
3255         }
3256
3257         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
3258                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
3259                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
3260                     ZTOI(zp));
3261                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3262                     ctime, sizeof (ctime));
3263         }
3264
3265         if (projid != ZFS_INVALID_PROJID) {
3266                 zp->z_projid = projid;
3267                 SA_ADD_BULK_ATTR(bulk, count,
3268                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
3269                     sizeof (zp->z_projid));
3270         }
3271
3272         if (attrzp && mask) {
3273                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3274                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
3275                     sizeof (ctime));
3276         }
3277
3278         /*
3279          * Do this after setting timestamps to prevent timestamp
3280          * update from toggling bit
3281          */
3282
3283         if (xoap && (mask & ATTR_XVATTR)) {
3284
3285                 /*
3286                  * restore trimmed off masks
3287                  * so that return masks can be set for caller.
3288                  */
3289
3290                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
3291                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
3292                 }
3293                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
3294                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
3295                 }
3296                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
3297                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3298                 }
3299                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
3300                         XVA_SET_REQ(xvap, XAT_NODUMP);
3301                 }
3302                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
3303                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3304                 }
3305                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
3306                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3307                 }
3308                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
3309                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
3310                 }
3311
3312                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3313                         ASSERT(S_ISREG(ip->i_mode));
3314
3315                 zfs_xvattr_set(zp, xvap, tx);
3316         }
3317
3318         if (fuid_dirtied)
3319                 zfs_fuid_sync(zfsvfs, tx);
3320
3321         if (mask != 0)
3322                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3323
3324         mutex_exit(&zp->z_lock);
3325         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3326                 mutex_exit(&zp->z_acl_lock);
3327
3328         if (attrzp) {
3329                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3330                         mutex_exit(&attrzp->z_acl_lock);
3331                 mutex_exit(&attrzp->z_lock);
3332         }
3333 out:
3334         if (err == 0 && xattr_count > 0) {
3335                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3336                     xattr_count, tx);
3337                 ASSERT(err2 == 0);
3338         }
3339
3340         if (aclp)
3341                 zfs_acl_free(aclp);
3342
3343         if (fuidp) {
3344                 zfs_fuid_info_free(fuidp);
3345                 fuidp = NULL;
3346         }
3347
3348         if (err) {
3349                 dmu_tx_abort(tx);
3350                 if (attrzp)
3351                         zrele(attrzp);
3352                 if (err == ERESTART)
3353                         goto top;
3354         } else {
3355                 if (count > 0)
3356                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3357                 dmu_tx_commit(tx);
3358                 if (attrzp) {
3359                         if (err2 == 0 && handle_eadir)
3360                                 err2 = zfs_setattr_dir(attrzp);
3361                         zrele(attrzp);
3362                 }
3363                 zfs_inode_update(zp);
3364         }
3365
3366 out2:
3367         if (os->os_sync == ZFS_SYNC_ALWAYS)
3368                 zil_commit(zilog, 0);
3369
3370 out3:
3371         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
3372         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
3373         kmem_free(tmpxvattr, sizeof (xvattr_t));
3374         ZFS_EXIT(zfsvfs);
3375         return (err);
3376 }
3377
3378 typedef struct zfs_zlock {
3379         krwlock_t       *zl_rwlock;     /* lock we acquired */
3380         znode_t         *zl_znode;      /* znode we held */
3381         struct zfs_zlock *zl_next;      /* next in list */
3382 } zfs_zlock_t;
3383
3384 /*
3385  * Drop locks and release vnodes that were held by zfs_rename_lock().
3386  */
3387 static void
3388 zfs_rename_unlock(zfs_zlock_t **zlpp)
3389 {
3390         zfs_zlock_t *zl;
3391
3392         while ((zl = *zlpp) != NULL) {
3393                 if (zl->zl_znode != NULL)
3394                         zfs_zrele_async(zl->zl_znode);
3395                 rw_exit(zl->zl_rwlock);
3396                 *zlpp = zl->zl_next;
3397                 kmem_free(zl, sizeof (*zl));
3398         }
3399 }
3400
3401 /*
3402  * Search back through the directory tree, using the ".." entries.
3403  * Lock each directory in the chain to prevent concurrent renames.
3404  * Fail any attempt to move a directory into one of its own descendants.
3405  * XXX - z_parent_lock can overlap with map or grow locks
3406  */
3407 static int
3408 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3409 {
3410         zfs_zlock_t     *zl;
3411         znode_t         *zp = tdzp;
3412         uint64_t        rootid = ZTOZSB(zp)->z_root;
3413         uint64_t        oidp = zp->z_id;
3414         krwlock_t       *rwlp = &szp->z_parent_lock;
3415         krw_t           rw = RW_WRITER;
3416
3417         /*
3418          * First pass write-locks szp and compares to zp->z_id.
3419          * Later passes read-lock zp and compare to zp->z_parent.
3420          */
3421         do {
3422                 if (!rw_tryenter(rwlp, rw)) {
3423                         /*
3424                          * Another thread is renaming in this path.
3425                          * Note that if we are a WRITER, we don't have any
3426                          * parent_locks held yet.
3427                          */
3428                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3429                                 /*
3430                                  * Drop our locks and restart
3431                                  */
3432                                 zfs_rename_unlock(&zl);
3433                                 *zlpp = NULL;
3434                                 zp = tdzp;
3435                                 oidp = zp->z_id;
3436                                 rwlp = &szp->z_parent_lock;
3437                                 rw = RW_WRITER;
3438                                 continue;
3439                         } else {
3440                                 /*
3441                                  * Wait for other thread to drop its locks
3442                                  */
3443                                 rw_enter(rwlp, rw);
3444                         }
3445                 }
3446
3447                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3448                 zl->zl_rwlock = rwlp;
3449                 zl->zl_znode = NULL;
3450                 zl->zl_next = *zlpp;
3451                 *zlpp = zl;
3452
3453                 if (oidp == szp->z_id)          /* We're a descendant of szp */
3454                         return (SET_ERROR(EINVAL));
3455
3456                 if (oidp == rootid)             /* We've hit the top */
3457                         return (0);
3458
3459                 if (rw == RW_READER) {          /* i.e. not the first pass */
3460                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
3461                         if (error)
3462                                 return (error);
3463                         zl->zl_znode = zp;
3464                 }
3465                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
3466                     &oidp, sizeof (oidp));
3467                 rwlp = &zp->z_parent_lock;
3468                 rw = RW_READER;
3469
3470         } while (zp->z_id != sdzp->z_id);
3471
3472         return (0);
3473 }
3474
3475 /*
3476  * Move an entry from the provided source directory to the target
3477  * directory.  Change the entry name as indicated.
3478  *
3479  *      IN:     sdzp    - Source directory containing the "old entry".
3480  *              snm     - Old entry name.
3481  *              tdzp    - Target directory to contain the "new entry".
3482  *              tnm     - New entry name.
3483  *              cr      - credentials of caller.
3484  *              flags   - case flags
3485  *
3486  *      RETURN: 0 on success, error code on failure.
3487  *
3488  * Timestamps:
3489  *      sdzp,tdzp - ctime|mtime updated
3490  */
3491 /*ARGSUSED*/
3492 int
3493 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
3494     cred_t *cr, int flags)
3495 {
3496         znode_t         *szp, *tzp;
3497         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
3498         zilog_t         *zilog;
3499         zfs_dirlock_t   *sdl, *tdl;
3500         dmu_tx_t        *tx;
3501         zfs_zlock_t     *zl;
3502         int             cmp, serr, terr;
3503         int             error = 0;
3504         int             zflg = 0;
3505         boolean_t       waited = B_FALSE;
3506
3507         if (snm == NULL || tnm == NULL)
3508                 return (SET_ERROR(EINVAL));
3509
3510         ZFS_ENTER(zfsvfs);
3511         ZFS_VERIFY_ZP(sdzp);
3512         zilog = zfsvfs->z_log;
3513
3514         ZFS_VERIFY_ZP(tdzp);
3515
3516         /*
3517          * We check i_sb because snapshots and the ctldir must have different
3518          * super blocks.
3519          */
3520         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
3521             zfsctl_is_node(ZTOI(tdzp))) {
3522                 ZFS_EXIT(zfsvfs);
3523                 return (SET_ERROR(EXDEV));
3524         }
3525
3526         if (zfsvfs->z_utf8 && u8_validate(tnm,
3527             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3528                 ZFS_EXIT(zfsvfs);
3529                 return (SET_ERROR(EILSEQ));
3530         }
3531
3532         if (flags & FIGNORECASE)
3533                 zflg |= ZCILOOK;
3534
3535 top:
3536         szp = NULL;
3537         tzp = NULL;
3538         zl = NULL;
3539
3540         /*
3541          * This is to prevent the creation of links into attribute space
3542          * by renaming a linked file into/outof an attribute directory.
3543          * See the comment in zfs_link() for why this is considered bad.
3544          */
3545         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3546                 ZFS_EXIT(zfsvfs);
3547                 return (SET_ERROR(EINVAL));
3548         }
3549
3550         /*
3551          * Lock source and target directory entries.  To prevent deadlock,
3552          * a lock ordering must be defined.  We lock the directory with
3553          * the smallest object id first, or if it's a tie, the one with
3554          * the lexically first name.
3555          */
3556         if (sdzp->z_id < tdzp->z_id) {
3557                 cmp = -1;
3558         } else if (sdzp->z_id > tdzp->z_id) {
3559                 cmp = 1;
3560         } else {
3561                 /*
3562                  * First compare the two name arguments without
3563                  * considering any case folding.
3564                  */
3565                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3566
3567                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3568                 ASSERT(error == 0 || !zfsvfs->z_utf8);
3569                 if (cmp == 0) {
3570                         /*
3571                          * POSIX: "If the old argument and the new argument
3572                          * both refer to links to the same existing file,
3573                          * the rename() function shall return successfully
3574                          * and perform no other action."
3575                          */
3576                         ZFS_EXIT(zfsvfs);
3577                         return (0);
3578                 }
3579                 /*
3580                  * If the file system is case-folding, then we may
3581                  * have some more checking to do.  A case-folding file
3582                  * system is either supporting mixed case sensitivity
3583                  * access or is completely case-insensitive.  Note
3584                  * that the file system is always case preserving.
3585                  *
3586                  * In mixed sensitivity mode case sensitive behavior
3587                  * is the default.  FIGNORECASE must be used to
3588                  * explicitly request case insensitive behavior.
3589                  *
3590                  * If the source and target names provided differ only
3591                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3592                  * we will treat this as a special case in the
3593                  * case-insensitive mode: as long as the source name
3594                  * is an exact match, we will allow this to proceed as
3595                  * a name-change request.
3596                  */
3597                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3598                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
3599                     flags & FIGNORECASE)) &&
3600                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3601                     &error) == 0) {
3602                         /*
3603                          * case preserving rename request, require exact
3604                          * name matches
3605                          */
3606                         zflg |= ZCIEXACT;
3607                         zflg &= ~ZCILOOK;
3608                 }
3609         }
3610
3611         /*
3612          * If the source and destination directories are the same, we should
3613          * grab the z_name_lock of that directory only once.
3614          */
3615         if (sdzp == tdzp) {
3616                 zflg |= ZHAVELOCK;
3617                 rw_enter(&sdzp->z_name_lock, RW_READER);
3618         }
3619
3620         if (cmp < 0) {
3621                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3622                     ZEXISTS | zflg, NULL, NULL);
3623                 terr = zfs_dirent_lock(&tdl,
3624                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3625         } else {
3626                 terr = zfs_dirent_lock(&tdl,
3627                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3628                 serr = zfs_dirent_lock(&sdl,
3629                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3630                     NULL, NULL);
3631         }
3632
3633         if (serr) {
3634                 /*
3635                  * Source entry invalid or not there.
3636                  */
3637                 if (!terr) {
3638                         zfs_dirent_unlock(tdl);
3639                         if (tzp)
3640                                 zrele(tzp);
3641                 }
3642
3643                 if (sdzp == tdzp)
3644                         rw_exit(&sdzp->z_name_lock);
3645
3646                 if (strcmp(snm, "..") == 0)
3647                         serr = EINVAL;
3648                 ZFS_EXIT(zfsvfs);
3649                 return (serr);
3650         }
3651         if (terr) {
3652                 zfs_dirent_unlock(sdl);
3653                 zrele(szp);
3654
3655                 if (sdzp == tdzp)
3656                         rw_exit(&sdzp->z_name_lock);
3657
3658                 if (strcmp(tnm, "..") == 0)
3659                         terr = EINVAL;
3660                 ZFS_EXIT(zfsvfs);
3661                 return (terr);
3662         }
3663
3664         /*
3665          * If we are using project inheritance, means if the directory has
3666          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3667          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3668          * such case, we only allow renames into our tree when the project
3669          * IDs are the same.
3670          */
3671         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3672             tdzp->z_projid != szp->z_projid) {
3673                 error = SET_ERROR(EXDEV);
3674                 goto out;
3675         }
3676
3677         /*
3678          * Must have write access at the source to remove the old entry
3679          * and write access at the target to create the new entry.
3680          * Note that if target and source are the same, this can be
3681          * done in a single check.
3682          */
3683
3684         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
3685                 goto out;
3686
3687         if (S_ISDIR(ZTOI(szp)->i_mode)) {
3688                 /*
3689                  * Check to make sure rename is valid.
3690                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3691                  */
3692                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
3693                         goto out;
3694         }
3695
3696         /*
3697          * Does target exist?
3698          */
3699         if (tzp) {
3700                 /*
3701                  * Source and target must be the same type.
3702                  */
3703                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3704                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
3705                                 error = SET_ERROR(ENOTDIR);
3706                                 goto out;
3707                         }
3708                 } else {
3709                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
3710                                 error = SET_ERROR(EISDIR);
3711                                 goto out;
3712                         }
3713                 }
3714                 /*
3715                  * POSIX dictates that when the source and target
3716                  * entries refer to the same file object, rename
3717                  * must do nothing and exit without error.
3718                  */
3719                 if (szp->z_id == tzp->z_id) {
3720                         error = 0;
3721                         goto out;
3722                 }
3723         }
3724
3725         tx = dmu_tx_create(zfsvfs->z_os);
3726         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3727         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3728         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3729         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3730         if (sdzp != tdzp) {
3731                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3732                 zfs_sa_upgrade_txholds(tx, tdzp);
3733         }
3734         if (tzp) {
3735                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3736                 zfs_sa_upgrade_txholds(tx, tzp);
3737         }
3738
3739         zfs_sa_upgrade_txholds(tx, szp);
3740         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3741         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3742         if (error) {
3743                 if (zl != NULL)
3744                         zfs_rename_unlock(&zl);
3745                 zfs_dirent_unlock(sdl);
3746                 zfs_dirent_unlock(tdl);
3747
3748                 if (sdzp == tdzp)
3749                         rw_exit(&sdzp->z_name_lock);
3750
3751                 if (error == ERESTART) {
3752                         waited = B_TRUE;
3753                         dmu_tx_wait(tx);
3754                         dmu_tx_abort(tx);
3755                         zrele(szp);
3756                         if (tzp)
3757                                 zrele(tzp);
3758                         goto top;
3759                 }
3760                 dmu_tx_abort(tx);
3761                 zrele(szp);
3762                 if (tzp)
3763                         zrele(tzp);
3764                 ZFS_EXIT(zfsvfs);
3765                 return (error);
3766         }
3767
3768         if (tzp)        /* Attempt to remove the existing target */
3769                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3770
3771         if (error == 0) {
3772                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3773                 if (error == 0) {
3774                         szp->z_pflags |= ZFS_AV_MODIFIED;
3775                         if (tdzp->z_pflags & ZFS_PROJINHERIT)
3776                                 szp->z_pflags |= ZFS_PROJINHERIT;
3777
3778                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3779                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3780                         ASSERT0(error);
3781
3782                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3783                         if (error == 0) {
3784                                 zfs_log_rename(zilog, tx, TX_RENAME |
3785                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3786                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
3787                         } else {
3788                                 /*
3789                                  * At this point, we have successfully created
3790                                  * the target name, but have failed to remove
3791                                  * the source name.  Since the create was done
3792                                  * with the ZRENAMING flag, there are
3793                                  * complications; for one, the link count is
3794                                  * wrong.  The easiest way to deal with this
3795                                  * is to remove the newly created target, and
3796                                  * return the original error.  This must
3797                                  * succeed; fortunately, it is very unlikely to
3798                                  * fail, since we just created it.
3799                                  */
3800                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3801                                     ZRENAMING, NULL), ==, 0);
3802                         }
3803                 } else {
3804                         /*
3805                          * If we had removed the existing target, subsequent
3806                          * call to zfs_link_create() to add back the same entry
3807                          * but, the new dnode (szp) should not fail.
3808                          */
3809                         ASSERT(tzp == NULL);
3810                 }
3811         }
3812
3813         dmu_tx_commit(tx);
3814 out:
3815         if (zl != NULL)
3816                 zfs_rename_unlock(&zl);
3817
3818         zfs_dirent_unlock(sdl);
3819         zfs_dirent_unlock(tdl);
3820
3821         zfs_inode_update(sdzp);
3822         if (sdzp == tdzp)
3823                 rw_exit(&sdzp->z_name_lock);
3824
3825         if (sdzp != tdzp)
3826                 zfs_inode_update(tdzp);
3827
3828         zfs_inode_update(szp);
3829         zrele(szp);
3830         if (tzp) {
3831                 zfs_inode_update(tzp);
3832                 zrele(tzp);
3833         }
3834
3835         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3836                 zil_commit(zilog, 0);
3837
3838         ZFS_EXIT(zfsvfs);
3839         return (error);
3840 }
3841
3842 /*
3843  * Insert the indicated symbolic reference entry into the directory.
3844  *
3845  *      IN:     dzp     - Directory to contain new symbolic link.
3846  *              name    - Name of directory entry in dip.
3847  *              vap     - Attributes of new entry.
3848  *              link    - Name for new symlink entry.
3849  *              cr      - credentials of caller.
3850  *              flags   - case flags
3851  *
3852  *      OUT:    zpp     - Znode for new symbolic link.
3853  *
3854  *      RETURN: 0 on success, error code on failure.
3855  *
3856  * Timestamps:
3857  *      dip - ctime|mtime updated
3858  */
3859 /*ARGSUSED*/
3860 int
3861 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3862     znode_t **zpp, cred_t *cr, int flags)
3863 {
3864         znode_t         *zp;
3865         zfs_dirlock_t   *dl;
3866         dmu_tx_t        *tx;
3867         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3868         zilog_t         *zilog;
3869         uint64_t        len = strlen(link);
3870         int             error;
3871         int             zflg = ZNEW;
3872         zfs_acl_ids_t   acl_ids;
3873         boolean_t       fuid_dirtied;
3874         uint64_t        txtype = TX_SYMLINK;
3875         boolean_t       waited = B_FALSE;
3876
3877         ASSERT(S_ISLNK(vap->va_mode));
3878
3879         if (name == NULL)
3880                 return (SET_ERROR(EINVAL));
3881
3882         ZFS_ENTER(zfsvfs);
3883         ZFS_VERIFY_ZP(dzp);
3884         zilog = zfsvfs->z_log;
3885
3886         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3887             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3888                 ZFS_EXIT(zfsvfs);
3889                 return (SET_ERROR(EILSEQ));
3890         }
3891         if (flags & FIGNORECASE)
3892                 zflg |= ZCILOOK;
3893
3894         if (len > MAXPATHLEN) {
3895                 ZFS_EXIT(zfsvfs);
3896                 return (SET_ERROR(ENAMETOOLONG));
3897         }
3898
3899         if ((error = zfs_acl_ids_create(dzp, 0,
3900             vap, cr, NULL, &acl_ids)) != 0) {
3901                 ZFS_EXIT(zfsvfs);
3902                 return (error);
3903         }
3904 top:
3905         *zpp = NULL;
3906
3907         /*
3908          * Attempt to lock directory; fail if entry already exists.
3909          */
3910         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3911         if (error) {
3912                 zfs_acl_ids_free(&acl_ids);
3913                 ZFS_EXIT(zfsvfs);
3914                 return (error);
3915         }
3916
3917         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3918                 zfs_acl_ids_free(&acl_ids);
3919                 zfs_dirent_unlock(dl);
3920                 ZFS_EXIT(zfsvfs);
3921                 return (error);
3922         }
3923
3924         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3925                 zfs_acl_ids_free(&acl_ids);
3926                 zfs_dirent_unlock(dl);
3927                 ZFS_EXIT(zfsvfs);
3928                 return (SET_ERROR(EDQUOT));
3929         }
3930         tx = dmu_tx_create(zfsvfs->z_os);
3931         fuid_dirtied = zfsvfs->z_fuid_dirty;
3932         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3933         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3934         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3935             ZFS_SA_BASE_ATTR_SIZE + len);
3936         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3937         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3938                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3939                     acl_ids.z_aclp->z_acl_bytes);
3940         }
3941         if (fuid_dirtied)
3942                 zfs_fuid_txhold(zfsvfs, tx);
3943         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3944         if (error) {
3945                 zfs_dirent_unlock(dl);
3946                 if (error == ERESTART) {
3947                         waited = B_TRUE;
3948                         dmu_tx_wait(tx);
3949                         dmu_tx_abort(tx);
3950                         goto top;
3951                 }
3952                 zfs_acl_ids_free(&acl_ids);
3953                 dmu_tx_abort(tx);
3954                 ZFS_EXIT(zfsvfs);
3955                 return (error);
3956         }
3957
3958         /*
3959          * Create a new object for the symlink.
3960          * for version 4 ZPL datsets the symlink will be an SA attribute
3961          */
3962         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3963
3964         if (fuid_dirtied)
3965                 zfs_fuid_sync(zfsvfs, tx);
3966
3967         mutex_enter(&zp->z_lock);
3968         if (zp->z_is_sa)
3969                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3970                     link, len, tx);
3971         else
3972                 zfs_sa_symlink(zp, link, len, tx);
3973         mutex_exit(&zp->z_lock);
3974
3975         zp->z_size = len;
3976         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3977             &zp->z_size, sizeof (zp->z_size), tx);
3978         /*
3979          * Insert the new object into the directory.
3980          */
3981         error = zfs_link_create(dl, zp, tx, ZNEW);
3982         if (error != 0) {
3983                 zfs_znode_delete(zp, tx);
3984                 remove_inode_hash(ZTOI(zp));
3985         } else {
3986                 if (flags & FIGNORECASE)
3987                         txtype |= TX_CI;
3988                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3989
3990                 zfs_inode_update(dzp);
3991                 zfs_inode_update(zp);
3992         }
3993
3994         zfs_acl_ids_free(&acl_ids);
3995
3996         dmu_tx_commit(tx);
3997
3998         zfs_dirent_unlock(dl);
3999
4000         if (error == 0) {
4001                 *zpp = zp;
4002
4003                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4004                         zil_commit(zilog, 0);
4005         } else {
4006                 zrele(zp);
4007         }
4008
4009         ZFS_EXIT(zfsvfs);
4010         return (error);
4011 }
4012
4013 /*
4014  * Return, in the buffer contained in the provided uio structure,
4015  * the symbolic path referred to by ip.
4016  *
4017  *      IN:     ip      - inode of symbolic link
4018  *              uio     - structure to contain the link path.
4019  *              cr      - credentials of caller.
4020  *
4021  *      RETURN: 0 if success
4022  *              error code if failure
4023  *
4024  * Timestamps:
4025  *      ip - atime updated
4026  */
4027 /* ARGSUSED */
4028 int
4029 zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
4030 {
4031         znode_t         *zp = ITOZ(ip);
4032         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4033         int             error;
4034
4035         ZFS_ENTER(zfsvfs);
4036         ZFS_VERIFY_ZP(zp);
4037
4038         mutex_enter(&zp->z_lock);
4039         if (zp->z_is_sa)
4040                 error = sa_lookup_uio(zp->z_sa_hdl,
4041                     SA_ZPL_SYMLINK(zfsvfs), uio);
4042         else
4043                 error = zfs_sa_readlink(zp, uio);
4044         mutex_exit(&zp->z_lock);
4045
4046         ZFS_EXIT(zfsvfs);
4047         return (error);
4048 }
4049
4050 /*
4051  * Insert a new entry into directory tdzp referencing szp.
4052  *
4053  *      IN:     tdzp    - Directory to contain new entry.
4054  *              szp     - znode of new entry.
4055  *              name    - name of new entry.
4056  *              cr      - credentials of caller.
4057  *              flags   - case flags.
4058  *
4059  *      RETURN: 0 if success
4060  *              error code if failure
4061  *
4062  * Timestamps:
4063  *      tdzp - ctime|mtime updated
4064  *       szp - ctime updated
4065  */
4066 /* ARGSUSED */
4067 int
4068 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
4069     int flags)
4070 {
4071         struct inode *sip = ZTOI(szp);
4072         znode_t         *tzp;
4073         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
4074         zilog_t         *zilog;
4075         zfs_dirlock_t   *dl;
4076         dmu_tx_t        *tx;
4077         int             error;
4078         int             zf = ZNEW;
4079         uint64_t        parent;
4080         uid_t           owner;
4081         boolean_t       waited = B_FALSE;
4082         boolean_t       is_tmpfile = 0;
4083         uint64_t        txg;
4084 #ifdef HAVE_TMPFILE
4085         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
4086 #endif
4087         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
4088
4089         if (name == NULL)
4090                 return (SET_ERROR(EINVAL));
4091
4092         ZFS_ENTER(zfsvfs);
4093         ZFS_VERIFY_ZP(tdzp);
4094         zilog = zfsvfs->z_log;
4095
4096         /*
4097          * POSIX dictates that we return EPERM here.
4098          * Better choices include ENOTSUP or EISDIR.
4099          */
4100         if (S_ISDIR(sip->i_mode)) {
4101                 ZFS_EXIT(zfsvfs);
4102                 return (SET_ERROR(EPERM));
4103         }
4104
4105         ZFS_VERIFY_ZP(szp);
4106
4107         /*
4108          * If we are using project inheritance, means if the directory has
4109          * ZFS_PROJINHERIT set, then its descendant directories will inherit
4110          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
4111          * such case, we only allow hard link creation in our tree when the
4112          * project IDs are the same.
4113          */
4114         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
4115             tdzp->z_projid != szp->z_projid) {
4116                 ZFS_EXIT(zfsvfs);
4117                 return (SET_ERROR(EXDEV));
4118         }
4119
4120         /*
4121          * We check i_sb because snapshots and the ctldir must have different
4122          * super blocks.
4123          */
4124         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
4125                 ZFS_EXIT(zfsvfs);
4126                 return (SET_ERROR(EXDEV));
4127         }
4128
4129         /* Prevent links to .zfs/shares files */
4130
4131         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4132             &parent, sizeof (uint64_t))) != 0) {
4133                 ZFS_EXIT(zfsvfs);
4134                 return (error);
4135         }
4136         if (parent == zfsvfs->z_shares_dir) {
4137                 ZFS_EXIT(zfsvfs);
4138                 return (SET_ERROR(EPERM));
4139         }
4140
4141         if (zfsvfs->z_utf8 && u8_validate(name,
4142             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4143                 ZFS_EXIT(zfsvfs);
4144                 return (SET_ERROR(EILSEQ));
4145         }
4146         if (flags & FIGNORECASE)
4147                 zf |= ZCILOOK;
4148
4149         /*
4150          * We do not support links between attributes and non-attributes
4151          * because of the potential security risk of creating links
4152          * into "normal" file space in order to circumvent restrictions
4153          * imposed in attribute space.
4154          */
4155         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
4156                 ZFS_EXIT(zfsvfs);
4157                 return (SET_ERROR(EINVAL));
4158         }
4159
4160         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
4161             cr, ZFS_OWNER);
4162         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4163                 ZFS_EXIT(zfsvfs);
4164                 return (SET_ERROR(EPERM));
4165         }
4166
4167         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
4168                 ZFS_EXIT(zfsvfs);
4169                 return (error);
4170         }
4171
4172 top:
4173         /*
4174          * Attempt to lock directory; fail if entry already exists.
4175          */
4176         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
4177         if (error) {
4178                 ZFS_EXIT(zfsvfs);
4179                 return (error);
4180         }
4181
4182         tx = dmu_tx_create(zfsvfs->z_os);
4183         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4184         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
4185         if (is_tmpfile)
4186                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4187
4188         zfs_sa_upgrade_txholds(tx, szp);
4189         zfs_sa_upgrade_txholds(tx, tdzp);
4190         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4191         if (error) {
4192                 zfs_dirent_unlock(dl);
4193                 if (error == ERESTART) {
4194                         waited = B_TRUE;
4195                         dmu_tx_wait(tx);
4196                         dmu_tx_abort(tx);
4197                         goto top;
4198                 }
4199                 dmu_tx_abort(tx);
4200                 ZFS_EXIT(zfsvfs);
4201                 return (error);
4202         }
4203         /* unmark z_unlinked so zfs_link_create will not reject */
4204         if (is_tmpfile)
4205                 szp->z_unlinked = B_FALSE;
4206         error = zfs_link_create(dl, szp, tx, 0);
4207
4208         if (error == 0) {
4209                 uint64_t txtype = TX_LINK;
4210                 /*
4211                  * tmpfile is created to be in z_unlinkedobj, so remove it.
4212                  * Also, we don't log in ZIL, because all previous file
4213                  * operation on the tmpfile are ignored by ZIL. Instead we
4214                  * always wait for txg to sync to make sure all previous
4215                  * operation are sync safe.
4216                  */
4217                 if (is_tmpfile) {
4218                         VERIFY(zap_remove_int(zfsvfs->z_os,
4219                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
4220                 } else {
4221                         if (flags & FIGNORECASE)
4222                                 txtype |= TX_CI;
4223                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
4224                 }
4225         } else if (is_tmpfile) {
4226                 /* restore z_unlinked since when linking failed */
4227                 szp->z_unlinked = B_TRUE;
4228         }
4229         txg = dmu_tx_get_txg(tx);
4230         dmu_tx_commit(tx);
4231
4232         zfs_dirent_unlock(dl);
4233
4234         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4235                 zil_commit(zilog, 0);
4236
4237         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
4238                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
4239
4240         zfs_inode_update(tdzp);
4241         zfs_inode_update(szp);
4242         ZFS_EXIT(zfsvfs);
4243         return (error);
4244 }
4245
4246 static void
4247 zfs_putpage_commit_cb(void *arg)
4248 {
4249         struct page *pp = arg;
4250
4251         ClearPageError(pp);
4252         end_page_writeback(pp);
4253 }
4254
4255 /*
4256  * Push a page out to disk, once the page is on stable storage the
4257  * registered commit callback will be run as notification of completion.
4258  *
4259  *      IN:     ip      - page mapped for inode.
4260  *              pp      - page to push (page is locked)
4261  *              wbc     - writeback control data
4262  *
4263  *      RETURN: 0 if success
4264  *              error code if failure
4265  *
4266  * Timestamps:
4267  *      ip - ctime|mtime updated
4268  */
4269 /* ARGSUSED */
4270 int
4271 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
4272 {
4273         znode_t         *zp = ITOZ(ip);
4274         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4275         loff_t          offset;
4276         loff_t          pgoff;
4277         unsigned int    pglen;
4278         dmu_tx_t        *tx;
4279         caddr_t         va;
4280         int             err = 0;
4281         uint64_t        mtime[2], ctime[2];
4282         sa_bulk_attr_t  bulk[3];
4283         int             cnt = 0;
4284         struct address_space *mapping;
4285
4286         ZFS_ENTER(zfsvfs);
4287         ZFS_VERIFY_ZP(zp);
4288
4289         ASSERT(PageLocked(pp));
4290
4291         pgoff = page_offset(pp);        /* Page byte-offset in file */
4292         offset = i_size_read(ip);       /* File length in bytes */
4293         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
4294             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
4295
4296         /* Page is beyond end of file */
4297         if (pgoff >= offset) {
4298                 unlock_page(pp);
4299                 ZFS_EXIT(zfsvfs);
4300                 return (0);
4301         }
4302
4303         /* Truncate page length to end of file */
4304         if (pgoff + pglen > offset)
4305                 pglen = offset - pgoff;
4306
4307 #if 0
4308         /*
4309          * FIXME: Allow mmap writes past its quota.  The correct fix
4310          * is to register a page_mkwrite() handler to count the page
4311          * against its quota when it is about to be dirtied.
4312          */
4313         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
4314             KUID_TO_SUID(ip->i_uid)) ||
4315             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
4316             KGID_TO_SGID(ip->i_gid)) ||
4317             (zp->z_projid != ZFS_DEFAULT_PROJID &&
4318             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4319             zp->z_projid))) {
4320                 err = EDQUOT;
4321         }
4322 #endif
4323
4324         /*
4325          * The ordering here is critical and must adhere to the following
4326          * rules in order to avoid deadlocking in either zfs_read() or
4327          * zfs_free_range() due to a lock inversion.
4328          *
4329          * 1) The page must be unlocked prior to acquiring the range lock.
4330          *    This is critical because zfs_read() calls find_lock_page()
4331          *    which may block on the page lock while holding the range lock.
4332          *
4333          * 2) Before setting or clearing write back on a page the range lock
4334          *    must be held in order to prevent a lock inversion with the
4335          *    zfs_free_range() function.
4336          *
4337          * This presents a problem because upon entering this function the
4338          * page lock is already held.  To safely acquire the range lock the
4339          * page lock must be dropped.  This creates a window where another
4340          * process could truncate, invalidate, dirty, or write out the page.
4341          *
4342          * Therefore, after successfully reacquiring the range and page locks
4343          * the current page state is checked.  In the common case everything
4344          * will be as is expected and it can be written out.  However, if
4345          * the page state has changed it must be handled accordingly.
4346          */
4347         mapping = pp->mapping;
4348         redirty_page_for_writepage(wbc, pp);
4349         unlock_page(pp);
4350
4351         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
4352             pgoff, pglen, RL_WRITER);
4353         lock_page(pp);
4354
4355         /* Page mapping changed or it was no longer dirty, we're done */
4356         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
4357                 unlock_page(pp);
4358                 zfs_rangelock_exit(lr);
4359                 ZFS_EXIT(zfsvfs);
4360                 return (0);
4361         }
4362
4363         /* Another process started write block if required */
4364         if (PageWriteback(pp)) {
4365                 unlock_page(pp);
4366                 zfs_rangelock_exit(lr);
4367
4368                 if (wbc->sync_mode != WB_SYNC_NONE) {
4369                         if (PageWriteback(pp))
4370                                 wait_on_page_bit(pp, PG_writeback);
4371                 }
4372
4373                 ZFS_EXIT(zfsvfs);
4374                 return (0);
4375         }
4376
4377         /* Clear the dirty flag the required locks are held */
4378         if (!clear_page_dirty_for_io(pp)) {
4379                 unlock_page(pp);
4380                 zfs_rangelock_exit(lr);
4381                 ZFS_EXIT(zfsvfs);
4382                 return (0);
4383         }
4384
4385         /*
4386          * Counterpart for redirty_page_for_writepage() above.  This page
4387          * was in fact not skipped and should not be counted as if it were.
4388          */
4389         wbc->pages_skipped--;
4390         set_page_writeback(pp);
4391         unlock_page(pp);
4392
4393         tx = dmu_tx_create(zfsvfs->z_os);
4394         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
4395         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4396         zfs_sa_upgrade_txholds(tx, zp);
4397
4398         err = dmu_tx_assign(tx, TXG_NOWAIT);
4399         if (err != 0) {
4400                 if (err == ERESTART)
4401                         dmu_tx_wait(tx);
4402
4403                 dmu_tx_abort(tx);
4404                 __set_page_dirty_nobuffers(pp);
4405                 ClearPageError(pp);
4406                 end_page_writeback(pp);
4407                 zfs_rangelock_exit(lr);
4408                 ZFS_EXIT(zfsvfs);
4409                 return (err);
4410         }
4411
4412         va = kmap(pp);
4413         ASSERT3U(pglen, <=, PAGE_SIZE);
4414         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
4415         kunmap(pp);
4416
4417         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4418         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4419         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
4420             &zp->z_pflags, 8);
4421
4422         /* Preserve the mtime and ctime provided by the inode */
4423         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4424         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4425         zp->z_atime_dirty = B_FALSE;
4426         zp->z_seq++;
4427
4428         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4429
4430         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
4431             zfs_putpage_commit_cb, pp);
4432         dmu_tx_commit(tx);
4433
4434         zfs_rangelock_exit(lr);
4435
4436         if (wbc->sync_mode != WB_SYNC_NONE) {
4437                 /*
4438                  * Note that this is rarely called under writepages(), because
4439                  * writepages() normally handles the entire commit for
4440                  * performance reasons.
4441                  */
4442                 zil_commit(zfsvfs->z_log, zp->z_id);
4443         }
4444
4445         ZFS_EXIT(zfsvfs);
4446         return (err);
4447 }
4448
4449 /*
4450  * Update the system attributes when the inode has been dirtied.  For the
4451  * moment we only update the mode, atime, mtime, and ctime.
4452  */
4453 int
4454 zfs_dirty_inode(struct inode *ip, int flags)
4455 {
4456         znode_t         *zp = ITOZ(ip);
4457         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4458         dmu_tx_t        *tx;
4459         uint64_t        mode, atime[2], mtime[2], ctime[2];
4460         sa_bulk_attr_t  bulk[4];
4461         int             error = 0;
4462         int             cnt = 0;
4463
4464         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4465                 return (0);
4466
4467         ZFS_ENTER(zfsvfs);
4468         ZFS_VERIFY_ZP(zp);
4469
4470 #ifdef I_DIRTY_TIME
4471         /*
4472          * This is the lazytime semantic introduced in Linux 4.0
4473          * This flag will only be called from update_time when lazytime is set.
4474          * (Note, I_DIRTY_SYNC will also set if not lazytime)
4475          * Fortunately mtime and ctime are managed within ZFS itself, so we
4476          * only need to dirty atime.
4477          */
4478         if (flags == I_DIRTY_TIME) {
4479                 zp->z_atime_dirty = B_TRUE;
4480                 goto out;
4481         }
4482 #endif
4483
4484         tx = dmu_tx_create(zfsvfs->z_os);
4485
4486         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4487         zfs_sa_upgrade_txholds(tx, zp);
4488
4489         error = dmu_tx_assign(tx, TXG_WAIT);
4490         if (error) {
4491                 dmu_tx_abort(tx);
4492                 goto out;
4493         }
4494
4495         mutex_enter(&zp->z_lock);
4496         zp->z_atime_dirty = B_FALSE;
4497
4498         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4499         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4500         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4501         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4502
4503         /* Preserve the mode, mtime and ctime provided by the inode */
4504         ZFS_TIME_ENCODE(&ip->i_atime, atime);
4505         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4506         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4507         mode = ip->i_mode;
4508
4509         zp->z_mode = mode;
4510
4511         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4512         mutex_exit(&zp->z_lock);
4513
4514         dmu_tx_commit(tx);
4515 out:
4516         ZFS_EXIT(zfsvfs);
4517         return (error);
4518 }
4519
4520 /*ARGSUSED*/
4521 void
4522 zfs_inactive(struct inode *ip)
4523 {
4524         znode_t *zp = ITOZ(ip);
4525         zfsvfs_t *zfsvfs = ITOZSB(ip);
4526         uint64_t atime[2];
4527         int error;
4528         int need_unlock = 0;
4529
4530         /* Only read lock if we haven't already write locked, e.g. rollback */
4531         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4532                 need_unlock = 1;
4533                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4534         }
4535         if (zp->z_sa_hdl == NULL) {
4536                 if (need_unlock)
4537                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
4538                 return;
4539         }
4540
4541         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
4542                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4543
4544                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4545                 zfs_sa_upgrade_txholds(tx, zp);
4546                 error = dmu_tx_assign(tx, TXG_WAIT);
4547                 if (error) {
4548                         dmu_tx_abort(tx);
4549                 } else {
4550                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
4551                         mutex_enter(&zp->z_lock);
4552                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4553                             (void *)&atime, sizeof (atime), tx);
4554                         zp->z_atime_dirty = B_FALSE;
4555                         mutex_exit(&zp->z_lock);
4556                         dmu_tx_commit(tx);
4557                 }
4558         }
4559
4560         zfs_zinactive(zp);
4561         if (need_unlock)
4562                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4563 }
4564
4565 /*
4566  * Fill pages with data from the disk.
4567  */
4568 static int
4569 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
4570 {
4571         znode_t *zp = ITOZ(ip);
4572         zfsvfs_t *zfsvfs = ITOZSB(ip);
4573         objset_t *os;
4574         struct page *cur_pp;
4575         u_offset_t io_off, total;
4576         size_t io_len;
4577         loff_t i_size;
4578         unsigned page_idx;
4579         int err;
4580
4581         os = zfsvfs->z_os;
4582         io_len = nr_pages << PAGE_SHIFT;
4583         i_size = i_size_read(ip);
4584         io_off = page_offset(pl[0]);
4585
4586         if (io_off + io_len > i_size)
4587                 io_len = i_size - io_off;
4588
4589         /*
4590          * Iterate over list of pages and read each page individually.
4591          */
4592         page_idx = 0;
4593         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4594                 caddr_t va;
4595
4596                 cur_pp = pl[page_idx++];
4597                 va = kmap(cur_pp);
4598                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4599                     DMU_READ_PREFETCH);
4600                 kunmap(cur_pp);
4601                 if (err) {
4602                         /* convert checksum errors into IO errors */
4603                         if (err == ECKSUM)
4604                                 err = SET_ERROR(EIO);
4605                         return (err);
4606                 }
4607         }
4608
4609         return (0);
4610 }
4611
4612 /*
4613  * Uses zfs_fillpage to read data from the file and fill the pages.
4614  *
4615  *      IN:     ip       - inode of file to get data from.
4616  *              pl       - list of pages to read
4617  *              nr_pages - number of pages to read
4618  *
4619  *      RETURN: 0 on success, error code on failure.
4620  *
4621  * Timestamps:
4622  *      vp - atime updated
4623  */
4624 /* ARGSUSED */
4625 int
4626 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
4627 {
4628         znode_t  *zp  = ITOZ(ip);
4629         zfsvfs_t *zfsvfs = ITOZSB(ip);
4630         int      err;
4631
4632         if (pl == NULL)
4633                 return (0);
4634
4635         ZFS_ENTER(zfsvfs);
4636         ZFS_VERIFY_ZP(zp);
4637
4638         err = zfs_fillpage(ip, pl, nr_pages);
4639
4640         ZFS_EXIT(zfsvfs);
4641         return (err);
4642 }
4643
4644 /*
4645  * Check ZFS specific permissions to memory map a section of a file.
4646  *
4647  *      IN:     ip      - inode of the file to mmap
4648  *              off     - file offset
4649  *              addrp   - start address in memory region
4650  *              len     - length of memory region
4651  *              vm_flags- address flags
4652  *
4653  *      RETURN: 0 if success
4654  *              error code if failure
4655  */
4656 /*ARGSUSED*/
4657 int
4658 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4659     unsigned long vm_flags)
4660 {
4661         znode_t  *zp = ITOZ(ip);
4662         zfsvfs_t *zfsvfs = ITOZSB(ip);
4663
4664         ZFS_ENTER(zfsvfs);
4665         ZFS_VERIFY_ZP(zp);
4666
4667         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
4668             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4669                 ZFS_EXIT(zfsvfs);
4670                 return (SET_ERROR(EPERM));
4671         }
4672
4673         if ((vm_flags & (VM_READ | VM_EXEC)) &&
4674             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4675                 ZFS_EXIT(zfsvfs);
4676                 return (SET_ERROR(EACCES));
4677         }
4678
4679         if (off < 0 || len > MAXOFFSET_T - off) {
4680                 ZFS_EXIT(zfsvfs);
4681                 return (SET_ERROR(ENXIO));
4682         }
4683
4684         ZFS_EXIT(zfsvfs);
4685         return (0);
4686 }
4687
4688 /*
4689  * Free or allocate space in a file.  Currently, this function only
4690  * supports the `F_FREESP' command.  However, this command is somewhat
4691  * misnamed, as its functionality includes the ability to allocate as
4692  * well as free space.
4693  *
4694  *      IN:     zp      - znode of file to free data in.
4695  *              cmd     - action to take (only F_FREESP supported).
4696  *              bfp     - section of file to free/alloc.
4697  *              flag    - current file open mode flags.
4698  *              offset  - current file offset.
4699  *              cr      - credentials of caller.
4700  *
4701  *      RETURN: 0 on success, error code on failure.
4702  *
4703  * Timestamps:
4704  *      zp - ctime|mtime updated
4705  */
4706 /* ARGSUSED */
4707 int
4708 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4709     offset_t offset, cred_t *cr)
4710 {
4711         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
4712         uint64_t        off, len;
4713         int             error;
4714
4715         ZFS_ENTER(zfsvfs);
4716         ZFS_VERIFY_ZP(zp);
4717
4718         if (cmd != F_FREESP) {
4719                 ZFS_EXIT(zfsvfs);
4720                 return (SET_ERROR(EINVAL));
4721         }
4722
4723         /*
4724          * Callers might not be able to detect properly that we are read-only,
4725          * so check it explicitly here.
4726          */
4727         if (zfs_is_readonly(zfsvfs)) {
4728                 ZFS_EXIT(zfsvfs);
4729                 return (SET_ERROR(EROFS));
4730         }
4731
4732         if (bfp->l_len < 0) {
4733                 ZFS_EXIT(zfsvfs);
4734                 return (SET_ERROR(EINVAL));
4735         }
4736
4737         /*
4738          * Permissions aren't checked on Solaris because on this OS
4739          * zfs_space() can only be called with an opened file handle.
4740          * On Linux we can get here through truncate_range() which
4741          * operates directly on inodes, so we need to check access rights.
4742          */
4743         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
4744                 ZFS_EXIT(zfsvfs);
4745                 return (error);
4746         }
4747
4748         off = bfp->l_start;
4749         len = bfp->l_len; /* 0 means from off to end of file */
4750
4751         error = zfs_freesp(zp, off, len, flag, TRUE);
4752
4753         ZFS_EXIT(zfsvfs);
4754         return (error);
4755 }
4756
4757 /*ARGSUSED*/
4758 int
4759 zfs_fid(struct inode *ip, fid_t *fidp)
4760 {
4761         znode_t         *zp = ITOZ(ip);
4762         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4763         uint32_t        gen;
4764         uint64_t        gen64;
4765         uint64_t        object = zp->z_id;
4766         zfid_short_t    *zfid;
4767         int             size, i, error;
4768
4769         ZFS_ENTER(zfsvfs);
4770         ZFS_VERIFY_ZP(zp);
4771
4772         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4773             &gen64, sizeof (uint64_t))) != 0) {
4774                 ZFS_EXIT(zfsvfs);
4775                 return (error);
4776         }
4777
4778         gen = (uint32_t)gen64;
4779
4780         size = SHORT_FID_LEN;
4781
4782         zfid = (zfid_short_t *)fidp;
4783
4784         zfid->zf_len = size;
4785
4786         for (i = 0; i < sizeof (zfid->zf_object); i++)
4787                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4788
4789         /* Must have a non-zero generation number to distinguish from .zfs */
4790         if (gen == 0)
4791                 gen = 1;
4792         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4793                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4794
4795         ZFS_EXIT(zfsvfs);
4796         return (0);
4797 }
4798
4799 /*ARGSUSED*/
4800 int
4801 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4802 {
4803         znode_t *zp = ITOZ(ip);
4804         zfsvfs_t *zfsvfs = ITOZSB(ip);
4805         int error;
4806         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4807
4808         ZFS_ENTER(zfsvfs);
4809         ZFS_VERIFY_ZP(zp);
4810         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4811         ZFS_EXIT(zfsvfs);
4812
4813         return (error);
4814 }
4815
4816 /*ARGSUSED*/
4817 int
4818 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
4819 {
4820         zfsvfs_t *zfsvfs = ZTOZSB(zp);
4821         int error;
4822         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4823         zilog_t *zilog = zfsvfs->z_log;
4824
4825         ZFS_ENTER(zfsvfs);
4826         ZFS_VERIFY_ZP(zp);
4827
4828         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4829
4830         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4831                 zil_commit(zilog, 0);
4832
4833         ZFS_EXIT(zfsvfs);
4834         return (error);
4835 }
4836
4837 #ifdef HAVE_UIO_ZEROCOPY
4838 /*
4839  * The smallest read we may consider to loan out an arcbuf.
4840  * This must be a power of 2.
4841  */
4842 int zcr_blksz_min = (1 << 10);  /* 1K */
4843 /*
4844  * If set to less than the file block size, allow loaning out of an
4845  * arcbuf for a partial block read.  This must be a power of 2.
4846  */
4847 int zcr_blksz_max = (1 << 17);  /* 128K */
4848
4849 /*ARGSUSED*/
4850 static int
4851 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
4852 {
4853         znode_t *zp = ITOZ(ip);
4854         zfsvfs_t *zfsvfs = ITOZSB(ip);
4855         int max_blksz = zfsvfs->z_max_blksz;
4856         uio_t *uio = &xuio->xu_uio;
4857         ssize_t size = uio->uio_resid;
4858         offset_t offset = uio->uio_loffset;
4859         int blksz;
4860         int fullblk, i;
4861         arc_buf_t *abuf;
4862         ssize_t maxsize;
4863         int preamble, postamble;
4864
4865         if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4866                 return (SET_ERROR(EINVAL));
4867
4868         ZFS_ENTER(zfsvfs);
4869         ZFS_VERIFY_ZP(zp);
4870         switch (ioflag) {
4871         case UIO_WRITE:
4872                 /*
4873                  * Loan out an arc_buf for write if write size is bigger than
4874                  * max_blksz, and the file's block size is also max_blksz.
4875                  */
4876                 blksz = max_blksz;
4877                 if (size < blksz || zp->z_blksz != blksz) {
4878                         ZFS_EXIT(zfsvfs);
4879                         return (SET_ERROR(EINVAL));
4880                 }
4881                 /*
4882                  * Caller requests buffers for write before knowing where the
4883                  * write offset might be (e.g. NFS TCP write).
4884                  */
4885                 if (offset == -1) {
4886                         preamble = 0;
4887                 } else {
4888                         preamble = P2PHASE(offset, blksz);
4889                         if (preamble) {
4890                                 preamble = blksz - preamble;
4891                                 size -= preamble;
4892                         }
4893                 }
4894
4895                 postamble = P2PHASE(size, blksz);
4896                 size -= postamble;
4897
4898                 fullblk = size / blksz;
4899                 (void) dmu_xuio_init(xuio,
4900                     (preamble != 0) + fullblk + (postamble != 0));
4901
4902                 /*
4903                  * Have to fix iov base/len for partial buffers.  They
4904                  * currently represent full arc_buf's.
4905                  */
4906                 if (preamble) {
4907                         /* data begins in the middle of the arc_buf */
4908                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4909                             blksz);
4910                         ASSERT(abuf);
4911                         (void) dmu_xuio_add(xuio, abuf,
4912                             blksz - preamble, preamble);
4913                 }
4914
4915                 for (i = 0; i < fullblk; i++) {
4916                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4917                             blksz);
4918                         ASSERT(abuf);
4919                         (void) dmu_xuio_add(xuio, abuf, 0, blksz);
4920                 }
4921
4922                 if (postamble) {
4923                         /* data ends in the middle of the arc_buf */
4924                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4925                             blksz);
4926                         ASSERT(abuf);
4927                         (void) dmu_xuio_add(xuio, abuf, 0, postamble);
4928                 }
4929                 break;
4930         case UIO_READ:
4931                 /*
4932                  * Loan out an arc_buf for read if the read size is larger than
4933                  * the current file block size.  Block alignment is not
4934                  * considered.  Partial arc_buf will be loaned out for read.
4935                  */
4936                 blksz = zp->z_blksz;
4937                 if (blksz < zcr_blksz_min)
4938                         blksz = zcr_blksz_min;
4939                 if (blksz > zcr_blksz_max)
4940                         blksz = zcr_blksz_max;
4941                 /* avoid potential complexity of dealing with it */
4942                 if (blksz > max_blksz) {
4943                         ZFS_EXIT(zfsvfs);
4944                         return (SET_ERROR(EINVAL));
4945                 }
4946
4947                 maxsize = zp->z_size - uio->uio_loffset;
4948                 if (size > maxsize)
4949                         size = maxsize;
4950
4951                 if (size < blksz) {
4952                         ZFS_EXIT(zfsvfs);
4953                         return (SET_ERROR(EINVAL));
4954                 }
4955                 break;
4956         default:
4957                 ZFS_EXIT(zfsvfs);
4958                 return (SET_ERROR(EINVAL));
4959         }
4960
4961         uio->uio_extflg = UIO_XUIO;
4962         XUIO_XUZC_RW(xuio) = ioflag;
4963         ZFS_EXIT(zfsvfs);
4964         return (0);
4965 }
4966
4967 /*ARGSUSED*/
4968 static int
4969 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
4970 {
4971         int i;
4972         arc_buf_t *abuf;
4973         int ioflag = XUIO_XUZC_RW(xuio);
4974
4975         ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
4976
4977         i = dmu_xuio_cnt(xuio);
4978         while (i-- > 0) {
4979                 abuf = dmu_xuio_arcbuf(xuio, i);
4980                 /*
4981                  * if abuf == NULL, it must be a write buffer
4982                  * that has been returned in zfs_write().
4983                  */
4984                 if (abuf)
4985                         dmu_return_arcbuf(abuf);
4986                 ASSERT(abuf || ioflag == UIO_WRITE);
4987         }
4988
4989         dmu_xuio_fini(xuio);
4990         return (0);
4991 }
4992 #endif /* HAVE_UIO_ZEROCOPY */
4993
4994 #if defined(_KERNEL)
4995 EXPORT_SYMBOL(zfs_open);
4996 EXPORT_SYMBOL(zfs_close);
4997 EXPORT_SYMBOL(zfs_read);
4998 EXPORT_SYMBOL(zfs_write);
4999 EXPORT_SYMBOL(zfs_access);
5000 EXPORT_SYMBOL(zfs_lookup);
5001 EXPORT_SYMBOL(zfs_create);
5002 EXPORT_SYMBOL(zfs_tmpfile);
5003 EXPORT_SYMBOL(zfs_remove);
5004 EXPORT_SYMBOL(zfs_mkdir);
5005 EXPORT_SYMBOL(zfs_rmdir);
5006 EXPORT_SYMBOL(zfs_readdir);
5007 EXPORT_SYMBOL(zfs_fsync);
5008 EXPORT_SYMBOL(zfs_getattr_fast);
5009 EXPORT_SYMBOL(zfs_setattr);
5010 EXPORT_SYMBOL(zfs_rename);
5011 EXPORT_SYMBOL(zfs_symlink);
5012 EXPORT_SYMBOL(zfs_readlink);
5013 EXPORT_SYMBOL(zfs_link);
5014 EXPORT_SYMBOL(zfs_inactive);
5015 EXPORT_SYMBOL(zfs_space);
5016 EXPORT_SYMBOL(zfs_fid);
5017 EXPORT_SYMBOL(zfs_getsecattr);
5018 EXPORT_SYMBOL(zfs_setsecattr);
5019 EXPORT_SYMBOL(zfs_getpage);
5020 EXPORT_SYMBOL(zfs_putpage);
5021 EXPORT_SYMBOL(zfs_dirty_inode);
5022 EXPORT_SYMBOL(zfs_map);
5023
5024 /* BEGIN CSTYLED */
5025 module_param(zfs_delete_blocks, ulong, 0644);
5026 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
5027 module_param(zfs_read_chunk_size, ulong, 0644);
5028 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
5029 /* END CSTYLED */
5030
5031 #endif