sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/time.h>
  31 #include <sys/systm.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/resource.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vnode.h>
  36 #include <sys/file.h>
  37 #include <sys/kmem.h>
  38 #include <sys/uio.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/errno.h>
  41 #include <sys/stat.h>
  42 #include <sys/unistd.h>
  43 #include <sys/random.h>
  44 #include <sys/policy.h>
  45 #include <sys/kcondvar.h>
  46 #include <sys/callb.h>
  47 #include <sys/smp.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/fs/zfs.h>
  51 #include <sys/zap.h>
  52 #include <sys/dmu.h>
  53 #include <sys/atomic.h>
  54 #include <sys/zfs_ctldir.h>
  55 #include <sys/dnlc.h>
  56
  57 /*
  58  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
  59  * in dzp's directory zap object.  As long as you hold a dirlock, you can
  60  * assume two things: (1) dzp cannot be reaped, and (2) no other thread
  61  * can change the zap entry for (i.e. link or unlink) this name.
  62  *
  63  * Input arguments:
  64  *      dzp     - znode for directory
  65  *      name    - name of entry to lock
  66  *      flag    - ZNEW: if the entry already exists, fail with EEXIST.
  67  *                ZEXISTS: if the entry does not exist, fail with ENOENT.
  68  *                ZSHARED: allow concurrent access with other ZSHARED callers.
  69  *                ZXATTR: we want dzp's xattr directory
  70  *
  71  * Output arguments:
  72  *      zpp     - pointer to the znode for the entry (NULL if there isn't one)
  73  *      dlpp    - pointer to the dirlock for this entry (NULL on error)
  74  *
  75  * Return value: 0 on success or errno on failure.
  76  *
  77  * NOTE: Always checks for, and rejects, '.' and '..'.
  78  */
  79 int
  80 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
  81         int flag)
  82 {
  83         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
  84         zfs_dirlock_t   *dl;
  85         uint64_t        zoid;
  86         int             error;
  87         vnode_t         *vp;
  88
  89         *zpp = NULL;
  90         *dlpp = NULL;
  91
  92         /*
  93          * Verify that we are not trying to lock '.', '..', or '.zfs'
  94          */
  95         if (name[0] == '.' &&
  96             (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
  97             zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
  98                 return (EEXIST);
  99
 100         /*
 101          * Wait until there are no locks on this name.
 102          */
 103         rw_enter(&dzp->z_name_lock, RW_READER);
 104         mutex_enter(&dzp->z_lock);
 105         for (;;) {
 106                 if (dzp->z_unlinked) {
 107                         mutex_exit(&dzp->z_lock);
 108                         rw_exit(&dzp->z_name_lock);
 109                         return (ENOENT);
 110                 }
 111                 for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
 112                         if (strcmp(name, dl->dl_name) == 0)
 113                                 break;
 114                 if (dl == NULL) {
 115                         /*
 116                          * Allocate a new dirlock and add it to the list.
 117                          */
 118                         dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
 119                         cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
 120                         dl->dl_name = name;
 121                         dl->dl_sharecnt = 0;
 122                         dl->dl_namesize = 0;
 123                         dl->dl_dzp = dzp;
 124                         dl->dl_next = dzp->z_dirlocks;
 125                         dzp->z_dirlocks = dl;
 126                         break;
 127                 }
 128                 if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
 129                         break;
 130                 cv_wait(&dl->dl_cv, &dzp->z_lock);
 131         }
 132
 133         if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
 134                 /*
 135                  * We're the second shared reference to dl.  Make a copy of
 136                  * dl_name in case the first thread goes away before we do.
 137                  * Note that we initialize the new name before storing its
 138                  * pointer into dl_name, because the first thread may load
 139                  * dl->dl_name at any time.  He'll either see the old value,
 140                  * which is his, or the new shared copy; either is OK.
 141                  */
 142                 dl->dl_namesize = strlen(dl->dl_name) + 1;
 143                 name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
 144                 bcopy(dl->dl_name, name, dl->dl_namesize);
 145                 dl->dl_name = name;
 146         }
 147
 148         mutex_exit(&dzp->z_lock);
 149
 150         /*
 151          * We have a dirlock on the name.  (Note that it is the dirlock,
 152          * not the dzp's z_lock, that protects the name in the zap object.)
 153          * See if there's an object by this name; if so, put a hold on it.
 154          */
 155         if (flag & ZXATTR) {
 156                 zoid = dzp->z_phys->zp_xattr;
 157                 error = (zoid == 0 ? ENOENT : 0);
 158         } else {
 159                 vp = dnlc_lookup(ZTOV(dzp), name);
 160                 if (vp == DNLC_NO_VNODE) {
 161                         VN_RELE(vp);
 162                         error = ENOENT;
 163                 } else if (vp) {
 164                         if (flag & ZNEW) {
 165                                 zfs_dirent_unlock(dl);
 166                                 VN_RELE(vp);
 167                                 return (EEXIST);
 168                         }
 169                         *dlpp = dl;
 170                         *zpp = VTOZ(vp);
 171                         return (0);
 172                 } else {
 173                         error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
 174                             8, 1, &zoid);
 175                         zoid = ZFS_DIRENT_OBJ(zoid);
 176                         if (error == ENOENT)
 177                                 dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
 178                 }
 179         }
 180         if (error) {
 181                 if (error != ENOENT || (flag & ZEXISTS)) {
 182                         zfs_dirent_unlock(dl);
 183                         return (error);
 184                 }
 185         } else {
 186                 if (flag & ZNEW) {
 187                         zfs_dirent_unlock(dl);
 188                         return (EEXIST);
 189                 }
 190                 error = zfs_zget(zfsvfs, zoid, zpp);
 191                 if (error) {
 192                         zfs_dirent_unlock(dl);
 193                         return (error);
 194                 }
 195                 if (!(flag & ZXATTR))
 196                         dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
 197         }
 198
 199         *dlpp = dl;
 200
 201         return (0);
 202 }
 203
 204 /*
 205  * Unlock this directory entry and wake anyone who was waiting for it.
 206  */
 207 void
 208 zfs_dirent_unlock(zfs_dirlock_t *dl)
 209 {
 210         znode_t *dzp = dl->dl_dzp;
 211         zfs_dirlock_t **prev_dl, *cur_dl;
 212
 213         mutex_enter(&dzp->z_lock);
 214         rw_exit(&dzp->z_name_lock);
 215         if (dl->dl_sharecnt > 1) {
 216                 dl->dl_sharecnt--;
 217                 mutex_exit(&dzp->z_lock);
 218                 return;
 219         }
 220         prev_dl = &dzp->z_dirlocks;
 221         while ((cur_dl = *prev_dl) != dl)
 222                 prev_dl = &cur_dl->dl_next;
 223         *prev_dl = dl->dl_next;
 224         cv_broadcast(&dl->dl_cv);
 225         mutex_exit(&dzp->z_lock);
 226
 227         if (dl->dl_namesize != 0)
 228                 kmem_free(dl->dl_name, dl->dl_namesize);
 229         cv_destroy(&dl->dl_cv);
 230         kmem_free(dl, sizeof (*dl));
 231 }
 232
 233 /*
 234  * Look up an entry in a directory.
 235  *
 236  * NOTE: '.' and '..' are handled as special cases because
 237  *      no directory entries are actually stored for them.  If this is
 238  *      the root of a filesystem, then '.zfs' is also treated as a
 239  *      special pseudo-directory.
 240  */
 241 int
 242 zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
 243 {
 244         zfs_dirlock_t *dl;
 245         znode_t *zp;
 246         int error = 0;
 247
 248         if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 249                 *vpp = ZTOV(dzp);
 250                 VN_HOLD(*vpp);
 251         } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 252                 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 253                 /*
 254                  * If we are a snapshot mounted under .zfs, return
 255                  * the vp for the snapshot directory.
 256                  */
 257                 if (dzp->z_phys->zp_parent == dzp->z_id &&
 258                     zfsvfs->z_parent != zfsvfs) {
 259                         error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
 260                             "snapshot", vpp, NULL, 0, NULL, kcred);
 261                         return (error);
 262                 }
 263                 rw_enter(&dzp->z_parent_lock, RW_READER);
 264                 error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
 265                 if (error == 0)
 266                         *vpp = ZTOV(zp);
 267                 rw_exit(&dzp->z_parent_lock);
 268         } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
 269                 *vpp = zfsctl_root(dzp);
 270         } else {
 271                 error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
 272                 if (error == 0) {
 273                         *vpp = ZTOV(zp);
 274                         zfs_dirent_unlock(dl);
 275                         dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
 276                 }
 277         }
 278
 279         return (error);
 280 }
 281
 282 static char *
 283 zfs_unlinked_hexname(char namebuf[17], uint64_t x)
 284 {
 285         char *name = &namebuf[16];
 286         const char digits[16] = "0123456789abcdef";
 287
 288         *name = '\0';
 289         do {
 290                 *--name = digits[x & 0xf];
 291                 x >>= 4;
 292         } while (x != 0);
 293
 294         return (name);
 295 }
 296
 297 /*
 298  * unlinked Set (formerly known as the "delete queue") Error Handling
 299  *
 300  * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
 301  * don't specify the name of the entry that we will be manipulating.  We
 302  * also fib and say that we won't be adding any new entries to the
 303  * unlinked set, even though we might (this is to lower the minimum file
 304  * size that can be deleted in a full filesystem).  So on the small
 305  * chance that the nlink list is using a fat zap (ie. has more than
 306  * 2000 entries), we *may* not pre-read a block that's needed.
 307  * Therefore it is remotely possible for some of the assertions
 308  * regarding the unlinked set below to fail due to i/o error.  On a
 309  * nondebug system, this will result in the space being leaked.
 310  */
 311 void
 312 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 313 {
 314         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 315         char obj_name[17];
 316         int error;
 317
 318         ASSERT(zp->z_unlinked);
 319         ASSERT3U(zp->z_phys->zp_links, ==, 0);
 320
 321         error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
 322             zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
 323         ASSERT3U(error, ==, 0);
 324 }
 325
 326 /*
 327  * Clean up any znodes that had no links when we either crashed or
 328  * (force) umounted the file system.
 329  */
 330 void
 331 zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 332 {
 333         zap_cursor_t    zc;
 334         zap_attribute_t zap;
 335         dmu_object_info_t doi;
 336         znode_t         *zp;
 337         int             error;
 338
 339         /*
 340          * Interate over the contents of the unlinked set.
 341          */
 342         for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
 343             zap_cursor_retrieve(&zc, &zap) == 0;
 344             zap_cursor_advance(&zc)) {
 345
 346                 /*
 347                  * See what kind of object we have in list
 348                  */
 349
 350                 error = dmu_object_info(zfsvfs->z_os,
 351                     zap.za_first_integer, &doi);
 352                 if (error != 0)
 353                         continue;
 354
 355                 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
 356                     (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
 357                 /*
 358                  * We need to re-mark these list entries for deletion,
 359                  * so we pull them back into core and set zp->z_unlinked.
 360                  */
 361                 error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
 362
 363                 /*
 364                  * We may pick up znodes that are already marked for deletion.
 365                  * This could happen during the purge of an extended attribute
 366                  * directory.  All we need to do is skip over them, since they
 367                  * are already in the system marked z_unlinked.
 368                  */
 369                 if (error != 0)
 370                         continue;
 371
 372                 zp->z_unlinked = B_TRUE;
 373                 VN_RELE(ZTOV(zp));
 374         }
 375         zap_cursor_fini(&zc);
 376 }
 377
 378 /*
 379  * Delete the entire contents of a directory.  Return a count
 380  * of the number of entries that could not be deleted.
 381  *
 382  * NOTE: this function assumes that the directory is inactive,
 383  *      so there is no need to lock its entries before deletion.
 384  *      Also, it assumes the directory contents is *only* regular
 385  *      files.
 386  */
 387 static int
 388 zfs_purgedir(znode_t *dzp)
 389 {
 390         zap_cursor_t    zc;
 391         zap_attribute_t zap;
 392         znode_t         *xzp;
 393         dmu_tx_t        *tx;
 394         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
 395         zfs_dirlock_t   dl;
 396         int skipped = 0;
 397         int error;
 398
 399         for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
 400             (error = zap_cursor_retrieve(&zc, &zap)) == 0;
 401             zap_cursor_advance(&zc)) {
 402                 error = zfs_zget(zfsvfs,
 403                     ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
 404                 ASSERT3U(error, ==, 0);
 405
 406                 ASSERT((ZTOV(xzp)->v_type == VREG) ||
 407                     (ZTOV(xzp)->v_type == VLNK));
 408
 409                 tx = dmu_tx_create(zfsvfs->z_os);
 410                 dmu_tx_hold_bonus(tx, dzp->z_id);
 411                 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
 412                 dmu_tx_hold_bonus(tx, xzp->z_id);
 413                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 414                 error = dmu_tx_assign(tx, TXG_WAIT);
 415                 if (error) {
 416                         dmu_tx_abort(tx);
 417                         VN_RELE(ZTOV(xzp));
 418                         skipped += 1;
 419                         continue;
 420                 }
 421                 bzero(&dl, sizeof (dl));
 422                 dl.dl_dzp = dzp;
 423                 dl.dl_name = zap.za_name;
 424
 425                 error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
 426                 ASSERT3U(error, ==, 0);
 427                 dmu_tx_commit(tx);
 428
 429                 VN_RELE(ZTOV(xzp));
 430         }
 431         zap_cursor_fini(&zc);
 432         ASSERT(error == ENOENT);
 433         return (skipped);
 434 }
 435
 436 void
 437 zfs_rmnode(znode_t *zp)
 438 {
 439         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 440         objset_t        *os = zfsvfs->z_os;
 441         znode_t         *xzp = NULL;
 442         char            obj_name[17];
 443         dmu_tx_t        *tx;
 444         uint64_t        acl_obj;
 445         int             error;
 446         int             vfslocked;
 447
 448         vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
 449
 450         ASSERT(zp->z_phys->zp_links == 0);
 451
 452         /*
 453          * If this is an attribute directory, purge its contents.
 454          */
 455         if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
 456             (zp->z_phys->zp_flags & ZFS_XATTR)) {
 457                 if (zfs_purgedir(zp) != 0) {
 458                         /*
 459                          * Not enough space to delete some xattrs.
 460                          * Leave it on the unlinked set.
 461                          */
 462                         VFS_UNLOCK_GIANT(vfslocked);
 463                         return;
 464                 }
 465         }
 466
 467         /*
 468          * If the file has extended attributes, we're going to unlink
 469          * the xattr dir.
 470          */
 471         if (zp->z_phys->zp_xattr) {
 472                 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
 473                 ASSERT(error == 0);
 474         }
 475
 476         acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
 477
 478         /*
 479          * Set up the transaction.
 480          */
 481         tx = dmu_tx_create(os);
 482         dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 483         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 484         if (xzp) {
 485                 dmu_tx_hold_bonus(tx, xzp->z_id);
 486                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
 487         }
 488         if (acl_obj)
 489                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 490         error = dmu_tx_assign(tx, TXG_WAIT);
 491         if (error) {
 492                 /*
 493                  * Not enough space to delete the file.  Leave it in the
 494                  * unlinked set, leaking it until the fs is remounted (at
 495                  * which point we'll call zfs_unlinked_drain() to process it).
 496                  */
 497                 dmu_tx_abort(tx);
 498                 VFS_UNLOCK_GIANT(vfslocked);
 499                 return;
 500         }
 501
 502         if (xzp) {
 503                 dmu_buf_will_dirty(xzp->z_dbuf, tx);
 504                 mutex_enter(&xzp->z_lock);
 505                 xzp->z_unlinked = B_TRUE;       /* mark xzp for deletion */
 506                 xzp->z_phys->zp_links = 0;      /* no more links to it */
 507                 mutex_exit(&xzp->z_lock);
 508                 zfs_unlinked_add(xzp, tx);
 509         }
 510
 511         /* Remove this znode from the unlinked set */
 512         error = zap_remove(os, zfsvfs->z_unlinkedobj,
 513             zfs_unlinked_hexname(obj_name, zp->z_id), tx);
 514         ASSERT3U(error, ==, 0);
 515
 516         zfs_znode_delete(zp, tx);
 517
 518         dmu_tx_commit(tx);
 519
 520         if (xzp)
 521                 VN_RELE(ZTOV(xzp));
 522         VFS_UNLOCK_GIANT(vfslocked);
 523 }
 524
 525 /*
 526  * Link zp into dl.  Can only fail if zp has been unlinked.
 527  */
 528 int
 529 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 530 {
 531         znode_t *dzp = dl->dl_dzp;
 532         vnode_t *vp = ZTOV(zp);
 533         uint64_t value;
 534         int zp_is_dir = (vp->v_type == VDIR);
 535         int error;
 536
 537         dmu_buf_will_dirty(zp->z_dbuf, tx);
 538         mutex_enter(&zp->z_lock);
 539
 540         if (!(flag & ZRENAMING)) {
 541                 if (zp->z_unlinked) {   /* no new links to unlinked zp */
 542                         ASSERT(!(flag & (ZNEW | ZEXISTS)));
 543                         mutex_exit(&zp->z_lock);
 544                         return (ENOENT);
 545                 }
 546                 zp->z_phys->zp_links++;
 547         }
 548         zp->z_phys->zp_parent = dzp->z_id;      /* dzp is now zp's parent */
 549
 550         if (!(flag & ZNEW))
 551                 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 552         mutex_exit(&zp->z_lock);
 553
 554         dmu_buf_will_dirty(dzp->z_dbuf, tx);
 555         mutex_enter(&dzp->z_lock);
 556         dzp->z_phys->zp_size++;                 /* one dirent added */
 557         dzp->z_phys->zp_links += zp_is_dir;     /* ".." link from zp */
 558         zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
 559         mutex_exit(&dzp->z_lock);
 560
 561         /*
 562          * MacOS X will fill in the 4-bit object type here.
 563          */
 564         value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
 565         error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
 566             8, 1, &value, tx);
 567         ASSERT(error == 0);
 568
 569         dnlc_update(ZTOV(dzp), dl->dl_name, vp);
 570
 571         return (0);
 572 }
 573
 574 /*
 575  * Unlink zp from dl, and mark zp for deletion if this was the last link.
 576  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
 577  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
 578  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
 579  * and it's the caller's job to do it.
 580  */
 581 int
 582 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 583         boolean_t *unlinkedp)
 584 {
 585         znode_t *dzp = dl->dl_dzp;
 586         vnode_t *vp = ZTOV(zp);
 587         int zp_is_dir = (vp->v_type == VDIR);
 588         boolean_t unlinked = B_FALSE;
 589         int error;
 590
 591         dnlc_remove(ZTOV(dzp), dl->dl_name);
 592
 593         if (!(flag & ZRENAMING)) {
 594                 dmu_buf_will_dirty(zp->z_dbuf, tx);
 595
 596                 if (vn_vfswlock(vp))            /* prevent new mounts on zp */
 597                         return (EBUSY);
 598
 599                 if (vn_ismntpt(vp)) {           /* don't remove mount point */
 600                         vn_vfsunlock(vp);
 601                         return (EBUSY);
 602                 }
 603
 604                 mutex_enter(&zp->z_lock);
 605                 if (zp_is_dir && !zfs_dirempty(zp)) {   /* dir not empty */
 606                         mutex_exit(&zp->z_lock);
 607                         vn_vfsunlock(vp);
 608                         return (ENOTEMPTY);
 609                 }
 610                 if (zp->z_phys->zp_links <= zp_is_dir) {
 611                         zfs_panic_recover("zfs: link count on vnode %p is %u, "
 612                             "should be at least %u", zp->z_vnode,
 613                             (int)zp->z_phys->zp_links,
 614                             zp_is_dir + 1);
 615                         zp->z_phys->zp_links = zp_is_dir + 1;
 616                 }
 617                 if (--zp->z_phys->zp_links == zp_is_dir) {
 618                         zp->z_unlinked = B_TRUE;
 619                         zp->z_phys->zp_links = 0;
 620                         unlinked = B_TRUE;
 621                 } else {
 622                         zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 623                 }
 624                 mutex_exit(&zp->z_lock);
 625                 vn_vfsunlock(vp);
 626         }
 627
 628         dmu_buf_will_dirty(dzp->z_dbuf, tx);
 629         mutex_enter(&dzp->z_lock);
 630         dzp->z_phys->zp_size--;                 /* one dirent removed */
 631         dzp->z_phys->zp_links -= zp_is_dir;     /* ".." link from zp */
 632         zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
 633         mutex_exit(&dzp->z_lock);
 634
 635         error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
 636         ASSERT(error == 0);
 637
 638         if (unlinkedp != NULL)
 639                 *unlinkedp = unlinked;
 640         else if (unlinked)
 641                 zfs_unlinked_add(zp, tx);
 642
 643         return (0);
 644 }
 645
 646 /*
 647  * Indicate whether the directory is empty.  Works with or without z_lock
 648  * held, but can only be consider a hint in the latter case.  Returns true
 649  * if only "." and ".." remain and there's no work in progress.
 650  */
 651 boolean_t
 652 zfs_dirempty(znode_t *dzp)
 653 {
 654         return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
 655 }
 656
 657 int
 658 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 659 {
 660         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 661         znode_t *xzp;
 662         dmu_tx_t *tx;
 663         uint64_t xoid;
 664         int error;
 665
 666         *xvpp = NULL;
 667
 668         if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
 669                 return (error);
 670
 671         tx = dmu_tx_create(zfsvfs->z_os);
 672         dmu_tx_hold_bonus(tx, zp->z_id);
 673         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 674         error = dmu_tx_assign(tx, zfsvfs->z_assign);
 675         if (error) {
 676                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
 677                         dmu_tx_wait(tx);
 678                 dmu_tx_abort(tx);
 679                 return (error);
 680         }
 681         zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
 682         ASSERT(xzp->z_id == xoid);
 683         ASSERT(xzp->z_phys->zp_parent == zp->z_id);
 684         dmu_buf_will_dirty(zp->z_dbuf, tx);
 685         zp->z_phys->zp_xattr = xoid;
 686
 687         (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
 688         dmu_tx_commit(tx);
 689
 690         *xvpp = ZTOV(xzp);
 691
 692         return (0);
 693 }
 694
 695 /*
 696  * Return a znode for the extended attribute directory for zp.
 697  * ** If the directory does not already exist, it is created **
 698  *
 699  *      IN:     zp      - znode to obtain attribute directory from
 700  *              cr      - credentials of caller
 701  *              flags   - flags from the VOP_LOOKUP call
 702  *
 703  *      OUT:    xzpp    - pointer to extended attribute znode
 704  *
 705  *      RETURN: 0 on success
 706  *              error number on failure
 707  */
 708 int
 709 zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 710 {
 711         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 712         znode_t         *xzp;
 713         zfs_dirlock_t   *dl;
 714         vattr_t         va;
 715         int             error;
 716 top:
 717         error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
 718         if (error)
 719                 return (error);
 720
 721         if (xzp != NULL) {
 722                 *xvpp = ZTOV(xzp);
 723                 zfs_dirent_unlock(dl);
 724                 return (0);
 725         }
 726
 727         ASSERT(zp->z_phys->zp_xattr == 0);
 728
 729 #ifdef TODO
 730         if (!(flags & CREATE_XATTR_DIR)) {
 731                 zfs_dirent_unlock(dl);
 732                 return (ENOENT);
 733         }
 734 #endif
 735
 736         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 737                 zfs_dirent_unlock(dl);
 738                 return (EROFS);
 739         }
 740
 741         /*
 742          * The ability to 'create' files in an attribute
 743          * directory comes from the write_xattr permission on the base file.
 744          *
 745          * The ability to 'search' an attribute directory requires
 746          * read_xattr permission on the base file.
 747          *
 748          * Once in a directory the ability to read/write attributes
 749          * is controlled by the permissions on the attribute file.
 750          */
 751         va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
 752         va.va_type = VDIR;
 753         va.va_mode = S_IFDIR | S_ISVTX | 0777;
 754         va.va_uid = (uid_t)zp->z_phys->zp_uid;
 755         va.va_gid = (gid_t)zp->z_phys->zp_gid;
 756
 757         error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 758         zfs_dirent_unlock(dl);
 759
 760         if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 761                 /* NB: we already did dmu_tx_wait() if necessary */
 762                 goto top;
 763         }
 764
 765         return (error);
 766 }
 767
 768 /*
 769  * Decide whether it is okay to remove within a sticky directory.
 770  *
 771  * In sticky directories, write access is not sufficient;
 772  * you can remove entries from a directory only if:
 773  *
 774  *      you own the directory,
 775  *      you own the entry,
 776  *      the entry is a plain file and you have write access,
 777  *      or you are privileged (checked in secpolicy...).
 778  *
 779  * The function returns 0 if remove access is granted.
 780  */
 781 int
 782 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 783 {
 784         uid_t           uid;
 785
 786         if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)     /* ZIL replay */
 787                 return (0);
 788
 789         if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
 790             (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
 791             uid == zp->z_phys->zp_uid ||
 792             (ZTOV(zp)->v_type == VREG &&
 793             zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
 794                 return (0);
 795         else
 796                 return (secpolicy_vnode_remove(cr));
 797 }