sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  26  * All rights reserved.
  27  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
  28  */
  29
  30 #include <sys/dmu_objset.h>
  31 #include <sys/dsl_dataset.h>
  32 #include <sys/dsl_dir.h>
  33 #include <sys/dsl_prop.h>
  34 #include <sys/dsl_synctask.h>
  35 #include <sys/dmu_traverse.h>
  36 #include <sys/dmu_impl.h>
  37 #include <sys/dmu_tx.h>
  38 #include <sys/arc.h>
  39 #include <sys/zio.h>
  40 #include <sys/zap.h>
  41 #include <sys/unique.h>
  42 #include <sys/zfs_context.h>
  43 #include <sys/zfs_ioctl.h>
  44 #include <sys/spa.h>
  45 #include <sys/zfs_znode.h>
  46 #include <sys/zfs_onexit.h>
  47 #include <sys/zvol.h>
  48 #include <sys/dsl_scan.h>
  49 #include <sys/dsl_deadlist.h>
  50
  51 static char *dsl_reaper = "the grim reaper";
  52
  53 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  54 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  55 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  56
  57 #define SWITCH64(x, y) \
  58         { \
  59                 uint64_t __tmp = (x); \
  60                 (x) = (y); \
  61                 (y) = __tmp; \
  62         }
  63
  64 #define DS_REF_MAX      (1ULL << 62)
  65
  66 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  67
  68 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  69
  70
  71 /*
  72  * Figure out how much of this delta should be propogated to the dsl_dir
  73  * layer.  If there's a refreservation, that space has already been
  74  * partially accounted for in our ancestors.
  75  */
  76 static int64_t
  77 parent_delta(dsl_dataset_t *ds, int64_t delta)
  78 {
  79         uint64_t old_bytes, new_bytes;
  80
  81         if (ds->ds_reserved == 0)
  82                 return (delta);
  83
  84         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  85         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  86
  87         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  88         return (new_bytes - old_bytes);
  89 }
  90
  91 void
  92 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  93 {
  94         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  95         int compressed = BP_GET_PSIZE(bp);
  96         int uncompressed = BP_GET_UCSIZE(bp);
  97         int64_t delta;
  98
  99         dprintf_bp(bp, "ds=%p", ds);
 100
 101         ASSERT(dmu_tx_is_syncing(tx));
 102         /* It could have been compressed away to nothing */
 103         if (BP_IS_HOLE(bp))
 104                 return;
 105         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 106         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
 107         if (ds == NULL) {
 108                 /*
 109                  * Account for the meta-objset space in its placeholder
 110                  * dsl_dir.
 111                  */
 112                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 113                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 114                     used, compressed, uncompressed, tx);
 115                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 116                 return;
 117         }
 118         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 119
 120         mutex_enter(&ds->ds_dir->dd_lock);
 121         mutex_enter(&ds->ds_lock);
 122         delta = parent_delta(ds, used);
 123         ds->ds_phys->ds_used_bytes += used;
 124         ds->ds_phys->ds_compressed_bytes += compressed;
 125         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 126         ds->ds_phys->ds_unique_bytes += used;
 127         mutex_exit(&ds->ds_lock);
 128         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 129             compressed, uncompressed, tx);
 130         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 131             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 132         mutex_exit(&ds->ds_dir->dd_lock);
 133 }
 134
 135 int
 136 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 137     boolean_t async)
 138 {
 139         if (BP_IS_HOLE(bp))
 140                 return (0);
 141
 142         ASSERT(dmu_tx_is_syncing(tx));
 143         ASSERT(bp->blk_birth <= tx->tx_txg);
 144
 145         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 146         int compressed = BP_GET_PSIZE(bp);
 147         int uncompressed = BP_GET_UCSIZE(bp);
 148
 149         ASSERT(used > 0);
 150         if (ds == NULL) {
 151                 /*
 152                  * Account for the meta-objset space in its placeholder
 153                  * dataset.
 154                  */
 155                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 156
 157                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 158                     -used, -compressed, -uncompressed, tx);
 159                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 160                 return (used);
 161         }
 162         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 163
 164         ASSERT(!dsl_dataset_is_snapshot(ds));
 165         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 166
 167         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 168                 int64_t delta;
 169
 170                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 171                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 172
 173                 mutex_enter(&ds->ds_dir->dd_lock);
 174                 mutex_enter(&ds->ds_lock);
 175                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 176                     !DS_UNIQUE_IS_ACCURATE(ds));
 177                 delta = parent_delta(ds, -used);
 178                 ds->ds_phys->ds_unique_bytes -= used;
 179                 mutex_exit(&ds->ds_lock);
 180                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 181                     delta, -compressed, -uncompressed, tx);
 182                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 183                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
 184                 mutex_exit(&ds->ds_dir->dd_lock);
 185         } else {
 186                 dprintf_bp(bp, "putting on dead list: %s", "");
 187                 if (async) {
 188                         /*
 189                          * We are here as part of zio's write done callback,
 190                          * which means we're a zio interrupt thread.  We can't
 191                          * call dsl_deadlist_insert() now because it may block
 192                          * waiting for I/O.  Instead, put bp on the deferred
 193                          * queue and let dsl_pool_sync() finish the job.
 194                          */
 195                         bplist_append(&ds->ds_pending_deadlist, bp);
 196                 } else {
 197                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 198                 }
 199                 ASSERT3U(ds->ds_prev->ds_object, ==,
 200                     ds->ds_phys->ds_prev_snap_obj);
 201                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 202                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 203                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 204                     ds->ds_object && bp->blk_birth >
 205                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 206                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 207                         mutex_enter(&ds->ds_prev->ds_lock);
 208                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 209                         mutex_exit(&ds->ds_prev->ds_lock);
 210                 }
 211                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 212                         dsl_dir_transfer_space(ds->ds_dir, used,
 213                             DD_USED_HEAD, DD_USED_SNAP, tx);
 214                 }
 215         }
 216         mutex_enter(&ds->ds_lock);
 217         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 218         ds->ds_phys->ds_used_bytes -= used;
 219         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 220         ds->ds_phys->ds_compressed_bytes -= compressed;
 221         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 222         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 223         mutex_exit(&ds->ds_lock);
 224
 225         return (used);
 226 }
 227
 228 uint64_t
 229 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 230 {
 231         uint64_t trysnap = 0;
 232
 233         if (ds == NULL)
 234                 return (0);
 235         /*
 236          * The snapshot creation could fail, but that would cause an
 237          * incorrect FALSE return, which would only result in an
 238          * overestimation of the amount of space that an operation would
 239          * consume, which is OK.
 240          *
 241          * There's also a small window where we could miss a pending
 242          * snapshot, because we could set the sync task in the quiescing
 243          * phase.  So this should only be used as a guess.
 244          */
 245         if (ds->ds_trysnap_txg >
 246             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 247                 trysnap = ds->ds_trysnap_txg;
 248         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 249 }
 250
 251 boolean_t
 252 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 253     uint64_t blk_birth)
 254 {
 255         if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 256                 return (B_FALSE);
 257
 258         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 259
 260         return (B_TRUE);
 261 }
 262
 263 /* ARGSUSED */
 264 static void
 265 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 266 {
 267         dsl_dataset_t *ds = dsv;
 268
 269         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 270
 271         unique_remove(ds->ds_fsid_guid);
 272
 273         if (ds->ds_objset != NULL)
 274                 dmu_objset_evict(ds->ds_objset);
 275
 276         if (ds->ds_prev) {
 277                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 278                 ds->ds_prev = NULL;
 279         }
 280
 281         bplist_destroy(&ds->ds_pending_deadlist);
 282         if (db != NULL) {
 283                 dsl_deadlist_close(&ds->ds_deadlist);
 284         } else {
 285                 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 286                 ASSERT(!ds->ds_deadlist.dl_oldfmt);
 287         }
 288         if (ds->ds_dir)
 289                 dsl_dir_close(ds->ds_dir, ds);
 290
 291         ASSERT(!list_link_active(&ds->ds_synced_link));
 292
 293         if (mutex_owned(&ds->ds_lock))
 294                 mutex_exit(&ds->ds_lock);
 295         mutex_destroy(&ds->ds_lock);
 296         mutex_destroy(&ds->ds_recvlock);
 297         if (mutex_owned(&ds->ds_opening_lock))
 298                 mutex_exit(&ds->ds_opening_lock);
 299         mutex_destroy(&ds->ds_opening_lock);
 300         rw_destroy(&ds->ds_rwlock);
 301         cv_destroy(&ds->ds_exclusive_cv);
 302
 303         kmem_free(ds, sizeof (dsl_dataset_t));
 304 }
 305
 306 static int
 307 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 308 {
 309         dsl_dataset_phys_t *headphys;
 310         int err;
 311         dmu_buf_t *headdbuf;
 312         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 313         objset_t *mos = dp->dp_meta_objset;
 314
 315         if (ds->ds_snapname[0])
 316                 return (0);
 317         if (ds->ds_phys->ds_next_snap_obj == 0)
 318                 return (0);
 319
 320         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 321             FTAG, &headdbuf);
 322         if (err)
 323                 return (err);
 324         headphys = headdbuf->db_data;
 325         err = zap_value_search(dp->dp_meta_objset,
 326             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 327         dmu_buf_rele(headdbuf, FTAG);
 328         return (err);
 329 }
 330
 331 static int
 332 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 333 {
 334         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 335         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 336         matchtype_t mt;
 337         int err;
 338
 339         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 340                 mt = MT_FIRST;
 341         else
 342                 mt = MT_EXACT;
 343
 344         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 345             value, mt, NULL, 0, NULL);
 346         if (err == ENOTSUP && mt == MT_FIRST)
 347                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
 348         return (err);
 349 }
 350
 351 static int
 352 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 353 {
 354         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 355         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 356         matchtype_t mt;
 357         int err;
 358
 359         dsl_dir_snap_cmtime_update(ds->ds_dir);
 360
 361         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 362                 mt = MT_FIRST;
 363         else
 364                 mt = MT_EXACT;
 365
 366         err = zap_remove_norm(mos, snapobj, name, mt, tx);
 367         if (err == ENOTSUP && mt == MT_FIRST)
 368                 err = zap_remove(mos, snapobj, name, tx);
 369         return (err);
 370 }
 371
 372 static int
 373 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 374     dsl_dataset_t **dsp)
 375 {
 376         objset_t *mos = dp->dp_meta_objset;
 377         dmu_buf_t *dbuf;
 378         dsl_dataset_t *ds;
 379         int err;
 380         dmu_object_info_t doi;
 381
 382         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 383             dsl_pool_sync_context(dp));
 384
 385         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 386         if (err)
 387                 return (err);
 388
 389         /* Make sure dsobj has the correct object type. */
 390         dmu_object_info_from_db(dbuf, &doi);
 391         if (doi.doi_type != DMU_OT_DSL_DATASET)
 392                 return (EINVAL);
 393
 394         ds = dmu_buf_get_user(dbuf);
 395         if (ds == NULL) {
 396                 dsl_dataset_t *winner;
 397
 398                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 399                 ds->ds_dbuf = dbuf;
 400                 ds->ds_object = dsobj;
 401                 ds->ds_phys = dbuf->db_data;
 402
 403                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 404                 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 405                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 406                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 407
 408                 rw_init(&ds->ds_rwlock, 0, 0, 0);
 409                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 410
 411                 bplist_create(&ds->ds_pending_deadlist);
 412                 dsl_deadlist_open(&ds->ds_deadlist,
 413                     mos, ds->ds_phys->ds_deadlist_obj);
 414
 415                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 416                     offsetof(dmu_sendarg_t, dsa_link));
 417
 418                 if (err == 0) {
 419                         err = dsl_dir_open_obj(dp,
 420                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 421                 }
 422                 if (err) {
 423                         mutex_destroy(&ds->ds_lock);
 424                         mutex_destroy(&ds->ds_recvlock);
 425                         mutex_destroy(&ds->ds_opening_lock);
 426                         rw_destroy(&ds->ds_rwlock);
 427                         cv_destroy(&ds->ds_exclusive_cv);
 428                         bplist_destroy(&ds->ds_pending_deadlist);
 429                         dsl_deadlist_close(&ds->ds_deadlist);
 430                         kmem_free(ds, sizeof (dsl_dataset_t));
 431                         dmu_buf_rele(dbuf, tag);
 432                         return (err);
 433                 }
 434
 435                 if (!dsl_dataset_is_snapshot(ds)) {
 436                         ds->ds_snapname[0] = '\0';
 437                         if (ds->ds_phys->ds_prev_snap_obj) {
 438                                 err = dsl_dataset_get_ref(dp,
 439                                     ds->ds_phys->ds_prev_snap_obj,
 440                                     ds, &ds->ds_prev);
 441                         }
 442                 } else {
 443                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 444                                 err = dsl_dataset_get_snapname(ds);
 445                         if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 446                                 err = zap_count(
 447                                     ds->ds_dir->dd_pool->dp_meta_objset,
 448                                     ds->ds_phys->ds_userrefs_obj,
 449                                     &ds->ds_userrefs);
 450                         }
 451                 }
 452
 453                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 454                         /*
 455                          * In sync context, we're called with either no lock
 456                          * or with the write lock.  If we're not syncing,
 457                          * we're always called with the read lock held.
 458                          */
 459                         boolean_t need_lock =
 460                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 461                             dsl_pool_sync_context(dp);
 462
 463                         if (need_lock)
 464                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 465
 466                         err = dsl_prop_get_ds(ds,
 467                             "refreservation", sizeof (uint64_t), 1,
 468                             &ds->ds_reserved, NULL);
 469                         if (err == 0) {
 470                                 err = dsl_prop_get_ds(ds,
 471                                     "refquota", sizeof (uint64_t), 1,
 472                                     &ds->ds_quota, NULL);
 473                         }
 474
 475                         if (need_lock)
 476                                 rw_exit(&dp->dp_config_rwlock);
 477                 } else {
 478                         ds->ds_reserved = ds->ds_quota = 0;
 479                 }
 480
 481                 if (err == 0) {
 482                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 483                             dsl_dataset_evict);
 484                 }
 485                 if (err || winner) {
 486                         bplist_destroy(&ds->ds_pending_deadlist);
 487                         dsl_deadlist_close(&ds->ds_deadlist);
 488                         if (ds->ds_prev)
 489                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 490                         dsl_dir_close(ds->ds_dir, ds);
 491                         mutex_destroy(&ds->ds_lock);
 492                         mutex_destroy(&ds->ds_recvlock);
 493                         mutex_destroy(&ds->ds_opening_lock);
 494                         rw_destroy(&ds->ds_rwlock);
 495                         cv_destroy(&ds->ds_exclusive_cv);
 496                         kmem_free(ds, sizeof (dsl_dataset_t));
 497                         if (err) {
 498                                 dmu_buf_rele(dbuf, tag);
 499                                 return (err);
 500                         }
 501                         ds = winner;
 502                 } else {
 503                         ds->ds_fsid_guid =
 504                             unique_insert(ds->ds_phys->ds_fsid_guid);
 505                 }
 506         }
 507         ASSERT3P(ds->ds_dbuf, ==, dbuf);
 508         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 509         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 510             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 511             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 512         mutex_enter(&ds->ds_lock);
 513         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 514                 mutex_exit(&ds->ds_lock);
 515                 dmu_buf_rele(ds->ds_dbuf, tag);
 516                 return (ENOENT);
 517         }
 518         mutex_exit(&ds->ds_lock);
 519         *dsp = ds;
 520         return (0);
 521 }
 522
 523 static int
 524 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 525 {
 526         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 527
 528         /*
 529          * In syncing context we don't want the rwlock lock: there
 530          * may be an existing writer waiting for sync phase to
 531          * finish.  We don't need to worry about such writers, since
 532          * sync phase is single-threaded, so the writer can't be
 533          * doing anything while we are active.
 534          */
 535         if (dsl_pool_sync_context(dp)) {
 536                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 537                 return (0);
 538         }
 539
 540         /*
 541          * Normal users will hold the ds_rwlock as a READER until they
 542          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 543          * drop their READER lock after they set the ds_owner field.
 544          *
 545          * If the dataset is being destroyed, the destroy thread will
 546          * obtain a WRITER lock for exclusive access after it's done its
 547          * open-context work and then change the ds_owner to
 548          * dsl_reaper once destruction is assured.  So threads
 549          * may block here temporarily, until the "destructability" of
 550          * the dataset is determined.
 551          */
 552         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 553         mutex_enter(&ds->ds_lock);
 554         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 555                 rw_exit(&dp->dp_config_rwlock);
 556                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 557                 if (DSL_DATASET_IS_DESTROYED(ds)) {
 558                         mutex_exit(&ds->ds_lock);
 559                         dsl_dataset_drop_ref(ds, tag);
 560                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 561                         return (ENOENT);
 562                 }
 563                 /*
 564                  * The dp_config_rwlock lives above the ds_lock. And
 565                  * we need to check DSL_DATASET_IS_DESTROYED() while
 566                  * holding the ds_lock, so we have to drop and reacquire
 567                  * the ds_lock here.
 568                  */
 569                 mutex_exit(&ds->ds_lock);
 570                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 571                 mutex_enter(&ds->ds_lock);
 572         }
 573         mutex_exit(&ds->ds_lock);
 574         return (0);
 575 }
 576
 577 int
 578 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 579     dsl_dataset_t **dsp)
 580 {
 581         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 582
 583         if (err)
 584                 return (err);
 585         return (dsl_dataset_hold_ref(*dsp, tag));
 586 }
 587
 588 int
 589 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 590     void *tag, dsl_dataset_t **dsp)
 591 {
 592         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 593         if (err)
 594                 return (err);
 595         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 596                 dsl_dataset_rele(*dsp, tag);
 597                 *dsp = NULL;
 598                 return (EBUSY);
 599         }
 600         return (0);
 601 }
 602
 603 int
 604 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 605 {
 606         dsl_dir_t *dd;
 607         dsl_pool_t *dp;
 608         const char *snapname;
 609         uint64_t obj;
 610         int err = 0;
 611
 612         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 613         if (err)
 614                 return (err);
 615
 616         dp = dd->dd_pool;
 617         obj = dd->dd_phys->dd_head_dataset_obj;
 618         rw_enter(&dp->dp_config_rwlock, RW_READER);
 619         if (obj)
 620                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 621         else
 622                 err = ENOENT;
 623         if (err)
 624                 goto out;
 625
 626         err = dsl_dataset_hold_ref(*dsp, tag);
 627
 628         /* we may be looking for a snapshot */
 629         if (err == 0 && snapname != NULL) {
 630                 dsl_dataset_t *ds = NULL;
 631
 632                 if (*snapname++ != '@') {
 633                         dsl_dataset_rele(*dsp, tag);
 634                         err = ENOENT;
 635                         goto out;
 636                 }
 637
 638                 dprintf("looking for snapshot '%s'\n", snapname);
 639                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 640                 if (err == 0)
 641                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 642                 dsl_dataset_rele(*dsp, tag);
 643
 644                 ASSERT3U((err == 0), ==, (ds != NULL));
 645
 646                 if (ds) {
 647                         mutex_enter(&ds->ds_lock);
 648                         if (ds->ds_snapname[0] == 0)
 649                                 (void) strlcpy(ds->ds_snapname, snapname,
 650                                     sizeof (ds->ds_snapname));
 651                         mutex_exit(&ds->ds_lock);
 652                         err = dsl_dataset_hold_ref(ds, tag);
 653                         *dsp = err ? NULL : ds;
 654                 }
 655         }
 656 out:
 657         rw_exit(&dp->dp_config_rwlock);
 658         dsl_dir_close(dd, FTAG);
 659         return (err);
 660 }
 661
 662 int
 663 dsl_dataset_own(const char *name, boolean_t inconsistentok,
 664     void *tag, dsl_dataset_t **dsp)
 665 {
 666         int err = dsl_dataset_hold(name, tag, dsp);
 667         if (err)
 668                 return (err);
 669         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 670                 dsl_dataset_rele(*dsp, tag);
 671                 return (EBUSY);
 672         }
 673         return (0);
 674 }
 675
 676 void
 677 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 678 {
 679         if (ds == NULL) {
 680                 (void) strcpy(name, "mos");
 681         } else {
 682                 dsl_dir_name(ds->ds_dir, name);
 683                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 684                 if (ds->ds_snapname[0]) {
 685                         (void) strcat(name, "@");
 686                         /*
 687                          * We use a "recursive" mutex so that we
 688                          * can call dprintf_ds() with ds_lock held.
 689                          */
 690                         if (!MUTEX_HELD(&ds->ds_lock)) {
 691                                 mutex_enter(&ds->ds_lock);
 692                                 (void) strcat(name, ds->ds_snapname);
 693                                 mutex_exit(&ds->ds_lock);
 694                         } else {
 695                                 (void) strcat(name, ds->ds_snapname);
 696                         }
 697                 }
 698         }
 699 }
 700
 701 static int
 702 dsl_dataset_namelen(dsl_dataset_t *ds)
 703 {
 704         int result;
 705
 706         if (ds == NULL) {
 707                 result = 3;     /* "mos" */
 708         } else {
 709                 result = dsl_dir_namelen(ds->ds_dir);
 710                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 711                 if (ds->ds_snapname[0]) {
 712                         ++result;       /* adding one for the @-sign */
 713                         if (!MUTEX_HELD(&ds->ds_lock)) {
 714                                 mutex_enter(&ds->ds_lock);
 715                                 result += strlen(ds->ds_snapname);
 716                                 mutex_exit(&ds->ds_lock);
 717                         } else {
 718                                 result += strlen(ds->ds_snapname);
 719                         }
 720                 }
 721         }
 722
 723         return (result);
 724 }
 725
 726 void
 727 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 728 {
 729         dmu_buf_rele(ds->ds_dbuf, tag);
 730 }
 731
 732 void
 733 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 734 {
 735         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 736                 rw_exit(&ds->ds_rwlock);
 737         }
 738         dsl_dataset_drop_ref(ds, tag);
 739 }
 740
 741 void
 742 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 743 {
 744         ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 745             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 746
 747         mutex_enter(&ds->ds_lock);
 748         ds->ds_owner = NULL;
 749         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 750                 rw_exit(&ds->ds_rwlock);
 751                 cv_broadcast(&ds->ds_exclusive_cv);
 752         }
 753         mutex_exit(&ds->ds_lock);
 754         if (ds->ds_dbuf)
 755                 dsl_dataset_drop_ref(ds, tag);
 756         else
 757                 dsl_dataset_evict(NULL, ds);
 758 }
 759
 760 boolean_t
 761 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 762 {
 763         boolean_t gotit = FALSE;
 764
 765         mutex_enter(&ds->ds_lock);
 766         if (ds->ds_owner == NULL &&
 767             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 768                 ds->ds_owner = tag;
 769                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 770                         rw_exit(&ds->ds_rwlock);
 771                 gotit = TRUE;
 772         }
 773         mutex_exit(&ds->ds_lock);
 774         return (gotit);
 775 }
 776
 777 void
 778 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 779 {
 780         ASSERT3P(owner, ==, ds->ds_owner);
 781         if (!RW_WRITE_HELD(&ds->ds_rwlock))
 782                 rw_enter(&ds->ds_rwlock, RW_WRITER);
 783 }
 784
 785 uint64_t
 786 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 787     uint64_t flags, dmu_tx_t *tx)
 788 {
 789         dsl_pool_t *dp = dd->dd_pool;
 790         dmu_buf_t *dbuf;
 791         dsl_dataset_phys_t *dsphys;
 792         uint64_t dsobj;
 793         objset_t *mos = dp->dp_meta_objset;
 794
 795         if (origin == NULL)
 796                 origin = dp->dp_origin_snap;
 797
 798         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 799         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 800         ASSERT(dmu_tx_is_syncing(tx));
 801         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 802
 803         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 804             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 805         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 806         dmu_buf_will_dirty(dbuf, tx);
 807         dsphys = dbuf->db_data;
 808         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 809         dsphys->ds_dir_obj = dd->dd_object;
 810         dsphys->ds_flags = flags;
 811         dsphys->ds_fsid_guid = unique_create();
 812         do {
 813                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 814                     sizeof (dsphys->ds_guid));
 815         } while (dsphys->ds_guid == 0);
 816         dsphys->ds_snapnames_zapobj =
 817             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 818             DMU_OT_NONE, 0, tx);
 819         dsphys->ds_creation_time = gethrestime_sec();
 820         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 821
 822         if (origin == NULL) {
 823                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 824         } else {
 825                 dsl_dataset_t *ohds;
 826
 827                 dsphys->ds_prev_snap_obj = origin->ds_object;
 828                 dsphys->ds_prev_snap_txg =
 829                     origin->ds_phys->ds_creation_txg;
 830                 dsphys->ds_used_bytes =
 831                     origin->ds_phys->ds_used_bytes;
 832                 dsphys->ds_compressed_bytes =
 833                     origin->ds_phys->ds_compressed_bytes;
 834                 dsphys->ds_uncompressed_bytes =
 835                     origin->ds_phys->ds_uncompressed_bytes;
 836                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 837                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 838
 839                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 840                 origin->ds_phys->ds_num_children++;
 841
 842                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 843                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 844                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 845                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 846                 dsl_dataset_rele(ohds, FTAG);
 847
 848                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 849                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 850                                 origin->ds_phys->ds_next_clones_obj =
 851                                     zap_create(mos,
 852                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 853                         }
 854                         VERIFY(0 == zap_add_int(mos,
 855                             origin->ds_phys->ds_next_clones_obj,
 856                             dsobj, tx));
 857                 }
 858
 859                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
 860                 dd->dd_phys->dd_origin_obj = origin->ds_object;
 861                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 862                         if (origin->ds_dir->dd_phys->dd_clones == 0) {
 863                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 864                                 origin->ds_dir->dd_phys->dd_clones =
 865                                     zap_create(mos,
 866                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 867                         }
 868                         VERIFY3U(0, ==, zap_add_int(mos,
 869                             origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 870                 }
 871         }
 872
 873         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 874                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 875
 876         dmu_buf_rele(dbuf, FTAG);
 877
 878         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 879         dd->dd_phys->dd_head_dataset_obj = dsobj;
 880
 881         return (dsobj);
 882 }
 883
 884 uint64_t
 885 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 886     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 887 {
 888         dsl_pool_t *dp = pdd->dd_pool;
 889         uint64_t dsobj, ddobj;
 890         dsl_dir_t *dd;
 891
 892         ASSERT(lastname[0] != '@');
 893
 894         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 895         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 896
 897         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 898
 899         dsl_deleg_set_create_perms(dd, tx, cr);
 900
 901         dsl_dir_close(dd, FTAG);
 902
 903         /*
 904          * If we are creating a clone, make sure we zero out any stale
 905          * data from the origin snapshots zil header.
 906          */
 907         if (origin != NULL) {
 908                 dsl_dataset_t *ds;
 909                 objset_t *os;
 910
 911                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 912                 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 913                 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 914                 dsl_dataset_dirty(ds, tx);
 915                 dsl_dataset_rele(ds, FTAG);
 916         }
 917
 918         return (dsobj);
 919 }
 920
 921 #ifdef __FreeBSD__
 922 /* FreeBSD ioctl compat begin */
 923 struct destroyarg {
 924         nvlist_t *nvl;
 925         const char *snapname;
 926 };
 927
 928 static int
 929 dsl_check_snap_cb(const char *name, void *arg)
 930 {
 931         struct destroyarg *da = arg;
 932         dsl_dataset_t *ds;
 933         char *dsname;
 934
 935         dsname = kmem_asprintf("%s@%s", name, da->snapname);
 936         VERIFY(nvlist_add_boolean(da->nvl, dsname) == 0);
 937
 938         return (0);
 939 }
 940
 941 int
 942 dmu_get_recursive_snaps_nvl(const char *fsname, const char *snapname,
 943     nvlist_t *snaps)
 944 {
 945         struct destroyarg *da;
 946         int err;
 947
 948         da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
 949         da->nvl = snaps;
 950         da->snapname = snapname;
 951         err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
 952             DS_FIND_CHILDREN);
 953         kmem_free(da, sizeof (struct destroyarg));
 954
 955         return (err);
 956 }
 957 /* FreeBSD ioctl compat end */
 958 #endif /* __FreeBSD__ */
 959
 960 /*
 961  * The snapshots must all be in the same pool.
 962  */
 963 int
 964 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
 965 {
 966         int err;
 967         dsl_sync_task_t *dst;
 968         spa_t *spa;
 969         nvpair_t *pair;
 970         dsl_sync_task_group_t *dstg;
 971
 972         pair = nvlist_next_nvpair(snaps, NULL);
 973         if (pair == NULL)
 974                 return (0);
 975
 976         err = spa_open(nvpair_name(pair), &spa, FTAG);
 977         if (err)
 978                 return (err);
 979         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 980
 981         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 982             pair = nvlist_next_nvpair(snaps, pair)) {
 983                 dsl_dataset_t *ds;
 984                 int err;
 985
 986                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 987                 if (err == 0) {
 988                         struct dsl_ds_destroyarg *dsda;
 989
 990                         dsl_dataset_make_exclusive(ds, dstg);
 991                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 992                             KM_SLEEP);
 993                         dsda->ds = ds;
 994                         dsda->defer = defer;
 995                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 996                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 997                 } else if (err == ENOENT) {
 998                         err = 0;
 999                 } else {
1000                         (void) strcpy(failed, nvpair_name(pair));
1001                         break;
1002                 }
1003         }
1004
1005         if (err == 0)
1006                 err = dsl_sync_task_group_wait(dstg);
1007
1008         for (dst = list_head(&dstg->dstg_tasks); dst;
1009             dst = list_next(&dstg->dstg_tasks, dst)) {
1010                 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
1011                 dsl_dataset_t *ds = dsda->ds;
1012
1013                 /*
1014                  * Return the file system name that triggered the error
1015                  */
1016                 if (dst->dst_err) {
1017                         dsl_dataset_name(ds, failed);
1018                 }
1019                 ASSERT3P(dsda->rm_origin, ==, NULL);
1020                 dsl_dataset_disown(ds, dstg);
1021                 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
1022         }
1023
1024         dsl_sync_task_group_destroy(dstg);
1025         spa_close(spa, FTAG);
1026         return (err);
1027
1028 }
1029
1030 static boolean_t
1031 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
1032 {
1033         boolean_t might_destroy = B_FALSE;
1034
1035         mutex_enter(&ds->ds_lock);
1036         if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
1037             DS_IS_DEFER_DESTROY(ds))
1038                 might_destroy = B_TRUE;
1039         mutex_exit(&ds->ds_lock);
1040
1041         return (might_destroy);
1042 }
1043
1044 /*
1045  * If we're removing a clone, and these three conditions are true:
1046  *      1) the clone's origin has no other children
1047  *      2) the clone's origin has no user references
1048  *      3) the clone's origin has been marked for deferred destruction
1049  * Then, prepare to remove the origin as part of this sync task group.
1050  */
1051 static int
1052 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
1053 {
1054         dsl_dataset_t *ds = dsda->ds;
1055         dsl_dataset_t *origin = ds->ds_prev;
1056
1057         if (dsl_dataset_might_destroy_origin(origin)) {
1058                 char *name;
1059                 int namelen;
1060                 int error;
1061
1062                 namelen = dsl_dataset_namelen(origin) + 1;
1063                 name = kmem_alloc(namelen, KM_SLEEP);
1064                 dsl_dataset_name(origin, name);
1065 #ifdef _KERNEL
1066                 error = zfs_unmount_snap(name, NULL);
1067                 if (error) {
1068                         kmem_free(name, namelen);
1069                         return (error);
1070                 }
1071 #endif
1072                 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1073                 kmem_free(name, namelen);
1074                 if (error)
1075                         return (error);
1076                 dsda->rm_origin = origin;
1077                 dsl_dataset_make_exclusive(origin, tag);
1078         }
1079
1080         return (0);
1081 }
1082
1083 /*
1084  * ds must be opened as OWNER.  On return (whether successful or not),
1085  * ds will be closed and caller can no longer dereference it.
1086  */
1087 int
1088 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1089 {
1090         int err;
1091         dsl_sync_task_group_t *dstg;
1092         objset_t *os;
1093         dsl_dir_t *dd;
1094         uint64_t obj;
1095         struct dsl_ds_destroyarg dsda = { 0 };
1096         dsl_dataset_t dummy_ds = { 0 };
1097
1098         dsda.ds = ds;
1099
1100         if (dsl_dataset_is_snapshot(ds)) {
1101                 /* Destroying a snapshot is simpler */
1102                 dsl_dataset_make_exclusive(ds, tag);
1103
1104                 dsda.defer = defer;
1105                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1106                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1107                     &dsda, tag, 0);
1108                 ASSERT3P(dsda.rm_origin, ==, NULL);
1109                 goto out;
1110         } else if (defer) {
1111                 err = EINVAL;
1112                 goto out;
1113         }
1114
1115         dd = ds->ds_dir;
1116         dummy_ds.ds_dir = dd;
1117         dummy_ds.ds_object = ds->ds_object;
1118
1119         /*
1120          * Check for errors and mark this ds as inconsistent, in
1121          * case we crash while freeing the objects.
1122          */
1123         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1124             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1125         if (err)
1126                 goto out;
1127
1128         err = dmu_objset_from_ds(ds, &os);
1129         if (err)
1130                 goto out;
1131
1132         /*
1133          * remove the objects in open context, so that we won't
1134          * have too much to do in syncing context.
1135          */
1136         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1137             ds->ds_phys->ds_prev_snap_txg)) {
1138                 /*
1139                  * Ignore errors, if there is not enough disk space
1140                  * we will deal with it in dsl_dataset_destroy_sync().
1141                  */
1142                 (void) dmu_free_object(os, obj);
1143         }
1144         if (err != ESRCH)
1145                 goto out;
1146
1147         /*
1148          * Only the ZIL knows how to free log blocks.
1149          */
1150         zil_destroy(dmu_objset_zil(os), B_FALSE);
1151
1152         /*
1153          * Sync out all in-flight IO.
1154          */
1155         txg_wait_synced(dd->dd_pool, 0);
1156
1157         /*
1158          * If we managed to free all the objects in open
1159          * context, the user space accounting should be zero.
1160          */
1161         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1162             dmu_objset_userused_enabled(os)) {
1163                 uint64_t count;
1164
1165                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
1166                     count == 0);
1167                 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
1168                     count == 0);
1169         }
1170
1171         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1172         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1173         rw_exit(&dd->dd_pool->dp_config_rwlock);
1174
1175         if (err)
1176                 goto out;
1177
1178         /*
1179          * Blow away the dsl_dir + head dataset.
1180          */
1181         dsl_dataset_make_exclusive(ds, tag);
1182         /*
1183          * If we're removing a clone, we might also need to remove its
1184          * origin.
1185          */
1186         do {
1187                 dsda.need_prep = B_FALSE;
1188                 if (dsl_dir_is_clone(dd)) {
1189                         err = dsl_dataset_origin_rm_prep(&dsda, tag);
1190                         if (err) {
1191                                 dsl_dir_close(dd, FTAG);
1192                                 goto out;
1193                         }
1194                 }
1195
1196                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1197                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1198                     dsl_dataset_destroy_sync, &dsda, tag, 0);
1199                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1200                     dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
1201                 err = dsl_sync_task_group_wait(dstg);
1202                 dsl_sync_task_group_destroy(dstg);
1203
1204                 /*
1205                  * We could be racing against 'zfs release' or 'zfs destroy -d'
1206                  * on the origin snap, in which case we can get EBUSY if we
1207                  * needed to destroy the origin snap but were not ready to
1208                  * do so.
1209                  */
1210                 if (dsda.need_prep) {
1211                         ASSERT(err == EBUSY);
1212                         ASSERT(dsl_dir_is_clone(dd));
1213                         ASSERT(dsda.rm_origin == NULL);
1214                 }
1215         } while (dsda.need_prep);
1216
1217         if (dsda.rm_origin != NULL)
1218                 dsl_dataset_disown(dsda.rm_origin, tag);
1219
1220         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1221         if (err)
1222                 dsl_dir_close(dd, FTAG);
1223 out:
1224         dsl_dataset_disown(ds, tag);
1225         return (err);
1226 }
1227
1228 blkptr_t *
1229 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1230 {
1231         return (&ds->ds_phys->ds_bp);
1232 }
1233
1234 void
1235 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1236 {
1237         ASSERT(dmu_tx_is_syncing(tx));
1238         /* If it's the meta-objset, set dp_meta_rootbp */
1239         if (ds == NULL) {
1240                 tx->tx_pool->dp_meta_rootbp = *bp;
1241         } else {
1242                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1243                 ds->ds_phys->ds_bp = *bp;
1244         }
1245 }
1246
1247 spa_t *
1248 dsl_dataset_get_spa(dsl_dataset_t *ds)
1249 {
1250         return (ds->ds_dir->dd_pool->dp_spa);
1251 }
1252
1253 void
1254 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1255 {
1256         dsl_pool_t *dp;
1257
1258         if (ds == NULL) /* this is the meta-objset */
1259                 return;
1260
1261         ASSERT(ds->ds_objset != NULL);
1262
1263         if (ds->ds_phys->ds_next_snap_obj != 0)
1264                 panic("dirtying snapshot!");
1265
1266         dp = ds->ds_dir->dd_pool;
1267
1268         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1269                 /* up the hold count until we can be written out */
1270                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1271         }
1272 }
1273
1274 /*
1275  * The unique space in the head dataset can be calculated by subtracting
1276  * the space used in the most recent snapshot, that is still being used
1277  * in this file system, from the space currently in use.  To figure out
1278  * the space in the most recent snapshot still in use, we need to take
1279  * the total space used in the snapshot and subtract out the space that
1280  * has been freed up since the snapshot was taken.
1281  */
1282 static void
1283 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1284 {
1285         uint64_t mrs_used;
1286         uint64_t dlused, dlcomp, dluncomp;
1287
1288         ASSERT(!dsl_dataset_is_snapshot(ds));
1289
1290         if (ds->ds_phys->ds_prev_snap_obj != 0)
1291                 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
1292         else
1293                 mrs_used = 0;
1294
1295         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1296
1297         ASSERT3U(dlused, <=, mrs_used);
1298         ds->ds_phys->ds_unique_bytes =
1299             ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
1300
1301         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1302             SPA_VERSION_UNIQUE_ACCURATE)
1303                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1304 }
1305
1306 struct killarg {
1307         dsl_dataset_t *ds;
1308         dmu_tx_t *tx;
1309 };
1310
1311 /* ARGSUSED */
1312 static int
1313 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1314     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1315 {
1316         struct killarg *ka = arg;
1317         dmu_tx_t *tx = ka->tx;
1318
1319         if (bp == NULL)
1320                 return (0);
1321
1322         if (zb->zb_level == ZB_ZIL_LEVEL) {
1323                 ASSERT(zilog != NULL);
1324                 /*
1325                  * It's a block in the intent log.  It has no
1326                  * accounting, so just free it.
1327                  */
1328                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1329         } else {
1330                 ASSERT(zilog == NULL);
1331                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1332                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1333         }
1334
1335         return (0);
1336 }
1337
1338 /* ARGSUSED */
1339 static int
1340 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1341 {
1342         dsl_dataset_t *ds = arg1;
1343         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1344         uint64_t count;
1345         int err;
1346
1347         /*
1348          * Can't delete a head dataset if there are snapshots of it.
1349          * (Except if the only snapshots are from the branch we cloned
1350          * from.)
1351          */
1352         if (ds->ds_prev != NULL &&
1353             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1354                 return (EBUSY);
1355
1356         /*
1357          * This is really a dsl_dir thing, but check it here so that
1358          * we'll be less likely to leave this dataset inconsistent &
1359          * nearly destroyed.
1360          */
1361         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1362         if (err)
1363                 return (err);
1364         if (count != 0)
1365                 return (EEXIST);
1366
1367         return (0);
1368 }
1369
1370 /* ARGSUSED */
1371 static void
1372 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1373 {
1374         dsl_dataset_t *ds = arg1;
1375         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1376
1377         /* Mark it as inconsistent on-disk, in case we crash */
1378         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1379         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1380
1381         spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
1382             "dataset = %llu", ds->ds_object);
1383 }
1384
1385 static int
1386 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1387     dmu_tx_t *tx)
1388 {
1389         dsl_dataset_t *ds = dsda->ds;
1390         dsl_dataset_t *ds_prev = ds->ds_prev;
1391
1392         if (dsl_dataset_might_destroy_origin(ds_prev)) {
1393                 struct dsl_ds_destroyarg ndsda = {0};
1394
1395                 /*
1396                  * If we're not prepared to remove the origin, don't remove
1397                  * the clone either.
1398                  */
1399                 if (dsda->rm_origin == NULL) {
1400                         dsda->need_prep = B_TRUE;
1401                         return (EBUSY);
1402                 }
1403
1404                 ndsda.ds = ds_prev;
1405                 ndsda.is_origin_rm = B_TRUE;
1406                 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1407         }
1408
1409         /*
1410          * If we're not going to remove the origin after all,
1411          * undo the open context setup.
1412          */
1413         if (dsda->rm_origin != NULL) {
1414                 dsl_dataset_disown(dsda->rm_origin, tag);
1415                 dsda->rm_origin = NULL;
1416         }
1417
1418         return (0);
1419 }
1420
1421 /*
1422  * If you add new checks here, you may need to add
1423  * additional checks to the "temporary" case in
1424  * snapshot_check() in dmu_objset.c.
1425  */
1426 /* ARGSUSED */
1427 int
1428 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1429 {
1430         struct dsl_ds_destroyarg *dsda = arg1;
1431         dsl_dataset_t *ds = dsda->ds;
1432
1433         /* we have an owner hold, so noone else can destroy us */
1434         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1435
1436         /*
1437          * Only allow deferred destroy on pools that support it.
1438          * NOTE: deferred destroy is only supported on snapshots.
1439          */
1440         if (dsda->defer) {
1441                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1442                     SPA_VERSION_USERREFS)
1443                         return (ENOTSUP);
1444                 ASSERT(dsl_dataset_is_snapshot(ds));
1445                 return (0);
1446         }
1447
1448         /*
1449          * Can't delete a head dataset if there are snapshots of it.
1450          * (Except if the only snapshots are from the branch we cloned
1451          * from.)
1452          */
1453         if (ds->ds_prev != NULL &&
1454             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1455                 return (EBUSY);
1456
1457         /*
1458          * If we made changes this txg, traverse_dsl_dataset won't find
1459          * them.  Try again.
1460          */
1461         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1462                 return (EAGAIN);
1463
1464         if (dsl_dataset_is_snapshot(ds)) {
1465                 /*
1466                  * If this snapshot has an elevated user reference count,
1467                  * we can't destroy it yet.
1468                  */
1469                 if (ds->ds_userrefs > 0 && !dsda->releasing)
1470                         return (EBUSY);
1471
1472                 mutex_enter(&ds->ds_lock);
1473                 /*
1474                  * Can't delete a branch point. However, if we're destroying
1475                  * a clone and removing its origin due to it having a user
1476                  * hold count of 0 and having been marked for deferred destroy,
1477                  * it's OK for the origin to have a single clone.
1478                  */
1479                 if (ds->ds_phys->ds_num_children >
1480                     (dsda->is_origin_rm ? 2 : 1)) {
1481                         mutex_exit(&ds->ds_lock);
1482                         return (EEXIST);
1483                 }
1484                 mutex_exit(&ds->ds_lock);
1485         } else if (dsl_dir_is_clone(ds->ds_dir)) {
1486                 return (dsl_dataset_origin_check(dsda, arg2, tx));
1487         }
1488
1489         /* XXX we should do some i/o error checking... */
1490         return (0);
1491 }
1492
1493 struct refsarg {
1494         kmutex_t lock;
1495         boolean_t gone;
1496         kcondvar_t cv;
1497 };
1498
1499 /* ARGSUSED */
1500 static void
1501 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1502 {
1503         struct refsarg *arg = argv;
1504
1505         mutex_enter(&arg->lock);
1506         arg->gone = TRUE;
1507         cv_signal(&arg->cv);
1508         mutex_exit(&arg->lock);
1509 }
1510
1511 static void
1512 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1513 {
1514         struct refsarg arg;
1515
1516         bzero(&arg, sizeof(arg));
1517         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1518         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1519         arg.gone = FALSE;
1520         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1521             dsl_dataset_refs_gone);
1522         dmu_buf_rele(ds->ds_dbuf, tag);
1523         mutex_enter(&arg.lock);
1524         while (!arg.gone)
1525                 cv_wait(&arg.cv, &arg.lock);
1526         ASSERT(arg.gone);
1527         mutex_exit(&arg.lock);
1528         ds->ds_dbuf = NULL;
1529         ds->ds_phys = NULL;
1530         mutex_destroy(&arg.lock);
1531         cv_destroy(&arg.cv);
1532 }
1533
1534 static void
1535 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1536 {
1537         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1538         uint64_t count;
1539         int err;
1540
1541         ASSERT(ds->ds_phys->ds_num_children >= 2);
1542         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1543         /*
1544          * The err should not be ENOENT, but a bug in a previous version
1545          * of the code could cause upgrade_clones_cb() to not set
1546          * ds_next_snap_obj when it should, leading to a missing entry.
1547          * If we knew that the pool was created after
1548          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1549          * ENOENT.  However, at least we can check that we don't have
1550          * too many entries in the next_clones_obj even after failing to
1551          * remove this one.
1552          */
1553         if (err != ENOENT) {
1554                 VERIFY3U(err, ==, 0);
1555         }
1556         ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1557             &count));
1558         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1559 }
1560
1561 static void
1562 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1563 {
1564         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1565         zap_cursor_t zc;
1566         zap_attribute_t za;
1567
1568         /*
1569          * If it is the old version, dd_clones doesn't exist so we can't
1570          * find the clones, but deadlist_remove_key() is a no-op so it
1571          * doesn't matter.
1572          */
1573         if (ds->ds_dir->dd_phys->dd_clones == 0)
1574                 return;
1575
1576         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1577             zap_cursor_retrieve(&zc, &za) == 0;
1578             zap_cursor_advance(&zc)) {
1579                 dsl_dataset_t *clone;
1580
1581                 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1582                     za.za_first_integer, FTAG, &clone));
1583                 if (clone->ds_dir->dd_origin_txg > mintxg) {
1584                         dsl_deadlist_remove_key(&clone->ds_deadlist,
1585                             mintxg, tx);
1586                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
1587                 }
1588                 dsl_dataset_rele(clone, FTAG);
1589         }
1590         zap_cursor_fini(&zc);
1591 }
1592
1593 struct process_old_arg {
1594         dsl_dataset_t *ds;
1595         dsl_dataset_t *ds_prev;
1596         boolean_t after_branch_point;
1597         zio_t *pio;
1598         uint64_t used, comp, uncomp;
1599 };
1600
1601 static int
1602 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1603 {
1604         struct process_old_arg *poa = arg;
1605         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1606
1607         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1608                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1609                 if (poa->ds_prev && !poa->after_branch_point &&
1610                     bp->blk_birth >
1611                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1612                         poa->ds_prev->ds_phys->ds_unique_bytes +=
1613                             bp_get_dsize_sync(dp->dp_spa, bp);
1614                 }
1615         } else {
1616                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1617                 poa->comp += BP_GET_PSIZE(bp);
1618                 poa->uncomp += BP_GET_UCSIZE(bp);
1619                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1620         }
1621         return (0);
1622 }
1623
1624 static void
1625 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1626     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1627 {
1628         struct process_old_arg poa = { 0 };
1629         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1630         objset_t *mos = dp->dp_meta_objset;
1631
1632         ASSERT(ds->ds_deadlist.dl_oldfmt);
1633         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1634
1635         poa.ds = ds;
1636         poa.ds_prev = ds_prev;
1637         poa.after_branch_point = after_branch_point;
1638         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1639         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1640             process_old_cb, &poa, tx));
1641         VERIFY3U(zio_wait(poa.pio), ==, 0);
1642         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1643
1644         /* change snapused */
1645         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1646             -poa.used, -poa.comp, -poa.uncomp, tx);
1647
1648         /* swap next's deadlist to our deadlist */
1649         dsl_deadlist_close(&ds->ds_deadlist);
1650         dsl_deadlist_close(&ds_next->ds_deadlist);
1651         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1652             ds->ds_phys->ds_deadlist_obj);
1653         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1654         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1655             ds_next->ds_phys->ds_deadlist_obj);
1656 }
1657
1658 void
1659 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1660 {
1661         struct dsl_ds_destroyarg *dsda = arg1;
1662         dsl_dataset_t *ds = dsda->ds;
1663         int err;
1664         int after_branch_point = FALSE;
1665         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1666         objset_t *mos = dp->dp_meta_objset;
1667         dsl_dataset_t *ds_prev = NULL;
1668         boolean_t wont_destroy;
1669         uint64_t obj;
1670
1671         wont_destroy = (dsda->defer &&
1672             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1673
1674         ASSERT(ds->ds_owner || wont_destroy);
1675         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1676         ASSERT(ds->ds_prev == NULL ||
1677             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1678         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1679
1680         if (wont_destroy) {
1681                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1682                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1683                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1684                 return;
1685         }
1686
1687         /* signal any waiters that this dataset is going away */
1688         mutex_enter(&ds->ds_lock);
1689         ds->ds_owner = dsl_reaper;
1690         cv_broadcast(&ds->ds_exclusive_cv);
1691         mutex_exit(&ds->ds_lock);
1692
1693         /* Remove our reservation */
1694         if (ds->ds_reserved != 0) {
1695                 dsl_prop_setarg_t psa;
1696                 uint64_t value = 0;
1697
1698                 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1699                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1700                     &value);
1701                 psa.psa_effective_value = 0;    /* predict default value */
1702
1703                 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1704                 ASSERT3U(ds->ds_reserved, ==, 0);
1705         }
1706
1707         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1708
1709         dsl_scan_ds_destroyed(ds, tx);
1710
1711         obj = ds->ds_object;
1712
1713         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1714                 if (ds->ds_prev) {
1715                         ds_prev = ds->ds_prev;
1716                 } else {
1717                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1718                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1719                 }
1720                 after_branch_point =
1721                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1722
1723                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1724                 if (after_branch_point &&
1725                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1726                         remove_from_next_clones(ds_prev, obj, tx);
1727                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1728                                 VERIFY(0 == zap_add_int(mos,
1729                                     ds_prev->ds_phys->ds_next_clones_obj,
1730                                     ds->ds_phys->ds_next_snap_obj, tx));
1731                         }
1732                 }
1733                 if (after_branch_point &&
1734                     ds->ds_phys->ds_next_snap_obj == 0) {
1735                         /* This clone is toast. */
1736                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1737                         ds_prev->ds_phys->ds_num_children--;
1738
1739                         /*
1740                          * If the clone's origin has no other clones, no
1741                          * user holds, and has been marked for deferred
1742                          * deletion, then we should have done the necessary
1743                          * destroy setup for it.
1744                          */
1745                         if (ds_prev->ds_phys->ds_num_children == 1 &&
1746                             ds_prev->ds_userrefs == 0 &&
1747                             DS_IS_DEFER_DESTROY(ds_prev)) {
1748                                 ASSERT3P(dsda->rm_origin, !=, NULL);
1749                         } else {
1750                                 ASSERT3P(dsda->rm_origin, ==, NULL);
1751                         }
1752                 } else if (!after_branch_point) {
1753                         ds_prev->ds_phys->ds_next_snap_obj =
1754                             ds->ds_phys->ds_next_snap_obj;
1755                 }
1756         }
1757
1758         if (dsl_dataset_is_snapshot(ds)) {
1759                 dsl_dataset_t *ds_next;
1760                 uint64_t old_unique;
1761                 uint64_t used = 0, comp = 0, uncomp = 0;
1762
1763                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1764                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1765                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1766
1767                 old_unique = ds_next->ds_phys->ds_unique_bytes;
1768
1769                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1770                 ds_next->ds_phys->ds_prev_snap_obj =
1771                     ds->ds_phys->ds_prev_snap_obj;
1772                 ds_next->ds_phys->ds_prev_snap_txg =
1773                     ds->ds_phys->ds_prev_snap_txg;
1774                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1775                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1776
1777
1778                 if (ds_next->ds_deadlist.dl_oldfmt) {
1779                         process_old_deadlist(ds, ds_prev, ds_next,
1780                             after_branch_point, tx);
1781                 } else {
1782                         /* Adjust prev's unique space. */
1783                         if (ds_prev && !after_branch_point) {
1784                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1785                                     ds_prev->ds_phys->ds_prev_snap_txg,
1786                                     ds->ds_phys->ds_prev_snap_txg,
1787                                     &used, &comp, &uncomp);
1788                                 ds_prev->ds_phys->ds_unique_bytes += used;
1789                         }
1790
1791                         /* Adjust snapused. */
1792                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1793                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1794                             &used, &comp, &uncomp);
1795                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1796                             -used, -comp, -uncomp, tx);
1797
1798                         /* Move blocks to be freed to pool's free list. */
1799                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1800                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1801                             tx);
1802                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1803                             DD_USED_HEAD, used, comp, uncomp, tx);
1804                         dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
1805
1806                         /* Merge our deadlist into next's and free it. */
1807                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1808                             ds->ds_phys->ds_deadlist_obj, tx);
1809                 }
1810                 dsl_deadlist_close(&ds->ds_deadlist);
1811                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1812
1813                 /* Collapse range in clone heads */
1814                 dsl_dataset_remove_clones_key(ds,
1815                     ds->ds_phys->ds_creation_txg, tx);
1816
1817                 if (dsl_dataset_is_snapshot(ds_next)) {
1818                         dsl_dataset_t *ds_nextnext;
1819
1820                         /*
1821                          * Update next's unique to include blocks which
1822                          * were previously shared by only this snapshot
1823                          * and it.  Those blocks will be born after the
1824                          * prev snap and before this snap, and will have
1825                          * died after the next snap and before the one
1826                          * after that (ie. be on the snap after next's
1827                          * deadlist).
1828                          */
1829                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1830                             ds_next->ds_phys->ds_next_snap_obj,
1831                             FTAG, &ds_nextnext));
1832                         dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1833                             ds->ds_phys->ds_prev_snap_txg,
1834                             ds->ds_phys->ds_creation_txg,
1835                             &used, &comp, &uncomp);
1836                         ds_next->ds_phys->ds_unique_bytes += used;
1837                         dsl_dataset_rele(ds_nextnext, FTAG);
1838                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1839
1840                         /* Collapse range in this head. */
1841                         dsl_dataset_t *hds;
1842                         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1843                             ds->ds_dir->dd_phys->dd_head_dataset_obj,
1844                             FTAG, &hds));
1845                         dsl_deadlist_remove_key(&hds->ds_deadlist,
1846                             ds->ds_phys->ds_creation_txg, tx);
1847                         dsl_dataset_rele(hds, FTAG);
1848
1849                 } else {
1850                         ASSERT3P(ds_next->ds_prev, ==, ds);
1851                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1852                         ds_next->ds_prev = NULL;
1853                         if (ds_prev) {
1854                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1855                                     ds->ds_phys->ds_prev_snap_obj,
1856                                     ds_next, &ds_next->ds_prev));
1857                         }
1858
1859                         dsl_dataset_recalc_head_uniq(ds_next);
1860
1861                         /*
1862                          * Reduce the amount of our unconsmed refreservation
1863                          * being charged to our parent by the amount of
1864                          * new unique data we have gained.
1865                          */
1866                         if (old_unique < ds_next->ds_reserved) {
1867                                 int64_t mrsdelta;
1868                                 uint64_t new_unique =
1869                                     ds_next->ds_phys->ds_unique_bytes;
1870
1871                                 ASSERT(old_unique <= new_unique);
1872                                 mrsdelta = MIN(new_unique - old_unique,
1873                                     ds_next->ds_reserved - old_unique);
1874                                 dsl_dir_diduse_space(ds->ds_dir,
1875                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1876                         }
1877                 }
1878                 dsl_dataset_rele(ds_next, FTAG);
1879         } else {
1880                 /*
1881                  * There's no next snapshot, so this is a head dataset.
1882                  * Destroy the deadlist.  Unless it's a clone, the
1883                  * deadlist should be empty.  (If it's a clone, it's
1884                  * safe to ignore the deadlist contents.)
1885                  */
1886                 struct killarg ka;
1887
1888                 dsl_deadlist_close(&ds->ds_deadlist);
1889                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1890                 ds->ds_phys->ds_deadlist_obj = 0;
1891
1892                 /*
1893                  * Free everything that we point to (that's born after
1894                  * the previous snapshot, if we are a clone)
1895                  *
1896                  * NB: this should be very quick, because we already
1897                  * freed all the objects in open context.
1898                  */
1899                 ka.ds = ds;
1900                 ka.tx = tx;
1901                 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1902                     TRAVERSE_POST, kill_blkptr, &ka);
1903                 ASSERT3U(err, ==, 0);
1904                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1905                     ds->ds_phys->ds_unique_bytes == 0);
1906
1907                 if (ds->ds_prev != NULL) {
1908                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1909                                 VERIFY3U(0, ==, zap_remove_int(mos,
1910                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1911                                     ds->ds_object, tx));
1912                         }
1913                         dsl_dataset_rele(ds->ds_prev, ds);
1914                         ds->ds_prev = ds_prev = NULL;
1915                 }
1916         }
1917
1918         /*
1919          * This must be done after the dsl_traverse(), because it will
1920          * re-open the objset.
1921          */
1922         if (ds->ds_objset) {
1923                 dmu_objset_evict(ds->ds_objset);
1924                 ds->ds_objset = NULL;
1925         }
1926
1927         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1928                 /* Erase the link in the dir */
1929                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1930                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1931                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1932                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1933                 ASSERT(err == 0);
1934         } else {
1935                 /* remove from snapshot namespace */
1936                 dsl_dataset_t *ds_head;
1937                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1938                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1939                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1940                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1941 #ifdef ZFS_DEBUG
1942                 {
1943                         uint64_t val;
1944
1945                         err = dsl_dataset_snap_lookup(ds_head,
1946                             ds->ds_snapname, &val);
1947                         ASSERT3U(err, ==, 0);
1948                         ASSERT3U(val, ==, obj);
1949                 }
1950 #endif
1951                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1952                 ASSERT(err == 0);
1953                 dsl_dataset_rele(ds_head, FTAG);
1954         }
1955
1956         if (ds_prev && ds->ds_prev != ds_prev)
1957                 dsl_dataset_rele(ds_prev, FTAG);
1958
1959         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1960         spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
1961             "dataset = %llu", ds->ds_object);
1962
1963         if (ds->ds_phys->ds_next_clones_obj != 0) {
1964                 uint64_t count;
1965                 ASSERT(0 == zap_count(mos,
1966                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1967                 VERIFY(0 == dmu_object_free(mos,
1968                     ds->ds_phys->ds_next_clones_obj, tx));
1969         }
1970         if (ds->ds_phys->ds_props_obj != 0)
1971                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1972         if (ds->ds_phys->ds_userrefs_obj != 0)
1973                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1974         dsl_dir_close(ds->ds_dir, ds);
1975         ds->ds_dir = NULL;
1976         dsl_dataset_drain_refs(ds, tag);
1977         VERIFY(0 == dmu_object_free(mos, obj, tx));
1978
1979         if (dsda->rm_origin) {
1980                 /*
1981                  * Remove the origin of the clone we just destroyed.
1982                  */
1983                 struct dsl_ds_destroyarg ndsda = {0};
1984
1985                 ndsda.ds = dsda->rm_origin;
1986                 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1987         }
1988 }
1989
1990 static int
1991 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1992 {
1993         uint64_t asize;
1994
1995         if (!dmu_tx_is_syncing(tx))
1996                 return (0);
1997
1998         /*
1999          * If there's an fs-only reservation, any blocks that might become
2000          * owned by the snapshot dataset must be accommodated by space
2001          * outside of the reservation.
2002          */
2003         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2004         asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2005         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2006                 return (ENOSPC);
2007
2008         /*
2009          * Propogate any reserved space for this snapshot to other
2010          * snapshot checks in this sync group.
2011          */
2012         if (asize > 0)
2013                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2014
2015         return (0);
2016 }
2017
2018 int
2019 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
2020 {
2021         dsl_dataset_t *ds = arg1;
2022         const char *snapname = arg2;
2023         int err;
2024         uint64_t value;
2025
2026         /*
2027          * We don't allow multiple snapshots of the same txg.  If there
2028          * is already one, try again.
2029          */
2030         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2031                 return (EAGAIN);
2032
2033         /*
2034          * Check for conflicting name snapshot name.
2035          */
2036         err = dsl_dataset_snap_lookup(ds, snapname, &value);
2037         if (err == 0)
2038                 return (EEXIST);
2039         if (err != ENOENT)
2040                 return (err);
2041
2042         /*
2043          * Check that the dataset's name is not too long.  Name consists
2044          * of the dataset's length + 1 for the @-sign + snapshot name's length
2045          */
2046         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2047                 return (ENAMETOOLONG);
2048
2049         err = dsl_dataset_snapshot_reserve_space(ds, tx);
2050         if (err)
2051                 return (err);
2052
2053         ds->ds_trysnap_txg = tx->tx_txg;
2054         return (0);
2055 }
2056
2057 void
2058 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2059 {
2060         dsl_dataset_t *ds = arg1;
2061         const char *snapname = arg2;
2062         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2063         dmu_buf_t *dbuf;
2064         dsl_dataset_phys_t *dsphys;
2065         uint64_t dsobj, crtxg;
2066         objset_t *mos = dp->dp_meta_objset;
2067         int err;
2068
2069         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2070
2071         /*
2072          * The origin's ds_creation_txg has to be < TXG_INITIAL
2073          */
2074         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2075                 crtxg = 1;
2076         else
2077                 crtxg = tx->tx_txg;
2078
2079         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2080             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2081         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2082         dmu_buf_will_dirty(dbuf, tx);
2083         dsphys = dbuf->db_data;
2084         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2085         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2086         dsphys->ds_fsid_guid = unique_create();
2087         do {
2088                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2089                     sizeof (dsphys->ds_guid));
2090         } while (dsphys->ds_guid == 0);
2091         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2092         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2093         dsphys->ds_next_snap_obj = ds->ds_object;
2094         dsphys->ds_num_children = 1;
2095         dsphys->ds_creation_time = gethrestime_sec();
2096         dsphys->ds_creation_txg = crtxg;
2097         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2098         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
2099         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2100         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2101         dsphys->ds_flags = ds->ds_phys->ds_flags;
2102         dsphys->ds_bp = ds->ds_phys->ds_bp;
2103         dmu_buf_rele(dbuf, FTAG);
2104
2105         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2106         if (ds->ds_prev) {
2107                 uint64_t next_clones_obj =
2108                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2109                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2110                     ds->ds_object ||
2111                     ds->ds_prev->ds_phys->ds_num_children > 1);
2112                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2113                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2114                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2115                             ds->ds_prev->ds_phys->ds_creation_txg);
2116                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2117                 } else if (next_clones_obj != 0) {
2118                         remove_from_next_clones(ds->ds_prev,
2119                             dsphys->ds_next_snap_obj, tx);
2120                         VERIFY3U(0, ==, zap_add_int(mos,
2121                             next_clones_obj, dsobj, tx));
2122                 }
2123         }
2124
2125         /*
2126          * If we have a reference-reservation on this dataset, we will
2127          * need to increase the amount of refreservation being charged
2128          * since our unique space is going to zero.
2129          */
2130         if (ds->ds_reserved) {
2131                 int64_t delta;
2132                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2133                 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2134                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2135                     delta, 0, 0, tx);
2136         }
2137
2138         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2139         zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2140             ds->ds_dir->dd_myname, snapname, dsobj,
2141             ds->ds_phys->ds_prev_snap_txg);
2142         ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2143             UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2144         dsl_deadlist_close(&ds->ds_deadlist);
2145         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2146         dsl_deadlist_add_key(&ds->ds_deadlist,
2147             ds->ds_phys->ds_prev_snap_txg, tx);
2148
2149         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2150         ds->ds_phys->ds_prev_snap_obj = dsobj;
2151         ds->ds_phys->ds_prev_snap_txg = crtxg;
2152         ds->ds_phys->ds_unique_bytes = 0;
2153         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2154                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2155
2156         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2157             snapname, 8, 1, &dsobj, tx);
2158         ASSERT(err == 0);
2159
2160         if (ds->ds_prev)
2161                 dsl_dataset_drop_ref(ds->ds_prev, ds);
2162         VERIFY(0 == dsl_dataset_get_ref(dp,
2163             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2164
2165         dsl_scan_ds_snapshotted(ds, tx);
2166
2167         dsl_dir_snap_cmtime_update(ds->ds_dir);
2168
2169         spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
2170             "dataset = %llu", dsobj);
2171 }
2172
2173 void
2174 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2175 {
2176         ASSERT(dmu_tx_is_syncing(tx));
2177         ASSERT(ds->ds_objset != NULL);
2178         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2179
2180         /*
2181          * in case we had to change ds_fsid_guid when we opened it,
2182          * sync it out now.
2183          */
2184         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2185         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2186
2187         dsl_dir_dirty(ds->ds_dir, tx);
2188         dmu_objset_sync(ds->ds_objset, zio, tx);
2189 }
2190
2191 static void
2192 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2193 {
2194         uint64_t count = 0;
2195         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2196         zap_cursor_t zc;
2197         zap_attribute_t za;
2198         nvlist_t *propval;
2199         nvlist_t *val;
2200
2201         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2202         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2203         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2204
2205         /*
2206          * There may me missing entries in ds_next_clones_obj
2207          * due to a bug in a previous version of the code.
2208          * Only trust it if it has the right number of entries.
2209          */
2210         if (ds->ds_phys->ds_next_clones_obj != 0) {
2211                 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2212                     &count));
2213         }
2214         if (count != ds->ds_phys->ds_num_children - 1) {
2215                 goto fail;
2216         }
2217         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2218             zap_cursor_retrieve(&zc, &za) == 0;
2219             zap_cursor_advance(&zc)) {
2220                 dsl_dataset_t *clone;
2221                 char buf[ZFS_MAXNAMELEN];
2222                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2223                     za.za_first_integer, FTAG, &clone) != 0) {
2224                         goto fail;
2225                 }
2226                 dsl_dir_name(clone->ds_dir, buf);
2227                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2228                 dsl_dataset_rele(clone, FTAG);
2229         }
2230         zap_cursor_fini(&zc);
2231         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2232         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2233             propval) == 0);
2234 fail:
2235         nvlist_free(val);
2236         nvlist_free(propval);
2237         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2238 }
2239
2240 void
2241 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2242 {
2243         uint64_t refd, avail, uobjs, aobjs, ratio;
2244
2245         dsl_dir_stats(ds->ds_dir, nv);
2246
2247         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2248         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2249         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2250
2251         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2252             ds->ds_phys->ds_creation_time);
2253         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2254             ds->ds_phys->ds_creation_txg);
2255         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2256             ds->ds_quota);
2257         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2258             ds->ds_reserved);
2259         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2260             ds->ds_phys->ds_guid);
2261         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2262             ds->ds_phys->ds_unique_bytes);
2263         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2264             ds->ds_object);
2265         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2266             ds->ds_userrefs);
2267         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2268             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2269
2270         if (ds->ds_phys->ds_prev_snap_obj != 0) {
2271                 uint64_t written, comp, uncomp;
2272                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2273                 dsl_dataset_t *prev;
2274
2275                 rw_enter(&dp->dp_config_rwlock, RW_READER);
2276                 int err = dsl_dataset_hold_obj(dp,
2277                     ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2278                 rw_exit(&dp->dp_config_rwlock);
2279                 if (err == 0) {
2280                         err = dsl_dataset_space_written(prev, ds, &written,
2281                             &comp, &uncomp);
2282                         dsl_dataset_rele(prev, FTAG);
2283                         if (err == 0) {
2284                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2285                                     written);
2286                         }
2287                 }
2288         }
2289
2290         ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2291             (ds->ds_phys->ds_uncompressed_bytes * 100 /
2292             ds->ds_phys->ds_compressed_bytes);
2293         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2294
2295         if (ds->ds_phys->ds_next_snap_obj) {
2296                 /*
2297                  * This is a snapshot; override the dd's space used with
2298                  * our unique space and compression ratio.
2299                  */
2300                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2301                     ds->ds_phys->ds_unique_bytes);
2302                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2303
2304                 get_clones_stat(ds, nv);
2305         }
2306 }
2307
2308 void
2309 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2310 {
2311         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2312         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2313         stat->dds_guid = ds->ds_phys->ds_guid;
2314         if (ds->ds_phys->ds_next_snap_obj) {
2315                 stat->dds_is_snapshot = B_TRUE;
2316                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2317         } else {
2318                 stat->dds_is_snapshot = B_FALSE;
2319                 stat->dds_num_clones = 0;
2320         }
2321
2322         /* clone origin is really a dsl_dir thing... */
2323         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2324         if (dsl_dir_is_clone(ds->ds_dir)) {
2325                 dsl_dataset_t *ods;
2326
2327                 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2328                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2329                 dsl_dataset_name(ods, stat->dds_origin);
2330                 dsl_dataset_drop_ref(ods, FTAG);
2331         } else {
2332                 stat->dds_origin[0] = '\0';
2333         }
2334         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2335 }
2336
2337 uint64_t
2338 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2339 {
2340         return (ds->ds_fsid_guid);
2341 }
2342
2343 void
2344 dsl_dataset_space(dsl_dataset_t *ds,
2345     uint64_t *refdbytesp, uint64_t *availbytesp,
2346     uint64_t *usedobjsp, uint64_t *availobjsp)
2347 {
2348         *refdbytesp = ds->ds_phys->ds_used_bytes;
2349         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2350         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2351                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2352         if (ds->ds_quota != 0) {
2353                 /*
2354                  * Adjust available bytes according to refquota
2355                  */
2356                 if (*refdbytesp < ds->ds_quota)
2357                         *availbytesp = MIN(*availbytesp,
2358                             ds->ds_quota - *refdbytesp);
2359                 else
2360                         *availbytesp = 0;
2361         }
2362         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2363         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2364 }
2365
2366 boolean_t
2367 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2368 {
2369         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2370
2371         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2372             dsl_pool_sync_context(dp));
2373         if (ds->ds_prev == NULL)
2374                 return (B_FALSE);
2375         if (ds->ds_phys->ds_bp.blk_birth >
2376             ds->ds_prev->ds_phys->ds_creation_txg) {
2377                 objset_t *os, *os_prev;
2378                 /*
2379                  * It may be that only the ZIL differs, because it was
2380                  * reset in the head.  Don't count that as being
2381                  * modified.
2382                  */
2383                 if (dmu_objset_from_ds(ds, &os) != 0)
2384                         return (B_TRUE);
2385                 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2386                         return (B_TRUE);
2387                 return (bcmp(&os->os_phys->os_meta_dnode,
2388                     &os_prev->os_phys->os_meta_dnode,
2389                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2390         }
2391         return (B_FALSE);
2392 }
2393
2394 /* ARGSUSED */
2395 static int
2396 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2397 {
2398         dsl_dataset_t *ds = arg1;
2399         char *newsnapname = arg2;
2400         dsl_dir_t *dd = ds->ds_dir;
2401         dsl_dataset_t *hds;
2402         uint64_t val;
2403         int err;
2404
2405         err = dsl_dataset_hold_obj(dd->dd_pool,
2406             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2407         if (err)
2408                 return (err);
2409
2410         /* new name better not be in use */
2411         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2412         dsl_dataset_rele(hds, FTAG);
2413
2414         if (err == 0)
2415                 err = EEXIST;
2416         else if (err == ENOENT)
2417                 err = 0;
2418
2419         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2420         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2421                 err = ENAMETOOLONG;
2422
2423         return (err);
2424 }
2425
2426 static void
2427 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2428 {
2429         char oldname[MAXPATHLEN], newname[MAXPATHLEN];
2430         dsl_dataset_t *ds = arg1;
2431         const char *newsnapname = arg2;
2432         dsl_dir_t *dd = ds->ds_dir;
2433         objset_t *mos = dd->dd_pool->dp_meta_objset;
2434         dsl_dataset_t *hds;
2435         int err;
2436
2437         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2438
2439         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2440             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2441
2442         VERIFY(0 == dsl_dataset_get_snapname(ds));
2443         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2444         ASSERT3U(err, ==, 0);
2445         dsl_dataset_name(ds, oldname);
2446         mutex_enter(&ds->ds_lock);
2447         (void) strcpy(ds->ds_snapname, newsnapname);
2448         mutex_exit(&ds->ds_lock);
2449         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2450             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2451         ASSERT3U(err, ==, 0);
2452         dsl_dataset_name(ds, newname);
2453 #ifdef _KERNEL
2454         zvol_rename_minors(oldname, newname);
2455 #endif
2456
2457         spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
2458             "dataset = %llu", ds->ds_object);
2459         dsl_dataset_rele(hds, FTAG);
2460 }
2461
2462 struct renamesnaparg {
2463         dsl_sync_task_group_t *dstg;
2464         char failed[MAXPATHLEN];
2465         char *oldsnap;
2466         char *newsnap;
2467 };
2468
2469 static int
2470 dsl_snapshot_rename_one(const char *name, void *arg)
2471 {
2472         struct renamesnaparg *ra = arg;
2473         dsl_dataset_t *ds = NULL;
2474         char *snapname;
2475         int err;
2476
2477         snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2478         (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2479
2480         /*
2481          * For recursive snapshot renames the parent won't be changing
2482          * so we just pass name for both the to/from argument.
2483          */
2484         err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2485         if (err != 0) {
2486                 strfree(snapname);
2487                 return (err == ENOENT ? 0 : err);
2488         }
2489
2490 #ifdef _KERNEL
2491         /*
2492          * For all filesystems undergoing rename, we'll need to unmount it.
2493          */
2494         (void) zfs_unmount_snap(snapname, NULL);
2495 #endif
2496         err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2497         strfree(snapname);
2498         if (err != 0)
2499                 return (err == ENOENT ? 0 : err);
2500
2501         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2502             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2503
2504         return (0);
2505 }
2506
2507 static int
2508 dsl_recursive_rename(char *oldname, const char *newname)
2509 {
2510         int err;
2511         struct renamesnaparg *ra;
2512         dsl_sync_task_t *dst;
2513         spa_t *spa;
2514         char *cp, *fsname = spa_strdup(oldname);
2515         int len = strlen(oldname) + 1;
2516
2517         /* truncate the snapshot name to get the fsname */
2518         cp = strchr(fsname, '@');
2519         *cp = '\0';
2520
2521         err = spa_open(fsname, &spa, FTAG);
2522         if (err) {
2523                 kmem_free(fsname, len);
2524                 return (err);
2525         }
2526         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2527         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2528
2529         ra->oldsnap = strchr(oldname, '@') + 1;
2530         ra->newsnap = strchr(newname, '@') + 1;
2531         *ra->failed = '\0';
2532
2533         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2534             DS_FIND_CHILDREN);
2535         kmem_free(fsname, len);
2536
2537         if (err == 0) {
2538                 err = dsl_sync_task_group_wait(ra->dstg);
2539         }
2540
2541         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2542             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2543                 dsl_dataset_t *ds = dst->dst_arg1;
2544                 if (dst->dst_err) {
2545                         dsl_dir_name(ds->ds_dir, ra->failed);
2546                         (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2547                         (void) strlcat(ra->failed, ra->newsnap,
2548                             sizeof (ra->failed));
2549                 }
2550                 dsl_dataset_rele(ds, ra->dstg);
2551         }
2552
2553         if (err)
2554                 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2555
2556         dsl_sync_task_group_destroy(ra->dstg);
2557         kmem_free(ra, sizeof (struct renamesnaparg));
2558         spa_close(spa, FTAG);
2559         return (err);
2560 }
2561
2562 static int
2563 dsl_valid_rename(const char *oldname, void *arg)
2564 {
2565         int delta = *(int *)arg;
2566
2567         if (strlen(oldname) + delta >= MAXNAMELEN)
2568                 return (ENAMETOOLONG);
2569
2570         return (0);
2571 }
2572
2573 #pragma weak dmu_objset_rename = dsl_dataset_rename
2574 int
2575 dsl_dataset_rename(char *oldname, const char *newname, int flags)
2576 {
2577         dsl_dir_t *dd;
2578         dsl_dataset_t *ds;
2579         const char *tail;
2580         int err;
2581
2582         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2583         if (err)
2584                 return (err);
2585
2586         if (tail == NULL) {
2587                 int delta = strlen(newname) - strlen(oldname);
2588
2589                 /* if we're growing, validate child name lengths */
2590                 if (delta > 0)
2591                         err = dmu_objset_find(oldname, dsl_valid_rename,
2592                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2593
2594                 if (err == 0)
2595                         err = dsl_dir_rename(dd, newname, flags);
2596                 dsl_dir_close(dd, FTAG);
2597                 return (err);
2598         }
2599
2600         if (tail[0] != '@') {
2601                 /* the name ended in a nonexistent component */
2602                 dsl_dir_close(dd, FTAG);
2603                 return (ENOENT);
2604         }
2605
2606         dsl_dir_close(dd, FTAG);
2607
2608         /* new name must be snapshot in same filesystem */
2609         tail = strchr(newname, '@');
2610         if (tail == NULL)
2611                 return (EINVAL);
2612         tail++;
2613         if (strncmp(oldname, newname, tail - newname) != 0)
2614                 return (EXDEV);
2615
2616         if (flags & ZFS_RENAME_RECURSIVE) {
2617                 err = dsl_recursive_rename(oldname, newname);
2618         } else {
2619                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2620                 if (err)
2621                         return (err);
2622
2623                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2624                     dsl_dataset_snapshot_rename_check,
2625                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2626
2627                 dsl_dataset_rele(ds, FTAG);
2628         }
2629
2630         return (err);
2631 }
2632
2633 struct promotenode {
2634         list_node_t link;
2635         dsl_dataset_t *ds;
2636 };
2637
2638 struct promotearg {
2639         list_t shared_snaps, origin_snaps, clone_snaps;
2640         dsl_dataset_t *origin_origin;
2641         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2642         char *err_ds;
2643 };
2644
2645 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2646 static boolean_t snaplist_unstable(list_t *l);
2647
2648 static int
2649 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2650 {
2651         dsl_dataset_t *hds = arg1;
2652         struct promotearg *pa = arg2;
2653         struct promotenode *snap = list_head(&pa->shared_snaps);
2654         dsl_dataset_t *origin_ds = snap->ds;
2655         int err;
2656         uint64_t unused;
2657
2658         /* Check that it is a real clone */
2659         if (!dsl_dir_is_clone(hds->ds_dir))
2660                 return (EINVAL);
2661
2662         /* Since this is so expensive, don't do the preliminary check */
2663         if (!dmu_tx_is_syncing(tx))
2664                 return (0);
2665
2666         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2667                 return (EXDEV);
2668
2669         /* compute origin's new unique space */
2670         snap = list_tail(&pa->clone_snaps);
2671         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2672         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2673             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2674             &pa->unique, &unused, &unused);
2675
2676         /*
2677          * Walk the snapshots that we are moving
2678          *
2679          * Compute space to transfer.  Consider the incremental changes
2680          * to used for each snapshot:
2681          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2682          * So each snapshot gave birth to:
2683          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2684          * So a sequence would look like:
2685          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2686          * Which simplifies to:
2687          * uN + kN + kN-1 + ... + k1 + k0
2688          * Note however, if we stop before we reach the ORIGIN we get:
2689          * uN + kN + kN-1 + ... + kM - uM-1
2690          */
2691         pa->used = origin_ds->ds_phys->ds_used_bytes;
2692         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2693         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2694         for (snap = list_head(&pa->shared_snaps); snap;
2695             snap = list_next(&pa->shared_snaps, snap)) {
2696                 uint64_t val, dlused, dlcomp, dluncomp;
2697                 dsl_dataset_t *ds = snap->ds;
2698
2699                 /* Check that the snapshot name does not conflict */
2700                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2701                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2702                 if (err == 0) {
2703                         err = EEXIST;
2704                         goto out;
2705                 }
2706                 if (err != ENOENT)
2707                         goto out;
2708
2709                 /* The very first snapshot does not have a deadlist */
2710                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2711                         continue;
2712
2713                 dsl_deadlist_space(&ds->ds_deadlist,
2714                     &dlused, &dlcomp, &dluncomp);
2715                 pa->used += dlused;
2716                 pa->comp += dlcomp;
2717                 pa->uncomp += dluncomp;
2718         }
2719
2720         /*
2721          * If we are a clone of a clone then we never reached ORIGIN,
2722          * so we need to subtract out the clone origin's used space.
2723          */
2724         if (pa->origin_origin) {
2725                 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
2726                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2727                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2728         }
2729
2730         /* Check that there is enough space here */
2731         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2732             pa->used);
2733         if (err)
2734                 return (err);
2735
2736         /*
2737          * Compute the amounts of space that will be used by snapshots
2738          * after the promotion (for both origin and clone).  For each,
2739          * it is the amount of space that will be on all of their
2740          * deadlists (that was not born before their new origin).
2741          */
2742         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2743                 uint64_t space;
2744
2745                 /*
2746                  * Note, typically this will not be a clone of a clone,
2747                  * so dd_origin_txg will be < TXG_INITIAL, so
2748                  * these snaplist_space() -> dsl_deadlist_space_range()
2749                  * calls will be fast because they do not have to
2750                  * iterate over all bps.
2751                  */
2752                 snap = list_head(&pa->origin_snaps);
2753                 err = snaplist_space(&pa->shared_snaps,
2754                     snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2755                 if (err)
2756                         return (err);
2757
2758                 err = snaplist_space(&pa->clone_snaps,
2759                     snap->ds->ds_dir->dd_origin_txg, &space);
2760                 if (err)
2761                         return (err);
2762                 pa->cloneusedsnap += space;
2763         }
2764         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2765                 err = snaplist_space(&pa->origin_snaps,
2766                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2767                 if (err)
2768                         return (err);
2769         }
2770
2771         return (0);
2772 out:
2773         pa->err_ds =  snap->ds->ds_snapname;
2774         return (err);
2775 }
2776
2777 static void
2778 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2779 {
2780         dsl_dataset_t *hds = arg1;
2781         struct promotearg *pa = arg2;
2782         struct promotenode *snap = list_head(&pa->shared_snaps);
2783         dsl_dataset_t *origin_ds = snap->ds;
2784         dsl_dataset_t *origin_head;
2785         dsl_dir_t *dd = hds->ds_dir;
2786         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2787         dsl_dir_t *odd = NULL;
2788         uint64_t oldnext_obj;
2789         int64_t delta;
2790
2791         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2792
2793         snap = list_head(&pa->origin_snaps);
2794         origin_head = snap->ds;
2795
2796         /*
2797          * We need to explicitly open odd, since origin_ds's dd will be
2798          * changing.
2799          */
2800         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2801             NULL, FTAG, &odd));
2802
2803         /* change origin's next snap */
2804         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2805         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2806         snap = list_tail(&pa->clone_snaps);
2807         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2808         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2809
2810         /* change the origin's next clone */
2811         if (origin_ds->ds_phys->ds_next_clones_obj) {
2812                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2813                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2814                     origin_ds->ds_phys->ds_next_clones_obj,
2815                     oldnext_obj, tx));
2816         }
2817
2818         /* change origin */
2819         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2820         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2821         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2822         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2823         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2824         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2825         origin_head->ds_dir->dd_origin_txg =
2826             origin_ds->ds_phys->ds_creation_txg;
2827
2828         /* change dd_clone entries */
2829         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2830                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2831                     odd->dd_phys->dd_clones, hds->ds_object, tx));
2832                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2833                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2834                     hds->ds_object, tx));
2835
2836                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2837                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2838                     origin_head->ds_object, tx));
2839                 if (dd->dd_phys->dd_clones == 0) {
2840                         dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2841                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2842                 }
2843                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2844                     dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2845
2846         }
2847
2848         /* move snapshots to this dir */
2849         for (snap = list_head(&pa->shared_snaps); snap;
2850             snap = list_next(&pa->shared_snaps, snap)) {
2851                 dsl_dataset_t *ds = snap->ds;
2852
2853                 /* unregister props as dsl_dir is changing */
2854                 if (ds->ds_objset) {
2855                         dmu_objset_evict(ds->ds_objset);
2856                         ds->ds_objset = NULL;
2857                 }
2858                 /* move snap name entry */
2859                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2860                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2861                     ds->ds_snapname, tx));
2862                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2863                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2864                     8, 1, &ds->ds_object, tx));
2865
2866                 /* change containing dsl_dir */
2867                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2868                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2869                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2870                 ASSERT3P(ds->ds_dir, ==, odd);
2871                 dsl_dir_close(ds->ds_dir, ds);
2872                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2873                     NULL, ds, &ds->ds_dir));
2874
2875                 /* move any clone references */
2876                 if (ds->ds_phys->ds_next_clones_obj &&
2877                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2878                         zap_cursor_t zc;
2879                         zap_attribute_t za;
2880
2881                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2882                             ds->ds_phys->ds_next_clones_obj);
2883                             zap_cursor_retrieve(&zc, &za) == 0;
2884                             zap_cursor_advance(&zc)) {
2885                                 dsl_dataset_t *cnds;
2886                                 uint64_t o;
2887
2888                                 if (za.za_first_integer == oldnext_obj) {
2889                                         /*
2890                                          * We've already moved the
2891                                          * origin's reference.
2892                                          */
2893                                         continue;
2894                                 }
2895
2896                                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2897                                     za.za_first_integer, FTAG, &cnds));
2898                                 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2899
2900                                 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2901                                     odd->dd_phys->dd_clones, o, tx), ==, 0);
2902                                 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2903                                     dd->dd_phys->dd_clones, o, tx), ==, 0);
2904                                 dsl_dataset_rele(cnds, FTAG);
2905                         }
2906                         zap_cursor_fini(&zc);
2907                 }
2908
2909                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
2910         }
2911
2912         /*
2913          * Change space accounting.
2914          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2915          * both be valid, or both be 0 (resulting in delta == 0).  This
2916          * is true for each of {clone,origin} independently.
2917          */
2918
2919         delta = pa->cloneusedsnap -
2920             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2921         ASSERT3S(delta, >=, 0);
2922         ASSERT3U(pa->used, >=, delta);
2923         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2924         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2925             pa->used - delta, pa->comp, pa->uncomp, tx);
2926
2927         delta = pa->originusedsnap -
2928             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2929         ASSERT3S(delta, <=, 0);
2930         ASSERT3U(pa->used, >=, -delta);
2931         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2932         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2933             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2934
2935         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2936
2937         /* log history record */
2938         spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
2939             "dataset = %llu", hds->ds_object);
2940
2941         dsl_dir_close(odd, FTAG);
2942 }
2943
2944 static char *snaplist_tag = "snaplist";
2945 /*
2946  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2947  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2948  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2949  * snapshots back to this dataset's origin.
2950  */
2951 static int
2952 snaplist_make(dsl_pool_t *dp, boolean_t own,
2953     uint64_t first_obj, uint64_t last_obj, list_t *l)
2954 {
2955         uint64_t obj = last_obj;
2956
2957         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2958
2959         list_create(l, sizeof (struct promotenode),
2960             offsetof(struct promotenode, link));
2961
2962         while (obj != first_obj) {
2963                 dsl_dataset_t *ds;
2964                 struct promotenode *snap;
2965                 int err;
2966
2967                 if (own) {
2968                         err = dsl_dataset_own_obj(dp, obj,
2969                             0, snaplist_tag, &ds);
2970                         if (err == 0)
2971                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2972                 } else {
2973                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2974                 }
2975                 if (err == ENOENT) {
2976                         /* lost race with snapshot destroy */
2977                         struct promotenode *last = list_tail(l);
2978                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2979                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2980                         continue;
2981                 } else if (err) {
2982                         return (err);
2983                 }
2984
2985                 if (first_obj == 0)
2986                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2987
2988                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2989                 snap->ds = ds;
2990                 list_insert_tail(l, snap);
2991                 obj = ds->ds_phys->ds_prev_snap_obj;
2992         }
2993
2994         return (0);
2995 }
2996
2997 static int
2998 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2999 {
3000         struct promotenode *snap;
3001
3002         *spacep = 0;
3003         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
3004                 uint64_t used, comp, uncomp;
3005                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
3006                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
3007                 *spacep += used;
3008         }
3009         return (0);
3010 }
3011
3012 static void
3013 snaplist_destroy(list_t *l, boolean_t own)
3014 {
3015         struct promotenode *snap;
3016
3017         if (!l || !list_link_active(&l->list_head))
3018                 return;
3019
3020         while ((snap = list_tail(l)) != NULL) {
3021                 list_remove(l, snap);
3022                 if (own)
3023                         dsl_dataset_disown(snap->ds, snaplist_tag);
3024                 else
3025                         dsl_dataset_rele(snap->ds, snaplist_tag);
3026                 kmem_free(snap, sizeof (struct promotenode));
3027         }
3028         list_destroy(l);
3029 }
3030
3031 /*
3032  * Promote a clone.  Nomenclature note:
3033  * "clone" or "cds": the original clone which is being promoted
3034  * "origin" or "ods": the snapshot which is originally clone's origin
3035  * "origin head" or "ohds": the dataset which is the head
3036  * (filesystem/volume) for the origin
3037  * "origin origin": the origin of the origin's filesystem (typically
3038  * NULL, indicating that the clone is not a clone of a clone).
3039  */
3040 int
3041 dsl_dataset_promote(const char *name, char *conflsnap)
3042 {
3043         dsl_dataset_t *ds;
3044         dsl_dir_t *dd;
3045         dsl_pool_t *dp;
3046         dmu_object_info_t doi;
3047         struct promotearg pa = { 0 };
3048         struct promotenode *snap;
3049         int err;
3050
3051         err = dsl_dataset_hold(name, FTAG, &ds);
3052         if (err)
3053                 return (err);
3054         dd = ds->ds_dir;
3055         dp = dd->dd_pool;
3056
3057         err = dmu_object_info(dp->dp_meta_objset,
3058             ds->ds_phys->ds_snapnames_zapobj, &doi);
3059         if (err) {
3060                 dsl_dataset_rele(ds, FTAG);
3061                 return (err);
3062         }
3063
3064         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3065                 dsl_dataset_rele(ds, FTAG);
3066                 return (EINVAL);
3067         }
3068
3069         /*
3070          * We are going to inherit all the snapshots taken before our
3071          * origin (i.e., our new origin will be our parent's origin).
3072          * Take ownership of them so that we can rename them into our
3073          * namespace.
3074          */
3075         rw_enter(&dp->dp_config_rwlock, RW_READER);
3076
3077         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3078             &pa.shared_snaps);
3079         if (err != 0)
3080                 goto out;
3081
3082         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3083         if (err != 0)
3084                 goto out;
3085
3086         snap = list_head(&pa.shared_snaps);
3087         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3088         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3089             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3090         if (err != 0)
3091                 goto out;
3092
3093         if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3094                 err = dsl_dataset_hold_obj(dp,
3095                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
3096                     FTAG, &pa.origin_origin);
3097                 if (err != 0)
3098                         goto out;
3099         }
3100
3101 out:
3102         rw_exit(&dp->dp_config_rwlock);
3103
3104         /*
3105          * Add in 128x the snapnames zapobj size, since we will be moving
3106          * a bunch of snapnames to the promoted ds, and dirtying their
3107          * bonus buffers.
3108          */
3109         if (err == 0) {
3110                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3111                     dsl_dataset_promote_sync, ds, &pa,
3112                     2 + 2 * doi.doi_physical_blocks_512);
3113                 if (err && pa.err_ds && conflsnap)
3114                         (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3115         }
3116
3117         snaplist_destroy(&pa.shared_snaps, B_TRUE);
3118         snaplist_destroy(&pa.clone_snaps, B_FALSE);
3119         snaplist_destroy(&pa.origin_snaps, B_FALSE);
3120         if (pa.origin_origin)
3121                 dsl_dataset_rele(pa.origin_origin, FTAG);
3122         dsl_dataset_rele(ds, FTAG);
3123         return (err);
3124 }
3125
3126 struct cloneswaparg {
3127         dsl_dataset_t *cds; /* clone dataset */
3128         dsl_dataset_t *ohds; /* origin's head dataset */
3129         boolean_t force;
3130         int64_t unused_refres_delta; /* change in unconsumed refreservation */
3131 };
3132
3133 /* ARGSUSED */
3134 static int
3135 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3136 {
3137         struct cloneswaparg *csa = arg1;
3138
3139         /* they should both be heads */
3140         if (dsl_dataset_is_snapshot(csa->cds) ||
3141             dsl_dataset_is_snapshot(csa->ohds))
3142                 return (EINVAL);
3143
3144         /* the branch point should be just before them */
3145         if (csa->cds->ds_prev != csa->ohds->ds_prev)
3146                 return (EINVAL);
3147
3148         /* cds should be the clone (unless they are unrelated) */
3149         if (csa->cds->ds_prev != NULL &&
3150             csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3151             csa->ohds->ds_object !=
3152             csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3153                 return (EINVAL);
3154
3155         /* the clone should be a child of the origin */
3156         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3157                 return (EINVAL);
3158
3159         /* ohds shouldn't be modified unless 'force' */
3160         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3161                 return (ETXTBSY);
3162
3163         /* adjust amount of any unconsumed refreservation */
3164         csa->unused_refres_delta =
3165             (int64_t)MIN(csa->ohds->ds_reserved,
3166             csa->ohds->ds_phys->ds_unique_bytes) -
3167             (int64_t)MIN(csa->ohds->ds_reserved,
3168             csa->cds->ds_phys->ds_unique_bytes);
3169
3170         if (csa->unused_refres_delta > 0 &&
3171             csa->unused_refres_delta >
3172             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3173                 return (ENOSPC);
3174
3175         if (csa->ohds->ds_quota != 0 &&
3176             csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3177                 return (EDQUOT);
3178
3179         return (0);
3180 }
3181
3182 /* ARGSUSED */
3183 static void
3184 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3185 {
3186         struct cloneswaparg *csa = arg1;
3187         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3188
3189         ASSERT(csa->cds->ds_reserved == 0);
3190         ASSERT(csa->ohds->ds_quota == 0 ||
3191             csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3192
3193         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3194         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3195
3196         if (csa->cds->ds_objset != NULL) {
3197                 dmu_objset_evict(csa->cds->ds_objset);
3198                 csa->cds->ds_objset = NULL;
3199         }
3200
3201         if (csa->ohds->ds_objset != NULL) {
3202                 dmu_objset_evict(csa->ohds->ds_objset);
3203                 csa->ohds->ds_objset = NULL;
3204         }
3205
3206         /*
3207          * Reset origin's unique bytes, if it exists.
3208          */
3209         if (csa->cds->ds_prev) {
3210                 dsl_dataset_t *origin = csa->cds->ds_prev;
3211                 uint64_t comp, uncomp;
3212
3213                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3214                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3215                     origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3216                     &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3217         }
3218
3219         /* swap blkptrs */
3220         {
3221                 blkptr_t tmp;
3222                 tmp = csa->ohds->ds_phys->ds_bp;
3223                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3224                 csa->cds->ds_phys->ds_bp = tmp;
3225         }
3226
3227         /* set dd_*_bytes */
3228         {
3229                 int64_t dused, dcomp, duncomp;
3230                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3231                 uint64_t odl_used, odl_comp, odl_uncomp;
3232
3233                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3234                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3235
3236                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3237                     &cdl_used, &cdl_comp, &cdl_uncomp);
3238                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3239                     &odl_used, &odl_comp, &odl_uncomp);
3240
3241                 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
3242                     (csa->ohds->ds_phys->ds_used_bytes + odl_used);
3243                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3244                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3245                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3246                     cdl_uncomp -
3247                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3248
3249                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3250                     dused, dcomp, duncomp, tx);
3251                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3252                     -dused, -dcomp, -duncomp, tx);
3253
3254                 /*
3255                  * The difference in the space used by snapshots is the
3256                  * difference in snapshot space due to the head's
3257                  * deadlist (since that's the only thing that's
3258                  * changing that affects the snapused).
3259                  */
3260                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3261                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3262                     &cdl_used, &cdl_comp, &cdl_uncomp);
3263                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3264                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3265                     &odl_used, &odl_comp, &odl_uncomp);
3266                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3267                     DD_USED_HEAD, DD_USED_SNAP, tx);
3268         }
3269
3270         /* swap ds_*_bytes */
3271         SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
3272             csa->cds->ds_phys->ds_used_bytes);
3273         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3274             csa->cds->ds_phys->ds_compressed_bytes);
3275         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3276             csa->cds->ds_phys->ds_uncompressed_bytes);
3277         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3278             csa->cds->ds_phys->ds_unique_bytes);
3279
3280         /* apply any parent delta for change in unconsumed refreservation */
3281         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3282             csa->unused_refres_delta, 0, 0, tx);
3283
3284         /*
3285          * Swap deadlists.
3286          */
3287         dsl_deadlist_close(&csa->cds->ds_deadlist);
3288         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3289         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3290             csa->cds->ds_phys->ds_deadlist_obj);
3291         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3292             csa->cds->ds_phys->ds_deadlist_obj);
3293         dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3294             csa->ohds->ds_phys->ds_deadlist_obj);
3295
3296         dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3297 }
3298
3299 /*
3300  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3301  * recv" into an existing fs to swizzle the file system to the new
3302  * version, and by "zfs rollback".  Can also be used to swap two
3303  * independent head datasets if neither has any snapshots.
3304  */
3305 int
3306 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3307     boolean_t force)
3308 {
3309         struct cloneswaparg csa;
3310         int error;
3311
3312         ASSERT(clone->ds_owner);
3313         ASSERT(origin_head->ds_owner);
3314 retry:
3315         /*
3316          * Need exclusive access for the swap. If we're swapping these
3317          * datasets back after an error, we already hold the locks.
3318          */
3319         if (!RW_WRITE_HELD(&clone->ds_rwlock))
3320                 rw_enter(&clone->ds_rwlock, RW_WRITER);
3321         if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3322             !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3323                 rw_exit(&clone->ds_rwlock);
3324                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3325                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3326                         rw_exit(&origin_head->ds_rwlock);
3327                         goto retry;
3328                 }
3329         }
3330         csa.cds = clone;
3331         csa.ohds = origin_head;
3332         csa.force = force;
3333         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3334             dsl_dataset_clone_swap_check,
3335             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3336         return (error);
3337 }
3338
3339 /*
3340  * Given a pool name and a dataset object number in that pool,
3341  * return the name of that dataset.
3342  */
3343 int
3344 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3345 {
3346         spa_t *spa;
3347         dsl_pool_t *dp;
3348         dsl_dataset_t *ds;
3349         int error;
3350
3351         if ((error = spa_open(pname, &spa, FTAG)) != 0)
3352                 return (error);
3353         dp = spa_get_dsl(spa);
3354         rw_enter(&dp->dp_config_rwlock, RW_READER);
3355         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3356                 dsl_dataset_name(ds, buf);
3357                 dsl_dataset_rele(ds, FTAG);
3358         }
3359         rw_exit(&dp->dp_config_rwlock);
3360         spa_close(spa, FTAG);
3361
3362         return (error);
3363 }
3364
3365 int
3366 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3367     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3368 {
3369         int error = 0;
3370
3371         ASSERT3S(asize, >, 0);
3372
3373         /*
3374          * *ref_rsrv is the portion of asize that will come from any
3375          * unconsumed refreservation space.
3376          */
3377         *ref_rsrv = 0;
3378
3379         mutex_enter(&ds->ds_lock);
3380         /*
3381          * Make a space adjustment for reserved bytes.
3382          */
3383         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3384                 ASSERT3U(*used, >=,
3385                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3386                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3387                 *ref_rsrv =
3388                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3389         }
3390
3391         if (!check_quota || ds->ds_quota == 0) {
3392                 mutex_exit(&ds->ds_lock);
3393                 return (0);
3394         }
3395         /*
3396          * If they are requesting more space, and our current estimate
3397          * is over quota, they get to try again unless the actual
3398          * on-disk is over quota and there are no pending changes (which
3399          * may free up space for us).
3400          */
3401         if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
3402                 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
3403                         error = ERESTART;
3404                 else
3405                         error = EDQUOT;
3406         }
3407         mutex_exit(&ds->ds_lock);
3408
3409         return (error);
3410 }
3411
3412 /* ARGSUSED */
3413 static int
3414 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3415 {
3416         dsl_dataset_t *ds = arg1;
3417         dsl_prop_setarg_t *psa = arg2;
3418         int err;
3419
3420         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3421                 return (ENOTSUP);
3422
3423         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3424                 return (err);
3425
3426         if (psa->psa_effective_value == 0)
3427                 return (0);
3428
3429         if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
3430             psa->psa_effective_value < ds->ds_reserved)
3431                 return (ENOSPC);
3432
3433         return (0);
3434 }
3435
3436 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3437
3438 void
3439 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3440 {
3441         dsl_dataset_t *ds = arg1;
3442         dsl_prop_setarg_t *psa = arg2;
3443         uint64_t effective_value = psa->psa_effective_value;
3444
3445         dsl_prop_set_sync(ds, psa, tx);
3446         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3447
3448         if (ds->ds_quota != effective_value) {
3449                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3450                 ds->ds_quota = effective_value;
3451
3452                 spa_history_log_internal(LOG_DS_REFQUOTA,
3453                     ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
3454                     (longlong_t)ds->ds_quota, ds->ds_object);
3455         }
3456 }
3457
3458 int
3459 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3460 {
3461         dsl_dataset_t *ds;
3462         dsl_prop_setarg_t psa;
3463         int err;
3464
3465         dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3466
3467         err = dsl_dataset_hold(dsname, FTAG, &ds);
3468         if (err)
3469                 return (err);
3470
3471         /*
3472          * If someone removes a file, then tries to set the quota, we
3473          * want to make sure the file freeing takes effect.
3474          */
3475         txg_wait_open(ds->ds_dir->dd_pool, 0);
3476
3477         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3478             dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3479             ds, &psa, 0);
3480
3481         dsl_dataset_rele(ds, FTAG);
3482         return (err);
3483 }
3484
3485 static int
3486 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3487 {
3488         dsl_dataset_t *ds = arg1;
3489         dsl_prop_setarg_t *psa = arg2;
3490         uint64_t effective_value;
3491         uint64_t unique;
3492         int err;
3493
3494         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3495             SPA_VERSION_REFRESERVATION)
3496                 return (ENOTSUP);
3497
3498         if (dsl_dataset_is_snapshot(ds))
3499                 return (EINVAL);
3500
3501         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3502                 return (err);
3503
3504         effective_value = psa->psa_effective_value;
3505
3506         /*
3507          * If we are doing the preliminary check in open context, the
3508          * space estimates may be inaccurate.
3509          */
3510         if (!dmu_tx_is_syncing(tx))
3511                 return (0);
3512
3513         mutex_enter(&ds->ds_lock);
3514         if (!DS_UNIQUE_IS_ACCURATE(ds))
3515                 dsl_dataset_recalc_head_uniq(ds);
3516         unique = ds->ds_phys->ds_unique_bytes;
3517         mutex_exit(&ds->ds_lock);
3518
3519         if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3520                 uint64_t delta = MAX(unique, effective_value) -
3521                     MAX(unique, ds->ds_reserved);
3522
3523                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3524                         return (ENOSPC);
3525                 if (ds->ds_quota > 0 &&
3526                     effective_value > ds->ds_quota)
3527                         return (ENOSPC);
3528         }
3529
3530         return (0);
3531 }
3532
3533 static void
3534 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3535 {
3536         dsl_dataset_t *ds = arg1;
3537         dsl_prop_setarg_t *psa = arg2;
3538         uint64_t effective_value = psa->psa_effective_value;
3539         uint64_t unique;
3540         int64_t delta;
3541
3542         dsl_prop_set_sync(ds, psa, tx);
3543         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3544
3545         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3546
3547         mutex_enter(&ds->ds_dir->dd_lock);
3548         mutex_enter(&ds->ds_lock);
3549         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3550         unique = ds->ds_phys->ds_unique_bytes;
3551         delta = MAX(0, (int64_t)(effective_value - unique)) -
3552             MAX(0, (int64_t)(ds->ds_reserved - unique));
3553         ds->ds_reserved = effective_value;
3554         mutex_exit(&ds->ds_lock);
3555
3556         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3557         mutex_exit(&ds->ds_dir->dd_lock);
3558
3559         spa_history_log_internal(LOG_DS_REFRESERV,
3560             ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
3561             (longlong_t)effective_value, ds->ds_object);
3562 }
3563
3564 int
3565 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3566     uint64_t reservation)
3567 {
3568         dsl_dataset_t *ds;
3569         dsl_prop_setarg_t psa;
3570         int err;
3571
3572         dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3573             &reservation);
3574
3575         err = dsl_dataset_hold(dsname, FTAG, &ds);
3576         if (err)
3577                 return (err);
3578
3579         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3580             dsl_dataset_set_reservation_check,
3581             dsl_dataset_set_reservation_sync, ds, &psa, 0);
3582
3583         dsl_dataset_rele(ds, FTAG);
3584         return (err);
3585 }
3586
3587 typedef struct zfs_hold_cleanup_arg {
3588         dsl_pool_t *dp;
3589         uint64_t dsobj;
3590         char htag[MAXNAMELEN];
3591 } zfs_hold_cleanup_arg_t;
3592
3593 static void
3594 dsl_dataset_user_release_onexit(void *arg)
3595 {
3596         zfs_hold_cleanup_arg_t *ca = arg;
3597
3598         (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3599             B_TRUE);
3600         kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3601 }
3602
3603 void
3604 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3605     minor_t minor)
3606 {
3607         zfs_hold_cleanup_arg_t *ca;
3608
3609         ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3610         ca->dp = ds->ds_dir->dd_pool;
3611         ca->dsobj = ds->ds_object;
3612         (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3613         VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3614             dsl_dataset_user_release_onexit, ca, NULL));
3615 }
3616
3617 /*
3618  * If you add new checks here, you may need to add
3619  * additional checks to the "temporary" case in
3620  * snapshot_check() in dmu_objset.c.
3621  */
3622 static int
3623 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3624 {
3625         dsl_dataset_t *ds = arg1;
3626         struct dsl_ds_holdarg *ha = arg2;
3627         char *htag = ha->htag;
3628         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3629         int error = 0;
3630
3631         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3632                 return (ENOTSUP);
3633
3634         if (!dsl_dataset_is_snapshot(ds))
3635                 return (EINVAL);
3636
3637         /* tags must be unique */
3638         mutex_enter(&ds->ds_lock);
3639         if (ds->ds_phys->ds_userrefs_obj) {
3640                 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3641                     8, 1, tx);
3642                 if (error == 0)
3643                         error = EEXIST;
3644                 else if (error == ENOENT)
3645                         error = 0;
3646         }
3647         mutex_exit(&ds->ds_lock);
3648
3649         if (error == 0 && ha->temphold &&
3650             strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3651                 error = E2BIG;
3652
3653         return (error);
3654 }
3655
3656 void
3657 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3658 {
3659         dsl_dataset_t *ds = arg1;
3660         struct dsl_ds_holdarg *ha = arg2;
3661         char *htag = ha->htag;
3662         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3663         objset_t *mos = dp->dp_meta_objset;
3664         uint64_t now = gethrestime_sec();
3665         uint64_t zapobj;
3666
3667         mutex_enter(&ds->ds_lock);
3668         if (ds->ds_phys->ds_userrefs_obj == 0) {
3669                 /*
3670                  * This is the first user hold for this dataset.  Create
3671                  * the userrefs zap object.
3672                  */
3673                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3674                 zapobj = ds->ds_phys->ds_userrefs_obj =
3675                     zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3676         } else {
3677                 zapobj = ds->ds_phys->ds_userrefs_obj;
3678         }
3679         ds->ds_userrefs++;
3680         mutex_exit(&ds->ds_lock);
3681
3682         VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3683
3684         if (ha->temphold) {
3685                 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3686                     htag, &now, tx));
3687         }
3688
3689         spa_history_log_internal(LOG_DS_USER_HOLD,
3690             dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
3691             (int)ha->temphold, ds->ds_object);
3692 }
3693
3694 static int
3695 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3696 {
3697         struct dsl_ds_holdarg *ha = arg;
3698         dsl_dataset_t *ds;
3699         int error;
3700         char *name;
3701
3702         /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3703         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3704         error = dsl_dataset_hold(name, ha->dstg, &ds);
3705         strfree(name);
3706         if (error == 0) {
3707                 ha->gotone = B_TRUE;
3708                 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3709                     dsl_dataset_user_hold_sync, ds, ha, 0);
3710         } else if (error == ENOENT && ha->recursive) {
3711                 error = 0;
3712         } else {
3713                 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3714         }
3715         return (error);
3716 }
3717
3718 int
3719 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3720     boolean_t temphold)
3721 {
3722         struct dsl_ds_holdarg *ha;
3723         int error;
3724
3725         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3726         ha->htag = htag;
3727         ha->temphold = temphold;
3728         error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3729             dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3730             ds, ha, 0);
3731         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3732
3733         return (error);
3734 }
3735
3736 int
3737 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3738     boolean_t recursive, boolean_t temphold, int cleanup_fd)
3739 {
3740         struct dsl_ds_holdarg *ha;
3741         dsl_sync_task_t *dst;
3742         spa_t *spa;
3743         int error;
3744         minor_t minor = 0;
3745
3746         if (cleanup_fd != -1) {
3747                 /* Currently we only support cleanup-on-exit of tempholds. */
3748                 if (!temphold)
3749                         return (EINVAL);
3750                 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3751                 if (error)
3752                         return (error);
3753         }
3754
3755         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3756
3757         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3758
3759         error = spa_open(dsname, &spa, FTAG);
3760         if (error) {
3761                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3762                 if (cleanup_fd != -1)
3763                         zfs_onexit_fd_rele(cleanup_fd);
3764                 return (error);
3765         }
3766
3767         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3768         ha->htag = htag;
3769         ha->snapname = snapname;
3770         ha->recursive = recursive;
3771         ha->temphold = temphold;
3772
3773         if (recursive) {
3774                 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3775                     ha, DS_FIND_CHILDREN);
3776         } else {
3777                 error = dsl_dataset_user_hold_one(dsname, ha);
3778         }
3779         if (error == 0)
3780                 error = dsl_sync_task_group_wait(ha->dstg);
3781
3782         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3783             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3784                 dsl_dataset_t *ds = dst->dst_arg1;
3785
3786                 if (dst->dst_err) {
3787                         dsl_dataset_name(ds, ha->failed);
3788                         *strchr(ha->failed, '@') = '\0';
3789                 } else if (error == 0 && minor != 0 && temphold) {
3790                         /*
3791                          * If this hold is to be released upon process exit,
3792                          * register that action now.
3793                          */
3794                         dsl_register_onexit_hold_cleanup(ds, htag, minor);
3795                 }
3796                 dsl_dataset_rele(ds, ha->dstg);
3797         }
3798
3799         if (error == 0 && recursive && !ha->gotone)
3800                 error = ENOENT;
3801
3802         if (error)
3803                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3804
3805         dsl_sync_task_group_destroy(ha->dstg);
3806
3807         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3808         spa_close(spa, FTAG);
3809         if (cleanup_fd != -1)
3810                 zfs_onexit_fd_rele(cleanup_fd);
3811         return (error);
3812 }
3813
3814 struct dsl_ds_releasearg {
3815         dsl_dataset_t *ds;
3816         const char *htag;
3817         boolean_t own;          /* do we own or just hold ds? */
3818 };
3819
3820 static int
3821 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3822     boolean_t *might_destroy)
3823 {
3824         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3825         uint64_t zapobj;
3826         uint64_t tmp;
3827         int error;
3828
3829         *might_destroy = B_FALSE;
3830
3831         mutex_enter(&ds->ds_lock);
3832         zapobj = ds->ds_phys->ds_userrefs_obj;
3833         if (zapobj == 0) {
3834                 /* The tag can't possibly exist */
3835                 mutex_exit(&ds->ds_lock);
3836                 return (ESRCH);
3837         }
3838
3839         /* Make sure the tag exists */
3840         error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3841         if (error) {
3842                 mutex_exit(&ds->ds_lock);
3843                 if (error == ENOENT)
3844                         error = ESRCH;
3845                 return (error);
3846         }
3847
3848         if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3849             DS_IS_DEFER_DESTROY(ds))
3850                 *might_destroy = B_TRUE;
3851
3852         mutex_exit(&ds->ds_lock);
3853         return (0);
3854 }
3855
3856 static int
3857 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3858 {
3859         struct dsl_ds_releasearg *ra = arg1;
3860         dsl_dataset_t *ds = ra->ds;
3861         boolean_t might_destroy;
3862         int error;
3863
3864         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3865                 return (ENOTSUP);
3866
3867         error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3868         if (error)
3869                 return (error);
3870
3871         if (might_destroy) {
3872                 struct dsl_ds_destroyarg dsda = {0};
3873
3874                 if (dmu_tx_is_syncing(tx)) {
3875                         /*
3876                          * If we're not prepared to remove the snapshot,
3877                          * we can't allow the release to happen right now.
3878                          */
3879                         if (!ra->own)
3880                                 return (EBUSY);
3881                 }
3882                 dsda.ds = ds;
3883                 dsda.releasing = B_TRUE;
3884                 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3885         }
3886
3887         return (0);
3888 }
3889
3890 static void
3891 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3892 {
3893         struct dsl_ds_releasearg *ra = arg1;
3894         dsl_dataset_t *ds = ra->ds;
3895         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3896         objset_t *mos = dp->dp_meta_objset;
3897         uint64_t zapobj;
3898         uint64_t dsobj = ds->ds_object;
3899         uint64_t refs;
3900         int error;
3901
3902         mutex_enter(&ds->ds_lock);
3903         ds->ds_userrefs--;
3904         refs = ds->ds_userrefs;
3905         mutex_exit(&ds->ds_lock);
3906         error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3907         VERIFY(error == 0 || error == ENOENT);
3908         zapobj = ds->ds_phys->ds_userrefs_obj;
3909         VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3910         if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3911             DS_IS_DEFER_DESTROY(ds)) {
3912                 struct dsl_ds_destroyarg dsda = {0};
3913
3914                 ASSERT(ra->own);
3915                 dsda.ds = ds;
3916                 dsda.releasing = B_TRUE;
3917                 /* We already did the destroy_check */
3918                 dsl_dataset_destroy_sync(&dsda, tag, tx);
3919         }
3920
3921         spa_history_log_internal(LOG_DS_USER_RELEASE,
3922             dp->dp_spa, tx, "<%s> %lld dataset = %llu",
3923             ra->htag, (longlong_t)refs, dsobj);
3924 }
3925
3926 static int
3927 dsl_dataset_user_release_one(const char *dsname, void *arg)
3928 {
3929         struct dsl_ds_holdarg *ha = arg;
3930         struct dsl_ds_releasearg *ra;
3931         dsl_dataset_t *ds;
3932         int error;
3933         void *dtag = ha->dstg;
3934         char *name;
3935         boolean_t own = B_FALSE;
3936         boolean_t might_destroy;
3937
3938         /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3939         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3940         error = dsl_dataset_hold(name, dtag, &ds);
3941         strfree(name);
3942         if (error == ENOENT && ha->recursive)
3943                 return (0);
3944         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3945         if (error)
3946                 return (error);
3947
3948         ha->gotone = B_TRUE;
3949
3950         ASSERT(dsl_dataset_is_snapshot(ds));
3951
3952         error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3953         if (error) {
3954                 dsl_dataset_rele(ds, dtag);
3955                 return (error);
3956         }
3957
3958         if (might_destroy) {
3959 #ifdef _KERNEL
3960                 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3961                 error = zfs_unmount_snap(name, NULL);
3962                 strfree(name);
3963                 if (error) {
3964                         dsl_dataset_rele(ds, dtag);
3965                         return (error);
3966                 }
3967 #endif
3968                 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3969                         dsl_dataset_rele(ds, dtag);
3970                         return (EBUSY);
3971                 } else {
3972                         own = B_TRUE;
3973                         dsl_dataset_make_exclusive(ds, dtag);
3974                 }
3975         }
3976
3977         ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3978         ra->ds = ds;
3979         ra->htag = ha->htag;
3980         ra->own = own;
3981         dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3982             dsl_dataset_user_release_sync, ra, dtag, 0);
3983
3984         return (0);
3985 }
3986
3987 int
3988 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3989     boolean_t recursive)
3990 {
3991         struct dsl_ds_holdarg *ha;
3992         dsl_sync_task_t *dst;
3993         spa_t *spa;
3994         int error;
3995
3996 top:
3997         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3998
3999         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
4000
4001         error = spa_open(dsname, &spa, FTAG);
4002         if (error) {
4003                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4004                 return (error);
4005         }
4006
4007         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
4008         ha->htag = htag;
4009         ha->snapname = snapname;
4010         ha->recursive = recursive;
4011         if (recursive) {
4012                 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
4013                     ha, DS_FIND_CHILDREN);
4014         } else {
4015                 error = dsl_dataset_user_release_one(dsname, ha);
4016         }
4017         if (error == 0)
4018                 error = dsl_sync_task_group_wait(ha->dstg);
4019
4020         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4021             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4022                 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4023                 dsl_dataset_t *ds = ra->ds;
4024
4025                 if (dst->dst_err)
4026                         dsl_dataset_name(ds, ha->failed);
4027
4028                 if (ra->own)
4029                         dsl_dataset_disown(ds, ha->dstg);
4030                 else
4031                         dsl_dataset_rele(ds, ha->dstg);
4032
4033                 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4034         }
4035
4036         if (error == 0 && recursive && !ha->gotone)
4037                 error = ENOENT;
4038
4039         if (error && error != EBUSY)
4040                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4041
4042         dsl_sync_task_group_destroy(ha->dstg);
4043         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4044         spa_close(spa, FTAG);
4045
4046         /*
4047          * We can get EBUSY if we were racing with deferred destroy and
4048          * dsl_dataset_user_release_check() hadn't done the necessary
4049          * open context setup.  We can also get EBUSY if we're racing
4050          * with destroy and that thread is the ds_owner.  Either way
4051          * the busy condition should be transient, and we should retry
4052          * the release operation.
4053          */
4054         if (error == EBUSY)
4055                 goto top;
4056
4057         return (error);
4058 }
4059
4060 /*
4061  * Called at spa_load time (with retry == B_FALSE) to release a stale
4062  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4063  */
4064 int
4065 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4066     boolean_t retry)
4067 {
4068         dsl_dataset_t *ds;
4069         char *snap;
4070         char *name;
4071         int namelen;
4072         int error;
4073
4074         do {
4075                 rw_enter(&dp->dp_config_rwlock, RW_READER);
4076                 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4077                 rw_exit(&dp->dp_config_rwlock);
4078                 if (error)
4079                         return (error);
4080                 namelen = dsl_dataset_namelen(ds)+1;
4081                 name = kmem_alloc(namelen, KM_SLEEP);
4082                 dsl_dataset_name(ds, name);
4083                 dsl_dataset_rele(ds, FTAG);
4084
4085                 snap = strchr(name, '@');
4086                 *snap = '\0';
4087                 ++snap;
4088                 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4089                 kmem_free(name, namelen);
4090
4091                 /*
4092                  * The object can't have been destroyed because we have a hold,
4093                  * but it might have been renamed, resulting in ENOENT.  Retry
4094                  * if we've been requested to do so.
4095                  *
4096                  * It would be nice if we could use the dsobj all the way
4097                  * through and avoid ENOENT entirely.  But we might need to
4098                  * unmount the snapshot, and there's currently no way to lookup
4099                  * a vfsp using a ZFS object id.
4100                  */
4101         } while ((error == ENOENT) && retry);
4102
4103         return (error);
4104 }
4105
4106 int
4107 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4108 {
4109         dsl_dataset_t *ds;
4110         int err;
4111
4112         err = dsl_dataset_hold(dsname, FTAG, &ds);
4113         if (err)
4114                 return (err);
4115
4116         VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4117         if (ds->ds_phys->ds_userrefs_obj != 0) {
4118                 zap_attribute_t *za;
4119                 zap_cursor_t zc;
4120
4121                 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4122                 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4123                     ds->ds_phys->ds_userrefs_obj);
4124                     zap_cursor_retrieve(&zc, za) == 0;
4125                     zap_cursor_advance(&zc)) {
4126                         VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4127                             za->za_first_integer));
4128                 }
4129                 zap_cursor_fini(&zc);
4130                 kmem_free(za, sizeof (zap_attribute_t));
4131         }
4132         dsl_dataset_rele(ds, FTAG);
4133         return (0);
4134 }
4135
4136 /*
4137  * Note, this function is used as the callback for dmu_objset_find().  We
4138  * always return 0 so that we will continue to find and process
4139  * inconsistent datasets, even if we encounter an error trying to
4140  * process one of them.
4141  */
4142 /* ARGSUSED */
4143 int
4144 dsl_destroy_inconsistent(const char *dsname, void *arg)
4145 {
4146         dsl_dataset_t *ds;
4147
4148         if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4149                 if (DS_IS_INCONSISTENT(ds))
4150                         (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4151                 else
4152                         dsl_dataset_disown(ds, FTAG);
4153         }
4154         return (0);
4155 }
4156
4157 /*
4158  * Return (in *usedp) the amount of space written in new that is not
4159  * present in oldsnap.  New may be a snapshot or the head.  Old must be
4160  * a snapshot before new, in new's filesystem (or its origin).  If not then
4161  * fail and return EINVAL.
4162  *
4163  * The written space is calculated by considering two components:  First, we
4164  * ignore any freed space, and calculate the written as new's used space
4165  * minus old's used space.  Next, we add in the amount of space that was freed
4166  * between the two snapshots, thus reducing new's used space relative to old's.
4167  * Specifically, this is the space that was born before old->ds_creation_txg,
4168  * and freed before new (ie. on new's deadlist or a previous deadlist).
4169  *
4170  * space freed                         [---------------------]
4171  * snapshots                       ---O-------O--------O-------O------
4172  *                                         oldsnap            new
4173  */
4174 int
4175 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4176     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4177 {
4178         int err = 0;
4179         uint64_t snapobj;
4180         dsl_pool_t *dp = new->ds_dir->dd_pool;
4181
4182         *usedp = 0;
4183         *usedp += new->ds_phys->ds_used_bytes;
4184         *usedp -= oldsnap->ds_phys->ds_used_bytes;
4185
4186         *compp = 0;
4187         *compp += new->ds_phys->ds_compressed_bytes;
4188         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4189
4190         *uncompp = 0;
4191         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4192         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4193
4194         rw_enter(&dp->dp_config_rwlock, RW_READER);
4195         snapobj = new->ds_object;
4196         while (snapobj != oldsnap->ds_object) {
4197                 dsl_dataset_t *snap;
4198                 uint64_t used, comp, uncomp;
4199
4200                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4201                 if (err != 0)
4202                         break;
4203
4204                 if (snap->ds_phys->ds_prev_snap_txg ==
4205                     oldsnap->ds_phys->ds_creation_txg) {
4206                         /*
4207                          * The blocks in the deadlist can not be born after
4208                          * ds_prev_snap_txg, so get the whole deadlist space,
4209                          * which is more efficient (especially for old-format
4210                          * deadlists).  Unfortunately the deadlist code
4211                          * doesn't have enough information to make this
4212                          * optimization itself.
4213                          */
4214                         dsl_deadlist_space(&snap->ds_deadlist,
4215                             &used, &comp, &uncomp);
4216                 } else {
4217                         dsl_deadlist_space_range(&snap->ds_deadlist,
4218                             0, oldsnap->ds_phys->ds_creation_txg,
4219                             &used, &comp, &uncomp);
4220                 }
4221                 *usedp += used;
4222                 *compp += comp;
4223                 *uncompp += uncomp;
4224
4225                 /*
4226                  * If we get to the beginning of the chain of snapshots
4227                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4228                  * was not a snapshot of/before new.
4229                  */
4230                 snapobj = snap->ds_phys->ds_prev_snap_obj;
4231                 dsl_dataset_rele(snap, FTAG);
4232                 if (snapobj == 0) {
4233                         err = EINVAL;
4234                         break;
4235                 }
4236
4237         }
4238         rw_exit(&dp->dp_config_rwlock);
4239         return (err);
4240 }
4241
4242 /*
4243  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4244  * lastsnap, and all snapshots in between are deleted.
4245  *
4246  * blocks that would be freed            [---------------------------]
4247  * snapshots                       ---O-------O--------O-------O--------O
4248  *                                        firstsnap        lastsnap
4249  *
4250  * This is the set of blocks that were born after the snap before firstsnap,
4251  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4252  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4253  * We calculate this by iterating over the relevant deadlists (from the snap
4254  * after lastsnap, backward to the snap after firstsnap), summing up the
4255  * space on the deadlist that was born after the snap before firstsnap.
4256  */
4257 int
4258 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4259     dsl_dataset_t *lastsnap,
4260     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4261 {
4262         int err = 0;
4263         uint64_t snapobj;
4264         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4265
4266         ASSERT(dsl_dataset_is_snapshot(firstsnap));
4267         ASSERT(dsl_dataset_is_snapshot(lastsnap));
4268
4269         /*
4270          * Check that the snapshots are in the same dsl_dir, and firstsnap
4271          * is before lastsnap.
4272          */
4273         if (firstsnap->ds_dir != lastsnap->ds_dir ||
4274             firstsnap->ds_phys->ds_creation_txg >
4275             lastsnap->ds_phys->ds_creation_txg)
4276                 return (EINVAL);
4277
4278         *usedp = *compp = *uncompp = 0;
4279
4280         rw_enter(&dp->dp_config_rwlock, RW_READER);
4281         snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4282         while (snapobj != firstsnap->ds_object) {
4283                 dsl_dataset_t *ds;
4284                 uint64_t used, comp, uncomp;
4285
4286                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4287                 if (err != 0)
4288                         break;
4289
4290                 dsl_deadlist_space_range(&ds->ds_deadlist,
4291                     firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4292                     &used, &comp, &uncomp);
4293                 *usedp += used;
4294                 *compp += comp;
4295                 *uncompp += uncomp;
4296
4297                 snapobj = ds->ds_phys->ds_prev_snap_obj;
4298                 ASSERT3U(snapobj, !=, 0);
4299                 dsl_dataset_rele(ds, FTAG);
4300         }
4301         rw_exit(&dp->dp_config_rwlock);
4302         return (err);
4303 }