]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
MFC r209962, r211970-r211972, r212050, r212605, r212611
[FreeBSD/stable/8.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #include <sys/dmu_objset.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_dir.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dmu_traverse.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/arc.h>
34 #include <sys/zio.h>
35 #include <sys/zap.h>
36 #include <sys/unique.h>
37 #include <sys/zfs_context.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/spa.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/sunddi.h>
42
43 static char *dsl_reaper = "the grim reaper";
44
45 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
46 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
47 static dsl_checkfunc_t dsl_dataset_rollback_check;
48 static dsl_syncfunc_t dsl_dataset_rollback_sync;
49 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
50
51 #define DS_REF_MAX      (1ULL << 62)
52
53 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
54
55 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
56
57
58 /*
59  * Figure out how much of this delta should be propogated to the dsl_dir
60  * layer.  If there's a refreservation, that space has already been
61  * partially accounted for in our ancestors.
62  */
63 static int64_t
64 parent_delta(dsl_dataset_t *ds, int64_t delta)
65 {
66         uint64_t old_bytes, new_bytes;
67
68         if (ds->ds_reserved == 0)
69                 return (delta);
70
71         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
72         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
73
74         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
75         return (new_bytes - old_bytes);
76 }
77
78 void
79 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
80 {
81         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
82         int compressed = BP_GET_PSIZE(bp);
83         int uncompressed = BP_GET_UCSIZE(bp);
84         int64_t delta;
85
86         dprintf_bp(bp, "born, ds=%p\n", ds);
87
88         ASSERT(dmu_tx_is_syncing(tx));
89         /* It could have been compressed away to nothing */
90         if (BP_IS_HOLE(bp))
91                 return;
92         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
93         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
94         if (ds == NULL) {
95                 /*
96                  * Account for the meta-objset space in its placeholder
97                  * dsl_dir.
98                  */
99                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
100                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
101                     used, compressed, uncompressed, tx);
102                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
103                 return;
104         }
105         dmu_buf_will_dirty(ds->ds_dbuf, tx);
106         mutex_enter(&ds->ds_dir->dd_lock);
107         mutex_enter(&ds->ds_lock);
108         delta = parent_delta(ds, used);
109         ds->ds_phys->ds_used_bytes += used;
110         ds->ds_phys->ds_compressed_bytes += compressed;
111         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
112         ds->ds_phys->ds_unique_bytes += used;
113         mutex_exit(&ds->ds_lock);
114         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
115             compressed, uncompressed, tx);
116         dsl_dir_transfer_space(ds->ds_dir, used - delta,
117             DD_USED_REFRSRV, DD_USED_HEAD, tx);
118         mutex_exit(&ds->ds_dir->dd_lock);
119 }
120
121 int
122 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
123     dmu_tx_t *tx)
124 {
125         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
126         int compressed = BP_GET_PSIZE(bp);
127         int uncompressed = BP_GET_UCSIZE(bp);
128
129         ASSERT(pio != NULL);
130         ASSERT(dmu_tx_is_syncing(tx));
131         /* No block pointer => nothing to free */
132         if (BP_IS_HOLE(bp))
133                 return (0);
134
135         ASSERT(used > 0);
136         if (ds == NULL) {
137                 int err;
138                 /*
139                  * Account for the meta-objset space in its placeholder
140                  * dataset.
141                  */
142                 err = dsl_free(pio, tx->tx_pool,
143                     tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
144                 ASSERT(err == 0);
145
146                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
147                     -used, -compressed, -uncompressed, tx);
148                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
149                 return (used);
150         }
151         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
152
153         ASSERT(!dsl_dataset_is_snapshot(ds));
154         dmu_buf_will_dirty(ds->ds_dbuf, tx);
155
156         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
157                 int err;
158                 int64_t delta;
159
160                 dprintf_bp(bp, "freeing: %s", "");
161                 err = dsl_free(pio, tx->tx_pool,
162                     tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
163                 ASSERT(err == 0);
164
165                 mutex_enter(&ds->ds_dir->dd_lock);
166                 mutex_enter(&ds->ds_lock);
167                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
168                     !DS_UNIQUE_IS_ACCURATE(ds));
169                 delta = parent_delta(ds, -used);
170                 ds->ds_phys->ds_unique_bytes -= used;
171                 mutex_exit(&ds->ds_lock);
172                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
173                     delta, -compressed, -uncompressed, tx);
174                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
175                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
176                 mutex_exit(&ds->ds_dir->dd_lock);
177         } else {
178                 dprintf_bp(bp, "putting on dead list: %s", "");
179                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
180                 ASSERT3U(ds->ds_prev->ds_object, ==,
181                     ds->ds_phys->ds_prev_snap_obj);
182                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
183                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
184                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
185                     ds->ds_object && bp->blk_birth >
186                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
187                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
188                         mutex_enter(&ds->ds_prev->ds_lock);
189                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
190                         mutex_exit(&ds->ds_prev->ds_lock);
191                 }
192                 if (bp->blk_birth > ds->ds_origin_txg) {
193                         dsl_dir_transfer_space(ds->ds_dir, used,
194                             DD_USED_HEAD, DD_USED_SNAP, tx);
195                 }
196         }
197         mutex_enter(&ds->ds_lock);
198         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
199         ds->ds_phys->ds_used_bytes -= used;
200         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
201         ds->ds_phys->ds_compressed_bytes -= compressed;
202         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
203         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
204         mutex_exit(&ds->ds_lock);
205
206         return (used);
207 }
208
209 uint64_t
210 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
211 {
212         uint64_t trysnap = 0;
213
214         if (ds == NULL)
215                 return (0);
216         /*
217          * The snapshot creation could fail, but that would cause an
218          * incorrect FALSE return, which would only result in an
219          * overestimation of the amount of space that an operation would
220          * consume, which is OK.
221          *
222          * There's also a small window where we could miss a pending
223          * snapshot, because we could set the sync task in the quiescing
224          * phase.  So this should only be used as a guess.
225          */
226         if (ds->ds_trysnap_txg >
227             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
228                 trysnap = ds->ds_trysnap_txg;
229         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
230 }
231
232 boolean_t
233 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
234 {
235         return (blk_birth > dsl_dataset_prev_snap_txg(ds));
236 }
237
238 /* ARGSUSED */
239 static void
240 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
241 {
242         dsl_dataset_t *ds = dsv;
243
244         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
245
246         dprintf_ds(ds, "evicting %s\n", "");
247
248         unique_remove(ds->ds_fsid_guid);
249
250         if (ds->ds_user_ptr != NULL)
251                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
252
253         if (ds->ds_prev) {
254                 dsl_dataset_drop_ref(ds->ds_prev, ds);
255                 ds->ds_prev = NULL;
256         }
257
258         bplist_close(&ds->ds_deadlist);
259         if (ds->ds_dir)
260                 dsl_dir_close(ds->ds_dir, ds);
261
262         ASSERT(!list_link_active(&ds->ds_synced_link));
263
264         if (mutex_owned(&ds->ds_lock))
265                 mutex_exit(&ds->ds_lock);
266         mutex_destroy(&ds->ds_lock);
267         if (mutex_owned(&ds->ds_opening_lock))
268                 mutex_exit(&ds->ds_opening_lock);
269         mutex_destroy(&ds->ds_opening_lock);
270         if (mutex_owned(&ds->ds_deadlist.bpl_lock))
271                 mutex_exit(&ds->ds_deadlist.bpl_lock);
272         mutex_destroy(&ds->ds_deadlist.bpl_lock);
273         rw_destroy(&ds->ds_rwlock);
274         cv_destroy(&ds->ds_exclusive_cv);
275
276         kmem_free(ds, sizeof (dsl_dataset_t));
277 }
278
279 static int
280 dsl_dataset_get_snapname(dsl_dataset_t *ds)
281 {
282         dsl_dataset_phys_t *headphys;
283         int err;
284         dmu_buf_t *headdbuf;
285         dsl_pool_t *dp = ds->ds_dir->dd_pool;
286         objset_t *mos = dp->dp_meta_objset;
287
288         if (ds->ds_snapname[0])
289                 return (0);
290         if (ds->ds_phys->ds_next_snap_obj == 0)
291                 return (0);
292
293         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
294             FTAG, &headdbuf);
295         if (err)
296                 return (err);
297         headphys = headdbuf->db_data;
298         err = zap_value_search(dp->dp_meta_objset,
299             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
300         dmu_buf_rele(headdbuf, FTAG);
301         return (err);
302 }
303
304 static int
305 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
306 {
307         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
308         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
309         matchtype_t mt;
310         int err;
311
312         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
313                 mt = MT_FIRST;
314         else
315                 mt = MT_EXACT;
316
317         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
318             value, mt, NULL, 0, NULL);
319         if (err == ENOTSUP && mt == MT_FIRST)
320                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
321         return (err);
322 }
323
324 static int
325 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
326 {
327         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
328         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
329         matchtype_t mt;
330         int err;
331
332         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
333                 mt = MT_FIRST;
334         else
335                 mt = MT_EXACT;
336
337         err = zap_remove_norm(mos, snapobj, name, mt, tx);
338         if (err == ENOTSUP && mt == MT_FIRST)
339                 err = zap_remove(mos, snapobj, name, tx);
340         return (err);
341 }
342
343 static int
344 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
345     dsl_dataset_t **dsp)
346 {
347         objset_t *mos = dp->dp_meta_objset;
348         dmu_buf_t *dbuf;
349         dsl_dataset_t *ds;
350         int err;
351
352         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
353             dsl_pool_sync_context(dp));
354
355         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
356         if (err)
357                 return (err);
358         ds = dmu_buf_get_user(dbuf);
359         if (ds == NULL) {
360                 dsl_dataset_t *winner;
361
362                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
363                 ds->ds_dbuf = dbuf;
364                 ds->ds_object = dsobj;
365                 ds->ds_phys = dbuf->db_data;
366
367                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
368                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
369                 mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
370                     NULL);
371                 rw_init(&ds->ds_rwlock, 0, 0, 0);
372                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
373
374                 err = bplist_open(&ds->ds_deadlist,
375                     mos, ds->ds_phys->ds_deadlist_obj);
376                 if (err == 0) {
377                         err = dsl_dir_open_obj(dp,
378                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
379                 }
380                 if (err) {
381                         /*
382                          * we don't really need to close the blist if we
383                          * just opened it.
384                          */
385                         mutex_destroy(&ds->ds_lock);
386                         mutex_destroy(&ds->ds_opening_lock);
387                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
388                         rw_destroy(&ds->ds_rwlock);
389                         cv_destroy(&ds->ds_exclusive_cv);
390                         kmem_free(ds, sizeof (dsl_dataset_t));
391                         dmu_buf_rele(dbuf, tag);
392                         return (err);
393                 }
394
395                 if (!dsl_dataset_is_snapshot(ds)) {
396                         ds->ds_snapname[0] = '\0';
397                         if (ds->ds_phys->ds_prev_snap_obj) {
398                                 err = dsl_dataset_get_ref(dp,
399                                     ds->ds_phys->ds_prev_snap_obj,
400                                     ds, &ds->ds_prev);
401                         }
402
403                         if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
404                                 dsl_dataset_t *origin;
405
406                                 err = dsl_dataset_hold_obj(dp,
407                                     ds->ds_dir->dd_phys->dd_origin_obj,
408                                     FTAG, &origin);
409                                 if (err == 0) {
410                                         ds->ds_origin_txg =
411                                             origin->ds_phys->ds_creation_txg;
412                                         dsl_dataset_rele(origin, FTAG);
413                                 }
414                         }
415                 } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
416                         err = dsl_dataset_get_snapname(ds);
417                 }
418
419                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
420                         /*
421                          * In sync context, we're called with either no lock
422                          * or with the write lock.  If we're not syncing,
423                          * we're always called with the read lock held.
424                          */
425                         boolean_t need_lock =
426                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
427                             dsl_pool_sync_context(dp);
428
429                         if (need_lock)
430                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
431
432                         err = dsl_prop_get_ds(ds,
433                             "refreservation", sizeof (uint64_t), 1,
434                             &ds->ds_reserved, NULL);
435                         if (err == 0) {
436                                 err = dsl_prop_get_ds(ds,
437                                     "refquota", sizeof (uint64_t), 1,
438                                     &ds->ds_quota, NULL);
439                         }
440
441                         if (need_lock)
442                                 rw_exit(&dp->dp_config_rwlock);
443                 } else {
444                         ds->ds_reserved = ds->ds_quota = 0;
445                 }
446
447                 if (err == 0) {
448                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
449                             dsl_dataset_evict);
450                 }
451                 if (err || winner) {
452                         bplist_close(&ds->ds_deadlist);
453                         if (ds->ds_prev)
454                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
455                         dsl_dir_close(ds->ds_dir, ds);
456                         mutex_destroy(&ds->ds_lock);
457                         mutex_destroy(&ds->ds_opening_lock);
458                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
459                         rw_destroy(&ds->ds_rwlock);
460                         cv_destroy(&ds->ds_exclusive_cv);
461                         kmem_free(ds, sizeof (dsl_dataset_t));
462                         if (err) {
463                                 dmu_buf_rele(dbuf, tag);
464                                 return (err);
465                         }
466                         ds = winner;
467                 } else {
468                         ds->ds_fsid_guid =
469                             unique_insert(ds->ds_phys->ds_fsid_guid);
470                 }
471         }
472         ASSERT3P(ds->ds_dbuf, ==, dbuf);
473         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
474         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
475             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
476             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
477         mutex_enter(&ds->ds_lock);
478         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
479                 mutex_exit(&ds->ds_lock);
480                 dmu_buf_rele(ds->ds_dbuf, tag);
481                 return (ENOENT);
482         }
483         mutex_exit(&ds->ds_lock);
484         *dsp = ds;
485         return (0);
486 }
487
488 static int
489 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
490 {
491         dsl_pool_t *dp = ds->ds_dir->dd_pool;
492
493         /*
494          * In syncing context we don't want the rwlock lock: there
495          * may be an existing writer waiting for sync phase to
496          * finish.  We don't need to worry about such writers, since
497          * sync phase is single-threaded, so the writer can't be
498          * doing anything while we are active.
499          */
500         if (dsl_pool_sync_context(dp)) {
501                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
502                 return (0);
503         }
504
505         /*
506          * Normal users will hold the ds_rwlock as a READER until they
507          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
508          * drop their READER lock after they set the ds_owner field.
509          *
510          * If the dataset is being destroyed, the destroy thread will
511          * obtain a WRITER lock for exclusive access after it's done its
512          * open-context work and then change the ds_owner to
513          * dsl_reaper once destruction is assured.  So threads
514          * may block here temporarily, until the "destructability" of
515          * the dataset is determined.
516          */
517         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
518         mutex_enter(&ds->ds_lock);
519         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
520                 rw_exit(&dp->dp_config_rwlock);
521                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
522                 if (DSL_DATASET_IS_DESTROYED(ds)) {
523                         mutex_exit(&ds->ds_lock);
524                         dsl_dataset_drop_ref(ds, tag);
525                         rw_enter(&dp->dp_config_rwlock, RW_READER);
526                         return (ENOENT);
527                 }
528                 /*
529                  * The dp_config_rwlock lives above the ds_lock. And
530                  * we need to check DSL_DATASET_IS_DESTROYED() while
531                  * holding the ds_lock, so we have to drop and reacquire
532                  * the ds_lock here.
533                  */
534                 mutex_exit(&ds->ds_lock);
535                 rw_enter(&dp->dp_config_rwlock, RW_READER);
536                 mutex_enter(&ds->ds_lock);
537         }
538         mutex_exit(&ds->ds_lock);
539         return (0);
540 }
541
542 int
543 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
544     dsl_dataset_t **dsp)
545 {
546         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
547
548         if (err)
549                 return (err);
550         return (dsl_dataset_hold_ref(*dsp, tag));
551 }
552
553 int
554 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
555     dsl_dataset_t **dsp)
556 {
557         int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
558
559         ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
560
561         if (err)
562                 return (err);
563         if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
564                 dsl_dataset_rele(*dsp, owner);
565                 *dsp = NULL;
566                 return (EBUSY);
567         }
568         return (0);
569 }
570
571 int
572 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
573 {
574         dsl_dir_t *dd;
575         dsl_pool_t *dp;
576         const char *snapname;
577         uint64_t obj;
578         int err = 0;
579
580         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
581         if (err)
582                 return (err);
583
584         dp = dd->dd_pool;
585         obj = dd->dd_phys->dd_head_dataset_obj;
586         rw_enter(&dp->dp_config_rwlock, RW_READER);
587         if (obj)
588                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
589         else
590                 err = ENOENT;
591         if (err)
592                 goto out;
593
594         err = dsl_dataset_hold_ref(*dsp, tag);
595
596         /* we may be looking for a snapshot */
597         if (err == 0 && snapname != NULL) {
598                 dsl_dataset_t *ds = NULL;
599
600                 if (*snapname++ != '@') {
601                         dsl_dataset_rele(*dsp, tag);
602                         err = ENOENT;
603                         goto out;
604                 }
605
606                 dprintf("looking for snapshot '%s'\n", snapname);
607                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
608                 if (err == 0)
609                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
610                 dsl_dataset_rele(*dsp, tag);
611
612                 ASSERT3U((err == 0), ==, (ds != NULL));
613
614                 if (ds) {
615                         mutex_enter(&ds->ds_lock);
616                         if (ds->ds_snapname[0] == 0)
617                                 (void) strlcpy(ds->ds_snapname, snapname,
618                                     sizeof (ds->ds_snapname));
619                         mutex_exit(&ds->ds_lock);
620                         err = dsl_dataset_hold_ref(ds, tag);
621                         *dsp = err ? NULL : ds;
622                 }
623         }
624 out:
625         rw_exit(&dp->dp_config_rwlock);
626         dsl_dir_close(dd, FTAG);
627         return (err);
628 }
629
630 int
631 dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
632 {
633         int err = dsl_dataset_hold(name, owner, dsp);
634         if (err)
635                 return (err);
636         if ((*dsp)->ds_phys->ds_num_children > 0 &&
637             !DS_MODE_IS_READONLY(flags)) {
638                 dsl_dataset_rele(*dsp, owner);
639                 return (EROFS);
640         }
641         if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
642                 dsl_dataset_rele(*dsp, owner);
643                 return (EBUSY);
644         }
645         return (0);
646 }
647
648 void
649 dsl_dataset_name(dsl_dataset_t *ds, char *name)
650 {
651         if (ds == NULL) {
652                 (void) strcpy(name, "mos");
653         } else {
654                 dsl_dir_name(ds->ds_dir, name);
655                 VERIFY(0 == dsl_dataset_get_snapname(ds));
656                 if (ds->ds_snapname[0]) {
657                         (void) strcat(name, "@");
658                         /*
659                          * We use a "recursive" mutex so that we
660                          * can call dprintf_ds() with ds_lock held.
661                          */
662                         if (!MUTEX_HELD(&ds->ds_lock)) {
663                                 mutex_enter(&ds->ds_lock);
664                                 (void) strcat(name, ds->ds_snapname);
665                                 mutex_exit(&ds->ds_lock);
666                         } else {
667                                 (void) strcat(name, ds->ds_snapname);
668                         }
669                 }
670         }
671 }
672
673 static int
674 dsl_dataset_namelen(dsl_dataset_t *ds)
675 {
676         int result;
677
678         if (ds == NULL) {
679                 result = 3;     /* "mos" */
680         } else {
681                 result = dsl_dir_namelen(ds->ds_dir);
682                 VERIFY(0 == dsl_dataset_get_snapname(ds));
683                 if (ds->ds_snapname[0]) {
684                         ++result;       /* adding one for the @-sign */
685                         if (!MUTEX_HELD(&ds->ds_lock)) {
686                                 mutex_enter(&ds->ds_lock);
687                                 result += strlen(ds->ds_snapname);
688                                 mutex_exit(&ds->ds_lock);
689                         } else {
690                                 result += strlen(ds->ds_snapname);
691                         }
692                 }
693         }
694
695         return (result);
696 }
697
698 void
699 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
700 {
701         dmu_buf_rele(ds->ds_dbuf, tag);
702 }
703
704 void
705 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
706 {
707         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
708                 rw_exit(&ds->ds_rwlock);
709         }
710         dsl_dataset_drop_ref(ds, tag);
711 }
712
713 void
714 dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
715 {
716         ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
717             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
718
719         mutex_enter(&ds->ds_lock);
720         ds->ds_owner = NULL;
721         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
722                 rw_exit(&ds->ds_rwlock);
723                 cv_broadcast(&ds->ds_exclusive_cv);
724         }
725         mutex_exit(&ds->ds_lock);
726         if (ds->ds_dbuf)
727                 dsl_dataset_drop_ref(ds, owner);
728         else
729                 dsl_dataset_evict(ds->ds_dbuf, ds);
730 }
731
732 boolean_t
733 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
734 {
735         boolean_t gotit = FALSE;
736
737         mutex_enter(&ds->ds_lock);
738         if (ds->ds_owner == NULL &&
739             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
740                 ds->ds_owner = owner;
741                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
742                         rw_exit(&ds->ds_rwlock);
743                 gotit = TRUE;
744         }
745         mutex_exit(&ds->ds_lock);
746         return (gotit);
747 }
748
749 void
750 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
751 {
752         ASSERT3P(owner, ==, ds->ds_owner);
753         if (!RW_WRITE_HELD(&ds->ds_rwlock))
754                 rw_enter(&ds->ds_rwlock, RW_WRITER);
755 }
756
757 uint64_t
758 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
759     uint64_t flags, dmu_tx_t *tx)
760 {
761         dsl_pool_t *dp = dd->dd_pool;
762         dmu_buf_t *dbuf;
763         dsl_dataset_phys_t *dsphys;
764         uint64_t dsobj;
765         objset_t *mos = dp->dp_meta_objset;
766
767         if (origin == NULL)
768                 origin = dp->dp_origin_snap;
769
770         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
771         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
772         ASSERT(dmu_tx_is_syncing(tx));
773         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
774
775         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
776             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
777         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
778         dmu_buf_will_dirty(dbuf, tx);
779         dsphys = dbuf->db_data;
780         bzero(dsphys, sizeof (dsl_dataset_phys_t));
781         dsphys->ds_dir_obj = dd->dd_object;
782         dsphys->ds_flags = flags;
783         dsphys->ds_fsid_guid = unique_create();
784         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
785             sizeof (dsphys->ds_guid));
786         dsphys->ds_snapnames_zapobj =
787             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
788             DMU_OT_NONE, 0, tx);
789         dsphys->ds_creation_time = gethrestime_sec();
790         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
791         dsphys->ds_deadlist_obj =
792             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
793
794         if (origin) {
795                 dsphys->ds_prev_snap_obj = origin->ds_object;
796                 dsphys->ds_prev_snap_txg =
797                     origin->ds_phys->ds_creation_txg;
798                 dsphys->ds_used_bytes =
799                     origin->ds_phys->ds_used_bytes;
800                 dsphys->ds_compressed_bytes =
801                     origin->ds_phys->ds_compressed_bytes;
802                 dsphys->ds_uncompressed_bytes =
803                     origin->ds_phys->ds_uncompressed_bytes;
804                 dsphys->ds_bp = origin->ds_phys->ds_bp;
805                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
806
807                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
808                 origin->ds_phys->ds_num_children++;
809
810                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
811                         if (origin->ds_phys->ds_next_clones_obj == 0) {
812                                 origin->ds_phys->ds_next_clones_obj =
813                                     zap_create(mos,
814                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
815                         }
816                         VERIFY(0 == zap_add_int(mos,
817                             origin->ds_phys->ds_next_clones_obj,
818                             dsobj, tx));
819                 }
820
821                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
822                 dd->dd_phys->dd_origin_obj = origin->ds_object;
823         }
824
825         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
826                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
827
828         dmu_buf_rele(dbuf, FTAG);
829
830         dmu_buf_will_dirty(dd->dd_dbuf, tx);
831         dd->dd_phys->dd_head_dataset_obj = dsobj;
832
833         return (dsobj);
834 }
835
836 uint64_t
837 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
838     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
839 {
840         dsl_pool_t *dp = pdd->dd_pool;
841         uint64_t dsobj, ddobj;
842         dsl_dir_t *dd;
843
844         ASSERT(lastname[0] != '@');
845
846         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
847         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
848
849         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
850
851         dsl_deleg_set_create_perms(dd, tx, cr);
852
853         dsl_dir_close(dd, FTAG);
854
855         return (dsobj);
856 }
857
858 struct destroyarg {
859         dsl_sync_task_group_t *dstg;
860         char *snapname;
861         char *failed;
862 };
863
864 static int
865 dsl_snapshot_destroy_one(char *name, void *arg)
866 {
867         struct destroyarg *da = arg;
868         dsl_dataset_t *ds;
869         char *cp;
870         int err;
871
872         (void) strcat(name, "@");
873         (void) strcat(name, da->snapname);
874         err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
875             da->dstg, &ds);
876         cp = strchr(name, '@');
877         *cp = '\0';
878         if (err == 0) {
879                 dsl_dataset_make_exclusive(ds, da->dstg);
880                 if (ds->ds_user_ptr) {
881                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
882                         ds->ds_user_ptr = NULL;
883                 }
884                 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
885                     dsl_dataset_destroy_sync, ds, da->dstg, 0);
886         } else if (err == ENOENT) {
887                 err = 0;
888         } else {
889                 (void) strcpy(da->failed, name);
890         }
891         return (err);
892 }
893
894 /*
895  * Destroy 'snapname' in all descendants of 'fsname'.
896  */
897 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
898 int
899 dsl_snapshots_destroy(char *fsname, char *snapname)
900 {
901         int err;
902         struct destroyarg da;
903         dsl_sync_task_t *dst;
904         spa_t *spa;
905
906         err = spa_open(fsname, &spa, FTAG);
907         if (err)
908                 return (err);
909         da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
910         da.snapname = snapname;
911         da.failed = fsname;
912
913         err = dmu_objset_find(fsname,
914             dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
915
916         if (err == 0)
917                 err = dsl_sync_task_group_wait(da.dstg);
918
919         for (dst = list_head(&da.dstg->dstg_tasks); dst;
920             dst = list_next(&da.dstg->dstg_tasks, dst)) {
921                 dsl_dataset_t *ds = dst->dst_arg1;
922                 /*
923                  * Return the file system name that triggered the error
924                  */
925                 if (dst->dst_err) {
926                         dsl_dataset_name(ds, fsname);
927                         *strchr(fsname, '@') = '\0';
928                 }
929                 dsl_dataset_disown(ds, da.dstg);
930         }
931
932         dsl_sync_task_group_destroy(da.dstg);
933         spa_close(spa, FTAG);
934         return (err);
935 }
936
937 /*
938  * ds must be opened as OWNER.  On return (whether successful or not),
939  * ds will be closed and caller can no longer dereference it.
940  */
941 int
942 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
943 {
944         int err;
945         dsl_sync_task_group_t *dstg;
946         objset_t *os;
947         dsl_dir_t *dd;
948         uint64_t obj;
949
950         if (dsl_dataset_is_snapshot(ds)) {
951                 /* Destroying a snapshot is simpler */
952                 dsl_dataset_make_exclusive(ds, tag);
953
954                 if (ds->ds_user_ptr) {
955                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
956                         ds->ds_user_ptr = NULL;
957                 }
958                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
959                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
960                     ds, tag, 0);
961                 goto out;
962         }
963
964         dd = ds->ds_dir;
965
966         /*
967          * Check for errors and mark this ds as inconsistent, in
968          * case we crash while freeing the objects.
969          */
970         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
971             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
972         if (err)
973                 goto out;
974
975         err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
976         if (err)
977                 goto out;
978
979         /*
980          * remove the objects in open context, so that we won't
981          * have too much to do in syncing context.
982          */
983         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
984             ds->ds_phys->ds_prev_snap_txg)) {
985                 /*
986                  * Ignore errors, if there is not enough disk space
987                  * we will deal with it in dsl_dataset_destroy_sync().
988                  */
989                 (void) dmu_free_object(os, obj);
990         }
991
992         /*
993          * We need to sync out all in-flight IO before we try to evict
994          * (the dataset evict func is trying to clear the cached entries
995          * for this dataset in the ARC).
996          */
997         txg_wait_synced(dd->dd_pool, 0);
998
999         /*
1000          * If we managed to free all the objects in open
1001          * context, the user space accounting should be zero.
1002          */
1003         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1004             dmu_objset_userused_enabled(os->os)) {
1005                 uint64_t count;
1006
1007                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
1008                     count == 0);
1009                 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
1010                     count == 0);
1011         }
1012
1013         dmu_objset_close(os);
1014         if (err != ESRCH)
1015                 goto out;
1016
1017         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1018         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1019         rw_exit(&dd->dd_pool->dp_config_rwlock);
1020
1021         if (err)
1022                 goto out;
1023
1024         if (ds->ds_user_ptr) {
1025                 /*
1026                  * We need to sync out all in-flight IO before we try
1027                  * to evict (the dataset evict func is trying to clear
1028                  * the cached entries for this dataset in the ARC).
1029                  */
1030                 txg_wait_synced(dd->dd_pool, 0);
1031         }
1032
1033         /*
1034          * Blow away the dsl_dir + head dataset.
1035          */
1036         dsl_dataset_make_exclusive(ds, tag);
1037         if (ds->ds_user_ptr) {
1038                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
1039                 ds->ds_user_ptr = NULL;
1040         }
1041         dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1042         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1043             dsl_dataset_destroy_sync, ds, tag, 0);
1044         dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1045             dsl_dir_destroy_sync, dd, FTAG, 0);
1046         err = dsl_sync_task_group_wait(dstg);
1047         dsl_sync_task_group_destroy(dstg);
1048         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1049         if (err)
1050                 dsl_dir_close(dd, FTAG);
1051 out:
1052         dsl_dataset_disown(ds, tag);
1053         return (err);
1054 }
1055
1056 int
1057 dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
1058 {
1059         int err;
1060
1061         ASSERT(ds->ds_owner);
1062
1063         dsl_dataset_make_exclusive(ds, ds->ds_owner);
1064         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1065             dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
1066             ds, &ost, 0);
1067         /* drop exclusive access */
1068         mutex_enter(&ds->ds_lock);
1069         rw_exit(&ds->ds_rwlock);
1070         cv_broadcast(&ds->ds_exclusive_cv);
1071         mutex_exit(&ds->ds_lock);
1072         return (err);
1073 }
1074
1075 void *
1076 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
1077     void *p, dsl_dataset_evict_func_t func)
1078 {
1079         void *old;
1080
1081         mutex_enter(&ds->ds_lock);
1082         old = ds->ds_user_ptr;
1083         if (old == NULL) {
1084                 ds->ds_user_ptr = p;
1085                 ds->ds_user_evict_func = func;
1086         }
1087         mutex_exit(&ds->ds_lock);
1088         return (old);
1089 }
1090
1091 void *
1092 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
1093 {
1094         return (ds->ds_user_ptr);
1095 }
1096
1097 blkptr_t *
1098 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1099 {
1100         return (&ds->ds_phys->ds_bp);
1101 }
1102
1103 void
1104 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1105 {
1106         ASSERT(dmu_tx_is_syncing(tx));
1107         /* If it's the meta-objset, set dp_meta_rootbp */
1108         if (ds == NULL) {
1109                 tx->tx_pool->dp_meta_rootbp = *bp;
1110         } else {
1111                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1112                 ds->ds_phys->ds_bp = *bp;
1113         }
1114 }
1115
1116 spa_t *
1117 dsl_dataset_get_spa(dsl_dataset_t *ds)
1118 {
1119         return (ds->ds_dir->dd_pool->dp_spa);
1120 }
1121
1122 void
1123 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1124 {
1125         dsl_pool_t *dp;
1126
1127         if (ds == NULL) /* this is the meta-objset */
1128                 return;
1129
1130         ASSERT(ds->ds_user_ptr != NULL);
1131
1132         if (ds->ds_phys->ds_next_snap_obj != 0)
1133                 panic("dirtying snapshot!");
1134
1135         dp = ds->ds_dir->dd_pool;
1136
1137         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1138                 /* up the hold count until we can be written out */
1139                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1140         }
1141 }
1142
1143 /*
1144  * The unique space in the head dataset can be calculated by subtracting
1145  * the space used in the most recent snapshot, that is still being used
1146  * in this file system, from the space currently in use.  To figure out
1147  * the space in the most recent snapshot still in use, we need to take
1148  * the total space used in the snapshot and subtract out the space that
1149  * has been freed up since the snapshot was taken.
1150  */
1151 static void
1152 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1153 {
1154         uint64_t mrs_used;
1155         uint64_t dlused, dlcomp, dluncomp;
1156
1157         ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
1158
1159         if (ds->ds_phys->ds_prev_snap_obj != 0)
1160                 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
1161         else
1162                 mrs_used = 0;
1163
1164         VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
1165             &dluncomp));
1166
1167         ASSERT3U(dlused, <=, mrs_used);
1168         ds->ds_phys->ds_unique_bytes =
1169             ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
1170
1171         if (!DS_UNIQUE_IS_ACCURATE(ds) &&
1172             spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1173             SPA_VERSION_UNIQUE_ACCURATE)
1174                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1175 }
1176
1177 static uint64_t
1178 dsl_dataset_unique(dsl_dataset_t *ds)
1179 {
1180         if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
1181                 dsl_dataset_recalc_head_uniq(ds);
1182
1183         return (ds->ds_phys->ds_unique_bytes);
1184 }
1185
1186 struct killarg {
1187         dsl_dataset_t *ds;
1188         zio_t *zio;
1189         dmu_tx_t *tx;
1190 };
1191
1192 /* ARGSUSED */
1193 static int
1194 kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
1195     const dnode_phys_t *dnp, void *arg)
1196 {
1197         struct killarg *ka = arg;
1198
1199         if (bp == NULL)
1200                 return (0);
1201
1202         if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
1203             (zb->zb_object != 0 && dnp == NULL)) {
1204                 /*
1205                  * It's a block in the intent log.  It has no
1206                  * accounting, so just free it.
1207                  */
1208                 VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
1209                     ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
1210         } else {
1211                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1212                 (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
1213         }
1214
1215         return (0);
1216 }
1217
1218 /* ARGSUSED */
1219 static int
1220 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
1221 {
1222         dsl_dataset_t *ds = arg1;
1223         dmu_objset_type_t *ost = arg2;
1224
1225         /*
1226          * We can only roll back to emptyness if it is a ZPL objset.
1227          */
1228         if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
1229                 return (EINVAL);
1230
1231         /*
1232          * This must not be a snapshot.
1233          */
1234         if (ds->ds_phys->ds_next_snap_obj != 0)
1235                 return (EINVAL);
1236
1237         /*
1238          * If we made changes this txg, traverse_dataset won't find
1239          * them.  Try again.
1240          */
1241         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1242                 return (EAGAIN);
1243
1244         return (0);
1245 }
1246
1247 /* ARGSUSED */
1248 static void
1249 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1250 {
1251         dsl_dataset_t *ds = arg1;
1252         dmu_objset_type_t *ost = arg2;
1253         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1254
1255         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1256
1257         if (ds->ds_user_ptr != NULL) {
1258                 /*
1259                  * We need to make sure that the objset_impl_t is reopened after
1260                  * we do the rollback, otherwise it will have the wrong
1261                  * objset_phys_t.  Normally this would happen when this
1262                  * dataset-open is closed, thus causing the
1263                  * dataset to be immediately evicted.  But when doing "zfs recv
1264                  * -F", we reopen the objset before that, so that there is no
1265                  * window where the dataset is closed and inconsistent.
1266                  */
1267                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
1268                 ds->ds_user_ptr = NULL;
1269         }
1270
1271         /* Transfer space that was freed since last snap back to the head. */
1272         {
1273                 uint64_t used;
1274
1275                 VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
1276                     ds->ds_origin_txg, UINT64_MAX, &used));
1277                 dsl_dir_transfer_space(ds->ds_dir, used,
1278                     DD_USED_SNAP, DD_USED_HEAD, tx);
1279         }
1280
1281         /* Zero out the deadlist. */
1282         bplist_close(&ds->ds_deadlist);
1283         bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1284         ds->ds_phys->ds_deadlist_obj =
1285             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1286         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1287             ds->ds_phys->ds_deadlist_obj));
1288
1289         {
1290                 /*
1291                  * Free blkptrs that we gave birth to - this covers
1292                  * claimed but not played log blocks too.
1293                  */
1294                 zio_t *zio;
1295                 struct killarg ka;
1296
1297                 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
1298                     ZIO_FLAG_MUSTSUCCEED);
1299                 ka.ds = ds;
1300                 ka.zio = zio;
1301                 ka.tx = tx;
1302                 (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1303                     TRAVERSE_POST, kill_blkptr, &ka);
1304                 (void) zio_wait(zio);
1305         }
1306
1307         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1308
1309         if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
1310                 /* Change our contents to that of the prev snapshot */
1311
1312                 ASSERT3U(ds->ds_prev->ds_object, ==,
1313                     ds->ds_phys->ds_prev_snap_obj);
1314                 ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
1315                     ds->ds_prev->ds_phys->ds_used_bytes);
1316
1317                 ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
1318                 ds->ds_phys->ds_used_bytes =
1319                     ds->ds_prev->ds_phys->ds_used_bytes;
1320                 ds->ds_phys->ds_compressed_bytes =
1321                     ds->ds_prev->ds_phys->ds_compressed_bytes;
1322                 ds->ds_phys->ds_uncompressed_bytes =
1323                     ds->ds_prev->ds_phys->ds_uncompressed_bytes;
1324                 ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
1325
1326                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1327                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1328                         ds->ds_prev->ds_phys->ds_unique_bytes = 0;
1329                 }
1330         } else {
1331                 objset_impl_t *osi;
1332
1333                 ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
1334                 ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
1335                 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
1336
1337                 bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
1338                 ds->ds_phys->ds_flags = 0;
1339                 ds->ds_phys->ds_unique_bytes = 0;
1340                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1341                     SPA_VERSION_UNIQUE_ACCURATE)
1342                         ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1343
1344                 osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
1345                     &ds->ds_phys->ds_bp, *ost, tx);
1346 #ifdef _KERNEL
1347                 zfs_create_fs(&osi->os, kcred, NULL, tx);
1348 #endif
1349         }
1350
1351         spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
1352             tx, cr, "dataset = %llu", ds->ds_object);
1353 }
1354
1355 /* ARGSUSED */
1356 static int
1357 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1358 {
1359         dsl_dataset_t *ds = arg1;
1360         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1361         uint64_t count;
1362         int err;
1363
1364         /*
1365          * Can't delete a head dataset if there are snapshots of it.
1366          * (Except if the only snapshots are from the branch we cloned
1367          * from.)
1368          */
1369         if (ds->ds_prev != NULL &&
1370             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1371                 return (EINVAL);
1372
1373         /*
1374          * This is really a dsl_dir thing, but check it here so that
1375          * we'll be less likely to leave this dataset inconsistent &
1376          * nearly destroyed.
1377          */
1378         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1379         if (err)
1380                 return (err);
1381         if (count != 0)
1382                 return (EEXIST);
1383
1384         return (0);
1385 }
1386
1387 /* ARGSUSED */
1388 static void
1389 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1390 {
1391         dsl_dataset_t *ds = arg1;
1392         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1393
1394         /* Mark it as inconsistent on-disk, in case we crash */
1395         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1396         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1397
1398         spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
1399             cr, "dataset = %llu", ds->ds_object);
1400 }
1401
1402 /* ARGSUSED */
1403 int
1404 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1405 {
1406         dsl_dataset_t *ds = arg1;
1407
1408         /* we have an owner hold, so noone else can destroy us */
1409         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1410
1411         /* Can't delete a branch point. */
1412         if (ds->ds_phys->ds_num_children > 1)
1413                 return (EEXIST);
1414
1415         /*
1416          * Can't delete a head dataset if there are snapshots of it.
1417          * (Except if the only snapshots are from the branch we cloned
1418          * from.)
1419          */
1420         if (ds->ds_prev != NULL &&
1421             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1422                 return (EINVAL);
1423
1424         /*
1425          * If we made changes this txg, traverse_dsl_dataset won't find
1426          * them.  Try again.
1427          */
1428         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1429                 return (EAGAIN);
1430
1431         /* XXX we should do some i/o error checking... */
1432         return (0);
1433 }
1434
1435 struct refsarg {
1436         kmutex_t lock;
1437         boolean_t gone;
1438         kcondvar_t cv;
1439 };
1440
1441 /* ARGSUSED */
1442 static void
1443 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1444 {
1445         struct refsarg *arg = argv;
1446
1447         mutex_enter(&arg->lock);
1448         arg->gone = TRUE;
1449         cv_signal(&arg->cv);
1450         mutex_exit(&arg->lock);
1451 }
1452
1453 static void
1454 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1455 {
1456         struct refsarg arg;
1457
1458         bzero(&arg, sizeof(arg));
1459         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1460         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1461         arg.gone = FALSE;
1462         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1463             dsl_dataset_refs_gone);
1464         dmu_buf_rele(ds->ds_dbuf, tag);
1465         mutex_enter(&arg.lock);
1466         while (!arg.gone)
1467                 cv_wait(&arg.cv, &arg.lock);
1468         ASSERT(arg.gone);
1469         mutex_exit(&arg.lock);
1470         ds->ds_dbuf = NULL;
1471         ds->ds_phys = NULL;
1472         mutex_destroy(&arg.lock);
1473         cv_destroy(&arg.cv);
1474 }
1475
1476 static void
1477 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1478 {
1479         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1480         uint64_t count;
1481         int err;
1482
1483         ASSERT(ds->ds_phys->ds_num_children >= 2);
1484         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1485         /*
1486          * The err should not be ENOENT, but a bug in a previous version
1487          * of the code could cause upgrade_clones_cb() to not set
1488          * ds_next_snap_obj when it should, leading to a missing entry.
1489          * If we knew that the pool was created after
1490          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1491          * ENOENT.  However, at least we can check that we don't have
1492          * too many entries in the next_clones_obj even after failing to
1493          * remove this one.
1494          */
1495         if (err != ENOENT) {
1496                 VERIFY3U(err, ==, 0);
1497         }
1498         ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1499             &count));
1500         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1501 }
1502
1503 void
1504 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
1505 {
1506         dsl_dataset_t *ds = arg1;
1507         zio_t *zio;
1508         int err;
1509         int after_branch_point = FALSE;
1510         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1511         objset_t *mos = dp->dp_meta_objset;
1512         dsl_dataset_t *ds_prev = NULL;
1513         uint64_t obj;
1514
1515         ASSERT(ds->ds_owner);
1516         ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
1517         ASSERT(ds->ds_prev == NULL ||
1518             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1519         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1520
1521         /* signal any waiters that this dataset is going away */
1522         mutex_enter(&ds->ds_lock);
1523         ds->ds_owner = dsl_reaper;
1524         cv_broadcast(&ds->ds_exclusive_cv);
1525         mutex_exit(&ds->ds_lock);
1526
1527         /* Remove our reservation */
1528         if (ds->ds_reserved != 0) {
1529                 uint64_t val = 0;
1530                 dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
1531                 ASSERT3U(ds->ds_reserved, ==, 0);
1532         }
1533
1534         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1535
1536         dsl_pool_ds_destroyed(ds, tx);
1537
1538         obj = ds->ds_object;
1539
1540         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1541                 if (ds->ds_prev) {
1542                         ds_prev = ds->ds_prev;
1543                 } else {
1544                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1545                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1546                 }
1547                 after_branch_point =
1548                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1549
1550                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1551                 if (after_branch_point &&
1552                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1553                         remove_from_next_clones(ds_prev, obj, tx);
1554                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1555                                 VERIFY(0 == zap_add_int(mos,
1556                                     ds_prev->ds_phys->ds_next_clones_obj,
1557                                     ds->ds_phys->ds_next_snap_obj, tx));
1558                         }
1559                 }
1560                 if (after_branch_point &&
1561                     ds->ds_phys->ds_next_snap_obj == 0) {
1562                         /* This clone is toast. */
1563                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1564                         ds_prev->ds_phys->ds_num_children--;
1565                 } else if (!after_branch_point) {
1566                         ds_prev->ds_phys->ds_next_snap_obj =
1567                             ds->ds_phys->ds_next_snap_obj;
1568                 }
1569         }
1570
1571         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1572
1573         if (ds->ds_phys->ds_next_snap_obj != 0) {
1574                 blkptr_t bp;
1575                 dsl_dataset_t *ds_next;
1576                 uint64_t itor = 0;
1577                 uint64_t old_unique;
1578                 int64_t used = 0, compressed = 0, uncompressed = 0;
1579
1580                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1581                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1582                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1583
1584                 old_unique = dsl_dataset_unique(ds_next);
1585
1586                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1587                 ds_next->ds_phys->ds_prev_snap_obj =
1588                     ds->ds_phys->ds_prev_snap_obj;
1589                 ds_next->ds_phys->ds_prev_snap_txg =
1590                     ds->ds_phys->ds_prev_snap_txg;
1591                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1592                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1593
1594                 /*
1595                  * Transfer to our deadlist (which will become next's
1596                  * new deadlist) any entries from next's current
1597                  * deadlist which were born before prev, and free the
1598                  * other entries.
1599                  *
1600                  * XXX we're doing this long task with the config lock held
1601                  */
1602                 while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
1603                         if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
1604                                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
1605                                     &bp, tx));
1606                                 if (ds_prev && !after_branch_point &&
1607                                     bp.blk_birth >
1608                                     ds_prev->ds_phys->ds_prev_snap_txg) {
1609                                         ds_prev->ds_phys->ds_unique_bytes +=
1610                                             bp_get_dasize(dp->dp_spa, &bp);
1611                                 }
1612                         } else {
1613                                 used += bp_get_dasize(dp->dp_spa, &bp);
1614                                 compressed += BP_GET_PSIZE(&bp);
1615                                 uncompressed += BP_GET_UCSIZE(&bp);
1616                                 /* XXX check return value? */
1617                                 (void) dsl_free(zio, dp, tx->tx_txg,
1618                                     &bp, NULL, NULL, ARC_NOWAIT);
1619                         }
1620                 }
1621
1622                 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
1623
1624                 /* change snapused */
1625                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1626                     -used, -compressed, -uncompressed, tx);
1627
1628                 /* free next's deadlist */
1629                 bplist_close(&ds_next->ds_deadlist);
1630                 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
1631
1632                 /* set next's deadlist to our deadlist */
1633                 bplist_close(&ds->ds_deadlist);
1634                 ds_next->ds_phys->ds_deadlist_obj =
1635                     ds->ds_phys->ds_deadlist_obj;
1636                 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
1637                     ds_next->ds_phys->ds_deadlist_obj));
1638                 ds->ds_phys->ds_deadlist_obj = 0;
1639
1640                 if (ds_next->ds_phys->ds_next_snap_obj != 0) {
1641                         /*
1642                          * Update next's unique to include blocks which
1643                          * were previously shared by only this snapshot
1644                          * and it.  Those blocks will be born after the
1645                          * prev snap and before this snap, and will have
1646                          * died after the next snap and before the one
1647                          * after that (ie. be on the snap after next's
1648                          * deadlist).
1649                          *
1650                          * XXX we're doing this long task with the
1651                          * config lock held
1652                          */
1653                         dsl_dataset_t *ds_after_next;
1654                         uint64_t space;
1655
1656                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1657                             ds_next->ds_phys->ds_next_snap_obj,
1658                             FTAG, &ds_after_next));
1659
1660                         VERIFY(0 ==
1661                             bplist_space_birthrange(&ds_after_next->ds_deadlist,
1662                             ds->ds_phys->ds_prev_snap_txg,
1663                             ds->ds_phys->ds_creation_txg, &space));
1664                         ds_next->ds_phys->ds_unique_bytes += space;
1665
1666                         dsl_dataset_rele(ds_after_next, FTAG);
1667                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1668                 } else {
1669                         ASSERT3P(ds_next->ds_prev, ==, ds);
1670                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1671                         ds_next->ds_prev = NULL;
1672                         if (ds_prev) {
1673                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1674                                     ds->ds_phys->ds_prev_snap_obj,
1675                                     ds_next, &ds_next->ds_prev));
1676                         }
1677
1678                         dsl_dataset_recalc_head_uniq(ds_next);
1679
1680                         /*
1681                          * Reduce the amount of our unconsmed refreservation
1682                          * being charged to our parent by the amount of
1683                          * new unique data we have gained.
1684                          */
1685                         if (old_unique < ds_next->ds_reserved) {
1686                                 int64_t mrsdelta;
1687                                 uint64_t new_unique =
1688                                     ds_next->ds_phys->ds_unique_bytes;
1689
1690                                 ASSERT(old_unique <= new_unique);
1691                                 mrsdelta = MIN(new_unique - old_unique,
1692                                     ds_next->ds_reserved - old_unique);
1693                                 dsl_dir_diduse_space(ds->ds_dir,
1694                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1695                         }
1696                 }
1697                 dsl_dataset_rele(ds_next, FTAG);
1698         } else {
1699                 /*
1700                  * There's no next snapshot, so this is a head dataset.
1701                  * Destroy the deadlist.  Unless it's a clone, the
1702                  * deadlist should be empty.  (If it's a clone, it's
1703                  * safe to ignore the deadlist contents.)
1704                  */
1705                 struct killarg ka;
1706
1707                 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
1708                 bplist_close(&ds->ds_deadlist);
1709                 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1710                 ds->ds_phys->ds_deadlist_obj = 0;
1711
1712                 /*
1713                  * Free everything that we point to (that's born after
1714                  * the previous snapshot, if we are a clone)
1715                  *
1716                  * NB: this should be very quick, because we already
1717                  * freed all the objects in open context.
1718                  */
1719                 ka.ds = ds;
1720                 ka.zio = zio;
1721                 ka.tx = tx;
1722                 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1723                     TRAVERSE_POST, kill_blkptr, &ka);
1724                 ASSERT3U(err, ==, 0);
1725                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1726                     ds->ds_phys->ds_unique_bytes == 0);
1727         }
1728
1729         err = zio_wait(zio);
1730         ASSERT3U(err, ==, 0);
1731
1732         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1733                 /* Erase the link in the dir */
1734                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1735                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1736                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1737                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1738                 ASSERT(err == 0);
1739         } else {
1740                 /* remove from snapshot namespace */
1741                 dsl_dataset_t *ds_head;
1742                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1743                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1744                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1745                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1746 #ifdef ZFS_DEBUG
1747                 {
1748                         uint64_t val;
1749
1750                         err = dsl_dataset_snap_lookup(ds_head,
1751                             ds->ds_snapname, &val);
1752                         ASSERT3U(err, ==, 0);
1753                         ASSERT3U(val, ==, obj);
1754                 }
1755 #endif
1756                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1757                 ASSERT(err == 0);
1758                 dsl_dataset_rele(ds_head, FTAG);
1759         }
1760
1761         if (ds_prev && ds->ds_prev != ds_prev)
1762                 dsl_dataset_rele(ds_prev, FTAG);
1763
1764         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1765         spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
1766             cr, "dataset = %llu", ds->ds_object);
1767
1768         if (ds->ds_phys->ds_next_clones_obj != 0) {
1769                 uint64_t count;
1770                 ASSERT(0 == zap_count(mos,
1771                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1772                 VERIFY(0 == dmu_object_free(mos,
1773                     ds->ds_phys->ds_next_clones_obj, tx));
1774         }
1775         if (ds->ds_phys->ds_props_obj != 0)
1776                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1777         dsl_dir_close(ds->ds_dir, ds);
1778         ds->ds_dir = NULL;
1779         dsl_dataset_drain_refs(ds, tag);
1780         VERIFY(0 == dmu_object_free(mos, obj, tx));
1781 }
1782
1783 static int
1784 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1785 {
1786         uint64_t asize;
1787
1788         if (!dmu_tx_is_syncing(tx))
1789                 return (0);
1790
1791         /*
1792          * If there's an fs-only reservation, any blocks that might become
1793          * owned by the snapshot dataset must be accommodated by space
1794          * outside of the reservation.
1795          */
1796         asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
1797         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
1798                 return (ENOSPC);
1799
1800         /*
1801          * Propogate any reserved space for this snapshot to other
1802          * snapshot checks in this sync group.
1803          */
1804         if (asize > 0)
1805                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1806
1807         return (0);
1808 }
1809
1810 /* ARGSUSED */
1811 int
1812 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
1813 {
1814         dsl_dataset_t *ds = arg1;
1815         const char *snapname = arg2;
1816         int err;
1817         uint64_t value;
1818
1819         /*
1820          * We don't allow multiple snapshots of the same txg.  If there
1821          * is already one, try again.
1822          */
1823         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
1824                 return (EAGAIN);
1825
1826         /*
1827          * Check for conflicting name snapshot name.
1828          */
1829         err = dsl_dataset_snap_lookup(ds, snapname, &value);
1830         if (err == 0)
1831                 return (EEXIST);
1832         if (err != ENOENT)
1833                 return (err);
1834
1835         /*
1836          * Check that the dataset's name is not too long.  Name consists
1837          * of the dataset's length + 1 for the @-sign + snapshot name's length
1838          */
1839         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
1840                 return (ENAMETOOLONG);
1841
1842         err = dsl_dataset_snapshot_reserve_space(ds, tx);
1843         if (err)
1844                 return (err);
1845
1846         ds->ds_trysnap_txg = tx->tx_txg;
1847         return (0);
1848 }
1849
1850 void
1851 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1852 {
1853         dsl_dataset_t *ds = arg1;
1854         const char *snapname = arg2;
1855         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1856         dmu_buf_t *dbuf;
1857         dsl_dataset_phys_t *dsphys;
1858         uint64_t dsobj, crtxg;
1859         objset_t *mos = dp->dp_meta_objset;
1860         int err;
1861
1862         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1863
1864         /*
1865          * The origin's ds_creation_txg has to be < TXG_INITIAL
1866          */
1867         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1868                 crtxg = 1;
1869         else
1870                 crtxg = tx->tx_txg;
1871
1872         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1873             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1874         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1875         dmu_buf_will_dirty(dbuf, tx);
1876         dsphys = dbuf->db_data;
1877         bzero(dsphys, sizeof (dsl_dataset_phys_t));
1878         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1879         dsphys->ds_fsid_guid = unique_create();
1880         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1881             sizeof (dsphys->ds_guid));
1882         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1883         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1884         dsphys->ds_next_snap_obj = ds->ds_object;
1885         dsphys->ds_num_children = 1;
1886         dsphys->ds_creation_time = gethrestime_sec();
1887         dsphys->ds_creation_txg = crtxg;
1888         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1889         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
1890         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1891         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1892         dsphys->ds_flags = ds->ds_phys->ds_flags;
1893         dsphys->ds_bp = ds->ds_phys->ds_bp;
1894         dmu_buf_rele(dbuf, FTAG);
1895
1896         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
1897         if (ds->ds_prev) {
1898                 uint64_t next_clones_obj =
1899                     ds->ds_prev->ds_phys->ds_next_clones_obj;
1900                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
1901                     ds->ds_object ||
1902                     ds->ds_prev->ds_phys->ds_num_children > 1);
1903                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1904                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1905                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1906                             ds->ds_prev->ds_phys->ds_creation_txg);
1907                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1908                 } else if (next_clones_obj != 0) {
1909                         remove_from_next_clones(ds->ds_prev,
1910                             dsphys->ds_next_snap_obj, tx);
1911                         VERIFY3U(0, ==, zap_add_int(mos,
1912                             next_clones_obj, dsobj, tx));
1913                 }
1914         }
1915
1916         /*
1917          * If we have a reference-reservation on this dataset, we will
1918          * need to increase the amount of refreservation being charged
1919          * since our unique space is going to zero.
1920          */
1921         if (ds->ds_reserved) {
1922                 int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
1923                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1924                     add, 0, 0, tx);
1925         }
1926
1927         bplist_close(&ds->ds_deadlist);
1928         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1929         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
1930         ds->ds_phys->ds_prev_snap_obj = dsobj;
1931         ds->ds_phys->ds_prev_snap_txg = crtxg;
1932         ds->ds_phys->ds_unique_bytes = 0;
1933         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1934                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1935         ds->ds_phys->ds_deadlist_obj =
1936             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1937         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1938             ds->ds_phys->ds_deadlist_obj));
1939
1940         dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
1941         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1942             snapname, 8, 1, &dsobj, tx);
1943         ASSERT(err == 0);
1944
1945         if (ds->ds_prev)
1946                 dsl_dataset_drop_ref(ds->ds_prev, ds);
1947         VERIFY(0 == dsl_dataset_get_ref(dp,
1948             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
1949
1950         dsl_pool_ds_snapshotted(ds, tx);
1951
1952         spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
1953             "dataset = %llu", dsobj);
1954 }
1955
1956 void
1957 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1958 {
1959         ASSERT(dmu_tx_is_syncing(tx));
1960         ASSERT(ds->ds_user_ptr != NULL);
1961         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1962
1963         /*
1964          * in case we had to change ds_fsid_guid when we opened it,
1965          * sync it out now.
1966          */
1967         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1968         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
1969
1970         dsl_dir_dirty(ds->ds_dir, tx);
1971         dmu_objset_sync(ds->ds_user_ptr, zio, tx);
1972 }
1973
1974 void
1975 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1976 {
1977         uint64_t refd, avail, uobjs, aobjs;
1978
1979         dsl_dir_stats(ds->ds_dir, nv);
1980
1981         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1982         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1983         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1984
1985         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1986             ds->ds_phys->ds_creation_time);
1987         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1988             ds->ds_phys->ds_creation_txg);
1989         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1990             ds->ds_quota);
1991         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1992             ds->ds_reserved);
1993         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1994             ds->ds_phys->ds_guid);
1995
1996         if (ds->ds_phys->ds_next_snap_obj) {
1997                 /*
1998                  * This is a snapshot; override the dd's space used with
1999                  * our unique space and compression ratio.
2000                  */
2001                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2002                     ds->ds_phys->ds_unique_bytes);
2003                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
2004                     ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2005                     (ds->ds_phys->ds_uncompressed_bytes * 100 /
2006                     ds->ds_phys->ds_compressed_bytes));
2007         }
2008 }
2009
2010 void
2011 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2012 {
2013         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2014         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2015         stat->dds_guid = ds->ds_phys->ds_guid;
2016         if (ds->ds_phys->ds_next_snap_obj) {
2017                 stat->dds_is_snapshot = B_TRUE;
2018                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2019         } else {
2020                 stat->dds_is_snapshot = B_FALSE;
2021                 stat->dds_num_clones = 0;
2022         }
2023
2024         /* clone origin is really a dsl_dir thing... */
2025         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2026         if (dsl_dir_is_clone(ds->ds_dir)) {
2027                 dsl_dataset_t *ods;
2028
2029                 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2030                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2031                 dsl_dataset_name(ods, stat->dds_origin);
2032                 dsl_dataset_drop_ref(ods, FTAG);
2033         } else {
2034                 stat->dds_origin[0] = '\0';
2035         }
2036         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2037 }
2038
2039 uint64_t
2040 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2041 {
2042         return (ds->ds_fsid_guid);
2043 }
2044
2045 void
2046 dsl_dataset_space(dsl_dataset_t *ds,
2047     uint64_t *refdbytesp, uint64_t *availbytesp,
2048     uint64_t *usedobjsp, uint64_t *availobjsp)
2049 {
2050         *refdbytesp = ds->ds_phys->ds_used_bytes;
2051         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2052         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2053                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2054         if (ds->ds_quota != 0) {
2055                 /*
2056                  * Adjust available bytes according to refquota
2057                  */
2058                 if (*refdbytesp < ds->ds_quota)
2059                         *availbytesp = MIN(*availbytesp,
2060                             ds->ds_quota - *refdbytesp);
2061                 else
2062                         *availbytesp = 0;
2063         }
2064         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2065         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2066 }
2067
2068 boolean_t
2069 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2070 {
2071         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2072
2073         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2074             dsl_pool_sync_context(dp));
2075         if (ds->ds_prev == NULL)
2076                 return (B_FALSE);
2077         if (ds->ds_phys->ds_bp.blk_birth >
2078             ds->ds_prev->ds_phys->ds_creation_txg)
2079                 return (B_TRUE);
2080         return (B_FALSE);
2081 }
2082
2083 /* ARGSUSED */
2084 static int
2085 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2086 {
2087         dsl_dataset_t *ds = arg1;
2088         char *newsnapname = arg2;
2089         dsl_dir_t *dd = ds->ds_dir;
2090         dsl_dataset_t *hds;
2091         uint64_t val;
2092         int err;
2093
2094         err = dsl_dataset_hold_obj(dd->dd_pool,
2095             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2096         if (err)
2097                 return (err);
2098
2099         /* new name better not be in use */
2100         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2101         dsl_dataset_rele(hds, FTAG);
2102
2103         if (err == 0)
2104                 err = EEXIST;
2105         else if (err == ENOENT)
2106                 err = 0;
2107
2108         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2109         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2110                 err = ENAMETOOLONG;
2111
2112         return (err);
2113 }
2114
2115 static void
2116 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
2117     cred_t *cr, dmu_tx_t *tx)
2118 {
2119         dsl_dataset_t *ds = arg1;
2120         const char *newsnapname = arg2;
2121         dsl_dir_t *dd = ds->ds_dir;
2122         objset_t *mos = dd->dd_pool->dp_meta_objset;
2123         dsl_dataset_t *hds;
2124         int err;
2125
2126         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2127
2128         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2129             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2130
2131         VERIFY(0 == dsl_dataset_get_snapname(ds));
2132         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2133         ASSERT3U(err, ==, 0);
2134         mutex_enter(&ds->ds_lock);
2135         (void) strcpy(ds->ds_snapname, newsnapname);
2136         mutex_exit(&ds->ds_lock);
2137         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2138             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2139         ASSERT3U(err, ==, 0);
2140
2141         spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
2142             cr, "dataset = %llu", ds->ds_object);
2143         dsl_dataset_rele(hds, FTAG);
2144 }
2145
2146 struct renamesnaparg {
2147         dsl_sync_task_group_t *dstg;
2148         char failed[MAXPATHLEN];
2149         char *oldsnap;
2150         char *newsnap;
2151 };
2152
2153 static int
2154 dsl_snapshot_rename_one(char *name, void *arg)
2155 {
2156         struct renamesnaparg *ra = arg;
2157         dsl_dataset_t *ds = NULL;
2158         char *cp;
2159         int err;
2160
2161         cp = name + strlen(name);
2162         *cp = '@';
2163         (void) strcpy(cp + 1, ra->oldsnap);
2164
2165         /*
2166          * For recursive snapshot renames the parent won't be changing
2167          * so we just pass name for both the to/from argument.
2168          */
2169         err = zfs_secpolicy_rename_perms(name, name, CRED());
2170         if (err == ENOENT) {
2171                 return (0);
2172         } else if (err) {
2173                 (void) strcpy(ra->failed, name);
2174                 return (err);
2175         }
2176
2177 #ifdef _KERNEL
2178         /*
2179          * For all filesystems undergoing rename, we'll need to unmount it.
2180          */
2181         (void) zfs_unmount_snap(name, NULL);
2182 #endif
2183         err = dsl_dataset_hold(name, ra->dstg, &ds);
2184         *cp = '\0';
2185         if (err == ENOENT) {
2186                 return (0);
2187         } else if (err) {
2188                 (void) strcpy(ra->failed, name);
2189                 return (err);
2190         }
2191
2192         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2193             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2194
2195         return (0);
2196 }
2197
2198 static int
2199 dsl_recursive_rename(char *oldname, const char *newname)
2200 {
2201         int err;
2202         struct renamesnaparg *ra;
2203         dsl_sync_task_t *dst;
2204         spa_t *spa;
2205         char *cp, *fsname = spa_strdup(oldname);
2206         int len = strlen(oldname);
2207
2208         /* truncate the snapshot name to get the fsname */
2209         cp = strchr(fsname, '@');
2210         *cp = '\0';
2211
2212         err = spa_open(fsname, &spa, FTAG);
2213         if (err) {
2214                 kmem_free(fsname, len + 1);
2215                 return (err);
2216         }
2217         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2218         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2219
2220         ra->oldsnap = strchr(oldname, '@') + 1;
2221         ra->newsnap = strchr(newname, '@') + 1;
2222         *ra->failed = '\0';
2223
2224         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2225             DS_FIND_CHILDREN);
2226         kmem_free(fsname, len + 1);
2227
2228         if (err == 0) {
2229                 err = dsl_sync_task_group_wait(ra->dstg);
2230         }
2231
2232         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2233             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2234                 dsl_dataset_t *ds = dst->dst_arg1;
2235                 if (dst->dst_err) {
2236                         dsl_dir_name(ds->ds_dir, ra->failed);
2237                         (void) strcat(ra->failed, "@");
2238                         (void) strcat(ra->failed, ra->newsnap);
2239                 }
2240                 dsl_dataset_rele(ds, ra->dstg);
2241         }
2242
2243         if (err)
2244                 (void) strcpy(oldname, ra->failed);
2245
2246         dsl_sync_task_group_destroy(ra->dstg);
2247         kmem_free(ra, sizeof (struct renamesnaparg));
2248         spa_close(spa, FTAG);
2249         return (err);
2250 }
2251
2252 static int
2253 dsl_valid_rename(char *oldname, void *arg)
2254 {
2255         int delta = *(int *)arg;
2256
2257         if (strlen(oldname) + delta >= MAXNAMELEN)
2258                 return (ENAMETOOLONG);
2259
2260         return (0);
2261 }
2262
2263 #pragma weak dmu_objset_rename = dsl_dataset_rename
2264 int
2265 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2266 {
2267         dsl_dir_t *dd;
2268         dsl_dataset_t *ds;
2269         const char *tail;
2270         int err;
2271
2272         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2273         if (err)
2274                 return (err);
2275         /*
2276          * If there are more than 2 references there may be holds
2277          * hanging around that haven't been cleared out yet.
2278          */
2279         if (dmu_buf_refcount(dd->dd_dbuf) > 2)
2280                 txg_wait_synced(dd->dd_pool, 0);
2281         if (tail == NULL) {
2282                 int delta = strlen(newname) - strlen(oldname);
2283
2284                 /* if we're growing, validate child name lengths */
2285                 if (delta > 0)
2286                         err = dmu_objset_find(oldname, dsl_valid_rename,
2287                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2288
2289                 if (!err)
2290                         err = dsl_dir_rename(dd, newname);
2291                 dsl_dir_close(dd, FTAG);
2292                 return (err);
2293         }
2294         if (tail[0] != '@') {
2295                 /* the name ended in a nonexistant component */
2296                 dsl_dir_close(dd, FTAG);
2297                 return (ENOENT);
2298         }
2299
2300         dsl_dir_close(dd, FTAG);
2301
2302         /* new name must be snapshot in same filesystem */
2303         tail = strchr(newname, '@');
2304         if (tail == NULL)
2305                 return (EINVAL);
2306         tail++;
2307         if (strncmp(oldname, newname, tail - newname) != 0)
2308                 return (EXDEV);
2309
2310         if (recursive) {
2311                 err = dsl_recursive_rename(oldname, newname);
2312         } else {
2313                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2314                 if (err)
2315                         return (err);
2316
2317                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2318                     dsl_dataset_snapshot_rename_check,
2319                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2320
2321                 dsl_dataset_rele(ds, FTAG);
2322         }
2323
2324         return (err);
2325 }
2326
2327 struct promotenode {
2328         list_node_t link;
2329         dsl_dataset_t *ds;
2330 };
2331
2332 struct promotearg {
2333         list_t shared_snaps, origin_snaps, clone_snaps;
2334         dsl_dataset_t *origin_origin, *origin_head;
2335         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2336 };
2337
2338 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2339
2340 /* ARGSUSED */
2341 static int
2342 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2343 {
2344         dsl_dataset_t *hds = arg1;
2345         struct promotearg *pa = arg2;
2346         struct promotenode *snap = list_head(&pa->shared_snaps);
2347         dsl_dataset_t *origin_ds = snap->ds;
2348         int err;
2349
2350         /* Check that it is a real clone */
2351         if (!dsl_dir_is_clone(hds->ds_dir))
2352                 return (EINVAL);
2353
2354         /* Since this is so expensive, don't do the preliminary check */
2355         if (!dmu_tx_is_syncing(tx))
2356                 return (0);
2357
2358         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2359                 return (EXDEV);
2360
2361         /* compute origin's new unique space */
2362         snap = list_tail(&pa->clone_snaps);
2363         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2364         err = bplist_space_birthrange(&snap->ds->ds_deadlist,
2365             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
2366         if (err)
2367                 return (err);
2368
2369         /*
2370          * Walk the snapshots that we are moving
2371          *
2372          * Compute space to transfer.  Consider the incremental changes
2373          * to used for each snapshot:
2374          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2375          * So each snapshot gave birth to:
2376          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2377          * So a sequence would look like:
2378          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2379          * Which simplifies to:
2380          * uN + kN + kN-1 + ... + k1 + k0
2381          * Note however, if we stop before we reach the ORIGIN we get:
2382          * uN + kN + kN-1 + ... + kM - uM-1
2383          */
2384         pa->used = origin_ds->ds_phys->ds_used_bytes;
2385         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2386         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2387         for (snap = list_head(&pa->shared_snaps); snap;
2388             snap = list_next(&pa->shared_snaps, snap)) {
2389                 uint64_t val, dlused, dlcomp, dluncomp;
2390                 dsl_dataset_t *ds = snap->ds;
2391
2392                 /* Check that the snapshot name does not conflict */
2393                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2394                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2395                 if (err == 0)
2396                         return (EEXIST);
2397                 if (err != ENOENT)
2398                         return (err);
2399
2400                 /* The very first snapshot does not have a deadlist */
2401                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2402                         continue;
2403
2404                 if (err = bplist_space(&ds->ds_deadlist,
2405                     &dlused, &dlcomp, &dluncomp))
2406                         return (err);
2407                 pa->used += dlused;
2408                 pa->comp += dlcomp;
2409                 pa->uncomp += dluncomp;
2410         }
2411
2412         /*
2413          * If we are a clone of a clone then we never reached ORIGIN,
2414          * so we need to subtract out the clone origin's used space.
2415          */
2416         if (pa->origin_origin) {
2417                 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
2418                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2419                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2420         }
2421
2422         /* Check that there is enough space here */
2423         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2424             pa->used);
2425         if (err)
2426                 return (err);
2427
2428         /*
2429          * Compute the amounts of space that will be used by snapshots
2430          * after the promotion (for both origin and clone).  For each,
2431          * it is the amount of space that will be on all of their
2432          * deadlists (that was not born before their new origin).
2433          */
2434         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2435                 uint64_t space;
2436
2437                 /*
2438                  * Note, typically this will not be a clone of a clone,
2439                  * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
2440                  * these snaplist_space() -> bplist_space_birthrange()
2441                  * calls will be fast because they do not have to
2442                  * iterate over all bps.
2443                  */
2444                 snap = list_head(&pa->origin_snaps);
2445                 err = snaplist_space(&pa->shared_snaps,
2446                     snap->ds->ds_origin_txg, &pa->cloneusedsnap);
2447                 if (err)
2448                         return (err);
2449
2450                 err = snaplist_space(&pa->clone_snaps,
2451                     snap->ds->ds_origin_txg, &space);
2452                 if (err)
2453                         return (err);
2454                 pa->cloneusedsnap += space;
2455         }
2456         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2457                 err = snaplist_space(&pa->origin_snaps,
2458                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2459                 if (err)
2460                         return (err);
2461         }
2462
2463         return (0);
2464 }
2465
2466 static void
2467 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
2468 {
2469         dsl_dataset_t *hds = arg1;
2470         struct promotearg *pa = arg2;
2471         struct promotenode *snap = list_head(&pa->shared_snaps);
2472         dsl_dataset_t *origin_ds = snap->ds;
2473         dsl_dataset_t *origin_head;
2474         dsl_dir_t *dd = hds->ds_dir;
2475         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2476         dsl_dir_t *odd = NULL;
2477         uint64_t oldnext_obj;
2478         int64_t delta;
2479
2480         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2481
2482         snap = list_head(&pa->origin_snaps);
2483         origin_head = snap->ds;
2484
2485         /*
2486          * We need to explicitly open odd, since origin_ds's dd will be
2487          * changing.
2488          */
2489         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2490             NULL, FTAG, &odd));
2491
2492         /* change origin's next snap */
2493         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2494         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2495         snap = list_tail(&pa->clone_snaps);
2496         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2497         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2498
2499         /* change the origin's next clone */
2500         if (origin_ds->ds_phys->ds_next_clones_obj) {
2501                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2502                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2503                     origin_ds->ds_phys->ds_next_clones_obj,
2504                     oldnext_obj, tx));
2505         }
2506
2507         /* change origin */
2508         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2509         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2510         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2511         hds->ds_origin_txg = origin_head->ds_origin_txg;
2512         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2513         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2514         origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
2515
2516         /* move snapshots to this dir */
2517         for (snap = list_head(&pa->shared_snaps); snap;
2518             snap = list_next(&pa->shared_snaps, snap)) {
2519                 dsl_dataset_t *ds = snap->ds;
2520
2521                 /* unregister props as dsl_dir is changing */
2522                 if (ds->ds_user_ptr) {
2523                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
2524                         ds->ds_user_ptr = NULL;
2525                 }
2526                 /* move snap name entry */
2527                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2528                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2529                     ds->ds_snapname, tx));
2530                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2531                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2532                     8, 1, &ds->ds_object, tx));
2533                 /* change containing dsl_dir */
2534                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2535                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2536                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2537                 ASSERT3P(ds->ds_dir, ==, odd);
2538                 dsl_dir_close(ds->ds_dir, ds);
2539                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2540                     NULL, ds, &ds->ds_dir));
2541
2542                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
2543         }
2544
2545         /*
2546          * Change space accounting.
2547          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2548          * both be valid, or both be 0 (resulting in delta == 0).  This
2549          * is true for each of {clone,origin} independently.
2550          */
2551
2552         delta = pa->cloneusedsnap -
2553             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2554         ASSERT3S(delta, >=, 0);
2555         ASSERT3U(pa->used, >=, delta);
2556         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2557         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2558             pa->used - delta, pa->comp, pa->uncomp, tx);
2559
2560         delta = pa->originusedsnap -
2561             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2562         ASSERT3S(delta, <=, 0);
2563         ASSERT3U(pa->used, >=, -delta);
2564         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2565         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2566             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2567
2568         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2569
2570         /* log history record */
2571         spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
2572             cr, "dataset = %llu", hds->ds_object);
2573
2574         dsl_dir_close(odd, FTAG);
2575 }
2576
2577 static char *snaplist_tag = "snaplist";
2578 /*
2579  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2580  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2581  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2582  * snapshots back to this dataset's origin.
2583  */
2584 static int
2585 snaplist_make(dsl_pool_t *dp, boolean_t own,
2586     uint64_t first_obj, uint64_t last_obj, list_t *l)
2587 {
2588         uint64_t obj = last_obj;
2589
2590         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2591
2592         list_create(l, sizeof (struct promotenode),
2593             offsetof(struct promotenode, link));
2594
2595         while (obj != first_obj) {
2596                 dsl_dataset_t *ds;
2597                 struct promotenode *snap;
2598                 int err;
2599
2600                 if (own) {
2601                         err = dsl_dataset_own_obj(dp, obj,
2602                             0, snaplist_tag, &ds);
2603                         if (err == 0)
2604                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2605                 } else {
2606                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2607                 }
2608                 if (err == ENOENT) {
2609                         /* lost race with snapshot destroy */
2610                         struct promotenode *last = list_tail(l);
2611                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2612                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2613                         continue;
2614                 } else if (err) {
2615                         return (err);
2616                 }
2617
2618                 if (first_obj == 0)
2619                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2620
2621                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2622                 snap->ds = ds;
2623                 list_insert_tail(l, snap);
2624                 obj = ds->ds_phys->ds_prev_snap_obj;
2625         }
2626
2627         return (0);
2628 }
2629
2630 static int
2631 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2632 {
2633         struct promotenode *snap;
2634
2635         *spacep = 0;
2636         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2637                 uint64_t used;
2638                 int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
2639                     mintxg, UINT64_MAX, &used);
2640                 if (err)
2641                         return (err);
2642                 *spacep += used;
2643         }
2644         return (0);
2645 }
2646
2647 static void
2648 snaplist_destroy(list_t *l, boolean_t own)
2649 {
2650         struct promotenode *snap;
2651
2652         if (!l || !list_link_active(&l->list_head))
2653                 return;
2654
2655         while ((snap = list_tail(l)) != NULL) {
2656                 list_remove(l, snap);
2657                 if (own)
2658                         dsl_dataset_disown(snap->ds, snaplist_tag);
2659                 else
2660                         dsl_dataset_rele(snap->ds, snaplist_tag);
2661                 kmem_free(snap, sizeof (struct promotenode));
2662         }
2663         list_destroy(l);
2664 }
2665
2666 /*
2667  * Promote a clone.  Nomenclature note:
2668  * "clone" or "cds": the original clone which is being promoted
2669  * "origin" or "ods": the snapshot which is originally clone's origin
2670  * "origin head" or "ohds": the dataset which is the head
2671  * (filesystem/volume) for the origin
2672  * "origin origin": the origin of the origin's filesystem (typically
2673  * NULL, indicating that the clone is not a clone of a clone).
2674  */
2675 int
2676 dsl_dataset_promote(const char *name)
2677 {
2678         dsl_dataset_t *ds;
2679         dsl_dir_t *dd;
2680         dsl_pool_t *dp;
2681         dmu_object_info_t doi;
2682         struct promotearg pa = { 0 };
2683         struct promotenode *snap;
2684         int err;
2685
2686         err = dsl_dataset_hold(name, FTAG, &ds);
2687         if (err)
2688                 return (err);
2689         dd = ds->ds_dir;
2690         dp = dd->dd_pool;
2691
2692         err = dmu_object_info(dp->dp_meta_objset,
2693             ds->ds_phys->ds_snapnames_zapobj, &doi);
2694         if (err) {
2695                 dsl_dataset_rele(ds, FTAG);
2696                 return (err);
2697         }
2698
2699         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
2700                 dsl_dataset_rele(ds, FTAG);
2701                 return (EINVAL);
2702         }
2703
2704         /*
2705          * We are going to inherit all the snapshots taken before our
2706          * origin (i.e., our new origin will be our parent's origin).
2707          * Take ownership of them so that we can rename them into our
2708          * namespace.
2709          */
2710         rw_enter(&dp->dp_config_rwlock, RW_READER);
2711
2712         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
2713             &pa.shared_snaps);
2714         if (err != 0)
2715                 goto out;
2716
2717         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
2718         if (err != 0)
2719                 goto out;
2720
2721         snap = list_head(&pa.shared_snaps);
2722         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
2723         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
2724             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
2725         if (err != 0)
2726                 goto out;
2727
2728         if (dsl_dir_is_clone(snap->ds->ds_dir)) {
2729                 err = dsl_dataset_own_obj(dp,
2730                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
2731                     0, FTAG, &pa.origin_origin);
2732                 if (err != 0)
2733                         goto out;
2734         }
2735
2736 out:
2737         rw_exit(&dp->dp_config_rwlock);
2738
2739         /*
2740          * Add in 128x the snapnames zapobj size, since we will be moving
2741          * a bunch of snapnames to the promoted ds, and dirtying their
2742          * bonus buffers.
2743          */
2744         if (err == 0) {
2745                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
2746                     dsl_dataset_promote_sync, ds, &pa,
2747                     2 + 2 * doi.doi_physical_blks);
2748         }
2749
2750         snaplist_destroy(&pa.shared_snaps, B_TRUE);
2751         snaplist_destroy(&pa.clone_snaps, B_FALSE);
2752         snaplist_destroy(&pa.origin_snaps, B_FALSE);
2753         if (pa.origin_origin)
2754                 dsl_dataset_disown(pa.origin_origin, FTAG);
2755         dsl_dataset_rele(ds, FTAG);
2756         return (err);
2757 }
2758
2759 struct cloneswaparg {
2760         dsl_dataset_t *cds; /* clone dataset */
2761         dsl_dataset_t *ohds; /* origin's head dataset */
2762         boolean_t force;
2763         int64_t unused_refres_delta; /* change in unconsumed refreservation */
2764 };
2765
2766 /* ARGSUSED */
2767 static int
2768 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
2769 {
2770         struct cloneswaparg *csa = arg1;
2771
2772         /* they should both be heads */
2773         if (dsl_dataset_is_snapshot(csa->cds) ||
2774             dsl_dataset_is_snapshot(csa->ohds))
2775                 return (EINVAL);
2776
2777         /* the branch point should be just before them */
2778         if (csa->cds->ds_prev != csa->ohds->ds_prev)
2779                 return (EINVAL);
2780
2781         /* cds should be the clone */
2782         if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
2783             csa->ohds->ds_object)
2784                 return (EINVAL);
2785
2786         /* the clone should be a child of the origin */
2787         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
2788                 return (EINVAL);
2789
2790         /* ohds shouldn't be modified unless 'force' */
2791         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
2792                 return (ETXTBSY);
2793
2794         /* adjust amount of any unconsumed refreservation */
2795         csa->unused_refres_delta =
2796             (int64_t)MIN(csa->ohds->ds_reserved,
2797             csa->ohds->ds_phys->ds_unique_bytes) -
2798             (int64_t)MIN(csa->ohds->ds_reserved,
2799             csa->cds->ds_phys->ds_unique_bytes);
2800
2801         if (csa->unused_refres_delta > 0 &&
2802             csa->unused_refres_delta >
2803             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
2804                 return (ENOSPC);
2805
2806         return (0);
2807 }
2808
2809 /* ARGSUSED */
2810 static void
2811 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
2812 {
2813         struct cloneswaparg *csa = arg1;
2814         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
2815
2816         ASSERT(csa->cds->ds_reserved == 0);
2817         ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
2818
2819         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
2820         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
2821         dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
2822
2823         if (csa->cds->ds_user_ptr != NULL) {
2824                 csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
2825                 csa->cds->ds_user_ptr = NULL;
2826         }
2827
2828         if (csa->ohds->ds_user_ptr != NULL) {
2829                 csa->ohds->ds_user_evict_func(csa->ohds,
2830                     csa->ohds->ds_user_ptr);
2831                 csa->ohds->ds_user_ptr = NULL;
2832         }
2833
2834         /* reset origin's unique bytes */
2835         VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
2836             csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2837             &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
2838
2839         /* swap blkptrs */
2840         {
2841                 blkptr_t tmp;
2842                 tmp = csa->ohds->ds_phys->ds_bp;
2843                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
2844                 csa->cds->ds_phys->ds_bp = tmp;
2845         }
2846
2847         /* set dd_*_bytes */
2848         {
2849                 int64_t dused, dcomp, duncomp;
2850                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
2851                 uint64_t odl_used, odl_comp, odl_uncomp;
2852
2853                 ASSERT3U(csa->cds->ds_dir->dd_phys->
2854                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
2855
2856                 VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
2857                     &cdl_comp, &cdl_uncomp));
2858                 VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
2859                     &odl_comp, &odl_uncomp));
2860
2861                 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
2862                     (csa->ohds->ds_phys->ds_used_bytes + odl_used);
2863                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
2864                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
2865                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
2866                     cdl_uncomp -
2867                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
2868
2869                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
2870                     dused, dcomp, duncomp, tx);
2871                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
2872                     -dused, -dcomp, -duncomp, tx);
2873
2874                 /*
2875                  * The difference in the space used by snapshots is the
2876                  * difference in snapshot space due to the head's
2877                  * deadlist (since that's the only thing that's
2878                  * changing that affects the snapused).
2879                  */
2880                 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
2881                     csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
2882                 VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
2883                     csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
2884                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
2885                     DD_USED_HEAD, DD_USED_SNAP, tx);
2886         }
2887
2888 #define SWITCH64(x, y) \
2889         { \
2890                 uint64_t __tmp = (x); \
2891                 (x) = (y); \
2892                 (y) = __tmp; \
2893         }
2894
2895         /* swap ds_*_bytes */
2896         SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
2897             csa->cds->ds_phys->ds_used_bytes);
2898         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
2899             csa->cds->ds_phys->ds_compressed_bytes);
2900         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
2901             csa->cds->ds_phys->ds_uncompressed_bytes);
2902         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
2903             csa->cds->ds_phys->ds_unique_bytes);
2904
2905         /* apply any parent delta for change in unconsumed refreservation */
2906         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
2907             csa->unused_refres_delta, 0, 0, tx);
2908
2909         /* swap deadlists */
2910         bplist_close(&csa->cds->ds_deadlist);
2911         bplist_close(&csa->ohds->ds_deadlist);
2912         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
2913             csa->cds->ds_phys->ds_deadlist_obj);
2914         VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
2915             csa->cds->ds_phys->ds_deadlist_obj));
2916         VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
2917             csa->ohds->ds_phys->ds_deadlist_obj));
2918
2919         dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
2920 }
2921
2922 /*
2923  * Swap 'clone' with its origin head file system.  Used at the end
2924  * of "online recv" to swizzle the file system to the new version.
2925  */
2926 int
2927 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
2928     boolean_t force)
2929 {
2930         struct cloneswaparg csa;
2931         int error;
2932
2933         ASSERT(clone->ds_owner);
2934         ASSERT(origin_head->ds_owner);
2935 retry:
2936         /* Need exclusive access for the swap */
2937         rw_enter(&clone->ds_rwlock, RW_WRITER);
2938         if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
2939                 rw_exit(&clone->ds_rwlock);
2940                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
2941                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
2942                         rw_exit(&origin_head->ds_rwlock);
2943                         goto retry;
2944                 }
2945         }
2946         csa.cds = clone;
2947         csa.ohds = origin_head;
2948         csa.force = force;
2949         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
2950             dsl_dataset_clone_swap_check,
2951             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
2952         return (error);
2953 }
2954
2955 /*
2956  * Given a pool name and a dataset object number in that pool,
2957  * return the name of that dataset.
2958  */
2959 int
2960 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2961 {
2962         spa_t *spa;
2963         dsl_pool_t *dp;
2964         dsl_dataset_t *ds;
2965         int error;
2966
2967         if ((error = spa_open(pname, &spa, FTAG)) != 0)
2968                 return (error);
2969         dp = spa_get_dsl(spa);
2970         rw_enter(&dp->dp_config_rwlock, RW_READER);
2971         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
2972                 dsl_dataset_name(ds, buf);
2973                 dsl_dataset_rele(ds, FTAG);
2974         }
2975         rw_exit(&dp->dp_config_rwlock);
2976         spa_close(spa, FTAG);
2977
2978         return (error);
2979 }
2980
2981 int
2982 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2983     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2984 {
2985         int error = 0;
2986
2987         ASSERT3S(asize, >, 0);
2988
2989         /*
2990          * *ref_rsrv is the portion of asize that will come from any
2991          * unconsumed refreservation space.
2992          */
2993         *ref_rsrv = 0;
2994
2995         mutex_enter(&ds->ds_lock);
2996         /*
2997          * Make a space adjustment for reserved bytes.
2998          */
2999         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3000                 ASSERT3U(*used, >=,
3001                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3002                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3003                 *ref_rsrv =
3004                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3005         }
3006
3007         if (!check_quota || ds->ds_quota == 0) {
3008                 mutex_exit(&ds->ds_lock);
3009                 return (0);
3010         }
3011         /*
3012          * If they are requesting more space, and our current estimate
3013          * is over quota, they get to try again unless the actual
3014          * on-disk is over quota and there are no pending changes (which
3015          * may free up space for us).
3016          */
3017         if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
3018                 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
3019                         error = ERESTART;
3020                 else
3021                         error = EDQUOT;
3022         }
3023         mutex_exit(&ds->ds_lock);
3024
3025         return (error);
3026 }
3027
3028 /* ARGSUSED */
3029 static int
3030 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3031 {
3032         dsl_dataset_t *ds = arg1;
3033         uint64_t *quotap = arg2;
3034         uint64_t new_quota = *quotap;
3035
3036         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3037                 return (ENOTSUP);
3038
3039         if (new_quota == 0)
3040                 return (0);
3041
3042         if (new_quota < ds->ds_phys->ds_used_bytes ||
3043             new_quota < ds->ds_reserved)
3044                 return (ENOSPC);
3045
3046         return (0);
3047 }
3048
3049 /* ARGSUSED */
3050 void
3051 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3052 {
3053         dsl_dataset_t *ds = arg1;
3054         uint64_t *quotap = arg2;
3055         uint64_t new_quota = *quotap;
3056
3057         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3058
3059         ds->ds_quota = new_quota;
3060
3061         dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
3062
3063         spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
3064             tx, cr, "%lld dataset = %llu ",
3065             (longlong_t)new_quota, ds->ds_object);
3066 }
3067
3068 int
3069 dsl_dataset_set_quota(const char *dsname, uint64_t quota)
3070 {
3071         dsl_dataset_t *ds;
3072         int err;
3073
3074         err = dsl_dataset_hold(dsname, FTAG, &ds);
3075         if (err)
3076                 return (err);
3077
3078         if (quota != ds->ds_quota) {
3079                 /*
3080                  * If someone removes a file, then tries to set the quota, we
3081                  * want to make sure the file freeing takes effect.
3082                  */
3083                 txg_wait_open(ds->ds_dir->dd_pool, 0);
3084
3085                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3086                     dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3087                     ds, &quota, 0);
3088         }
3089         dsl_dataset_rele(ds, FTAG);
3090         return (err);
3091 }
3092
3093 static int
3094 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3095 {
3096         dsl_dataset_t *ds = arg1;
3097         uint64_t *reservationp = arg2;
3098         uint64_t new_reservation = *reservationp;
3099         uint64_t unique;
3100
3101         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3102             SPA_VERSION_REFRESERVATION)
3103                 return (ENOTSUP);
3104
3105         if (dsl_dataset_is_snapshot(ds))
3106                 return (EINVAL);
3107
3108         /*
3109          * If we are doing the preliminary check in open context, the
3110          * space estimates may be inaccurate.
3111          */
3112         if (!dmu_tx_is_syncing(tx))
3113                 return (0);
3114
3115         mutex_enter(&ds->ds_lock);
3116         unique = dsl_dataset_unique(ds);
3117         mutex_exit(&ds->ds_lock);
3118
3119         if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
3120                 uint64_t delta = MAX(unique, new_reservation) -
3121                     MAX(unique, ds->ds_reserved);
3122
3123                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3124                         return (ENOSPC);
3125                 if (ds->ds_quota > 0 &&
3126                     new_reservation > ds->ds_quota)
3127                         return (ENOSPC);
3128         }
3129
3130         return (0);
3131 }
3132
3133 /* ARGSUSED */
3134 static void
3135 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
3136     dmu_tx_t *tx)
3137 {
3138         dsl_dataset_t *ds = arg1;
3139         uint64_t *reservationp = arg2;
3140         uint64_t new_reservation = *reservationp;
3141         uint64_t unique;
3142         int64_t delta;
3143
3144         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3145
3146         mutex_enter(&ds->ds_dir->dd_lock);
3147         mutex_enter(&ds->ds_lock);
3148         unique = dsl_dataset_unique(ds);
3149         delta = MAX(0, (int64_t)(new_reservation - unique)) -
3150             MAX(0, (int64_t)(ds->ds_reserved - unique));
3151         ds->ds_reserved = new_reservation;
3152         mutex_exit(&ds->ds_lock);
3153
3154         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3155         mutex_exit(&ds->ds_dir->dd_lock);
3156         dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
3157             new_reservation, cr, tx);
3158
3159         spa_history_internal_log(LOG_DS_REFRESERV,
3160             ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
3161             (longlong_t)new_reservation, ds->ds_object);
3162 }
3163
3164 int
3165 dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
3166 {
3167         dsl_dataset_t *ds;
3168         int err;
3169
3170         err = dsl_dataset_hold(dsname, FTAG, &ds);
3171         if (err)
3172                 return (err);
3173
3174         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3175             dsl_dataset_set_reservation_check,
3176             dsl_dataset_set_reservation_sync, ds, &reservation, 0);
3177         dsl_dataset_rele(ds, FTAG);
3178         return (err);
3179 }