]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
MFC r277419:
[FreeBSD/stable/10.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
24  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 RackTop Systems.
27  */
28
29 #include <sys/dmu_objset.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dsl_prop.h>
33 #include <sys/dsl_synctask.h>
34 #include <sys/dmu_traverse.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/arc.h>
38 #include <sys/zio.h>
39 #include <sys/zap.h>
40 #include <sys/zfeature.h>
41 #include <sys/unique.h>
42 #include <sys/zfs_context.h>
43 #include <sys/zfs_ioctl.h>
44 #include <sys/spa.h>
45 #include <sys/zfs_znode.h>
46 #include <sys/zfs_onexit.h>
47 #include <sys/zvol.h>
48 #include <sys/dsl_scan.h>
49 #include <sys/dsl_deadlist.h>
50 #include <sys/dsl_destroy.h>
51 #include <sys/dsl_userhold.h>
52 #include <sys/dsl_bookmark.h>
53
54 SYSCTL_DECL(_vfs_zfs);
55
56 /*
57  * The SPA supports block sizes up to 16MB.  However, very large blocks
58  * can have an impact on i/o latency (e.g. tying up a spinning disk for
59  * ~300ms), and also potentially on the memory allocator.  Therefore,
60  * we do not allow the recordsize to be set larger than zfs_max_recordsize
61  * (default 1MB).  Larger blocks can be created by changing this tunable,
62  * and pools with larger blocks can always be imported and used, regardless
63  * of this setting.
64  */
65 int zfs_max_recordsize = 1 * 1024 * 1024;
66 SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
67     &zfs_max_recordsize, 0,
68     "Maximum block size.  Expect dragons when tuning this.");
69
70 #define SWITCH64(x, y) \
71         { \
72                 uint64_t __tmp = (x); \
73                 (x) = (y); \
74                 (y) = __tmp; \
75         }
76
77 #define DS_REF_MAX      (1ULL << 62)
78
79 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
80 extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds);
81
82 /*
83  * Figure out how much of this delta should be propogated to the dsl_dir
84  * layer.  If there's a refreservation, that space has already been
85  * partially accounted for in our ancestors.
86  */
87 static int64_t
88 parent_delta(dsl_dataset_t *ds, int64_t delta)
89 {
90         dsl_dataset_phys_t *ds_phys;
91         uint64_t old_bytes, new_bytes;
92
93         if (ds->ds_reserved == 0)
94                 return (delta);
95
96         ds_phys = dsl_dataset_phys(ds);
97         old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
98         new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
99
100         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
101         return (new_bytes - old_bytes);
102 }
103
104 void
105 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
106 {
107         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
108         int compressed = BP_GET_PSIZE(bp);
109         int uncompressed = BP_GET_UCSIZE(bp);
110         int64_t delta;
111
112         dprintf_bp(bp, "ds=%p", ds);
113
114         ASSERT(dmu_tx_is_syncing(tx));
115         /* It could have been compressed away to nothing */
116         if (BP_IS_HOLE(bp))
117                 return;
118         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
119         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
120         if (ds == NULL) {
121                 dsl_pool_mos_diduse_space(tx->tx_pool,
122                     used, compressed, uncompressed);
123                 return;
124         }
125
126         dmu_buf_will_dirty(ds->ds_dbuf, tx);
127         mutex_enter(&ds->ds_lock);
128         delta = parent_delta(ds, used);
129         dsl_dataset_phys(ds)->ds_referenced_bytes += used;
130         dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
131         dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
132         dsl_dataset_phys(ds)->ds_unique_bytes += used;
133         if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
134                 ds->ds_need_large_blocks = B_TRUE;
135         mutex_exit(&ds->ds_lock);
136         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
137             compressed, uncompressed, tx);
138         dsl_dir_transfer_space(ds->ds_dir, used - delta,
139             DD_USED_REFRSRV, DD_USED_HEAD, NULL);
140 }
141
142 int
143 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
144     boolean_t async)
145 {
146         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
147         int compressed = BP_GET_PSIZE(bp);
148         int uncompressed = BP_GET_UCSIZE(bp);
149
150         if (BP_IS_HOLE(bp))
151                 return (0);
152
153         ASSERT(dmu_tx_is_syncing(tx));
154         ASSERT(bp->blk_birth <= tx->tx_txg);
155
156         if (ds == NULL) {
157                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
158                 dsl_pool_mos_diduse_space(tx->tx_pool,
159                     -used, -compressed, -uncompressed);
160                 return (used);
161         }
162         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
163
164         ASSERT(!dsl_dataset_is_snapshot(ds));
165         dmu_buf_will_dirty(ds->ds_dbuf, tx);
166
167         if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
168                 int64_t delta;
169
170                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
171                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
172
173                 mutex_enter(&ds->ds_lock);
174                 ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
175                     !DS_UNIQUE_IS_ACCURATE(ds));
176                 delta = parent_delta(ds, -used);
177                 dsl_dataset_phys(ds)->ds_unique_bytes -= used;
178                 mutex_exit(&ds->ds_lock);
179                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
180                     delta, -compressed, -uncompressed, tx);
181                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
182                     DD_USED_REFRSRV, DD_USED_HEAD, NULL);
183         } else {
184                 dprintf_bp(bp, "putting on dead list: %s", "");
185                 if (async) {
186                         /*
187                          * We are here as part of zio's write done callback,
188                          * which means we're a zio interrupt thread.  We can't
189                          * call dsl_deadlist_insert() now because it may block
190                          * waiting for I/O.  Instead, put bp on the deferred
191                          * queue and let dsl_pool_sync() finish the job.
192                          */
193                         bplist_append(&ds->ds_pending_deadlist, bp);
194                 } else {
195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
196                 }
197                 ASSERT3U(ds->ds_prev->ds_object, ==,
198                     dsl_dataset_phys(ds)->ds_prev_snap_obj);
199                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
201                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
202                     ds->ds_object && bp->blk_birth >
203                     dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
205                         mutex_enter(&ds->ds_prev->ds_lock);
206                         dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
207                         mutex_exit(&ds->ds_prev->ds_lock);
208                 }
209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
210                         dsl_dir_transfer_space(ds->ds_dir, used,
211                             DD_USED_HEAD, DD_USED_SNAP, tx);
212                 }
213         }
214         mutex_enter(&ds->ds_lock);
215         ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
216         dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
217         ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
218         dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
219         ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
220         dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
221         mutex_exit(&ds->ds_lock);
222
223         return (used);
224 }
225
226 uint64_t
227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
228 {
229         uint64_t trysnap = 0;
230
231         if (ds == NULL)
232                 return (0);
233         /*
234          * The snapshot creation could fail, but that would cause an
235          * incorrect FALSE return, which would only result in an
236          * overestimation of the amount of space that an operation would
237          * consume, which is OK.
238          *
239          * There's also a small window where we could miss a pending
240          * snapshot, because we could set the sync task in the quiescing
241          * phase.  So this should only be used as a guess.
242          */
243         if (ds->ds_trysnap_txg >
244             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
245                 trysnap = ds->ds_trysnap_txg;
246         return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
247 }
248
249 boolean_t
250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
251     uint64_t blk_birth)
252 {
253         if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
254             (bp != NULL && BP_IS_HOLE(bp)))
255                 return (B_FALSE);
256
257         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
258
259         return (B_TRUE);
260 }
261
262 /* ARGSUSED */
263 static void
264 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
265 {
266         dsl_dataset_t *ds = dsv;
267
268         ASSERT(ds->ds_owner == NULL);
269
270         unique_remove(ds->ds_fsid_guid);
271
272         if (ds->ds_objset != NULL)
273                 dmu_objset_evict(ds->ds_objset);
274
275         if (ds->ds_prev) {
276                 dsl_dataset_rele(ds->ds_prev, ds);
277                 ds->ds_prev = NULL;
278         }
279
280         bplist_destroy(&ds->ds_pending_deadlist);
281         if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0)
282                 dsl_deadlist_close(&ds->ds_deadlist);
283         if (ds->ds_dir)
284                 dsl_dir_rele(ds->ds_dir, ds);
285
286         ASSERT(!list_link_active(&ds->ds_synced_link));
287
288         if (mutex_owned(&ds->ds_lock))
289                 mutex_exit(&ds->ds_lock);
290         mutex_destroy(&ds->ds_lock);
291         if (mutex_owned(&ds->ds_opening_lock))
292                 mutex_exit(&ds->ds_opening_lock);
293         mutex_destroy(&ds->ds_opening_lock);
294         mutex_destroy(&ds->ds_sendstream_lock);
295         refcount_destroy(&ds->ds_longholds);
296
297         kmem_free(ds, sizeof (dsl_dataset_t));
298 }
299
300 int
301 dsl_dataset_get_snapname(dsl_dataset_t *ds)
302 {
303         dsl_dataset_phys_t *headphys;
304         int err;
305         dmu_buf_t *headdbuf;
306         dsl_pool_t *dp = ds->ds_dir->dd_pool;
307         objset_t *mos = dp->dp_meta_objset;
308
309         if (ds->ds_snapname[0])
310                 return (0);
311         if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
312                 return (0);
313
314         err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
315             FTAG, &headdbuf);
316         if (err != 0)
317                 return (err);
318         headphys = headdbuf->db_data;
319         err = zap_value_search(dp->dp_meta_objset,
320             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
321         dmu_buf_rele(headdbuf, FTAG);
322         return (err);
323 }
324
325 int
326 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
327 {
328         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
329         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
330         matchtype_t mt;
331         int err;
332
333         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
334                 mt = MT_FIRST;
335         else
336                 mt = MT_EXACT;
337
338         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
339             value, mt, NULL, 0, NULL);
340         if (err == ENOTSUP && mt == MT_FIRST)
341                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
342         return (err);
343 }
344
345 int
346 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
347     boolean_t adj_cnt)
348 {
349         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
350         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
351         matchtype_t mt;
352         int err;
353
354         dsl_dir_snap_cmtime_update(ds->ds_dir);
355
356         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
357                 mt = MT_FIRST;
358         else
359                 mt = MT_EXACT;
360
361         err = zap_remove_norm(mos, snapobj, name, mt, tx);
362         if (err == ENOTSUP && mt == MT_FIRST)
363                 err = zap_remove(mos, snapobj, name, tx);
364
365         if (err == 0 && adj_cnt)
366                 dsl_fs_ss_count_adjust(ds->ds_dir, -1,
367                     DD_FIELD_SNAPSHOT_COUNT, tx);
368
369         return (err);
370 }
371
372 int
373 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
374     dsl_dataset_t **dsp)
375 {
376         objset_t *mos = dp->dp_meta_objset;
377         dmu_buf_t *dbuf;
378         dsl_dataset_t *ds;
379         int err;
380         dmu_object_info_t doi;
381
382         ASSERT(dsl_pool_config_held(dp));
383
384         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
385         if (err != 0)
386                 return (err);
387
388         /* Make sure dsobj has the correct object type. */
389         dmu_object_info_from_db(dbuf, &doi);
390         if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
391                 dmu_buf_rele(dbuf, tag);
392                 return (SET_ERROR(EINVAL));
393         }
394
395         ds = dmu_buf_get_user(dbuf);
396         if (ds == NULL) {
397                 dsl_dataset_t *winner = NULL;
398
399                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
400                 ds->ds_dbuf = dbuf;
401                 ds->ds_object = dsobj;
402
403                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
404                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
405                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
406                 refcount_create(&ds->ds_longholds);
407
408                 bplist_create(&ds->ds_pending_deadlist);
409                 dsl_deadlist_open(&ds->ds_deadlist,
410                     mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
411
412                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
413                     offsetof(dmu_sendarg_t, dsa_link));
414
415                 if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
416                         int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
417                         if (zaperr != ENOENT) {
418                                 VERIFY0(zaperr);
419                                 ds->ds_large_blocks = B_TRUE;
420                         }
421                 }
422
423                 if (err == 0) {
424                         err = dsl_dir_hold_obj(dp,
425                             dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
426                             &ds->ds_dir);
427                 }
428                 if (err != 0) {
429                         mutex_destroy(&ds->ds_lock);
430                         mutex_destroy(&ds->ds_opening_lock);
431                         mutex_destroy(&ds->ds_sendstream_lock);
432                         refcount_destroy(&ds->ds_longholds);
433                         bplist_destroy(&ds->ds_pending_deadlist);
434                         dsl_deadlist_close(&ds->ds_deadlist);
435                         kmem_free(ds, sizeof (dsl_dataset_t));
436                         dmu_buf_rele(dbuf, tag);
437                         return (err);
438                 }
439
440                 if (!dsl_dataset_is_snapshot(ds)) {
441                         ds->ds_snapname[0] = '\0';
442                         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
443                                 err = dsl_dataset_hold_obj(dp,
444                                     dsl_dataset_phys(ds)->ds_prev_snap_obj,
445                                     ds, &ds->ds_prev);
446                         }
447                         if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
448                                 int zaperr = zap_lookup(mos, ds->ds_object,
449                                     DS_FIELD_BOOKMARK_NAMES,
450                                     sizeof (ds->ds_bookmarks), 1,
451                                     &ds->ds_bookmarks);
452                                 if (zaperr != ENOENT)
453                                         VERIFY0(zaperr);
454                         }
455                 } else {
456                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
457                                 err = dsl_dataset_get_snapname(ds);
458                         if (err == 0 &&
459                             dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
460                                 err = zap_count(
461                                     ds->ds_dir->dd_pool->dp_meta_objset,
462                                     dsl_dataset_phys(ds)->ds_userrefs_obj,
463                                     &ds->ds_userrefs);
464                         }
465                 }
466
467                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
468                         err = dsl_prop_get_int_ds(ds,
469                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
470                             &ds->ds_reserved);
471                         if (err == 0) {
472                                 err = dsl_prop_get_int_ds(ds,
473                                     zfs_prop_to_name(ZFS_PROP_REFQUOTA),
474                                     &ds->ds_quota);
475                         }
476                 } else {
477                         ds->ds_reserved = ds->ds_quota = 0;
478                 }
479
480                 if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
481                     dsl_dataset_evict)) != NULL) {
482                         bplist_destroy(&ds->ds_pending_deadlist);
483                         dsl_deadlist_close(&ds->ds_deadlist);
484                         if (ds->ds_prev)
485                                 dsl_dataset_rele(ds->ds_prev, ds);
486                         dsl_dir_rele(ds->ds_dir, ds);
487                         mutex_destroy(&ds->ds_lock);
488                         mutex_destroy(&ds->ds_opening_lock);
489                         mutex_destroy(&ds->ds_sendstream_lock);
490                         refcount_destroy(&ds->ds_longholds);
491                         kmem_free(ds, sizeof (dsl_dataset_t));
492                         if (err != 0) {
493                                 dmu_buf_rele(dbuf, tag);
494                                 return (err);
495                         }
496                         ds = winner;
497                 } else {
498                         ds->ds_fsid_guid =
499                             unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
500                 }
501         }
502         ASSERT3P(ds->ds_dbuf, ==, dbuf);
503         ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
504         ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
505             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
506             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
507         *dsp = ds;
508         return (0);
509 }
510
511 int
512 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
513     void *tag, dsl_dataset_t **dsp)
514 {
515         dsl_dir_t *dd;
516         const char *snapname;
517         uint64_t obj;
518         int err = 0;
519
520         err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
521         if (err != 0)
522                 return (err);
523
524         ASSERT(dsl_pool_config_held(dp));
525         obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
526         if (obj != 0)
527                 err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
528         else
529                 err = SET_ERROR(ENOENT);
530
531         /* we may be looking for a snapshot */
532         if (err == 0 && snapname != NULL) {
533                 dsl_dataset_t *ds;
534
535                 if (*snapname++ != '@') {
536                         dsl_dataset_rele(*dsp, tag);
537                         dsl_dir_rele(dd, FTAG);
538                         return (SET_ERROR(ENOENT));
539                 }
540
541                 dprintf("looking for snapshot '%s'\n", snapname);
542                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
543                 if (err == 0)
544                         err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
545                 dsl_dataset_rele(*dsp, tag);
546
547                 if (err == 0) {
548                         mutex_enter(&ds->ds_lock);
549                         if (ds->ds_snapname[0] == 0)
550                                 (void) strlcpy(ds->ds_snapname, snapname,
551                                     sizeof (ds->ds_snapname));
552                         mutex_exit(&ds->ds_lock);
553                         *dsp = ds;
554                 }
555         }
556
557         dsl_dir_rele(dd, FTAG);
558         return (err);
559 }
560
561 int
562 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
563     void *tag, dsl_dataset_t **dsp)
564 {
565         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
566         if (err != 0)
567                 return (err);
568         if (!dsl_dataset_tryown(*dsp, tag)) {
569                 dsl_dataset_rele(*dsp, tag);
570                 *dsp = NULL;
571                 return (SET_ERROR(EBUSY));
572         }
573         return (0);
574 }
575
576 int
577 dsl_dataset_own(dsl_pool_t *dp, const char *name,
578     void *tag, dsl_dataset_t **dsp)
579 {
580         int err = dsl_dataset_hold(dp, name, tag, dsp);
581         if (err != 0)
582                 return (err);
583         if (!dsl_dataset_tryown(*dsp, tag)) {
584                 dsl_dataset_rele(*dsp, tag);
585                 return (SET_ERROR(EBUSY));
586         }
587         return (0);
588 }
589
590 /*
591  * See the comment above dsl_pool_hold() for details.  In summary, a long
592  * hold is used to prevent destruction of a dataset while the pool hold
593  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
594  *
595  * The dataset and pool must be held when this function is called.  After it
596  * is called, the pool hold may be released while the dataset is still held
597  * and accessed.
598  */
599 void
600 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
601 {
602         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
603         (void) refcount_add(&ds->ds_longholds, tag);
604 }
605
606 void
607 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
608 {
609         (void) refcount_remove(&ds->ds_longholds, tag);
610 }
611
612 /* Return B_TRUE if there are any long holds on this dataset. */
613 boolean_t
614 dsl_dataset_long_held(dsl_dataset_t *ds)
615 {
616         return (!refcount_is_zero(&ds->ds_longholds));
617 }
618
619 void
620 dsl_dataset_name(dsl_dataset_t *ds, char *name)
621 {
622         if (ds == NULL) {
623                 (void) strcpy(name, "mos");
624         } else {
625                 dsl_dir_name(ds->ds_dir, name);
626                 VERIFY0(dsl_dataset_get_snapname(ds));
627                 if (ds->ds_snapname[0]) {
628                         (void) strcat(name, "@");
629                         /*
630                          * We use a "recursive" mutex so that we
631                          * can call dprintf_ds() with ds_lock held.
632                          */
633                         if (!MUTEX_HELD(&ds->ds_lock)) {
634                                 mutex_enter(&ds->ds_lock);
635                                 (void) strcat(name, ds->ds_snapname);
636                                 mutex_exit(&ds->ds_lock);
637                         } else {
638                                 (void) strcat(name, ds->ds_snapname);
639                         }
640                 }
641         }
642 }
643
644 void
645 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
646 {
647         dmu_buf_rele(ds->ds_dbuf, tag);
648 }
649
650 void
651 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
652 {
653         ASSERT3P(ds->ds_owner, ==, tag);
654         ASSERT(ds->ds_dbuf != NULL);
655
656         mutex_enter(&ds->ds_lock);
657         ds->ds_owner = NULL;
658         mutex_exit(&ds->ds_lock);
659         dsl_dataset_long_rele(ds, tag);
660         dsl_dataset_rele(ds, tag);
661 }
662
663 boolean_t
664 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
665 {
666         boolean_t gotit = FALSE;
667
668         mutex_enter(&ds->ds_lock);
669         if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
670                 ds->ds_owner = tag;
671                 dsl_dataset_long_hold(ds, tag);
672                 gotit = TRUE;
673         }
674         mutex_exit(&ds->ds_lock);
675         return (gotit);
676 }
677
678 uint64_t
679 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
680     uint64_t flags, dmu_tx_t *tx)
681 {
682         dsl_pool_t *dp = dd->dd_pool;
683         dmu_buf_t *dbuf;
684         dsl_dataset_phys_t *dsphys;
685         uint64_t dsobj;
686         objset_t *mos = dp->dp_meta_objset;
687
688         if (origin == NULL)
689                 origin = dp->dp_origin_snap;
690
691         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
692         ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
693         ASSERT(dmu_tx_is_syncing(tx));
694         ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
695
696         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
697             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
698         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
699         dmu_buf_will_dirty(dbuf, tx);
700         dsphys = dbuf->db_data;
701         bzero(dsphys, sizeof (dsl_dataset_phys_t));
702         dsphys->ds_dir_obj = dd->dd_object;
703         dsphys->ds_flags = flags;
704         dsphys->ds_fsid_guid = unique_create();
705         do {
706                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
707                     sizeof (dsphys->ds_guid));
708         } while (dsphys->ds_guid == 0);
709         dsphys->ds_snapnames_zapobj =
710             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
711             DMU_OT_NONE, 0, tx);
712         dsphys->ds_creation_time = gethrestime_sec();
713         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
714
715         if (origin == NULL) {
716                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
717         } else {
718                 dsl_dataset_t *ohds; /* head of the origin snapshot */
719
720                 dsphys->ds_prev_snap_obj = origin->ds_object;
721                 dsphys->ds_prev_snap_txg =
722                     dsl_dataset_phys(origin)->ds_creation_txg;
723                 dsphys->ds_referenced_bytes =
724                     dsl_dataset_phys(origin)->ds_referenced_bytes;
725                 dsphys->ds_compressed_bytes =
726                     dsl_dataset_phys(origin)->ds_compressed_bytes;
727                 dsphys->ds_uncompressed_bytes =
728                     dsl_dataset_phys(origin)->ds_uncompressed_bytes;
729                 dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
730
731                 /*
732                  * Inherit flags that describe the dataset's contents
733                  * (INCONSISTENT) or properties (Case Insensitive).
734                  */
735                 dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
736                     (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
737
738                 if (origin->ds_large_blocks)
739                         dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
740
741                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
742                 dsl_dataset_phys(origin)->ds_num_children++;
743
744                 VERIFY0(dsl_dataset_hold_obj(dp,
745                     dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
746                     FTAG, &ohds));
747                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
748                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
749                 dsl_dataset_rele(ohds, FTAG);
750
751                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
752                         if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
753                                 dsl_dataset_phys(origin)->ds_next_clones_obj =
754                                     zap_create(mos,
755                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
756                         }
757                         VERIFY0(zap_add_int(mos,
758                             dsl_dataset_phys(origin)->ds_next_clones_obj,
759                             dsobj, tx));
760                 }
761
762                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
763                 dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
764                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
765                         if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
766                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
767                                 dsl_dir_phys(origin->ds_dir)->dd_clones =
768                                     zap_create(mos,
769                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
770                         }
771                         VERIFY0(zap_add_int(mos,
772                             dsl_dir_phys(origin->ds_dir)->dd_clones,
773                             dsobj, tx));
774                 }
775         }
776
777         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
778                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
779
780         dmu_buf_rele(dbuf, FTAG);
781
782         dmu_buf_will_dirty(dd->dd_dbuf, tx);
783         dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
784
785         return (dsobj);
786 }
787
788 static void
789 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
790 {
791         objset_t *os;
792
793         VERIFY0(dmu_objset_from_ds(ds, &os));
794         bzero(&os->os_zil_header, sizeof (os->os_zil_header));
795         dsl_dataset_dirty(ds, tx);
796 }
797
798 uint64_t
799 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
800     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
801 {
802         dsl_pool_t *dp = pdd->dd_pool;
803         uint64_t dsobj, ddobj;
804         dsl_dir_t *dd;
805
806         ASSERT(dmu_tx_is_syncing(tx));
807         ASSERT(lastname[0] != '@');
808
809         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
810         VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
811
812         dsobj = dsl_dataset_create_sync_dd(dd, origin,
813             flags & ~DS_CREATE_FLAG_NODIRTY, tx);
814
815         dsl_deleg_set_create_perms(dd, tx, cr);
816
817         /*
818          * Since we're creating a new node we know it's a leaf, so we can
819          * initialize the counts if the limit feature is active.
820          */
821         if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
822                 uint64_t cnt = 0;
823                 objset_t *os = dd->dd_pool->dp_meta_objset;
824
825                 dsl_dir_zapify(dd, tx);
826                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
827                     sizeof (cnt), 1, &cnt, tx));
828                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
829                     sizeof (cnt), 1, &cnt, tx));
830         }
831
832         dsl_dir_rele(dd, FTAG);
833
834         /*
835          * If we are creating a clone, make sure we zero out any stale
836          * data from the origin snapshots zil header.
837          */
838         if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
839                 dsl_dataset_t *ds;
840
841                 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
842                 dsl_dataset_zero_zil(ds, tx);
843                 dsl_dataset_rele(ds, FTAG);
844         }
845
846         return (dsobj);
847 }
848
849 #ifdef __FreeBSD__
850 /* FreeBSD ioctl compat begin */
851 struct destroyarg {
852         nvlist_t *nvl;
853         const char *snapname;
854 };
855
856 static int
857 dsl_check_snap_cb(const char *name, void *arg)
858 {
859         struct destroyarg *da = arg;
860         dsl_dataset_t *ds;
861         char *dsname;
862
863         dsname = kmem_asprintf("%s@%s", name, da->snapname);
864         fnvlist_add_boolean(da->nvl, dsname);
865         kmem_free(dsname, strlen(dsname) + 1);
866
867         return (0);
868 }
869
870 int
871 dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
872     nvlist_t *snaps)
873 {
874         struct destroyarg *da;
875         int err;
876
877         da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
878         da->nvl = snaps;
879         da->snapname = snapname;
880         err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
881             DS_FIND_CHILDREN);
882         kmem_free(da, sizeof (struct destroyarg));
883
884         return (err);
885 }
886 /* FreeBSD ioctl compat end */
887 #endif /* __FreeBSD__ */
888
889 /*
890  * The unique space in the head dataset can be calculated by subtracting
891  * the space used in the most recent snapshot, that is still being used
892  * in this file system, from the space currently in use.  To figure out
893  * the space in the most recent snapshot still in use, we need to take
894  * the total space used in the snapshot and subtract out the space that
895  * has been freed up since the snapshot was taken.
896  */
897 void
898 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
899 {
900         uint64_t mrs_used;
901         uint64_t dlused, dlcomp, dluncomp;
902
903         ASSERT(!dsl_dataset_is_snapshot(ds));
904
905         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
906                 mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
907         else
908                 mrs_used = 0;
909
910         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
911
912         ASSERT3U(dlused, <=, mrs_used);
913         dsl_dataset_phys(ds)->ds_unique_bytes =
914             dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
915
916         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
917             SPA_VERSION_UNIQUE_ACCURATE)
918                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
919 }
920
921 void
922 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
923     dmu_tx_t *tx)
924 {
925         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
926         uint64_t count;
927         int err;
928
929         ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
930         err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
931             obj, tx);
932         /*
933          * The err should not be ENOENT, but a bug in a previous version
934          * of the code could cause upgrade_clones_cb() to not set
935          * ds_next_snap_obj when it should, leading to a missing entry.
936          * If we knew that the pool was created after
937          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
938          * ENOENT.  However, at least we can check that we don't have
939          * too many entries in the next_clones_obj even after failing to
940          * remove this one.
941          */
942         if (err != ENOENT)
943                 VERIFY0(err);
944         ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
945             &count));
946         ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
947 }
948
949
950 blkptr_t *
951 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
952 {
953         return (&dsl_dataset_phys(ds)->ds_bp);
954 }
955
956 void
957 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
958 {
959         ASSERT(dmu_tx_is_syncing(tx));
960         /* If it's the meta-objset, set dp_meta_rootbp */
961         if (ds == NULL) {
962                 tx->tx_pool->dp_meta_rootbp = *bp;
963         } else {
964                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
965                 dsl_dataset_phys(ds)->ds_bp = *bp;
966         }
967 }
968
969 spa_t *
970 dsl_dataset_get_spa(dsl_dataset_t *ds)
971 {
972         return (ds->ds_dir->dd_pool->dp_spa);
973 }
974
975 void
976 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
977 {
978         dsl_pool_t *dp;
979
980         if (ds == NULL) /* this is the meta-objset */
981                 return;
982
983         ASSERT(ds->ds_objset != NULL);
984
985         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
986                 panic("dirtying snapshot!");
987
988         dp = ds->ds_dir->dd_pool;
989
990         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
991                 /* up the hold count until we can be written out */
992                 dmu_buf_add_ref(ds->ds_dbuf, ds);
993         }
994 }
995
996 boolean_t
997 dsl_dataset_is_dirty(dsl_dataset_t *ds)
998 {
999         for (int t = 0; t < TXG_SIZE; t++) {
1000                 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1001                     ds, t))
1002                         return (B_TRUE);
1003         }
1004         return (B_FALSE);
1005 }
1006
1007 static int
1008 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1009 {
1010         uint64_t asize;
1011
1012         if (!dmu_tx_is_syncing(tx))
1013                 return (0);
1014
1015         /*
1016          * If there's an fs-only reservation, any blocks that might become
1017          * owned by the snapshot dataset must be accommodated by space
1018          * outside of the reservation.
1019          */
1020         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1021         asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1022         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1023                 return (SET_ERROR(ENOSPC));
1024
1025         /*
1026          * Propagate any reserved space for this snapshot to other
1027          * snapshot checks in this sync group.
1028          */
1029         if (asize > 0)
1030                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1031
1032         return (0);
1033 }
1034
1035 typedef struct dsl_dataset_snapshot_arg {
1036         nvlist_t *ddsa_snaps;
1037         nvlist_t *ddsa_props;
1038         nvlist_t *ddsa_errors;
1039         cred_t *ddsa_cr;
1040 } dsl_dataset_snapshot_arg_t;
1041
1042 int
1043 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1044     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1045 {
1046         int error;
1047         uint64_t value;
1048
1049         ds->ds_trysnap_txg = tx->tx_txg;
1050
1051         if (!dmu_tx_is_syncing(tx))
1052                 return (0);
1053
1054         /*
1055          * We don't allow multiple snapshots of the same txg.  If there
1056          * is already one, try again.
1057          */
1058         if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1059                 return (SET_ERROR(EAGAIN));
1060
1061         /*
1062          * Check for conflicting snapshot name.
1063          */
1064         error = dsl_dataset_snap_lookup(ds, snapname, &value);
1065         if (error == 0)
1066                 return (SET_ERROR(EEXIST));
1067         if (error != ENOENT)
1068                 return (error);
1069
1070         /*
1071          * We don't allow taking snapshots of inconsistent datasets, such as
1072          * those into which we are currently receiving.  However, if we are
1073          * creating this snapshot as part of a receive, this check will be
1074          * executed atomically with respect to the completion of the receive
1075          * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1076          * case we ignore this, knowing it will be fixed up for us shortly in
1077          * dmu_recv_end_sync().
1078          */
1079         if (!recv && DS_IS_INCONSISTENT(ds))
1080                 return (SET_ERROR(EBUSY));
1081
1082         /*
1083          * Skip the check for temporary snapshots or if we have already checked
1084          * the counts in dsl_dataset_snapshot_check. This means we really only
1085          * check the count here when we're receiving a stream.
1086          */
1087         if (cnt != 0 && cr != NULL) {
1088                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1089                     ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1090                 if (error != 0)
1091                         return (error);
1092         }
1093
1094         error = dsl_dataset_snapshot_reserve_space(ds, tx);
1095         if (error != 0)
1096                 return (error);
1097
1098         return (0);
1099 }
1100
1101 static int
1102 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1103 {
1104         dsl_dataset_snapshot_arg_t *ddsa = arg;
1105         dsl_pool_t *dp = dmu_tx_pool(tx);
1106         nvpair_t *pair;
1107         int rv = 0;
1108
1109         /*
1110          * Pre-compute how many total new snapshots will be created for each
1111          * level in the tree and below. This is needed for validating the
1112          * snapshot limit when either taking a recursive snapshot or when
1113          * taking multiple snapshots.
1114          *
1115          * The problem is that the counts are not actually adjusted when
1116          * we are checking, only when we finally sync. For a single snapshot,
1117          * this is easy, the count will increase by 1 at each node up the tree,
1118          * but its more complicated for the recursive/multiple snapshot case.
1119          *
1120          * The dsl_fs_ss_limit_check function does recursively check the count
1121          * at each level up the tree but since it is validating each snapshot
1122          * independently we need to be sure that we are validating the complete
1123          * count for the entire set of snapshots. We do this by rolling up the
1124          * counts for each component of the name into an nvlist and then
1125          * checking each of those cases with the aggregated count.
1126          *
1127          * This approach properly handles not only the recursive snapshot
1128          * case (where we get all of those on the ddsa_snaps list) but also
1129          * the sibling case (e.g. snapshot a/b and a/c so that we will also
1130          * validate the limit on 'a' using a count of 2).
1131          *
1132          * We validate the snapshot names in the third loop and only report
1133          * name errors once.
1134          */
1135         if (dmu_tx_is_syncing(tx)) {
1136                 nvlist_t *cnt_track = NULL;
1137                 cnt_track = fnvlist_alloc();
1138
1139                 /* Rollup aggregated counts into the cnt_track list */
1140                 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1141                     pair != NULL;
1142                     pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1143                         char *pdelim;
1144                         uint64_t val;
1145                         char nm[MAXPATHLEN];
1146
1147                         (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1148                         pdelim = strchr(nm, '@');
1149                         if (pdelim == NULL)
1150                                 continue;
1151                         *pdelim = '\0';
1152
1153                         do {
1154                                 if (nvlist_lookup_uint64(cnt_track, nm,
1155                                     &val) == 0) {
1156                                         /* update existing entry */
1157                                         fnvlist_add_uint64(cnt_track, nm,
1158                                             val + 1);
1159                                 } else {
1160                                         /* add to list */
1161                                         fnvlist_add_uint64(cnt_track, nm, 1);
1162                                 }
1163
1164                                 pdelim = strrchr(nm, '/');
1165                                 if (pdelim != NULL)
1166                                         *pdelim = '\0';
1167                         } while (pdelim != NULL);
1168                 }
1169
1170                 /* Check aggregated counts at each level */
1171                 for (pair = nvlist_next_nvpair(cnt_track, NULL);
1172                     pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1173                         int error = 0;
1174                         char *name;
1175                         uint64_t cnt = 0;
1176                         dsl_dataset_t *ds;
1177
1178                         name = nvpair_name(pair);
1179                         cnt = fnvpair_value_uint64(pair);
1180                         ASSERT(cnt > 0);
1181
1182                         error = dsl_dataset_hold(dp, name, FTAG, &ds);
1183                         if (error == 0) {
1184                                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1185                                     ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1186                                     ddsa->ddsa_cr);
1187                                 dsl_dataset_rele(ds, FTAG);
1188                         }
1189
1190                         if (error != 0) {
1191                                 if (ddsa->ddsa_errors != NULL)
1192                                         fnvlist_add_int32(ddsa->ddsa_errors,
1193                                             name, error);
1194                                 rv = error;
1195                                 /* only report one error for this check */
1196                                 break;
1197                         }
1198                 }
1199                 nvlist_free(cnt_track);
1200         }
1201
1202         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1203             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1204                 int error = 0;
1205                 dsl_dataset_t *ds;
1206                 char *name, *atp;
1207                 char dsname[MAXNAMELEN];
1208
1209                 name = nvpair_name(pair);
1210                 if (strlen(name) >= MAXNAMELEN)
1211                         error = SET_ERROR(ENAMETOOLONG);
1212                 if (error == 0) {
1213                         atp = strchr(name, '@');
1214                         if (atp == NULL)
1215                                 error = SET_ERROR(EINVAL);
1216                         if (error == 0)
1217                                 (void) strlcpy(dsname, name, atp - name + 1);
1218                 }
1219                 if (error == 0)
1220                         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1221                 if (error == 0) {
1222                         /* passing 0/NULL skips dsl_fs_ss_limit_check */
1223                         error = dsl_dataset_snapshot_check_impl(ds,
1224                             atp + 1, tx, B_FALSE, 0, NULL);
1225                         dsl_dataset_rele(ds, FTAG);
1226                 }
1227
1228                 if (error != 0) {
1229                         if (ddsa->ddsa_errors != NULL) {
1230                                 fnvlist_add_int32(ddsa->ddsa_errors,
1231                                     name, error);
1232                         }
1233                         rv = error;
1234                 }
1235         }
1236
1237         return (rv);
1238 }
1239
1240 void
1241 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1242     dmu_tx_t *tx)
1243 {
1244         static zil_header_t zero_zil;
1245
1246         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1247         dmu_buf_t *dbuf;
1248         dsl_dataset_phys_t *dsphys;
1249         uint64_t dsobj, crtxg;
1250         objset_t *mos = dp->dp_meta_objset;
1251         objset_t *os;
1252
1253         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1254
1255         /*
1256          * If we are on an old pool, the zil must not be active, in which
1257          * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1258          */
1259         ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1260             dmu_objset_from_ds(ds, &os) != 0 ||
1261             bcmp(&os->os_phys->os_zil_header, &zero_zil,
1262             sizeof (zero_zil)) == 0);
1263
1264         dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1265
1266         /*
1267          * The origin's ds_creation_txg has to be < TXG_INITIAL
1268          */
1269         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1270                 crtxg = 1;
1271         else
1272                 crtxg = tx->tx_txg;
1273
1274         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1275             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1276         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1277         dmu_buf_will_dirty(dbuf, tx);
1278         dsphys = dbuf->db_data;
1279         bzero(dsphys, sizeof (dsl_dataset_phys_t));
1280         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1281         dsphys->ds_fsid_guid = unique_create();
1282         do {
1283                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1284                     sizeof (dsphys->ds_guid));
1285         } while (dsphys->ds_guid == 0);
1286         dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1287         dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1288         dsphys->ds_next_snap_obj = ds->ds_object;
1289         dsphys->ds_num_children = 1;
1290         dsphys->ds_creation_time = gethrestime_sec();
1291         dsphys->ds_creation_txg = crtxg;
1292         dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1293         dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1294         dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1295         dsphys->ds_uncompressed_bytes =
1296             dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1297         dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1298         dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1299         dmu_buf_rele(dbuf, FTAG);
1300
1301         if (ds->ds_large_blocks)
1302                 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
1303
1304         ASSERT3U(ds->ds_prev != 0, ==,
1305             dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1306         if (ds->ds_prev) {
1307                 uint64_t next_clones_obj =
1308                     dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1309                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1310                     ds->ds_object ||
1311                     dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1312                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1313                     ds->ds_object) {
1314                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1315                         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1316                             dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1317                         dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1318                 } else if (next_clones_obj != 0) {
1319                         dsl_dataset_remove_from_next_clones(ds->ds_prev,
1320                             dsphys->ds_next_snap_obj, tx);
1321                         VERIFY0(zap_add_int(mos,
1322                             next_clones_obj, dsobj, tx));
1323                 }
1324         }
1325
1326         /*
1327          * If we have a reference-reservation on this dataset, we will
1328          * need to increase the amount of refreservation being charged
1329          * since our unique space is going to zero.
1330          */
1331         if (ds->ds_reserved) {
1332                 int64_t delta;
1333                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1334                 delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1335                     ds->ds_reserved);
1336                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1337                     delta, 0, 0, tx);
1338         }
1339
1340         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1341         dsl_dataset_phys(ds)->ds_deadlist_obj =
1342             dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1343             dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1344         dsl_deadlist_close(&ds->ds_deadlist);
1345         dsl_deadlist_open(&ds->ds_deadlist, mos,
1346             dsl_dataset_phys(ds)->ds_deadlist_obj);
1347         dsl_deadlist_add_key(&ds->ds_deadlist,
1348             dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1349
1350         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1351         dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1352         dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1353         dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1354         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1355                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1356
1357         VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1358             snapname, 8, 1, &dsobj, tx));
1359
1360         if (ds->ds_prev)
1361                 dsl_dataset_rele(ds->ds_prev, ds);
1362         VERIFY0(dsl_dataset_hold_obj(dp,
1363             dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1364
1365         dsl_scan_ds_snapshotted(ds, tx);
1366
1367         dsl_dir_snap_cmtime_update(ds->ds_dir);
1368
1369         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1370 }
1371
1372 static void
1373 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1374 {
1375         dsl_dataset_snapshot_arg_t *ddsa = arg;
1376         dsl_pool_t *dp = dmu_tx_pool(tx);
1377         nvpair_t *pair;
1378
1379         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1380             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1381                 dsl_dataset_t *ds;
1382                 char *name, *atp;
1383                 char dsname[MAXNAMELEN];
1384
1385                 name = nvpair_name(pair);
1386                 atp = strchr(name, '@');
1387                 (void) strlcpy(dsname, name, atp - name + 1);
1388                 VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1389
1390                 dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1391                 if (ddsa->ddsa_props != NULL) {
1392                         dsl_props_set_sync_impl(ds->ds_prev,
1393                             ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1394                 }
1395                 dsl_dataset_rele(ds, FTAG);
1396         }
1397 }
1398
1399 /*
1400  * The snapshots must all be in the same pool.
1401  * All-or-nothing: if there are any failures, nothing will be modified.
1402  */
1403 int
1404 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1405 {
1406         dsl_dataset_snapshot_arg_t ddsa;
1407         nvpair_t *pair;
1408         boolean_t needsuspend;
1409         int error;
1410         spa_t *spa;
1411         char *firstname;
1412         nvlist_t *suspended = NULL;
1413
1414         pair = nvlist_next_nvpair(snaps, NULL);
1415         if (pair == NULL)
1416                 return (0);
1417         firstname = nvpair_name(pair);
1418
1419         error = spa_open(firstname, &spa, FTAG);
1420         if (error != 0)
1421                 return (error);
1422         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1423         spa_close(spa, FTAG);
1424
1425         if (needsuspend) {
1426                 suspended = fnvlist_alloc();
1427                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1428                     pair = nvlist_next_nvpair(snaps, pair)) {
1429                         char fsname[MAXNAMELEN];
1430                         char *snapname = nvpair_name(pair);
1431                         char *atp;
1432                         void *cookie;
1433
1434                         atp = strchr(snapname, '@');
1435                         if (atp == NULL) {
1436                                 error = SET_ERROR(EINVAL);
1437                                 break;
1438                         }
1439                         (void) strlcpy(fsname, snapname, atp - snapname + 1);
1440
1441                         error = zil_suspend(fsname, &cookie);
1442                         if (error != 0)
1443                                 break;
1444                         fnvlist_add_uint64(suspended, fsname,
1445                             (uintptr_t)cookie);
1446                 }
1447         }
1448
1449         ddsa.ddsa_snaps = snaps;
1450         ddsa.ddsa_props = props;
1451         ddsa.ddsa_errors = errors;
1452         ddsa.ddsa_cr = CRED();
1453
1454         if (error == 0) {
1455                 error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1456                     dsl_dataset_snapshot_sync, &ddsa,
1457                     fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1458         }
1459
1460         if (suspended != NULL) {
1461                 for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1462                     pair = nvlist_next_nvpair(suspended, pair)) {
1463                         zil_resume((void *)(uintptr_t)
1464                             fnvpair_value_uint64(pair));
1465                 }
1466                 fnvlist_free(suspended);
1467         }
1468
1469 #ifdef __FreeBSD__
1470 #ifdef _KERNEL
1471         if (error == 0) {
1472                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1473                     pair = nvlist_next_nvpair(snaps, pair)) {
1474                         char *snapname = nvpair_name(pair);
1475                         zvol_create_minors(snapname);
1476                 }
1477         }
1478 #endif
1479 #endif
1480         return (error);
1481 }
1482
1483 typedef struct dsl_dataset_snapshot_tmp_arg {
1484         const char *ddsta_fsname;
1485         const char *ddsta_snapname;
1486         minor_t ddsta_cleanup_minor;
1487         const char *ddsta_htag;
1488 } dsl_dataset_snapshot_tmp_arg_t;
1489
1490 static int
1491 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1492 {
1493         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1494         dsl_pool_t *dp = dmu_tx_pool(tx);
1495         dsl_dataset_t *ds;
1496         int error;
1497
1498         error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1499         if (error != 0)
1500                 return (error);
1501
1502         /* NULL cred means no limit check for tmp snapshot */
1503         error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1504             tx, B_FALSE, 0, NULL);
1505         if (error != 0) {
1506                 dsl_dataset_rele(ds, FTAG);
1507                 return (error);
1508         }
1509
1510         if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1511                 dsl_dataset_rele(ds, FTAG);
1512                 return (SET_ERROR(ENOTSUP));
1513         }
1514         error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1515             B_TRUE, tx);
1516         if (error != 0) {
1517                 dsl_dataset_rele(ds, FTAG);
1518                 return (error);
1519         }
1520
1521         dsl_dataset_rele(ds, FTAG);
1522         return (0);
1523 }
1524
1525 static void
1526 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1527 {
1528         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1529         dsl_pool_t *dp = dmu_tx_pool(tx);
1530         dsl_dataset_t *ds;
1531
1532         VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1533
1534         dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1535         dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1536             ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1537         dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1538
1539         dsl_dataset_rele(ds, FTAG);
1540 }
1541
1542 int
1543 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1544     minor_t cleanup_minor, const char *htag)
1545 {
1546         dsl_dataset_snapshot_tmp_arg_t ddsta;
1547         int error;
1548         spa_t *spa;
1549         boolean_t needsuspend;
1550         void *cookie;
1551
1552         ddsta.ddsta_fsname = fsname;
1553         ddsta.ddsta_snapname = snapname;
1554         ddsta.ddsta_cleanup_minor = cleanup_minor;
1555         ddsta.ddsta_htag = htag;
1556
1557         error = spa_open(fsname, &spa, FTAG);
1558         if (error != 0)
1559                 return (error);
1560         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1561         spa_close(spa, FTAG);
1562
1563         if (needsuspend) {
1564                 error = zil_suspend(fsname, &cookie);
1565                 if (error != 0)
1566                         return (error);
1567         }
1568
1569         error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1570             dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1571
1572         if (needsuspend)
1573                 zil_resume(cookie);
1574         return (error);
1575 }
1576
1577
1578 void
1579 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1580 {
1581         ASSERT(dmu_tx_is_syncing(tx));
1582         ASSERT(ds->ds_objset != NULL);
1583         ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1584
1585         /*
1586          * in case we had to change ds_fsid_guid when we opened it,
1587          * sync it out now.
1588          */
1589         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1590         dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1591
1592         dmu_objset_sync(ds->ds_objset, zio, tx);
1593
1594         if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
1595                 dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
1596                 ds->ds_large_blocks = B_TRUE;
1597         }
1598 }
1599
1600 static void
1601 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1602 {
1603         uint64_t count = 0;
1604         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1605         zap_cursor_t zc;
1606         zap_attribute_t za;
1607         nvlist_t *propval = fnvlist_alloc();
1608         nvlist_t *val = fnvlist_alloc();
1609
1610         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1611
1612         /*
1613          * There may be missing entries in ds_next_clones_obj
1614          * due to a bug in a previous version of the code.
1615          * Only trust it if it has the right number of entries.
1616          */
1617         if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1618                 VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1619                     &count));
1620         }
1621         if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
1622                 goto fail;
1623         for (zap_cursor_init(&zc, mos,
1624             dsl_dataset_phys(ds)->ds_next_clones_obj);
1625             zap_cursor_retrieve(&zc, &za) == 0;
1626             zap_cursor_advance(&zc)) {
1627                 dsl_dataset_t *clone;
1628                 char buf[ZFS_MAXNAMELEN];
1629                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1630                     za.za_first_integer, FTAG, &clone));
1631                 dsl_dir_name(clone->ds_dir, buf);
1632                 fnvlist_add_boolean(val, buf);
1633                 dsl_dataset_rele(clone, FTAG);
1634         }
1635         zap_cursor_fini(&zc);
1636         fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1637         fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1638 fail:
1639         nvlist_free(val);
1640         nvlist_free(propval);
1641 }
1642
1643 void
1644 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1645 {
1646         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1647         uint64_t refd, avail, uobjs, aobjs, ratio;
1648
1649         ASSERT(dsl_pool_config_held(dp));
1650
1651         ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1652             (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1653             dsl_dataset_phys(ds)->ds_compressed_bytes);
1654
1655         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1656         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1657             dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1658
1659         if (dsl_dataset_is_snapshot(ds)) {
1660                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1661                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1662                     dsl_dataset_phys(ds)->ds_unique_bytes);
1663                 get_clones_stat(ds, nv);
1664         } else {
1665                 if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
1666                         char buf[MAXNAMELEN];
1667                         dsl_dataset_name(ds->ds_prev, buf);
1668                         dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
1669                 }
1670
1671                 dsl_dir_stats(ds->ds_dir, nv);
1672         }
1673
1674         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1675         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1676         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1677
1678         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1679             dsl_dataset_phys(ds)->ds_creation_time);
1680         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1681             dsl_dataset_phys(ds)->ds_creation_txg);
1682         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1683             ds->ds_quota);
1684         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1685             ds->ds_reserved);
1686         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1687             dsl_dataset_phys(ds)->ds_guid);
1688         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1689             dsl_dataset_phys(ds)->ds_unique_bytes);
1690         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1691             ds->ds_object);
1692         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1693             ds->ds_userrefs);
1694         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1695             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1696
1697         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1698                 uint64_t written, comp, uncomp;
1699                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1700                 dsl_dataset_t *prev;
1701
1702                 int err = dsl_dataset_hold_obj(dp,
1703                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1704                 if (err == 0) {
1705                         err = dsl_dataset_space_written(prev, ds, &written,
1706                             &comp, &uncomp);
1707                         dsl_dataset_rele(prev, FTAG);
1708                         if (err == 0) {
1709                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1710                                     written);
1711                         }
1712                 }
1713         }
1714 }
1715
1716 void
1717 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1718 {
1719         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1720         ASSERT(dsl_pool_config_held(dp));
1721
1722         stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
1723         stat->dds_inconsistent =
1724             dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
1725         stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
1726         stat->dds_origin[0] = '\0';
1727         if (dsl_dataset_is_snapshot(ds)) {
1728                 stat->dds_is_snapshot = B_TRUE;
1729                 stat->dds_num_clones =
1730                     dsl_dataset_phys(ds)->ds_num_children - 1;
1731         } else {
1732                 stat->dds_is_snapshot = B_FALSE;
1733                 stat->dds_num_clones = 0;
1734
1735                 if (dsl_dir_is_clone(ds->ds_dir)) {
1736                         dsl_dataset_t *ods;
1737
1738                         VERIFY0(dsl_dataset_hold_obj(dp,
1739                             dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
1740                             FTAG, &ods));
1741                         dsl_dataset_name(ods, stat->dds_origin);
1742                         dsl_dataset_rele(ods, FTAG);
1743                 }
1744         }
1745 }
1746
1747 uint64_t
1748 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1749 {
1750         return (ds->ds_fsid_guid);
1751 }
1752
1753 void
1754 dsl_dataset_space(dsl_dataset_t *ds,
1755     uint64_t *refdbytesp, uint64_t *availbytesp,
1756     uint64_t *usedobjsp, uint64_t *availobjsp)
1757 {
1758         *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
1759         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1760         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
1761                 *availbytesp +=
1762                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
1763         if (ds->ds_quota != 0) {
1764                 /*
1765                  * Adjust available bytes according to refquota
1766                  */
1767                 if (*refdbytesp < ds->ds_quota)
1768                         *availbytesp = MIN(*availbytesp,
1769                             ds->ds_quota - *refdbytesp);
1770                 else
1771                         *availbytesp = 0;
1772         }
1773         *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
1774         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
1775 }
1776
1777 boolean_t
1778 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1779 {
1780         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1781
1782         ASSERT(dsl_pool_config_held(dp));
1783         if (snap == NULL)
1784                 return (B_FALSE);
1785         if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
1786             dsl_dataset_phys(snap)->ds_creation_txg) {
1787                 objset_t *os, *os_snap;
1788                 /*
1789                  * It may be that only the ZIL differs, because it was
1790                  * reset in the head.  Don't count that as being
1791                  * modified.
1792                  */
1793                 if (dmu_objset_from_ds(ds, &os) != 0)
1794                         return (B_TRUE);
1795                 if (dmu_objset_from_ds(snap, &os_snap) != 0)
1796                         return (B_TRUE);
1797                 return (bcmp(&os->os_phys->os_meta_dnode,
1798                     &os_snap->os_phys->os_meta_dnode,
1799                     sizeof (os->os_phys->os_meta_dnode)) != 0);
1800         }
1801         return (B_FALSE);
1802 }
1803
1804 typedef struct dsl_dataset_rename_snapshot_arg {
1805         const char *ddrsa_fsname;
1806         const char *ddrsa_oldsnapname;
1807         const char *ddrsa_newsnapname;
1808         boolean_t ddrsa_recursive;
1809         dmu_tx_t *ddrsa_tx;
1810 } dsl_dataset_rename_snapshot_arg_t;
1811
1812 /* ARGSUSED */
1813 static int
1814 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
1815     dsl_dataset_t *hds, void *arg)
1816 {
1817         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1818         int error;
1819         uint64_t val;
1820
1821         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1822         if (error != 0) {
1823                 /* ignore nonexistent snapshots */
1824                 return (error == ENOENT ? 0 : error);
1825         }
1826
1827         /* new name should not exist */
1828         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
1829         if (error == 0)
1830                 error = SET_ERROR(EEXIST);
1831         else if (error == ENOENT)
1832                 error = 0;
1833
1834         /* dataset name + 1 for the "@" + the new snapshot name must fit */
1835         if (dsl_dir_namelen(hds->ds_dir) + 1 +
1836             strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
1837                 error = SET_ERROR(ENAMETOOLONG);
1838
1839         return (error);
1840 }
1841
1842 static int
1843 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
1844 {
1845         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1846         dsl_pool_t *dp = dmu_tx_pool(tx);
1847         dsl_dataset_t *hds;
1848         int error;
1849
1850         error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
1851         if (error != 0)
1852                 return (error);
1853
1854         if (ddrsa->ddrsa_recursive) {
1855                 error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1856                     dsl_dataset_rename_snapshot_check_impl, ddrsa,
1857                     DS_FIND_CHILDREN);
1858         } else {
1859                 error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
1860         }
1861         dsl_dataset_rele(hds, FTAG);
1862         return (error);
1863 }
1864
1865 static int
1866 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
1867     dsl_dataset_t *hds, void *arg)
1868 {
1869 #ifdef __FreeBSD__
1870 #ifdef _KERNEL
1871         char *oldname, *newname;
1872 #endif
1873 #endif
1874         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1875         dsl_dataset_t *ds;
1876         uint64_t val;
1877         dmu_tx_t *tx = ddrsa->ddrsa_tx;
1878         int error;
1879
1880         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1881         ASSERT(error == 0 || error == ENOENT);
1882         if (error == ENOENT) {
1883                 /* ignore nonexistent snapshots */
1884                 return (0);
1885         }
1886
1887         VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
1888
1889         /* log before we change the name */
1890         spa_history_log_internal_ds(ds, "rename", tx,
1891             "-> @%s", ddrsa->ddrsa_newsnapname);
1892
1893         VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
1894             B_FALSE));
1895         mutex_enter(&ds->ds_lock);
1896         (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
1897         mutex_exit(&ds->ds_lock);
1898         VERIFY0(zap_add(dp->dp_meta_objset,
1899             dsl_dataset_phys(hds)->ds_snapnames_zapobj,
1900             ds->ds_snapname, 8, 1, &ds->ds_object, tx));
1901
1902 #ifdef __FreeBSD__
1903 #ifdef _KERNEL
1904         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1905         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1906         snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1907             ddrsa->ddrsa_oldsnapname);
1908         snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1909             ddrsa->ddrsa_newsnapname);
1910         zfsvfs_update_fromname(oldname, newname);
1911         zvol_rename_minors(oldname, newname);
1912         kmem_free(newname, MAXPATHLEN);
1913         kmem_free(oldname, MAXPATHLEN);
1914 #endif
1915 #endif
1916         dsl_dataset_rele(ds, FTAG);
1917
1918         return (0);
1919 }
1920
1921 static void
1922 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
1923 {
1924         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1925         dsl_pool_t *dp = dmu_tx_pool(tx);
1926         dsl_dataset_t *hds;
1927
1928         VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
1929         ddrsa->ddrsa_tx = tx;
1930         if (ddrsa->ddrsa_recursive) {
1931                 VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1932                     dsl_dataset_rename_snapshot_sync_impl, ddrsa,
1933                     DS_FIND_CHILDREN));
1934         } else {
1935                 VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
1936         }
1937         dsl_dataset_rele(hds, FTAG);
1938 }
1939
1940 int
1941 dsl_dataset_rename_snapshot(const char *fsname,
1942     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
1943 {
1944         dsl_dataset_rename_snapshot_arg_t ddrsa;
1945
1946         ddrsa.ddrsa_fsname = fsname;
1947         ddrsa.ddrsa_oldsnapname = oldsnapname;
1948         ddrsa.ddrsa_newsnapname = newsnapname;
1949         ddrsa.ddrsa_recursive = recursive;
1950
1951         return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
1952             dsl_dataset_rename_snapshot_sync, &ddrsa,
1953             1, ZFS_SPACE_CHECK_RESERVED));
1954 }
1955
1956 /*
1957  * If we're doing an ownership handoff, we need to make sure that there is
1958  * only one long hold on the dataset.  We're not allowed to change anything here
1959  * so we don't permanently release the long hold or regular hold here.  We want
1960  * to do this only when syncing to avoid the dataset unexpectedly going away
1961  * when we release the long hold.
1962  */
1963 static int
1964 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
1965 {
1966         boolean_t held;
1967
1968         if (!dmu_tx_is_syncing(tx))
1969                 return (0);
1970
1971         if (owner != NULL) {
1972                 VERIFY3P(ds->ds_owner, ==, owner);
1973                 dsl_dataset_long_rele(ds, owner);
1974         }
1975
1976         held = dsl_dataset_long_held(ds);
1977
1978         if (owner != NULL)
1979                 dsl_dataset_long_hold(ds, owner);
1980
1981         if (held)
1982                 return (SET_ERROR(EBUSY));
1983
1984         return (0);
1985 }
1986
1987 typedef struct dsl_dataset_rollback_arg {
1988         const char *ddra_fsname;
1989         void *ddra_owner;
1990         nvlist_t *ddra_result;
1991 } dsl_dataset_rollback_arg_t;
1992
1993 static int
1994 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
1995 {
1996         dsl_dataset_rollback_arg_t *ddra = arg;
1997         dsl_pool_t *dp = dmu_tx_pool(tx);
1998         dsl_dataset_t *ds;
1999         int64_t unused_refres_delta;
2000         int error;
2001
2002         error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2003         if (error != 0)
2004                 return (error);
2005
2006         /* must not be a snapshot */
2007         if (dsl_dataset_is_snapshot(ds)) {
2008                 dsl_dataset_rele(ds, FTAG);
2009                 return (SET_ERROR(EINVAL));
2010         }
2011
2012         /* must have a most recent snapshot */
2013         if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2014                 dsl_dataset_rele(ds, FTAG);
2015                 return (SET_ERROR(EINVAL));
2016         }
2017
2018         /* must not have any bookmarks after the most recent snapshot */
2019         nvlist_t *proprequest = fnvlist_alloc();
2020         fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2021         nvlist_t *bookmarks = fnvlist_alloc();
2022         error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2023         fnvlist_free(proprequest);
2024         if (error != 0)
2025                 return (error);
2026         for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2027             pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2028                 nvlist_t *valuenv =
2029                     fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2030                     zfs_prop_to_name(ZFS_PROP_CREATETXG));
2031                 uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2032                 if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2033                         fnvlist_free(bookmarks);
2034                         dsl_dataset_rele(ds, FTAG);
2035                         return (SET_ERROR(EEXIST));
2036                 }
2037         }
2038         fnvlist_free(bookmarks);
2039
2040         error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2041         if (error != 0) {
2042                 dsl_dataset_rele(ds, FTAG);
2043                 return (error);
2044         }
2045
2046         /*
2047          * Check if the snap we are rolling back to uses more than
2048          * the refquota.
2049          */
2050         if (ds->ds_quota != 0 &&
2051             dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2052                 dsl_dataset_rele(ds, FTAG);
2053                 return (SET_ERROR(EDQUOT));
2054         }
2055
2056         /*
2057          * When we do the clone swap, we will temporarily use more space
2058          * due to the refreservation (the head will no longer have any
2059          * unique space, so the entire amount of the refreservation will need
2060          * to be free).  We will immediately destroy the clone, freeing
2061          * this space, but the freeing happens over many txg's.
2062          */
2063         unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2064             dsl_dataset_phys(ds)->ds_unique_bytes);
2065
2066         if (unused_refres_delta > 0 &&
2067             unused_refres_delta >
2068             dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2069                 dsl_dataset_rele(ds, FTAG);
2070                 return (SET_ERROR(ENOSPC));
2071         }
2072
2073         dsl_dataset_rele(ds, FTAG);
2074         return (0);
2075 }
2076
2077 static void
2078 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2079 {
2080         dsl_dataset_rollback_arg_t *ddra = arg;
2081         dsl_pool_t *dp = dmu_tx_pool(tx);
2082         dsl_dataset_t *ds, *clone;
2083         uint64_t cloneobj;
2084         char namebuf[ZFS_MAXNAMELEN];
2085
2086         VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2087
2088         dsl_dataset_name(ds->ds_prev, namebuf);
2089         fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2090
2091         cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2092             ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2093
2094         VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2095
2096         dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2097         dsl_dataset_zero_zil(ds, tx);
2098
2099         dsl_destroy_head_sync_impl(clone, tx);
2100
2101         dsl_dataset_rele(clone, FTAG);
2102         dsl_dataset_rele(ds, FTAG);
2103 }
2104
2105 /*
2106  * Rolls back the given filesystem or volume to the most recent snapshot.
2107  * The name of the most recent snapshot will be returned under key "target"
2108  * in the result nvlist.
2109  *
2110  * If owner != NULL:
2111  * - The existing dataset MUST be owned by the specified owner at entry
2112  * - Upon return, dataset will still be held by the same owner, whether we
2113  *   succeed or not.
2114  *
2115  * This mode is required any time the existing filesystem is mounted.  See
2116  * notes above zfs_suspend_fs() for further details.
2117  */
2118 int
2119 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
2120 {
2121         dsl_dataset_rollback_arg_t ddra;
2122
2123         ddra.ddra_fsname = fsname;
2124         ddra.ddra_owner = owner;
2125         ddra.ddra_result = result;
2126
2127         return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2128             dsl_dataset_rollback_sync, &ddra,
2129             1, ZFS_SPACE_CHECK_RESERVED));
2130 }
2131
2132 struct promotenode {
2133         list_node_t link;
2134         dsl_dataset_t *ds;
2135 };
2136
2137 typedef struct dsl_dataset_promote_arg {
2138         const char *ddpa_clonename;
2139         dsl_dataset_t *ddpa_clone;
2140         list_t shared_snaps, origin_snaps, clone_snaps;
2141         dsl_dataset_t *origin_origin; /* origin of the origin */
2142         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2143         char *err_ds;
2144         cred_t *cr;
2145 } dsl_dataset_promote_arg_t;
2146
2147 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2148 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2149     void *tag);
2150 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2151
2152 static int
2153 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2154 {
2155         dsl_dataset_promote_arg_t *ddpa = arg;
2156         dsl_pool_t *dp = dmu_tx_pool(tx);
2157         dsl_dataset_t *hds;
2158         struct promotenode *snap;
2159         dsl_dataset_t *origin_ds;
2160         int err;
2161         uint64_t unused;
2162         uint64_t ss_mv_cnt;
2163
2164         err = promote_hold(ddpa, dp, FTAG);
2165         if (err != 0)
2166                 return (err);
2167
2168         hds = ddpa->ddpa_clone;
2169
2170         if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2171                 promote_rele(ddpa, FTAG);
2172                 return (SET_ERROR(EXDEV));
2173         }
2174
2175         /*
2176          * Compute and check the amount of space to transfer.  Since this is
2177          * so expensive, don't do the preliminary check.
2178          */
2179         if (!dmu_tx_is_syncing(tx)) {
2180                 promote_rele(ddpa, FTAG);
2181                 return (0);
2182         }
2183
2184         snap = list_head(&ddpa->shared_snaps);
2185         origin_ds = snap->ds;
2186
2187         /* compute origin's new unique space */
2188         snap = list_tail(&ddpa->clone_snaps);
2189         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2190             origin_ds->ds_object);
2191         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2192             dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2193             &ddpa->unique, &unused, &unused);
2194
2195         /*
2196          * Walk the snapshots that we are moving
2197          *
2198          * Compute space to transfer.  Consider the incremental changes
2199          * to used by each snapshot:
2200          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2201          * So each snapshot gave birth to:
2202          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2203          * So a sequence would look like:
2204          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2205          * Which simplifies to:
2206          * uN + kN + kN-1 + ... + k1 + k0
2207          * Note however, if we stop before we reach the ORIGIN we get:
2208          * uN + kN + kN-1 + ... + kM - uM-1
2209          */
2210         ss_mv_cnt = 0;
2211         ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2212         ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2213         ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2214         for (snap = list_head(&ddpa->shared_snaps); snap;
2215             snap = list_next(&ddpa->shared_snaps, snap)) {
2216                 uint64_t val, dlused, dlcomp, dluncomp;
2217                 dsl_dataset_t *ds = snap->ds;
2218
2219                 ss_mv_cnt++;
2220
2221                 /*
2222                  * If there are long holds, we won't be able to evict
2223                  * the objset.
2224                  */
2225                 if (dsl_dataset_long_held(ds)) {
2226                         err = SET_ERROR(EBUSY);
2227                         goto out;
2228                 }
2229
2230                 /* Check that the snapshot name does not conflict */
2231                 VERIFY0(dsl_dataset_get_snapname(ds));
2232                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2233                 if (err == 0) {
2234                         (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
2235                         err = SET_ERROR(EEXIST);
2236                         goto out;
2237                 }
2238                 if (err != ENOENT)
2239                         goto out;
2240
2241                 /* The very first snapshot does not have a deadlist */
2242                 if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2243                         continue;
2244
2245                 dsl_deadlist_space(&ds->ds_deadlist,
2246                     &dlused, &dlcomp, &dluncomp);
2247                 ddpa->used += dlused;
2248                 ddpa->comp += dlcomp;
2249                 ddpa->uncomp += dluncomp;
2250         }
2251
2252         /*
2253          * If we are a clone of a clone then we never reached ORIGIN,
2254          * so we need to subtract out the clone origin's used space.
2255          */
2256         if (ddpa->origin_origin) {
2257                 ddpa->used -=
2258                     dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2259                 ddpa->comp -=
2260                     dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2261                 ddpa->uncomp -=
2262                     dsl_dataset_phys(ddpa->origin_origin)->
2263                     ds_uncompressed_bytes;
2264         }
2265
2266         /* Check that there is enough space and limit headroom here */
2267         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2268             0, ss_mv_cnt, ddpa->used, ddpa->cr);
2269         if (err != 0)
2270                 goto out;
2271
2272         /*
2273          * Compute the amounts of space that will be used by snapshots
2274          * after the promotion (for both origin and clone).  For each,
2275          * it is the amount of space that will be on all of their
2276          * deadlists (that was not born before their new origin).
2277          */
2278         if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2279                 uint64_t space;
2280
2281                 /*
2282                  * Note, typically this will not be a clone of a clone,
2283                  * so dd_origin_txg will be < TXG_INITIAL, so
2284                  * these snaplist_space() -> dsl_deadlist_space_range()
2285                  * calls will be fast because they do not have to
2286                  * iterate over all bps.
2287                  */
2288                 snap = list_head(&ddpa->origin_snaps);
2289                 err = snaplist_space(&ddpa->shared_snaps,
2290                     snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2291                 if (err != 0)
2292                         goto out;
2293
2294                 err = snaplist_space(&ddpa->clone_snaps,
2295                     snap->ds->ds_dir->dd_origin_txg, &space);
2296                 if (err != 0)
2297                         goto out;
2298                 ddpa->cloneusedsnap += space;
2299         }
2300         if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2301             DD_FLAG_USED_BREAKDOWN) {
2302                 err = snaplist_space(&ddpa->origin_snaps,
2303                     dsl_dataset_phys(origin_ds)->ds_creation_txg,
2304                     &ddpa->originusedsnap);
2305                 if (err != 0)
2306                         goto out;
2307         }
2308
2309 out:
2310         promote_rele(ddpa, FTAG);
2311         return (err);
2312 }
2313
2314 static void
2315 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2316 {
2317         dsl_dataset_promote_arg_t *ddpa = arg;
2318         dsl_pool_t *dp = dmu_tx_pool(tx);
2319         dsl_dataset_t *hds;
2320         struct promotenode *snap;
2321         dsl_dataset_t *origin_ds;
2322         dsl_dataset_t *origin_head;
2323         dsl_dir_t *dd;
2324         dsl_dir_t *odd = NULL;
2325         uint64_t oldnext_obj;
2326         int64_t delta;
2327 #if defined(__FreeBSD__) && defined(_KERNEL)
2328         char *oldname, *newname;
2329 #endif
2330
2331         VERIFY0(promote_hold(ddpa, dp, FTAG));
2332         hds = ddpa->ddpa_clone;
2333
2334         ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2335
2336         snap = list_head(&ddpa->shared_snaps);
2337         origin_ds = snap->ds;
2338         dd = hds->ds_dir;
2339
2340         snap = list_head(&ddpa->origin_snaps);
2341         origin_head = snap->ds;
2342
2343         /*
2344          * We need to explicitly open odd, since origin_ds's dd will be
2345          * changing.
2346          */
2347         VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2348             NULL, FTAG, &odd));
2349
2350         /* change origin's next snap */
2351         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2352         oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2353         snap = list_tail(&ddpa->clone_snaps);
2354         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2355             origin_ds->ds_object);
2356         dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2357
2358         /* change the origin's next clone */
2359         if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2360                 dsl_dataset_remove_from_next_clones(origin_ds,
2361                     snap->ds->ds_object, tx);
2362                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2363                     dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2364                     oldnext_obj, tx));
2365         }
2366
2367         /* change origin */
2368         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2369         ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2370         dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2371         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2372         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2373         dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2374         origin_head->ds_dir->dd_origin_txg =
2375             dsl_dataset_phys(origin_ds)->ds_creation_txg;
2376
2377         /* change dd_clone entries */
2378         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2379                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2380                     dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2381                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2382                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2383                     hds->ds_object, tx));
2384
2385                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2386                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2387                     origin_head->ds_object, tx));
2388                 if (dsl_dir_phys(dd)->dd_clones == 0) {
2389                         dsl_dir_phys(dd)->dd_clones =
2390                             zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
2391                             DMU_OT_NONE, 0, tx);
2392                 }
2393                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2394                     dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
2395         }
2396
2397 #if defined(__FreeBSD__) && defined(_KERNEL)
2398         /* Take the spa_namespace_lock early so zvol renames don't deadlock. */
2399         mutex_enter(&spa_namespace_lock);
2400
2401         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2402         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2403 #endif
2404
2405         /* move snapshots to this dir */
2406         for (snap = list_head(&ddpa->shared_snaps); snap;
2407             snap = list_next(&ddpa->shared_snaps, snap)) {
2408                 dsl_dataset_t *ds = snap->ds;
2409
2410                 /*
2411                  * Property callbacks are registered to a particular
2412                  * dsl_dir.  Since ours is changing, evict the objset
2413                  * so that they will be unregistered from the old dsl_dir.
2414                  */
2415                 if (ds->ds_objset) {
2416                         dmu_objset_evict(ds->ds_objset);
2417                         ds->ds_objset = NULL;
2418                 }
2419
2420                 /* move snap name entry */
2421                 VERIFY0(dsl_dataset_get_snapname(ds));
2422                 VERIFY0(dsl_dataset_snap_remove(origin_head,
2423                     ds->ds_snapname, tx, B_TRUE));
2424                 VERIFY0(zap_add(dp->dp_meta_objset,
2425                     dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
2426                     8, 1, &ds->ds_object, tx));
2427                 dsl_fs_ss_count_adjust(hds->ds_dir, 1,
2428                     DD_FIELD_SNAPSHOT_COUNT, tx);
2429
2430                 /* change containing dsl_dir */
2431                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2432                 ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
2433                 dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
2434                 ASSERT3P(ds->ds_dir, ==, odd);
2435                 dsl_dir_rele(ds->ds_dir, ds);
2436                 VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2437                     NULL, ds, &ds->ds_dir));
2438
2439 #if defined(__FreeBSD__) && defined(_KERNEL)
2440                 dsl_dataset_name(ds, newname);
2441                 zfsvfs_update_fromname(oldname, newname);
2442                 zvol_rename_minors(oldname, newname);
2443 #endif
2444
2445                 /* move any clone references */
2446                 if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
2447                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2448                         zap_cursor_t zc;
2449                         zap_attribute_t za;
2450
2451                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2452                             dsl_dataset_phys(ds)->ds_next_clones_obj);
2453                             zap_cursor_retrieve(&zc, &za) == 0;
2454                             zap_cursor_advance(&zc)) {
2455                                 dsl_dataset_t *cnds;
2456                                 uint64_t o;
2457
2458                                 if (za.za_first_integer == oldnext_obj) {
2459                                         /*
2460                                          * We've already moved the
2461                                          * origin's reference.
2462                                          */
2463                                         continue;
2464                                 }
2465
2466                                 VERIFY0(dsl_dataset_hold_obj(dp,
2467                                     za.za_first_integer, FTAG, &cnds));
2468                                 o = dsl_dir_phys(cnds->ds_dir)->
2469                                     dd_head_dataset_obj;
2470
2471                                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2472                                     dsl_dir_phys(odd)->dd_clones, o, tx));
2473                                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2474                                     dsl_dir_phys(dd)->dd_clones, o, tx));
2475                                 dsl_dataset_rele(cnds, FTAG);
2476                         }
2477                         zap_cursor_fini(&zc);
2478                 }
2479
2480                 ASSERT(!dsl_prop_hascb(ds));
2481         }
2482
2483 #if defined(__FreeBSD__) && defined(_KERNEL)
2484         mutex_exit(&spa_namespace_lock);
2485
2486         kmem_free(newname, MAXPATHLEN);
2487         kmem_free(oldname, MAXPATHLEN);
2488 #endif
2489         /*
2490          * Change space accounting.
2491          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2492          * both be valid, or both be 0 (resulting in delta == 0).  This
2493          * is true for each of {clone,origin} independently.
2494          */
2495
2496         delta = ddpa->cloneusedsnap -
2497             dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
2498         ASSERT3S(delta, >=, 0);
2499         ASSERT3U(ddpa->used, >=, delta);
2500         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2501         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2502             ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2503
2504         delta = ddpa->originusedsnap -
2505             dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
2506         ASSERT3S(delta, <=, 0);
2507         ASSERT3U(ddpa->used, >=, -delta);
2508         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2509         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2510             -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2511
2512         dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
2513
2514         /* log history record */
2515         spa_history_log_internal_ds(hds, "promote", tx, "");
2516
2517         dsl_dir_rele(odd, FTAG);
2518         promote_rele(ddpa, FTAG);
2519 }
2520
2521 /*
2522  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2523  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2524  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2525  * snapshots back to this dataset's origin.
2526  */
2527 static int
2528 snaplist_make(dsl_pool_t *dp,
2529     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2530 {
2531         uint64_t obj = last_obj;
2532
2533         list_create(l, sizeof (struct promotenode),
2534             offsetof(struct promotenode, link));
2535
2536         while (obj != first_obj) {
2537                 dsl_dataset_t *ds;
2538                 struct promotenode *snap;
2539                 int err;
2540
2541                 err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2542                 ASSERT(err != ENOENT);
2543                 if (err != 0)
2544                         return (err);
2545
2546                 if (first_obj == 0)
2547                         first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
2548
2549                 snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2550                 snap->ds = ds;
2551                 list_insert_tail(l, snap);
2552                 obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
2553         }
2554
2555         return (0);
2556 }
2557
2558 static int
2559 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2560 {
2561         struct promotenode *snap;
2562
2563         *spacep = 0;
2564         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2565                 uint64_t used, comp, uncomp;
2566                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2567                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
2568                 *spacep += used;
2569         }
2570         return (0);
2571 }
2572
2573 static void
2574 snaplist_destroy(list_t *l, void *tag)
2575 {
2576         struct promotenode *snap;
2577
2578         if (l == NULL || !list_link_active(&l->list_head))
2579                 return;
2580
2581         while ((snap = list_tail(l)) != NULL) {
2582                 list_remove(l, snap);
2583                 dsl_dataset_rele(snap->ds, tag);
2584                 kmem_free(snap, sizeof (*snap));
2585         }
2586         list_destroy(l);
2587 }
2588
2589 static int
2590 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2591 {
2592         int error;
2593         dsl_dir_t *dd;
2594         struct promotenode *snap;
2595
2596         error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2597             &ddpa->ddpa_clone);
2598         if (error != 0)
2599                 return (error);
2600         dd = ddpa->ddpa_clone->ds_dir;
2601
2602         if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
2603             !dsl_dir_is_clone(dd)) {
2604                 dsl_dataset_rele(ddpa->ddpa_clone, tag);
2605                 return (SET_ERROR(EINVAL));
2606         }
2607
2608         error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
2609             &ddpa->shared_snaps, tag);
2610         if (error != 0)
2611                 goto out;
2612
2613         error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2614             &ddpa->clone_snaps, tag);
2615         if (error != 0)
2616                 goto out;
2617
2618         snap = list_head(&ddpa->shared_snaps);
2619         ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
2620         error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
2621             dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
2622             &ddpa->origin_snaps, tag);
2623         if (error != 0)
2624                 goto out;
2625
2626         if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
2627                 error = dsl_dataset_hold_obj(dp,
2628                     dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
2629                     tag, &ddpa->origin_origin);
2630                 if (error != 0)
2631                         goto out;
2632         }
2633 out:
2634         if (error != 0)
2635                 promote_rele(ddpa, tag);
2636         return (error);
2637 }
2638
2639 static void
2640 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2641 {
2642         snaplist_destroy(&ddpa->shared_snaps, tag);
2643         snaplist_destroy(&ddpa->clone_snaps, tag);
2644         snaplist_destroy(&ddpa->origin_snaps, tag);
2645         if (ddpa->origin_origin != NULL)
2646                 dsl_dataset_rele(ddpa->origin_origin, tag);
2647         dsl_dataset_rele(ddpa->ddpa_clone, tag);
2648 }
2649
2650 /*
2651  * Promote a clone.
2652  *
2653  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2654  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2655  */
2656 int
2657 dsl_dataset_promote(const char *name, char *conflsnap)
2658 {
2659         dsl_dataset_promote_arg_t ddpa = { 0 };
2660         uint64_t numsnaps;
2661         int error;
2662         objset_t *os;
2663
2664         /*
2665          * We will modify space proportional to the number of
2666          * snapshots.  Compute numsnaps.
2667          */
2668         error = dmu_objset_hold(name, FTAG, &os);
2669         if (error != 0)
2670                 return (error);
2671         error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2672             dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
2673             &numsnaps);
2674         dmu_objset_rele(os, FTAG);
2675         if (error != 0)
2676                 return (error);
2677
2678         ddpa.ddpa_clonename = name;
2679         ddpa.err_ds = conflsnap;
2680         ddpa.cr = CRED();
2681
2682         return (dsl_sync_task(name, dsl_dataset_promote_check,
2683             dsl_dataset_promote_sync, &ddpa,
2684             2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
2685 }
2686
2687 int
2688 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2689     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2690 {
2691         int64_t unused_refres_delta;
2692
2693         /* they should both be heads */
2694         if (dsl_dataset_is_snapshot(clone) ||
2695             dsl_dataset_is_snapshot(origin_head))
2696                 return (SET_ERROR(EINVAL));
2697
2698         /* if we are not forcing, the branch point should be just before them */
2699         if (!force && clone->ds_prev != origin_head->ds_prev)
2700                 return (SET_ERROR(EINVAL));
2701
2702         /* clone should be the clone (unless they are unrelated) */
2703         if (clone->ds_prev != NULL &&
2704             clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2705             origin_head->ds_dir != clone->ds_prev->ds_dir)
2706                 return (SET_ERROR(EINVAL));
2707
2708         /* the clone should be a child of the origin */
2709         if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2710                 return (SET_ERROR(EINVAL));
2711
2712         /* origin_head shouldn't be modified unless 'force' */
2713         if (!force &&
2714             dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2715                 return (SET_ERROR(ETXTBSY));
2716
2717         /* origin_head should have no long holds (e.g. is not mounted) */
2718         if (dsl_dataset_handoff_check(origin_head, owner, tx))
2719                 return (SET_ERROR(EBUSY));
2720
2721         /* check amount of any unconsumed refreservation */
2722         unused_refres_delta =
2723             (int64_t)MIN(origin_head->ds_reserved,
2724             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2725             (int64_t)MIN(origin_head->ds_reserved,
2726             dsl_dataset_phys(clone)->ds_unique_bytes);
2727
2728         if (unused_refres_delta > 0 &&
2729             unused_refres_delta >
2730             dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2731                 return (SET_ERROR(ENOSPC));
2732
2733         /* clone can't be over the head's refquota */
2734         if (origin_head->ds_quota != 0 &&
2735             dsl_dataset_phys(clone)->ds_referenced_bytes >
2736             origin_head->ds_quota)
2737                 return (SET_ERROR(EDQUOT));
2738
2739         return (0);
2740 }
2741
2742 void
2743 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2744     dsl_dataset_t *origin_head, dmu_tx_t *tx)
2745 {
2746         dsl_pool_t *dp = dmu_tx_pool(tx);
2747         int64_t unused_refres_delta;
2748
2749         ASSERT(clone->ds_reserved == 0);
2750         ASSERT(origin_head->ds_quota == 0 ||
2751             dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
2752         ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2753
2754         dmu_buf_will_dirty(clone->ds_dbuf, tx);
2755         dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2756
2757         if (clone->ds_objset != NULL) {
2758                 dmu_objset_evict(clone->ds_objset);
2759                 clone->ds_objset = NULL;
2760         }
2761
2762         if (origin_head->ds_objset != NULL) {
2763                 dmu_objset_evict(origin_head->ds_objset);
2764                 origin_head->ds_objset = NULL;
2765         }
2766
2767         unused_refres_delta =
2768             (int64_t)MIN(origin_head->ds_reserved,
2769             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2770             (int64_t)MIN(origin_head->ds_reserved,
2771             dsl_dataset_phys(clone)->ds_unique_bytes);
2772
2773         /*
2774          * Reset origin's unique bytes, if it exists.
2775          */
2776         if (clone->ds_prev) {
2777                 dsl_dataset_t *origin = clone->ds_prev;
2778                 uint64_t comp, uncomp;
2779
2780                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
2781                 dsl_deadlist_space_range(&clone->ds_deadlist,
2782                     dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
2783                     &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
2784         }
2785
2786         /* swap blkptrs */
2787         {
2788                 blkptr_t tmp;
2789                 tmp = dsl_dataset_phys(origin_head)->ds_bp;
2790                 dsl_dataset_phys(origin_head)->ds_bp =
2791                     dsl_dataset_phys(clone)->ds_bp;
2792                 dsl_dataset_phys(clone)->ds_bp = tmp;
2793         }
2794
2795         /* set dd_*_bytes */
2796         {
2797                 int64_t dused, dcomp, duncomp;
2798                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
2799                 uint64_t odl_used, odl_comp, odl_uncomp;
2800
2801                 ASSERT3U(dsl_dir_phys(clone->ds_dir)->
2802                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
2803
2804                 dsl_deadlist_space(&clone->ds_deadlist,
2805                     &cdl_used, &cdl_comp, &cdl_uncomp);
2806                 dsl_deadlist_space(&origin_head->ds_deadlist,
2807                     &odl_used, &odl_comp, &odl_uncomp);
2808
2809                 dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
2810                     cdl_used -
2811                     (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
2812                     odl_used);
2813                 dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
2814                     cdl_comp -
2815                     (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
2816                     odl_comp);
2817                 duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
2818                     cdl_uncomp -
2819                     (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
2820                     odl_uncomp);
2821
2822                 dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
2823                     dused, dcomp, duncomp, tx);
2824                 dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
2825                     -dused, -dcomp, -duncomp, tx);
2826
2827                 /*
2828                  * The difference in the space used by snapshots is the
2829                  * difference in snapshot space due to the head's
2830                  * deadlist (since that's the only thing that's
2831                  * changing that affects the snapused).
2832                  */
2833                 dsl_deadlist_space_range(&clone->ds_deadlist,
2834                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2835                     &cdl_used, &cdl_comp, &cdl_uncomp);
2836                 dsl_deadlist_space_range(&origin_head->ds_deadlist,
2837                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2838                     &odl_used, &odl_comp, &odl_uncomp);
2839                 dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
2840                     DD_USED_HEAD, DD_USED_SNAP, NULL);
2841         }
2842
2843         /* swap ds_*_bytes */
2844         SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
2845             dsl_dataset_phys(clone)->ds_referenced_bytes);
2846         SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
2847             dsl_dataset_phys(clone)->ds_compressed_bytes);
2848         SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
2849             dsl_dataset_phys(clone)->ds_uncompressed_bytes);
2850         SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
2851             dsl_dataset_phys(clone)->ds_unique_bytes);
2852
2853         /* apply any parent delta for change in unconsumed refreservation */
2854         dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
2855             unused_refres_delta, 0, 0, tx);
2856
2857         /*
2858          * Swap deadlists.
2859          */
2860         dsl_deadlist_close(&clone->ds_deadlist);
2861         dsl_deadlist_close(&origin_head->ds_deadlist);
2862         SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
2863             dsl_dataset_phys(clone)->ds_deadlist_obj);
2864         dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
2865             dsl_dataset_phys(clone)->ds_deadlist_obj);
2866         dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
2867             dsl_dataset_phys(origin_head)->ds_deadlist_obj);
2868
2869         dsl_scan_ds_clone_swapped(origin_head, clone, tx);
2870
2871         spa_history_log_internal_ds(clone, "clone swap", tx,
2872             "parent=%s", origin_head->ds_dir->dd_myname);
2873 }
2874
2875 /*
2876  * Given a pool name and a dataset object number in that pool,
2877  * return the name of that dataset.
2878  */
2879 int
2880 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2881 {
2882         dsl_pool_t *dp;
2883         dsl_dataset_t *ds;
2884         int error;
2885
2886         error = dsl_pool_hold(pname, FTAG, &dp);
2887         if (error != 0)
2888                 return (error);
2889
2890         error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
2891         if (error == 0) {
2892                 dsl_dataset_name(ds, buf);
2893                 dsl_dataset_rele(ds, FTAG);
2894         }
2895         dsl_pool_rele(dp, FTAG);
2896
2897         return (error);
2898 }
2899
2900 int
2901 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2902     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2903 {
2904         int error = 0;
2905
2906         ASSERT3S(asize, >, 0);
2907
2908         /*
2909          * *ref_rsrv is the portion of asize that will come from any
2910          * unconsumed refreservation space.
2911          */
2912         *ref_rsrv = 0;
2913
2914         mutex_enter(&ds->ds_lock);
2915         /*
2916          * Make a space adjustment for reserved bytes.
2917          */
2918         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
2919                 ASSERT3U(*used, >=,
2920                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
2921                 *used -=
2922                     (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
2923                 *ref_rsrv =
2924                     asize - MIN(asize, parent_delta(ds, asize + inflight));
2925         }
2926
2927         if (!check_quota || ds->ds_quota == 0) {
2928                 mutex_exit(&ds->ds_lock);
2929                 return (0);
2930         }
2931         /*
2932          * If they are requesting more space, and our current estimate
2933          * is over quota, they get to try again unless the actual
2934          * on-disk is over quota and there are no pending changes (which
2935          * may free up space for us).
2936          */
2937         if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
2938             ds->ds_quota) {
2939                 if (inflight > 0 ||
2940                     dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
2941                         error = SET_ERROR(ERESTART);
2942                 else
2943                         error = SET_ERROR(EDQUOT);
2944         }
2945         mutex_exit(&ds->ds_lock);
2946
2947         return (error);
2948 }
2949
2950 typedef struct dsl_dataset_set_qr_arg {
2951         const char *ddsqra_name;
2952         zprop_source_t ddsqra_source;
2953         uint64_t ddsqra_value;
2954 } dsl_dataset_set_qr_arg_t;
2955
2956
2957 /* ARGSUSED */
2958 static int
2959 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
2960 {
2961         dsl_dataset_set_qr_arg_t *ddsqra = arg;
2962         dsl_pool_t *dp = dmu_tx_pool(tx);
2963         dsl_dataset_t *ds;
2964         int error;
2965         uint64_t newval;
2966
2967         if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
2968                 return (SET_ERROR(ENOTSUP));
2969
2970         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2971         if (error != 0)
2972                 return (error);
2973
2974         if (dsl_dataset_is_snapshot(ds)) {
2975                 dsl_dataset_rele(ds, FTAG);
2976                 return (SET_ERROR(EINVAL));
2977         }
2978
2979         error = dsl_prop_predict(ds->ds_dir,
2980             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2981             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2982         if (error != 0) {
2983                 dsl_dataset_rele(ds, FTAG);
2984                 return (error);
2985         }
2986
2987         if (newval == 0) {
2988                 dsl_dataset_rele(ds, FTAG);
2989                 return (0);
2990         }
2991
2992         if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
2993             newval < ds->ds_reserved) {
2994                 dsl_dataset_rele(ds, FTAG);
2995                 return (SET_ERROR(ENOSPC));
2996         }
2997
2998         dsl_dataset_rele(ds, FTAG);
2999         return (0);
3000 }
3001
3002 static void
3003 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3004 {
3005         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3006         dsl_pool_t *dp = dmu_tx_pool(tx);
3007         dsl_dataset_t *ds;
3008         uint64_t newval;
3009
3010         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3011
3012         dsl_prop_set_sync_impl(ds,
3013             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3014             ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3015             &ddsqra->ddsqra_value, tx);
3016
3017         VERIFY0(dsl_prop_get_int_ds(ds,
3018             zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3019
3020         if (ds->ds_quota != newval) {
3021                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3022                 ds->ds_quota = newval;
3023         }
3024         dsl_dataset_rele(ds, FTAG);
3025 }
3026
3027 int
3028 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3029     uint64_t refquota)
3030 {
3031         dsl_dataset_set_qr_arg_t ddsqra;
3032
3033         ddsqra.ddsqra_name = dsname;
3034         ddsqra.ddsqra_source = source;
3035         ddsqra.ddsqra_value = refquota;
3036
3037         return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3038             dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3039 }
3040
3041 static int
3042 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3043 {
3044         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3045         dsl_pool_t *dp = dmu_tx_pool(tx);
3046         dsl_dataset_t *ds;
3047         int error;
3048         uint64_t newval, unique;
3049
3050         if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3051                 return (SET_ERROR(ENOTSUP));
3052
3053         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3054         if (error != 0)
3055                 return (error);
3056
3057         if (dsl_dataset_is_snapshot(ds)) {
3058                 dsl_dataset_rele(ds, FTAG);
3059                 return (SET_ERROR(EINVAL));
3060         }
3061
3062         error = dsl_prop_predict(ds->ds_dir,
3063             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3064             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3065         if (error != 0) {
3066                 dsl_dataset_rele(ds, FTAG);
3067                 return (error);
3068         }
3069
3070         /*
3071          * If we are doing the preliminary check in open context, the
3072          * space estimates may be inaccurate.
3073          */
3074         if (!dmu_tx_is_syncing(tx)) {
3075                 dsl_dataset_rele(ds, FTAG);
3076                 return (0);
3077         }
3078
3079         mutex_enter(&ds->ds_lock);
3080         if (!DS_UNIQUE_IS_ACCURATE(ds))
3081                 dsl_dataset_recalc_head_uniq(ds);
3082         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3083         mutex_exit(&ds->ds_lock);
3084
3085         if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3086                 uint64_t delta = MAX(unique, newval) -
3087                     MAX(unique, ds->ds_reserved);
3088
3089                 if (delta >
3090                     dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3091                     (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3092                         dsl_dataset_rele(ds, FTAG);
3093                         return (SET_ERROR(ENOSPC));
3094                 }
3095         }
3096
3097         dsl_dataset_rele(ds, FTAG);
3098         return (0);
3099 }
3100
3101 void
3102 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3103     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3104 {
3105         uint64_t newval;
3106         uint64_t unique;
3107         int64_t delta;
3108
3109         dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3110             source, sizeof (value), 1, &value, tx);
3111
3112         VERIFY0(dsl_prop_get_int_ds(ds,
3113             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3114
3115         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3116         mutex_enter(&ds->ds_dir->dd_lock);
3117         mutex_enter(&ds->ds_lock);
3118         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3119         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3120         delta = MAX(0, (int64_t)(newval - unique)) -
3121             MAX(0, (int64_t)(ds->ds_reserved - unique));
3122         ds->ds_reserved = newval;
3123         mutex_exit(&ds->ds_lock);
3124
3125         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3126         mutex_exit(&ds->ds_dir->dd_lock);
3127 }
3128
3129 static void
3130 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3131 {
3132         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3133         dsl_pool_t *dp = dmu_tx_pool(tx);
3134         dsl_dataset_t *ds;
3135
3136         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3137         dsl_dataset_set_refreservation_sync_impl(ds,
3138             ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3139         dsl_dataset_rele(ds, FTAG);
3140 }
3141
3142 int
3143 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3144     uint64_t refreservation)
3145 {
3146         dsl_dataset_set_qr_arg_t ddsqra;
3147
3148         ddsqra.ddsqra_name = dsname;
3149         ddsqra.ddsqra_source = source;
3150         ddsqra.ddsqra_value = refreservation;
3151
3152         return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3153             dsl_dataset_set_refreservation_sync, &ddsqra,
3154             0, ZFS_SPACE_CHECK_NONE));
3155 }
3156
3157 /*
3158  * Return (in *usedp) the amount of space written in new that is not
3159  * present in oldsnap.  New may be a snapshot or the head.  Old must be
3160  * a snapshot before new, in new's filesystem (or its origin).  If not then
3161  * fail and return EINVAL.
3162  *
3163  * The written space is calculated by considering two components:  First, we
3164  * ignore any freed space, and calculate the written as new's used space
3165  * minus old's used space.  Next, we add in the amount of space that was freed
3166  * between the two snapshots, thus reducing new's used space relative to old's.
3167  * Specifically, this is the space that was born before old->ds_creation_txg,
3168  * and freed before new (ie. on new's deadlist or a previous deadlist).
3169  *
3170  * space freed                         [---------------------]
3171  * snapshots                       ---O-------O--------O-------O------
3172  *                                         oldsnap            new
3173  */
3174 int
3175 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3176     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3177 {
3178         int err = 0;
3179         uint64_t snapobj;
3180         dsl_pool_t *dp = new->ds_dir->dd_pool;
3181
3182         ASSERT(dsl_pool_config_held(dp));
3183
3184         *usedp = 0;
3185         *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3186         *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3187
3188         *compp = 0;
3189         *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3190         *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3191
3192         *uncompp = 0;
3193         *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3194         *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3195
3196         snapobj = new->ds_object;
3197         while (snapobj != oldsnap->ds_object) {
3198                 dsl_dataset_t *snap;
3199                 uint64_t used, comp, uncomp;
3200
3201                 if (snapobj == new->ds_object) {
3202                         snap = new;
3203                 } else {
3204                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3205                         if (err != 0)
3206                                 break;
3207                 }
3208
3209                 if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3210                     dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3211                         /*
3212                          * The blocks in the deadlist can not be born after
3213                          * ds_prev_snap_txg, so get the whole deadlist space,
3214                          * which is more efficient (especially for old-format
3215                          * deadlists).  Unfortunately the deadlist code
3216                          * doesn't have enough information to make this
3217                          * optimization itself.
3218                          */
3219                         dsl_deadlist_space(&snap->ds_deadlist,
3220                             &used, &comp, &uncomp);
3221                 } else {
3222                         dsl_deadlist_space_range(&snap->ds_deadlist,
3223                             0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3224                             &used, &comp, &uncomp);
3225                 }
3226                 *usedp += used;
3227                 *compp += comp;
3228                 *uncompp += uncomp;
3229
3230                 /*
3231                  * If we get to the beginning of the chain of snapshots
3232                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3233                  * was not a snapshot of/before new.
3234                  */
3235                 snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3236                 if (snap != new)
3237                         dsl_dataset_rele(snap, FTAG);
3238                 if (snapobj == 0) {
3239                         err = SET_ERROR(EINVAL);
3240                         break;
3241                 }
3242
3243         }
3244         return (err);
3245 }
3246
3247 /*
3248  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3249  * lastsnap, and all snapshots in between are deleted.
3250  *
3251  * blocks that would be freed            [---------------------------]
3252  * snapshots                       ---O-------O--------O-------O--------O
3253  *                                        firstsnap        lastsnap
3254  *
3255  * This is the set of blocks that were born after the snap before firstsnap,
3256  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3257  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3258  * We calculate this by iterating over the relevant deadlists (from the snap
3259  * after lastsnap, backward to the snap after firstsnap), summing up the
3260  * space on the deadlist that was born after the snap before firstsnap.
3261  */
3262 int
3263 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3264     dsl_dataset_t *lastsnap,
3265     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3266 {
3267         int err = 0;
3268         uint64_t snapobj;
3269         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3270
3271         ASSERT(dsl_dataset_is_snapshot(firstsnap));
3272         ASSERT(dsl_dataset_is_snapshot(lastsnap));
3273
3274         /*
3275          * Check that the snapshots are in the same dsl_dir, and firstsnap
3276          * is before lastsnap.
3277          */
3278         if (firstsnap->ds_dir != lastsnap->ds_dir ||
3279             dsl_dataset_phys(firstsnap)->ds_creation_txg >
3280             dsl_dataset_phys(lastsnap)->ds_creation_txg)
3281                 return (SET_ERROR(EINVAL));
3282
3283         *usedp = *compp = *uncompp = 0;
3284
3285         snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3286         while (snapobj != firstsnap->ds_object) {
3287                 dsl_dataset_t *ds;
3288                 uint64_t used, comp, uncomp;
3289
3290                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3291                 if (err != 0)
3292                         break;
3293
3294                 dsl_deadlist_space_range(&ds->ds_deadlist,
3295                     dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3296                     &used, &comp, &uncomp);
3297                 *usedp += used;
3298                 *compp += comp;
3299                 *uncompp += uncomp;
3300
3301                 snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3302                 ASSERT3U(snapobj, !=, 0);
3303                 dsl_dataset_rele(ds, FTAG);
3304         }
3305         return (err);
3306 }
3307
3308 static int
3309 dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
3310 {
3311         const char *dsname = arg;
3312         dsl_dataset_t *ds;
3313         dsl_pool_t *dp = dmu_tx_pool(tx);
3314         int error = 0;
3315
3316         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
3317                 return (SET_ERROR(ENOTSUP));
3318
3319         ASSERT(spa_feature_is_enabled(dp->dp_spa,
3320             SPA_FEATURE_EXTENSIBLE_DATASET));
3321
3322         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
3323         if (error != 0)
3324                 return (error);
3325
3326         if (ds->ds_large_blocks)
3327                 error = EALREADY;
3328         dsl_dataset_rele(ds, FTAG);
3329
3330         return (error);
3331 }
3332
3333 void
3334 dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
3335 {
3336         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3337         objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
3338         uint64_t zero = 0;
3339
3340         spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
3341         dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
3342
3343         VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
3344             sizeof (zero), 1, &zero, tx));
3345 }
3346
3347 static void
3348 dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
3349 {
3350         const char *dsname = arg;
3351         dsl_dataset_t *ds;
3352
3353         VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
3354
3355         dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
3356         ASSERT(!ds->ds_large_blocks);
3357         ds->ds_large_blocks = B_TRUE;
3358         dsl_dataset_rele(ds, FTAG);
3359 }
3360
3361 int
3362 dsl_dataset_activate_large_blocks(const char *dsname)
3363 {
3364         int error;
3365
3366         error = dsl_sync_task(dsname,
3367             dsl_dataset_activate_large_blocks_check,
3368             dsl_dataset_activate_large_blocks_sync, (void *)dsname,
3369             1, ZFS_SPACE_CHECK_RESERVED);
3370
3371         /*
3372          * EALREADY indicates that this dataset already supports large blocks.
3373          */
3374         if (error == EALREADY)
3375                 error = 0;
3376         return (error);
3377 }
3378
3379 /*
3380  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3381  * For example, they could both be snapshots of the same filesystem, and
3382  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3383  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3384  * filesystem.  Or 'earlier' could be the origin's origin.
3385  *
3386  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3387  */
3388 boolean_t
3389 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
3390         uint64_t earlier_txg)
3391 {
3392         dsl_pool_t *dp = later->ds_dir->dd_pool;
3393         int error;
3394         boolean_t ret;
3395
3396         ASSERT(dsl_pool_config_held(dp));
3397         ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0);
3398
3399         if (earlier_txg == 0)
3400                 earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
3401
3402         if (dsl_dataset_is_snapshot(later) &&
3403             earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
3404                 return (B_FALSE);
3405
3406         if (later->ds_dir == earlier->ds_dir)
3407                 return (B_TRUE);
3408         if (!dsl_dir_is_clone(later->ds_dir))
3409                 return (B_FALSE);
3410
3411         if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
3412                 return (B_TRUE);
3413         dsl_dataset_t *origin;
3414         error = dsl_dataset_hold_obj(dp,
3415             dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
3416         if (error != 0)
3417                 return (B_FALSE);
3418         ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3419         dsl_dataset_rele(origin, FTAG);
3420         return (ret);
3421 }
3422
3423
3424 void
3425 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3426 {
3427         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3428         dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3429 }