]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
24  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 RackTop Systems.
27  */
28
29 #include <sys/dmu_objset.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dsl_prop.h>
33 #include <sys/dsl_synctask.h>
34 #include <sys/dmu_traverse.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/arc.h>
38 #include <sys/zio.h>
39 #include <sys/zap.h>
40 #include <sys/zfeature.h>
41 #include <sys/unique.h>
42 #include <sys/zfs_context.h>
43 #include <sys/zfs_ioctl.h>
44 #include <sys/spa.h>
45 #include <sys/zfs_znode.h>
46 #include <sys/zfs_onexit.h>
47 #include <sys/zvol.h>
48 #include <sys/dsl_scan.h>
49 #include <sys/dsl_deadlist.h>
50 #include <sys/dsl_destroy.h>
51 #include <sys/dsl_userhold.h>
52 #include <sys/dsl_bookmark.h>
53
54 SYSCTL_DECL(_vfs_zfs);
55
56 /*
57  * The SPA supports block sizes up to 16MB.  However, very large blocks
58  * can have an impact on i/o latency (e.g. tying up a spinning disk for
59  * ~300ms), and also potentially on the memory allocator.  Therefore,
60  * we do not allow the recordsize to be set larger than zfs_max_recordsize
61  * (default 1MB).  Larger blocks can be created by changing this tunable,
62  * and pools with larger blocks can always be imported and used, regardless
63  * of this setting.
64  */
65 int zfs_max_recordsize = 1 * 1024 * 1024;
66 SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
67     &zfs_max_recordsize, 0,
68     "Maximum block size.  Expect dragons when tuning this.");
69
70 #define SWITCH64(x, y) \
71         { \
72                 uint64_t __tmp = (x); \
73                 (x) = (y); \
74                 (y) = __tmp; \
75         }
76
77 #define DS_REF_MAX      (1ULL << 62)
78
79 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
80 extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds);
81
82 /*
83  * Figure out how much of this delta should be propogated to the dsl_dir
84  * layer.  If there's a refreservation, that space has already been
85  * partially accounted for in our ancestors.
86  */
87 static int64_t
88 parent_delta(dsl_dataset_t *ds, int64_t delta)
89 {
90         dsl_dataset_phys_t *ds_phys;
91         uint64_t old_bytes, new_bytes;
92
93         if (ds->ds_reserved == 0)
94                 return (delta);
95
96         ds_phys = dsl_dataset_phys(ds);
97         old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
98         new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
99
100         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
101         return (new_bytes - old_bytes);
102 }
103
104 void
105 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
106 {
107         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
108         int compressed = BP_GET_PSIZE(bp);
109         int uncompressed = BP_GET_UCSIZE(bp);
110         int64_t delta;
111
112         dprintf_bp(bp, "ds=%p", ds);
113
114         ASSERT(dmu_tx_is_syncing(tx));
115         /* It could have been compressed away to nothing */
116         if (BP_IS_HOLE(bp))
117                 return;
118         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
119         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
120         if (ds == NULL) {
121                 dsl_pool_mos_diduse_space(tx->tx_pool,
122                     used, compressed, uncompressed);
123                 return;
124         }
125
126         dmu_buf_will_dirty(ds->ds_dbuf, tx);
127         mutex_enter(&ds->ds_lock);
128         delta = parent_delta(ds, used);
129         dsl_dataset_phys(ds)->ds_referenced_bytes += used;
130         dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
131         dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
132         dsl_dataset_phys(ds)->ds_unique_bytes += used;
133         if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
134                 ds->ds_need_large_blocks = B_TRUE;
135         mutex_exit(&ds->ds_lock);
136         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
137             compressed, uncompressed, tx);
138         dsl_dir_transfer_space(ds->ds_dir, used - delta,
139             DD_USED_REFRSRV, DD_USED_HEAD, NULL);
140 }
141
142 int
143 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
144     boolean_t async)
145 {
146         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
147         int compressed = BP_GET_PSIZE(bp);
148         int uncompressed = BP_GET_UCSIZE(bp);
149
150         if (BP_IS_HOLE(bp))
151                 return (0);
152
153         ASSERT(dmu_tx_is_syncing(tx));
154         ASSERT(bp->blk_birth <= tx->tx_txg);
155
156         if (ds == NULL) {
157                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
158                 dsl_pool_mos_diduse_space(tx->tx_pool,
159                     -used, -compressed, -uncompressed);
160                 return (used);
161         }
162         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
163
164         ASSERT(!dsl_dataset_is_snapshot(ds));
165         dmu_buf_will_dirty(ds->ds_dbuf, tx);
166
167         if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
168                 int64_t delta;
169
170                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
171                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
172
173                 mutex_enter(&ds->ds_lock);
174                 ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
175                     !DS_UNIQUE_IS_ACCURATE(ds));
176                 delta = parent_delta(ds, -used);
177                 dsl_dataset_phys(ds)->ds_unique_bytes -= used;
178                 mutex_exit(&ds->ds_lock);
179                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
180                     delta, -compressed, -uncompressed, tx);
181                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
182                     DD_USED_REFRSRV, DD_USED_HEAD, NULL);
183         } else {
184                 dprintf_bp(bp, "putting on dead list: %s", "");
185                 if (async) {
186                         /*
187                          * We are here as part of zio's write done callback,
188                          * which means we're a zio interrupt thread.  We can't
189                          * call dsl_deadlist_insert() now because it may block
190                          * waiting for I/O.  Instead, put bp on the deferred
191                          * queue and let dsl_pool_sync() finish the job.
192                          */
193                         bplist_append(&ds->ds_pending_deadlist, bp);
194                 } else {
195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
196                 }
197                 ASSERT3U(ds->ds_prev->ds_object, ==,
198                     dsl_dataset_phys(ds)->ds_prev_snap_obj);
199                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
201                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
202                     ds->ds_object && bp->blk_birth >
203                     dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
205                         mutex_enter(&ds->ds_prev->ds_lock);
206                         dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
207                         mutex_exit(&ds->ds_prev->ds_lock);
208                 }
209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
210                         dsl_dir_transfer_space(ds->ds_dir, used,
211                             DD_USED_HEAD, DD_USED_SNAP, tx);
212                 }
213         }
214         mutex_enter(&ds->ds_lock);
215         ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
216         dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
217         ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
218         dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
219         ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
220         dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
221         mutex_exit(&ds->ds_lock);
222
223         return (used);
224 }
225
226 uint64_t
227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
228 {
229         uint64_t trysnap = 0;
230
231         if (ds == NULL)
232                 return (0);
233         /*
234          * The snapshot creation could fail, but that would cause an
235          * incorrect FALSE return, which would only result in an
236          * overestimation of the amount of space that an operation would
237          * consume, which is OK.
238          *
239          * There's also a small window where we could miss a pending
240          * snapshot, because we could set the sync task in the quiescing
241          * phase.  So this should only be used as a guess.
242          */
243         if (ds->ds_trysnap_txg >
244             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
245                 trysnap = ds->ds_trysnap_txg;
246         return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
247 }
248
249 boolean_t
250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
251     uint64_t blk_birth)
252 {
253         if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
254             (bp != NULL && BP_IS_HOLE(bp)))
255                 return (B_FALSE);
256
257         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
258
259         return (B_TRUE);
260 }
261
262 /* ARGSUSED */
263 static void
264 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
265 {
266         dsl_dataset_t *ds = dsv;
267
268         ASSERT(ds->ds_owner == NULL);
269
270         unique_remove(ds->ds_fsid_guid);
271
272         if (ds->ds_objset != NULL)
273                 dmu_objset_evict(ds->ds_objset);
274
275         if (ds->ds_prev) {
276                 dsl_dataset_rele(ds->ds_prev, ds);
277                 ds->ds_prev = NULL;
278         }
279
280         bplist_destroy(&ds->ds_pending_deadlist);
281         if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0)
282                 dsl_deadlist_close(&ds->ds_deadlist);
283         if (ds->ds_dir)
284                 dsl_dir_rele(ds->ds_dir, ds);
285
286         ASSERT(!list_link_active(&ds->ds_synced_link));
287
288         if (mutex_owned(&ds->ds_lock))
289                 mutex_exit(&ds->ds_lock);
290         mutex_destroy(&ds->ds_lock);
291         if (mutex_owned(&ds->ds_opening_lock))
292                 mutex_exit(&ds->ds_opening_lock);
293         mutex_destroy(&ds->ds_opening_lock);
294         mutex_destroy(&ds->ds_sendstream_lock);
295         refcount_destroy(&ds->ds_longholds);
296
297         kmem_free(ds, sizeof (dsl_dataset_t));
298 }
299
300 int
301 dsl_dataset_get_snapname(dsl_dataset_t *ds)
302 {
303         dsl_dataset_phys_t *headphys;
304         int err;
305         dmu_buf_t *headdbuf;
306         dsl_pool_t *dp = ds->ds_dir->dd_pool;
307         objset_t *mos = dp->dp_meta_objset;
308
309         if (ds->ds_snapname[0])
310                 return (0);
311         if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
312                 return (0);
313
314         err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
315             FTAG, &headdbuf);
316         if (err != 0)
317                 return (err);
318         headphys = headdbuf->db_data;
319         err = zap_value_search(dp->dp_meta_objset,
320             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
321         dmu_buf_rele(headdbuf, FTAG);
322         return (err);
323 }
324
325 int
326 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
327 {
328         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
329         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
330         matchtype_t mt;
331         int err;
332
333         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
334                 mt = MT_FIRST;
335         else
336                 mt = MT_EXACT;
337
338         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
339             value, mt, NULL, 0, NULL);
340         if (err == ENOTSUP && mt == MT_FIRST)
341                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
342         return (err);
343 }
344
345 int
346 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
347     boolean_t adj_cnt)
348 {
349         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
350         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
351         matchtype_t mt;
352         int err;
353
354         dsl_dir_snap_cmtime_update(ds->ds_dir);
355
356         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
357                 mt = MT_FIRST;
358         else
359                 mt = MT_EXACT;
360
361         err = zap_remove_norm(mos, snapobj, name, mt, tx);
362         if (err == ENOTSUP && mt == MT_FIRST)
363                 err = zap_remove(mos, snapobj, name, tx);
364
365         if (err == 0 && adj_cnt)
366                 dsl_fs_ss_count_adjust(ds->ds_dir, -1,
367                     DD_FIELD_SNAPSHOT_COUNT, tx);
368
369         return (err);
370 }
371
372 int
373 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
374     dsl_dataset_t **dsp)
375 {
376         objset_t *mos = dp->dp_meta_objset;
377         dmu_buf_t *dbuf;
378         dsl_dataset_t *ds;
379         int err;
380         dmu_object_info_t doi;
381
382         ASSERT(dsl_pool_config_held(dp));
383
384         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
385         if (err != 0)
386                 return (err);
387
388         /* Make sure dsobj has the correct object type. */
389         dmu_object_info_from_db(dbuf, &doi);
390         if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
391                 dmu_buf_rele(dbuf, tag);
392                 return (SET_ERROR(EINVAL));
393         }
394
395         ds = dmu_buf_get_user(dbuf);
396         if (ds == NULL) {
397                 dsl_dataset_t *winner = NULL;
398
399                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
400                 ds->ds_dbuf = dbuf;
401                 ds->ds_object = dsobj;
402
403                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
404                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
405                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
406                 refcount_create(&ds->ds_longholds);
407
408                 bplist_create(&ds->ds_pending_deadlist);
409                 dsl_deadlist_open(&ds->ds_deadlist,
410                     mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
411
412                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
413                     offsetof(dmu_sendarg_t, dsa_link));
414
415                 if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
416                         int zaperr = zap_contains(mos, dsobj,
417                             DS_FIELD_LARGE_BLOCKS);
418                         if (zaperr != ENOENT) {
419                                 VERIFY0(zaperr);
420                                 ds->ds_large_blocks = B_TRUE;
421                         }
422                 }
423
424                 if (err == 0) {
425                         err = dsl_dir_hold_obj(dp,
426                             dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
427                             &ds->ds_dir);
428                 }
429                 if (err != 0) {
430                         mutex_destroy(&ds->ds_lock);
431                         mutex_destroy(&ds->ds_opening_lock);
432                         mutex_destroy(&ds->ds_sendstream_lock);
433                         refcount_destroy(&ds->ds_longholds);
434                         bplist_destroy(&ds->ds_pending_deadlist);
435                         dsl_deadlist_close(&ds->ds_deadlist);
436                         kmem_free(ds, sizeof (dsl_dataset_t));
437                         dmu_buf_rele(dbuf, tag);
438                         return (err);
439                 }
440
441                 if (!dsl_dataset_is_snapshot(ds)) {
442                         ds->ds_snapname[0] = '\0';
443                         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
444                                 err = dsl_dataset_hold_obj(dp,
445                                     dsl_dataset_phys(ds)->ds_prev_snap_obj,
446                                     ds, &ds->ds_prev);
447                         }
448                         if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
449                                 int zaperr = zap_lookup(mos, ds->ds_object,
450                                     DS_FIELD_BOOKMARK_NAMES,
451                                     sizeof (ds->ds_bookmarks), 1,
452                                     &ds->ds_bookmarks);
453                                 if (zaperr != ENOENT)
454                                         VERIFY0(zaperr);
455                         }
456                 } else {
457                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
458                                 err = dsl_dataset_get_snapname(ds);
459                         if (err == 0 &&
460                             dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
461                                 err = zap_count(
462                                     ds->ds_dir->dd_pool->dp_meta_objset,
463                                     dsl_dataset_phys(ds)->ds_userrefs_obj,
464                                     &ds->ds_userrefs);
465                         }
466                 }
467
468                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
469                         err = dsl_prop_get_int_ds(ds,
470                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
471                             &ds->ds_reserved);
472                         if (err == 0) {
473                                 err = dsl_prop_get_int_ds(ds,
474                                     zfs_prop_to_name(ZFS_PROP_REFQUOTA),
475                                     &ds->ds_quota);
476                         }
477                 } else {
478                         ds->ds_reserved = ds->ds_quota = 0;
479                 }
480
481                 if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
482                     dsl_dataset_evict)) != NULL) {
483                         bplist_destroy(&ds->ds_pending_deadlist);
484                         dsl_deadlist_close(&ds->ds_deadlist);
485                         if (ds->ds_prev)
486                                 dsl_dataset_rele(ds->ds_prev, ds);
487                         dsl_dir_rele(ds->ds_dir, ds);
488                         mutex_destroy(&ds->ds_lock);
489                         mutex_destroy(&ds->ds_opening_lock);
490                         mutex_destroy(&ds->ds_sendstream_lock);
491                         refcount_destroy(&ds->ds_longholds);
492                         kmem_free(ds, sizeof (dsl_dataset_t));
493                         if (err != 0) {
494                                 dmu_buf_rele(dbuf, tag);
495                                 return (err);
496                         }
497                         ds = winner;
498                 } else {
499                         ds->ds_fsid_guid =
500                             unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
501                 }
502         }
503         ASSERT3P(ds->ds_dbuf, ==, dbuf);
504         ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
505         ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
506             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
507             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
508         *dsp = ds;
509         return (0);
510 }
511
512 int
513 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
514     void *tag, dsl_dataset_t **dsp)
515 {
516         dsl_dir_t *dd;
517         const char *snapname;
518         uint64_t obj;
519         int err = 0;
520
521         err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
522         if (err != 0)
523                 return (err);
524
525         ASSERT(dsl_pool_config_held(dp));
526         obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
527         if (obj != 0)
528                 err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
529         else
530                 err = SET_ERROR(ENOENT);
531
532         /* we may be looking for a snapshot */
533         if (err == 0 && snapname != NULL) {
534                 dsl_dataset_t *ds;
535
536                 if (*snapname++ != '@') {
537                         dsl_dataset_rele(*dsp, tag);
538                         dsl_dir_rele(dd, FTAG);
539                         return (SET_ERROR(ENOENT));
540                 }
541
542                 dprintf("looking for snapshot '%s'\n", snapname);
543                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
544                 if (err == 0)
545                         err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
546                 dsl_dataset_rele(*dsp, tag);
547
548                 if (err == 0) {
549                         mutex_enter(&ds->ds_lock);
550                         if (ds->ds_snapname[0] == 0)
551                                 (void) strlcpy(ds->ds_snapname, snapname,
552                                     sizeof (ds->ds_snapname));
553                         mutex_exit(&ds->ds_lock);
554                         *dsp = ds;
555                 }
556         }
557
558         dsl_dir_rele(dd, FTAG);
559         return (err);
560 }
561
562 int
563 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
564     void *tag, dsl_dataset_t **dsp)
565 {
566         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
567         if (err != 0)
568                 return (err);
569         if (!dsl_dataset_tryown(*dsp, tag)) {
570                 dsl_dataset_rele(*dsp, tag);
571                 *dsp = NULL;
572                 return (SET_ERROR(EBUSY));
573         }
574         return (0);
575 }
576
577 int
578 dsl_dataset_own(dsl_pool_t *dp, const char *name,
579     void *tag, dsl_dataset_t **dsp)
580 {
581         int err = dsl_dataset_hold(dp, name, tag, dsp);
582         if (err != 0)
583                 return (err);
584         if (!dsl_dataset_tryown(*dsp, tag)) {
585                 dsl_dataset_rele(*dsp, tag);
586                 return (SET_ERROR(EBUSY));
587         }
588         return (0);
589 }
590
591 /*
592  * See the comment above dsl_pool_hold() for details.  In summary, a long
593  * hold is used to prevent destruction of a dataset while the pool hold
594  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
595  *
596  * The dataset and pool must be held when this function is called.  After it
597  * is called, the pool hold may be released while the dataset is still held
598  * and accessed.
599  */
600 void
601 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
602 {
603         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
604         (void) refcount_add(&ds->ds_longholds, tag);
605 }
606
607 void
608 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
609 {
610         (void) refcount_remove(&ds->ds_longholds, tag);
611 }
612
613 /* Return B_TRUE if there are any long holds on this dataset. */
614 boolean_t
615 dsl_dataset_long_held(dsl_dataset_t *ds)
616 {
617         return (!refcount_is_zero(&ds->ds_longholds));
618 }
619
620 void
621 dsl_dataset_name(dsl_dataset_t *ds, char *name)
622 {
623         if (ds == NULL) {
624                 (void) strcpy(name, "mos");
625         } else {
626                 dsl_dir_name(ds->ds_dir, name);
627                 VERIFY0(dsl_dataset_get_snapname(ds));
628                 if (ds->ds_snapname[0]) {
629                         (void) strcat(name, "@");
630                         /*
631                          * We use a "recursive" mutex so that we
632                          * can call dprintf_ds() with ds_lock held.
633                          */
634                         if (!MUTEX_HELD(&ds->ds_lock)) {
635                                 mutex_enter(&ds->ds_lock);
636                                 (void) strcat(name, ds->ds_snapname);
637                                 mutex_exit(&ds->ds_lock);
638                         } else {
639                                 (void) strcat(name, ds->ds_snapname);
640                         }
641                 }
642         }
643 }
644
645 void
646 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
647 {
648         dmu_buf_rele(ds->ds_dbuf, tag);
649 }
650
651 void
652 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
653 {
654         ASSERT3P(ds->ds_owner, ==, tag);
655         ASSERT(ds->ds_dbuf != NULL);
656
657         mutex_enter(&ds->ds_lock);
658         ds->ds_owner = NULL;
659         mutex_exit(&ds->ds_lock);
660         dsl_dataset_long_rele(ds, tag);
661         dsl_dataset_rele(ds, tag);
662 }
663
664 boolean_t
665 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
666 {
667         boolean_t gotit = FALSE;
668
669         mutex_enter(&ds->ds_lock);
670         if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
671                 ds->ds_owner = tag;
672                 dsl_dataset_long_hold(ds, tag);
673                 gotit = TRUE;
674         }
675         mutex_exit(&ds->ds_lock);
676         return (gotit);
677 }
678
679 uint64_t
680 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
681     uint64_t flags, dmu_tx_t *tx)
682 {
683         dsl_pool_t *dp = dd->dd_pool;
684         dmu_buf_t *dbuf;
685         dsl_dataset_phys_t *dsphys;
686         uint64_t dsobj;
687         objset_t *mos = dp->dp_meta_objset;
688
689         if (origin == NULL)
690                 origin = dp->dp_origin_snap;
691
692         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
693         ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
694         ASSERT(dmu_tx_is_syncing(tx));
695         ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
696
697         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
698             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
699         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
700         dmu_buf_will_dirty(dbuf, tx);
701         dsphys = dbuf->db_data;
702         bzero(dsphys, sizeof (dsl_dataset_phys_t));
703         dsphys->ds_dir_obj = dd->dd_object;
704         dsphys->ds_flags = flags;
705         dsphys->ds_fsid_guid = unique_create();
706         do {
707                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
708                     sizeof (dsphys->ds_guid));
709         } while (dsphys->ds_guid == 0);
710         dsphys->ds_snapnames_zapobj =
711             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
712             DMU_OT_NONE, 0, tx);
713         dsphys->ds_creation_time = gethrestime_sec();
714         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
715
716         if (origin == NULL) {
717                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
718         } else {
719                 dsl_dataset_t *ohds; /* head of the origin snapshot */
720
721                 dsphys->ds_prev_snap_obj = origin->ds_object;
722                 dsphys->ds_prev_snap_txg =
723                     dsl_dataset_phys(origin)->ds_creation_txg;
724                 dsphys->ds_referenced_bytes =
725                     dsl_dataset_phys(origin)->ds_referenced_bytes;
726                 dsphys->ds_compressed_bytes =
727                     dsl_dataset_phys(origin)->ds_compressed_bytes;
728                 dsphys->ds_uncompressed_bytes =
729                     dsl_dataset_phys(origin)->ds_uncompressed_bytes;
730                 dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
731
732                 /*
733                  * Inherit flags that describe the dataset's contents
734                  * (INCONSISTENT) or properties (Case Insensitive).
735                  */
736                 dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
737                     (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
738
739                 if (origin->ds_large_blocks)
740                         dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
741
742                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
743                 dsl_dataset_phys(origin)->ds_num_children++;
744
745                 VERIFY0(dsl_dataset_hold_obj(dp,
746                     dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
747                     FTAG, &ohds));
748                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
749                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
750                 dsl_dataset_rele(ohds, FTAG);
751
752                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
753                         if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
754                                 dsl_dataset_phys(origin)->ds_next_clones_obj =
755                                     zap_create(mos,
756                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
757                         }
758                         VERIFY0(zap_add_int(mos,
759                             dsl_dataset_phys(origin)->ds_next_clones_obj,
760                             dsobj, tx));
761                 }
762
763                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
764                 dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
765                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
766                         if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
767                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
768                                 dsl_dir_phys(origin->ds_dir)->dd_clones =
769                                     zap_create(mos,
770                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
771                         }
772                         VERIFY0(zap_add_int(mos,
773                             dsl_dir_phys(origin->ds_dir)->dd_clones,
774                             dsobj, tx));
775                 }
776         }
777
778         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
779                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
780
781         dmu_buf_rele(dbuf, FTAG);
782
783         dmu_buf_will_dirty(dd->dd_dbuf, tx);
784         dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
785
786         return (dsobj);
787 }
788
789 static void
790 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
791 {
792         objset_t *os;
793
794         VERIFY0(dmu_objset_from_ds(ds, &os));
795         bzero(&os->os_zil_header, sizeof (os->os_zil_header));
796         dsl_dataset_dirty(ds, tx);
797 }
798
799 uint64_t
800 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
801     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
802 {
803         dsl_pool_t *dp = pdd->dd_pool;
804         uint64_t dsobj, ddobj;
805         dsl_dir_t *dd;
806
807         ASSERT(dmu_tx_is_syncing(tx));
808         ASSERT(lastname[0] != '@');
809
810         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
811         VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
812
813         dsobj = dsl_dataset_create_sync_dd(dd, origin,
814             flags & ~DS_CREATE_FLAG_NODIRTY, tx);
815
816         dsl_deleg_set_create_perms(dd, tx, cr);
817
818         /*
819          * Since we're creating a new node we know it's a leaf, so we can
820          * initialize the counts if the limit feature is active.
821          */
822         if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
823                 uint64_t cnt = 0;
824                 objset_t *os = dd->dd_pool->dp_meta_objset;
825
826                 dsl_dir_zapify(dd, tx);
827                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
828                     sizeof (cnt), 1, &cnt, tx));
829                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
830                     sizeof (cnt), 1, &cnt, tx));
831         }
832
833         dsl_dir_rele(dd, FTAG);
834
835         /*
836          * If we are creating a clone, make sure we zero out any stale
837          * data from the origin snapshots zil header.
838          */
839         if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
840                 dsl_dataset_t *ds;
841
842                 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
843                 dsl_dataset_zero_zil(ds, tx);
844                 dsl_dataset_rele(ds, FTAG);
845         }
846
847         return (dsobj);
848 }
849
850 #ifdef __FreeBSD__
851 /* FreeBSD ioctl compat begin */
852 struct destroyarg {
853         nvlist_t *nvl;
854         const char *snapname;
855 };
856
857 static int
858 dsl_check_snap_cb(const char *name, void *arg)
859 {
860         struct destroyarg *da = arg;
861         dsl_dataset_t *ds;
862         char *dsname;
863
864         dsname = kmem_asprintf("%s@%s", name, da->snapname);
865         fnvlist_add_boolean(da->nvl, dsname);
866         kmem_free(dsname, strlen(dsname) + 1);
867
868         return (0);
869 }
870
871 int
872 dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
873     nvlist_t *snaps)
874 {
875         struct destroyarg *da;
876         int err;
877
878         da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
879         da->nvl = snaps;
880         da->snapname = snapname;
881         err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
882             DS_FIND_CHILDREN);
883         kmem_free(da, sizeof (struct destroyarg));
884
885         return (err);
886 }
887 /* FreeBSD ioctl compat end */
888 #endif /* __FreeBSD__ */
889
890 /*
891  * The unique space in the head dataset can be calculated by subtracting
892  * the space used in the most recent snapshot, that is still being used
893  * in this file system, from the space currently in use.  To figure out
894  * the space in the most recent snapshot still in use, we need to take
895  * the total space used in the snapshot and subtract out the space that
896  * has been freed up since the snapshot was taken.
897  */
898 void
899 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
900 {
901         uint64_t mrs_used;
902         uint64_t dlused, dlcomp, dluncomp;
903
904         ASSERT(!dsl_dataset_is_snapshot(ds));
905
906         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
907                 mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
908         else
909                 mrs_used = 0;
910
911         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
912
913         ASSERT3U(dlused, <=, mrs_used);
914         dsl_dataset_phys(ds)->ds_unique_bytes =
915             dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
916
917         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
918             SPA_VERSION_UNIQUE_ACCURATE)
919                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
920 }
921
922 void
923 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
924     dmu_tx_t *tx)
925 {
926         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
927         uint64_t count;
928         int err;
929
930         ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
931         err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
932             obj, tx);
933         /*
934          * The err should not be ENOENT, but a bug in a previous version
935          * of the code could cause upgrade_clones_cb() to not set
936          * ds_next_snap_obj when it should, leading to a missing entry.
937          * If we knew that the pool was created after
938          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
939          * ENOENT.  However, at least we can check that we don't have
940          * too many entries in the next_clones_obj even after failing to
941          * remove this one.
942          */
943         if (err != ENOENT)
944                 VERIFY0(err);
945         ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
946             &count));
947         ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
948 }
949
950
951 blkptr_t *
952 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
953 {
954         return (&dsl_dataset_phys(ds)->ds_bp);
955 }
956
957 void
958 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
959 {
960         ASSERT(dmu_tx_is_syncing(tx));
961         /* If it's the meta-objset, set dp_meta_rootbp */
962         if (ds == NULL) {
963                 tx->tx_pool->dp_meta_rootbp = *bp;
964         } else {
965                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
966                 dsl_dataset_phys(ds)->ds_bp = *bp;
967         }
968 }
969
970 spa_t *
971 dsl_dataset_get_spa(dsl_dataset_t *ds)
972 {
973         return (ds->ds_dir->dd_pool->dp_spa);
974 }
975
976 void
977 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
978 {
979         dsl_pool_t *dp;
980
981         if (ds == NULL) /* this is the meta-objset */
982                 return;
983
984         ASSERT(ds->ds_objset != NULL);
985
986         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
987                 panic("dirtying snapshot!");
988
989         dp = ds->ds_dir->dd_pool;
990
991         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
992                 /* up the hold count until we can be written out */
993                 dmu_buf_add_ref(ds->ds_dbuf, ds);
994         }
995 }
996
997 boolean_t
998 dsl_dataset_is_dirty(dsl_dataset_t *ds)
999 {
1000         for (int t = 0; t < TXG_SIZE; t++) {
1001                 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1002                     ds, t))
1003                         return (B_TRUE);
1004         }
1005         return (B_FALSE);
1006 }
1007
1008 static int
1009 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1010 {
1011         uint64_t asize;
1012
1013         if (!dmu_tx_is_syncing(tx))
1014                 return (0);
1015
1016         /*
1017          * If there's an fs-only reservation, any blocks that might become
1018          * owned by the snapshot dataset must be accommodated by space
1019          * outside of the reservation.
1020          */
1021         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1022         asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1023         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1024                 return (SET_ERROR(ENOSPC));
1025
1026         /*
1027          * Propagate any reserved space for this snapshot to other
1028          * snapshot checks in this sync group.
1029          */
1030         if (asize > 0)
1031                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1032
1033         return (0);
1034 }
1035
1036 typedef struct dsl_dataset_snapshot_arg {
1037         nvlist_t *ddsa_snaps;
1038         nvlist_t *ddsa_props;
1039         nvlist_t *ddsa_errors;
1040         cred_t *ddsa_cr;
1041 } dsl_dataset_snapshot_arg_t;
1042
1043 int
1044 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1045     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1046 {
1047         int error;
1048         uint64_t value;
1049
1050         ds->ds_trysnap_txg = tx->tx_txg;
1051
1052         if (!dmu_tx_is_syncing(tx))
1053                 return (0);
1054
1055         /*
1056          * We don't allow multiple snapshots of the same txg.  If there
1057          * is already one, try again.
1058          */
1059         if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1060                 return (SET_ERROR(EAGAIN));
1061
1062         /*
1063          * Check for conflicting snapshot name.
1064          */
1065         error = dsl_dataset_snap_lookup(ds, snapname, &value);
1066         if (error == 0)
1067                 return (SET_ERROR(EEXIST));
1068         if (error != ENOENT)
1069                 return (error);
1070
1071         /*
1072          * We don't allow taking snapshots of inconsistent datasets, such as
1073          * those into which we are currently receiving.  However, if we are
1074          * creating this snapshot as part of a receive, this check will be
1075          * executed atomically with respect to the completion of the receive
1076          * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1077          * case we ignore this, knowing it will be fixed up for us shortly in
1078          * dmu_recv_end_sync().
1079          */
1080         if (!recv && DS_IS_INCONSISTENT(ds))
1081                 return (SET_ERROR(EBUSY));
1082
1083         /*
1084          * Skip the check for temporary snapshots or if we have already checked
1085          * the counts in dsl_dataset_snapshot_check. This means we really only
1086          * check the count here when we're receiving a stream.
1087          */
1088         if (cnt != 0 && cr != NULL) {
1089                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1090                     ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1091                 if (error != 0)
1092                         return (error);
1093         }
1094
1095         error = dsl_dataset_snapshot_reserve_space(ds, tx);
1096         if (error != 0)
1097                 return (error);
1098
1099         return (0);
1100 }
1101
1102 static int
1103 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1104 {
1105         dsl_dataset_snapshot_arg_t *ddsa = arg;
1106         dsl_pool_t *dp = dmu_tx_pool(tx);
1107         nvpair_t *pair;
1108         int rv = 0;
1109
1110         /*
1111          * Pre-compute how many total new snapshots will be created for each
1112          * level in the tree and below. This is needed for validating the
1113          * snapshot limit when either taking a recursive snapshot or when
1114          * taking multiple snapshots.
1115          *
1116          * The problem is that the counts are not actually adjusted when
1117          * we are checking, only when we finally sync. For a single snapshot,
1118          * this is easy, the count will increase by 1 at each node up the tree,
1119          * but its more complicated for the recursive/multiple snapshot case.
1120          *
1121          * The dsl_fs_ss_limit_check function does recursively check the count
1122          * at each level up the tree but since it is validating each snapshot
1123          * independently we need to be sure that we are validating the complete
1124          * count for the entire set of snapshots. We do this by rolling up the
1125          * counts for each component of the name into an nvlist and then
1126          * checking each of those cases with the aggregated count.
1127          *
1128          * This approach properly handles not only the recursive snapshot
1129          * case (where we get all of those on the ddsa_snaps list) but also
1130          * the sibling case (e.g. snapshot a/b and a/c so that we will also
1131          * validate the limit on 'a' using a count of 2).
1132          *
1133          * We validate the snapshot names in the third loop and only report
1134          * name errors once.
1135          */
1136         if (dmu_tx_is_syncing(tx)) {
1137                 nvlist_t *cnt_track = NULL;
1138                 cnt_track = fnvlist_alloc();
1139
1140                 /* Rollup aggregated counts into the cnt_track list */
1141                 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1142                     pair != NULL;
1143                     pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1144                         char *pdelim;
1145                         uint64_t val;
1146                         char nm[MAXPATHLEN];
1147
1148                         (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1149                         pdelim = strchr(nm, '@');
1150                         if (pdelim == NULL)
1151                                 continue;
1152                         *pdelim = '\0';
1153
1154                         do {
1155                                 if (nvlist_lookup_uint64(cnt_track, nm,
1156                                     &val) == 0) {
1157                                         /* update existing entry */
1158                                         fnvlist_add_uint64(cnt_track, nm,
1159                                             val + 1);
1160                                 } else {
1161                                         /* add to list */
1162                                         fnvlist_add_uint64(cnt_track, nm, 1);
1163                                 }
1164
1165                                 pdelim = strrchr(nm, '/');
1166                                 if (pdelim != NULL)
1167                                         *pdelim = '\0';
1168                         } while (pdelim != NULL);
1169                 }
1170
1171                 /* Check aggregated counts at each level */
1172                 for (pair = nvlist_next_nvpair(cnt_track, NULL);
1173                     pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1174                         int error = 0;
1175                         char *name;
1176                         uint64_t cnt = 0;
1177                         dsl_dataset_t *ds;
1178
1179                         name = nvpair_name(pair);
1180                         cnt = fnvpair_value_uint64(pair);
1181                         ASSERT(cnt > 0);
1182
1183                         error = dsl_dataset_hold(dp, name, FTAG, &ds);
1184                         if (error == 0) {
1185                                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1186                                     ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1187                                     ddsa->ddsa_cr);
1188                                 dsl_dataset_rele(ds, FTAG);
1189                         }
1190
1191                         if (error != 0) {
1192                                 if (ddsa->ddsa_errors != NULL)
1193                                         fnvlist_add_int32(ddsa->ddsa_errors,
1194                                             name, error);
1195                                 rv = error;
1196                                 /* only report one error for this check */
1197                                 break;
1198                         }
1199                 }
1200                 nvlist_free(cnt_track);
1201         }
1202
1203         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1204             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1205                 int error = 0;
1206                 dsl_dataset_t *ds;
1207                 char *name, *atp;
1208                 char dsname[MAXNAMELEN];
1209
1210                 name = nvpair_name(pair);
1211                 if (strlen(name) >= MAXNAMELEN)
1212                         error = SET_ERROR(ENAMETOOLONG);
1213                 if (error == 0) {
1214                         atp = strchr(name, '@');
1215                         if (atp == NULL)
1216                                 error = SET_ERROR(EINVAL);
1217                         if (error == 0)
1218                                 (void) strlcpy(dsname, name, atp - name + 1);
1219                 }
1220                 if (error == 0)
1221                         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1222                 if (error == 0) {
1223                         /* passing 0/NULL skips dsl_fs_ss_limit_check */
1224                         error = dsl_dataset_snapshot_check_impl(ds,
1225                             atp + 1, tx, B_FALSE, 0, NULL);
1226                         dsl_dataset_rele(ds, FTAG);
1227                 }
1228
1229                 if (error != 0) {
1230                         if (ddsa->ddsa_errors != NULL) {
1231                                 fnvlist_add_int32(ddsa->ddsa_errors,
1232                                     name, error);
1233                         }
1234                         rv = error;
1235                 }
1236         }
1237
1238         return (rv);
1239 }
1240
1241 void
1242 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1243     dmu_tx_t *tx)
1244 {
1245         static zil_header_t zero_zil;
1246
1247         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1248         dmu_buf_t *dbuf;
1249         dsl_dataset_phys_t *dsphys;
1250         uint64_t dsobj, crtxg;
1251         objset_t *mos = dp->dp_meta_objset;
1252         objset_t *os;
1253
1254         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1255
1256         /*
1257          * If we are on an old pool, the zil must not be active, in which
1258          * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1259          */
1260         ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1261             dmu_objset_from_ds(ds, &os) != 0 ||
1262             bcmp(&os->os_phys->os_zil_header, &zero_zil,
1263             sizeof (zero_zil)) == 0);
1264
1265         dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1266
1267         /*
1268          * The origin's ds_creation_txg has to be < TXG_INITIAL
1269          */
1270         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1271                 crtxg = 1;
1272         else
1273                 crtxg = tx->tx_txg;
1274
1275         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1276             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1277         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1278         dmu_buf_will_dirty(dbuf, tx);
1279         dsphys = dbuf->db_data;
1280         bzero(dsphys, sizeof (dsl_dataset_phys_t));
1281         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1282         dsphys->ds_fsid_guid = unique_create();
1283         do {
1284                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1285                     sizeof (dsphys->ds_guid));
1286         } while (dsphys->ds_guid == 0);
1287         dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1288         dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1289         dsphys->ds_next_snap_obj = ds->ds_object;
1290         dsphys->ds_num_children = 1;
1291         dsphys->ds_creation_time = gethrestime_sec();
1292         dsphys->ds_creation_txg = crtxg;
1293         dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1294         dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1295         dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1296         dsphys->ds_uncompressed_bytes =
1297             dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1298         dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1299         dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1300         dmu_buf_rele(dbuf, FTAG);
1301
1302         if (ds->ds_large_blocks)
1303                 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
1304
1305         ASSERT3U(ds->ds_prev != 0, ==,
1306             dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1307         if (ds->ds_prev) {
1308                 uint64_t next_clones_obj =
1309                     dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1310                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1311                     ds->ds_object ||
1312                     dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1313                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1314                     ds->ds_object) {
1315                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1316                         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1317                             dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1318                         dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1319                 } else if (next_clones_obj != 0) {
1320                         dsl_dataset_remove_from_next_clones(ds->ds_prev,
1321                             dsphys->ds_next_snap_obj, tx);
1322                         VERIFY0(zap_add_int(mos,
1323                             next_clones_obj, dsobj, tx));
1324                 }
1325         }
1326
1327         /*
1328          * If we have a reference-reservation on this dataset, we will
1329          * need to increase the amount of refreservation being charged
1330          * since our unique space is going to zero.
1331          */
1332         if (ds->ds_reserved) {
1333                 int64_t delta;
1334                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1335                 delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1336                     ds->ds_reserved);
1337                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1338                     delta, 0, 0, tx);
1339         }
1340
1341         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1342         dsl_dataset_phys(ds)->ds_deadlist_obj =
1343             dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1344             dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1345         dsl_deadlist_close(&ds->ds_deadlist);
1346         dsl_deadlist_open(&ds->ds_deadlist, mos,
1347             dsl_dataset_phys(ds)->ds_deadlist_obj);
1348         dsl_deadlist_add_key(&ds->ds_deadlist,
1349             dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1350
1351         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1352         dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1353         dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1354         dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1355         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1356                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1357
1358         VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1359             snapname, 8, 1, &dsobj, tx));
1360
1361         if (ds->ds_prev)
1362                 dsl_dataset_rele(ds->ds_prev, ds);
1363         VERIFY0(dsl_dataset_hold_obj(dp,
1364             dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1365
1366         dsl_scan_ds_snapshotted(ds, tx);
1367
1368         dsl_dir_snap_cmtime_update(ds->ds_dir);
1369
1370         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1371 }
1372
1373 static void
1374 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1375 {
1376         dsl_dataset_snapshot_arg_t *ddsa = arg;
1377         dsl_pool_t *dp = dmu_tx_pool(tx);
1378         nvpair_t *pair;
1379
1380         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1381             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1382                 dsl_dataset_t *ds;
1383                 char *name, *atp;
1384                 char dsname[MAXNAMELEN];
1385
1386                 name = nvpair_name(pair);
1387                 atp = strchr(name, '@');
1388                 (void) strlcpy(dsname, name, atp - name + 1);
1389                 VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1390
1391                 dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1392                 if (ddsa->ddsa_props != NULL) {
1393                         dsl_props_set_sync_impl(ds->ds_prev,
1394                             ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1395                 }
1396                 dsl_dataset_rele(ds, FTAG);
1397         }
1398 }
1399
1400 /*
1401  * The snapshots must all be in the same pool.
1402  * All-or-nothing: if there are any failures, nothing will be modified.
1403  */
1404 int
1405 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1406 {
1407         dsl_dataset_snapshot_arg_t ddsa;
1408         nvpair_t *pair;
1409         boolean_t needsuspend;
1410         int error;
1411         spa_t *spa;
1412         char *firstname;
1413         nvlist_t *suspended = NULL;
1414
1415         pair = nvlist_next_nvpair(snaps, NULL);
1416         if (pair == NULL)
1417                 return (0);
1418         firstname = nvpair_name(pair);
1419
1420         error = spa_open(firstname, &spa, FTAG);
1421         if (error != 0)
1422                 return (error);
1423         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1424         spa_close(spa, FTAG);
1425
1426         if (needsuspend) {
1427                 suspended = fnvlist_alloc();
1428                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1429                     pair = nvlist_next_nvpair(snaps, pair)) {
1430                         char fsname[MAXNAMELEN];
1431                         char *snapname = nvpair_name(pair);
1432                         char *atp;
1433                         void *cookie;
1434
1435                         atp = strchr(snapname, '@');
1436                         if (atp == NULL) {
1437                                 error = SET_ERROR(EINVAL);
1438                                 break;
1439                         }
1440                         (void) strlcpy(fsname, snapname, atp - snapname + 1);
1441
1442                         error = zil_suspend(fsname, &cookie);
1443                         if (error != 0)
1444                                 break;
1445                         fnvlist_add_uint64(suspended, fsname,
1446                             (uintptr_t)cookie);
1447                 }
1448         }
1449
1450         ddsa.ddsa_snaps = snaps;
1451         ddsa.ddsa_props = props;
1452         ddsa.ddsa_errors = errors;
1453         ddsa.ddsa_cr = CRED();
1454
1455         if (error == 0) {
1456                 error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1457                     dsl_dataset_snapshot_sync, &ddsa,
1458                     fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1459         }
1460
1461         if (suspended != NULL) {
1462                 for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1463                     pair = nvlist_next_nvpair(suspended, pair)) {
1464                         zil_resume((void *)(uintptr_t)
1465                             fnvpair_value_uint64(pair));
1466                 }
1467                 fnvlist_free(suspended);
1468         }
1469
1470 #ifdef __FreeBSD__
1471 #ifdef _KERNEL
1472         if (error == 0) {
1473                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1474                     pair = nvlist_next_nvpair(snaps, pair)) {
1475                         char *snapname = nvpair_name(pair);
1476                         zvol_create_minors(snapname);
1477                 }
1478         }
1479 #endif
1480 #endif
1481         return (error);
1482 }
1483
1484 typedef struct dsl_dataset_snapshot_tmp_arg {
1485         const char *ddsta_fsname;
1486         const char *ddsta_snapname;
1487         minor_t ddsta_cleanup_minor;
1488         const char *ddsta_htag;
1489 } dsl_dataset_snapshot_tmp_arg_t;
1490
1491 static int
1492 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1493 {
1494         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1495         dsl_pool_t *dp = dmu_tx_pool(tx);
1496         dsl_dataset_t *ds;
1497         int error;
1498
1499         error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1500         if (error != 0)
1501                 return (error);
1502
1503         /* NULL cred means no limit check for tmp snapshot */
1504         error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1505             tx, B_FALSE, 0, NULL);
1506         if (error != 0) {
1507                 dsl_dataset_rele(ds, FTAG);
1508                 return (error);
1509         }
1510
1511         if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1512                 dsl_dataset_rele(ds, FTAG);
1513                 return (SET_ERROR(ENOTSUP));
1514         }
1515         error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1516             B_TRUE, tx);
1517         if (error != 0) {
1518                 dsl_dataset_rele(ds, FTAG);
1519                 return (error);
1520         }
1521
1522         dsl_dataset_rele(ds, FTAG);
1523         return (0);
1524 }
1525
1526 static void
1527 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1528 {
1529         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1530         dsl_pool_t *dp = dmu_tx_pool(tx);
1531         dsl_dataset_t *ds;
1532
1533         VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1534
1535         dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1536         dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1537             ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1538         dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1539
1540         dsl_dataset_rele(ds, FTAG);
1541 }
1542
1543 int
1544 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1545     minor_t cleanup_minor, const char *htag)
1546 {
1547         dsl_dataset_snapshot_tmp_arg_t ddsta;
1548         int error;
1549         spa_t *spa;
1550         boolean_t needsuspend;
1551         void *cookie;
1552
1553         ddsta.ddsta_fsname = fsname;
1554         ddsta.ddsta_snapname = snapname;
1555         ddsta.ddsta_cleanup_minor = cleanup_minor;
1556         ddsta.ddsta_htag = htag;
1557
1558         error = spa_open(fsname, &spa, FTAG);
1559         if (error != 0)
1560                 return (error);
1561         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1562         spa_close(spa, FTAG);
1563
1564         if (needsuspend) {
1565                 error = zil_suspend(fsname, &cookie);
1566                 if (error != 0)
1567                         return (error);
1568         }
1569
1570         error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1571             dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1572
1573         if (needsuspend)
1574                 zil_resume(cookie);
1575         return (error);
1576 }
1577
1578
1579 void
1580 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1581 {
1582         ASSERT(dmu_tx_is_syncing(tx));
1583         ASSERT(ds->ds_objset != NULL);
1584         ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1585
1586         /*
1587          * in case we had to change ds_fsid_guid when we opened it,
1588          * sync it out now.
1589          */
1590         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1591         dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1592
1593         dmu_objset_sync(ds->ds_objset, zio, tx);
1594
1595         if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
1596                 dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
1597                 ds->ds_large_blocks = B_TRUE;
1598         }
1599 }
1600
1601 static void
1602 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1603 {
1604         uint64_t count = 0;
1605         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1606         zap_cursor_t zc;
1607         zap_attribute_t za;
1608         nvlist_t *propval = fnvlist_alloc();
1609         nvlist_t *val = fnvlist_alloc();
1610
1611         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1612
1613         /*
1614          * There may be missing entries in ds_next_clones_obj
1615          * due to a bug in a previous version of the code.
1616          * Only trust it if it has the right number of entries.
1617          */
1618         if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1619                 VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1620                     &count));
1621         }
1622         if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
1623                 goto fail;
1624         for (zap_cursor_init(&zc, mos,
1625             dsl_dataset_phys(ds)->ds_next_clones_obj);
1626             zap_cursor_retrieve(&zc, &za) == 0;
1627             zap_cursor_advance(&zc)) {
1628                 dsl_dataset_t *clone;
1629                 char buf[ZFS_MAXNAMELEN];
1630                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1631                     za.za_first_integer, FTAG, &clone));
1632                 dsl_dir_name(clone->ds_dir, buf);
1633                 fnvlist_add_boolean(val, buf);
1634                 dsl_dataset_rele(clone, FTAG);
1635         }
1636         zap_cursor_fini(&zc);
1637         fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1638         fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1639 fail:
1640         nvlist_free(val);
1641         nvlist_free(propval);
1642 }
1643
1644 void
1645 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1646 {
1647         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1648         uint64_t refd, avail, uobjs, aobjs, ratio;
1649
1650         ASSERT(dsl_pool_config_held(dp));
1651
1652         ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1653             (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1654             dsl_dataset_phys(ds)->ds_compressed_bytes);
1655
1656         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1657         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1658             dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1659
1660         if (dsl_dataset_is_snapshot(ds)) {
1661                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1662                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1663                     dsl_dataset_phys(ds)->ds_unique_bytes);
1664                 get_clones_stat(ds, nv);
1665         } else {
1666                 if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
1667                         char buf[MAXNAMELEN];
1668                         dsl_dataset_name(ds->ds_prev, buf);
1669                         dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
1670                 }
1671
1672                 dsl_dir_stats(ds->ds_dir, nv);
1673         }
1674
1675         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1676         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1677         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1678
1679         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1680             dsl_dataset_phys(ds)->ds_creation_time);
1681         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1682             dsl_dataset_phys(ds)->ds_creation_txg);
1683         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1684             ds->ds_quota);
1685         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1686             ds->ds_reserved);
1687         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1688             dsl_dataset_phys(ds)->ds_guid);
1689         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1690             dsl_dataset_phys(ds)->ds_unique_bytes);
1691         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1692             ds->ds_object);
1693         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1694             ds->ds_userrefs);
1695         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1696             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1697
1698         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1699                 uint64_t written, comp, uncomp;
1700                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1701                 dsl_dataset_t *prev;
1702
1703                 int err = dsl_dataset_hold_obj(dp,
1704                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1705                 if (err == 0) {
1706                         err = dsl_dataset_space_written(prev, ds, &written,
1707                             &comp, &uncomp);
1708                         dsl_dataset_rele(prev, FTAG);
1709                         if (err == 0) {
1710                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1711                                     written);
1712                         }
1713                 }
1714         }
1715 }
1716
1717 void
1718 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1719 {
1720         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1721         ASSERT(dsl_pool_config_held(dp));
1722
1723         stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
1724         stat->dds_inconsistent =
1725             dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
1726         stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
1727         stat->dds_origin[0] = '\0';
1728         if (dsl_dataset_is_snapshot(ds)) {
1729                 stat->dds_is_snapshot = B_TRUE;
1730                 stat->dds_num_clones =
1731                     dsl_dataset_phys(ds)->ds_num_children - 1;
1732         } else {
1733                 stat->dds_is_snapshot = B_FALSE;
1734                 stat->dds_num_clones = 0;
1735
1736                 if (dsl_dir_is_clone(ds->ds_dir)) {
1737                         dsl_dataset_t *ods;
1738
1739                         VERIFY0(dsl_dataset_hold_obj(dp,
1740                             dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
1741                             FTAG, &ods));
1742                         dsl_dataset_name(ods, stat->dds_origin);
1743                         dsl_dataset_rele(ods, FTAG);
1744                 }
1745         }
1746 }
1747
1748 uint64_t
1749 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1750 {
1751         return (ds->ds_fsid_guid);
1752 }
1753
1754 void
1755 dsl_dataset_space(dsl_dataset_t *ds,
1756     uint64_t *refdbytesp, uint64_t *availbytesp,
1757     uint64_t *usedobjsp, uint64_t *availobjsp)
1758 {
1759         *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
1760         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1761         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
1762                 *availbytesp +=
1763                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
1764         if (ds->ds_quota != 0) {
1765                 /*
1766                  * Adjust available bytes according to refquota
1767                  */
1768                 if (*refdbytesp < ds->ds_quota)
1769                         *availbytesp = MIN(*availbytesp,
1770                             ds->ds_quota - *refdbytesp);
1771                 else
1772                         *availbytesp = 0;
1773         }
1774         *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
1775         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
1776 }
1777
1778 boolean_t
1779 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1780 {
1781         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1782
1783         ASSERT(dsl_pool_config_held(dp));
1784         if (snap == NULL)
1785                 return (B_FALSE);
1786         if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
1787             dsl_dataset_phys(snap)->ds_creation_txg) {
1788                 objset_t *os, *os_snap;
1789                 /*
1790                  * It may be that only the ZIL differs, because it was
1791                  * reset in the head.  Don't count that as being
1792                  * modified.
1793                  */
1794                 if (dmu_objset_from_ds(ds, &os) != 0)
1795                         return (B_TRUE);
1796                 if (dmu_objset_from_ds(snap, &os_snap) != 0)
1797                         return (B_TRUE);
1798                 return (bcmp(&os->os_phys->os_meta_dnode,
1799                     &os_snap->os_phys->os_meta_dnode,
1800                     sizeof (os->os_phys->os_meta_dnode)) != 0);
1801         }
1802         return (B_FALSE);
1803 }
1804
1805 typedef struct dsl_dataset_rename_snapshot_arg {
1806         const char *ddrsa_fsname;
1807         const char *ddrsa_oldsnapname;
1808         const char *ddrsa_newsnapname;
1809         boolean_t ddrsa_recursive;
1810         dmu_tx_t *ddrsa_tx;
1811 } dsl_dataset_rename_snapshot_arg_t;
1812
1813 /* ARGSUSED */
1814 static int
1815 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
1816     dsl_dataset_t *hds, void *arg)
1817 {
1818         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1819         int error;
1820         uint64_t val;
1821
1822         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1823         if (error != 0) {
1824                 /* ignore nonexistent snapshots */
1825                 return (error == ENOENT ? 0 : error);
1826         }
1827
1828         /* new name should not exist */
1829         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
1830         if (error == 0)
1831                 error = SET_ERROR(EEXIST);
1832         else if (error == ENOENT)
1833                 error = 0;
1834
1835         /* dataset name + 1 for the "@" + the new snapshot name must fit */
1836         if (dsl_dir_namelen(hds->ds_dir) + 1 +
1837             strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
1838                 error = SET_ERROR(ENAMETOOLONG);
1839
1840         return (error);
1841 }
1842
1843 static int
1844 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
1845 {
1846         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1847         dsl_pool_t *dp = dmu_tx_pool(tx);
1848         dsl_dataset_t *hds;
1849         int error;
1850
1851         error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
1852         if (error != 0)
1853                 return (error);
1854
1855         if (ddrsa->ddrsa_recursive) {
1856                 error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1857                     dsl_dataset_rename_snapshot_check_impl, ddrsa,
1858                     DS_FIND_CHILDREN);
1859         } else {
1860                 error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
1861         }
1862         dsl_dataset_rele(hds, FTAG);
1863         return (error);
1864 }
1865
1866 static int
1867 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
1868     dsl_dataset_t *hds, void *arg)
1869 {
1870 #ifdef __FreeBSD__
1871 #ifdef _KERNEL
1872         char *oldname, *newname;
1873 #endif
1874 #endif
1875         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1876         dsl_dataset_t *ds;
1877         uint64_t val;
1878         dmu_tx_t *tx = ddrsa->ddrsa_tx;
1879         int error;
1880
1881         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1882         ASSERT(error == 0 || error == ENOENT);
1883         if (error == ENOENT) {
1884                 /* ignore nonexistent snapshots */
1885                 return (0);
1886         }
1887
1888         VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
1889
1890         /* log before we change the name */
1891         spa_history_log_internal_ds(ds, "rename", tx,
1892             "-> @%s", ddrsa->ddrsa_newsnapname);
1893
1894         VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
1895             B_FALSE));
1896         mutex_enter(&ds->ds_lock);
1897         (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
1898         mutex_exit(&ds->ds_lock);
1899         VERIFY0(zap_add(dp->dp_meta_objset,
1900             dsl_dataset_phys(hds)->ds_snapnames_zapobj,
1901             ds->ds_snapname, 8, 1, &ds->ds_object, tx));
1902
1903 #ifdef __FreeBSD__
1904 #ifdef _KERNEL
1905         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1906         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1907         snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1908             ddrsa->ddrsa_oldsnapname);
1909         snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1910             ddrsa->ddrsa_newsnapname);
1911         zfsvfs_update_fromname(oldname, newname);
1912         zvol_rename_minors(oldname, newname);
1913         kmem_free(newname, MAXPATHLEN);
1914         kmem_free(oldname, MAXPATHLEN);
1915 #endif
1916 #endif
1917         dsl_dataset_rele(ds, FTAG);
1918
1919         return (0);
1920 }
1921
1922 static void
1923 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
1924 {
1925         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1926         dsl_pool_t *dp = dmu_tx_pool(tx);
1927         dsl_dataset_t *hds;
1928
1929         VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
1930         ddrsa->ddrsa_tx = tx;
1931         if (ddrsa->ddrsa_recursive) {
1932                 VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1933                     dsl_dataset_rename_snapshot_sync_impl, ddrsa,
1934                     DS_FIND_CHILDREN));
1935         } else {
1936                 VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
1937         }
1938         dsl_dataset_rele(hds, FTAG);
1939 }
1940
1941 int
1942 dsl_dataset_rename_snapshot(const char *fsname,
1943     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
1944 {
1945         dsl_dataset_rename_snapshot_arg_t ddrsa;
1946
1947         ddrsa.ddrsa_fsname = fsname;
1948         ddrsa.ddrsa_oldsnapname = oldsnapname;
1949         ddrsa.ddrsa_newsnapname = newsnapname;
1950         ddrsa.ddrsa_recursive = recursive;
1951
1952         return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
1953             dsl_dataset_rename_snapshot_sync, &ddrsa,
1954             1, ZFS_SPACE_CHECK_RESERVED));
1955 }
1956
1957 /*
1958  * If we're doing an ownership handoff, we need to make sure that there is
1959  * only one long hold on the dataset.  We're not allowed to change anything here
1960  * so we don't permanently release the long hold or regular hold here.  We want
1961  * to do this only when syncing to avoid the dataset unexpectedly going away
1962  * when we release the long hold.
1963  */
1964 static int
1965 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
1966 {
1967         boolean_t held;
1968
1969         if (!dmu_tx_is_syncing(tx))
1970                 return (0);
1971
1972         if (owner != NULL) {
1973                 VERIFY3P(ds->ds_owner, ==, owner);
1974                 dsl_dataset_long_rele(ds, owner);
1975         }
1976
1977         held = dsl_dataset_long_held(ds);
1978
1979         if (owner != NULL)
1980                 dsl_dataset_long_hold(ds, owner);
1981
1982         if (held)
1983                 return (SET_ERROR(EBUSY));
1984
1985         return (0);
1986 }
1987
1988 typedef struct dsl_dataset_rollback_arg {
1989         const char *ddra_fsname;
1990         void *ddra_owner;
1991         nvlist_t *ddra_result;
1992 } dsl_dataset_rollback_arg_t;
1993
1994 static int
1995 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
1996 {
1997         dsl_dataset_rollback_arg_t *ddra = arg;
1998         dsl_pool_t *dp = dmu_tx_pool(tx);
1999         dsl_dataset_t *ds;
2000         int64_t unused_refres_delta;
2001         int error;
2002
2003         error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2004         if (error != 0)
2005                 return (error);
2006
2007         /* must not be a snapshot */
2008         if (dsl_dataset_is_snapshot(ds)) {
2009                 dsl_dataset_rele(ds, FTAG);
2010                 return (SET_ERROR(EINVAL));
2011         }
2012
2013         /* must have a most recent snapshot */
2014         if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2015                 dsl_dataset_rele(ds, FTAG);
2016                 return (SET_ERROR(EINVAL));
2017         }
2018
2019         /* must not have any bookmarks after the most recent snapshot */
2020         nvlist_t *proprequest = fnvlist_alloc();
2021         fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2022         nvlist_t *bookmarks = fnvlist_alloc();
2023         error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2024         fnvlist_free(proprequest);
2025         if (error != 0)
2026                 return (error);
2027         for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2028             pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2029                 nvlist_t *valuenv =
2030                     fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2031                     zfs_prop_to_name(ZFS_PROP_CREATETXG));
2032                 uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2033                 if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2034                         fnvlist_free(bookmarks);
2035                         dsl_dataset_rele(ds, FTAG);
2036                         return (SET_ERROR(EEXIST));
2037                 }
2038         }
2039         fnvlist_free(bookmarks);
2040
2041         error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2042         if (error != 0) {
2043                 dsl_dataset_rele(ds, FTAG);
2044                 return (error);
2045         }
2046
2047         /*
2048          * Check if the snap we are rolling back to uses more than
2049          * the refquota.
2050          */
2051         if (ds->ds_quota != 0 &&
2052             dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2053                 dsl_dataset_rele(ds, FTAG);
2054                 return (SET_ERROR(EDQUOT));
2055         }
2056
2057         /*
2058          * When we do the clone swap, we will temporarily use more space
2059          * due to the refreservation (the head will no longer have any
2060          * unique space, so the entire amount of the refreservation will need
2061          * to be free).  We will immediately destroy the clone, freeing
2062          * this space, but the freeing happens over many txg's.
2063          */
2064         unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2065             dsl_dataset_phys(ds)->ds_unique_bytes);
2066
2067         if (unused_refres_delta > 0 &&
2068             unused_refres_delta >
2069             dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2070                 dsl_dataset_rele(ds, FTAG);
2071                 return (SET_ERROR(ENOSPC));
2072         }
2073
2074         dsl_dataset_rele(ds, FTAG);
2075         return (0);
2076 }
2077
2078 static void
2079 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2080 {
2081         dsl_dataset_rollback_arg_t *ddra = arg;
2082         dsl_pool_t *dp = dmu_tx_pool(tx);
2083         dsl_dataset_t *ds, *clone;
2084         uint64_t cloneobj;
2085         char namebuf[ZFS_MAXNAMELEN];
2086
2087         VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2088
2089         dsl_dataset_name(ds->ds_prev, namebuf);
2090         fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2091
2092         cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2093             ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2094
2095         VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2096
2097         dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2098         dsl_dataset_zero_zil(ds, tx);
2099
2100         dsl_destroy_head_sync_impl(clone, tx);
2101
2102         dsl_dataset_rele(clone, FTAG);
2103         dsl_dataset_rele(ds, FTAG);
2104 }
2105
2106 /*
2107  * Rolls back the given filesystem or volume to the most recent snapshot.
2108  * The name of the most recent snapshot will be returned under key "target"
2109  * in the result nvlist.
2110  *
2111  * If owner != NULL:
2112  * - The existing dataset MUST be owned by the specified owner at entry
2113  * - Upon return, dataset will still be held by the same owner, whether we
2114  *   succeed or not.
2115  *
2116  * This mode is required any time the existing filesystem is mounted.  See
2117  * notes above zfs_suspend_fs() for further details.
2118  */
2119 int
2120 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
2121 {
2122         dsl_dataset_rollback_arg_t ddra;
2123
2124         ddra.ddra_fsname = fsname;
2125         ddra.ddra_owner = owner;
2126         ddra.ddra_result = result;
2127
2128         return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2129             dsl_dataset_rollback_sync, &ddra,
2130             1, ZFS_SPACE_CHECK_RESERVED));
2131 }
2132
2133 struct promotenode {
2134         list_node_t link;
2135         dsl_dataset_t *ds;
2136 };
2137
2138 typedef struct dsl_dataset_promote_arg {
2139         const char *ddpa_clonename;
2140         dsl_dataset_t *ddpa_clone;
2141         list_t shared_snaps, origin_snaps, clone_snaps;
2142         dsl_dataset_t *origin_origin; /* origin of the origin */
2143         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2144         char *err_ds;
2145         cred_t *cr;
2146 } dsl_dataset_promote_arg_t;
2147
2148 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2149 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2150     void *tag);
2151 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2152
2153 static int
2154 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2155 {
2156         dsl_dataset_promote_arg_t *ddpa = arg;
2157         dsl_pool_t *dp = dmu_tx_pool(tx);
2158         dsl_dataset_t *hds;
2159         struct promotenode *snap;
2160         dsl_dataset_t *origin_ds;
2161         int err;
2162         uint64_t unused;
2163         uint64_t ss_mv_cnt;
2164         size_t max_snap_len;
2165
2166         err = promote_hold(ddpa, dp, FTAG);
2167         if (err != 0)
2168                 return (err);
2169
2170         hds = ddpa->ddpa_clone;
2171         max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
2172
2173         if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2174                 promote_rele(ddpa, FTAG);
2175                 return (SET_ERROR(EXDEV));
2176         }
2177
2178         /*
2179          * Compute and check the amount of space to transfer.  Since this is
2180          * so expensive, don't do the preliminary check.
2181          */
2182         if (!dmu_tx_is_syncing(tx)) {
2183                 promote_rele(ddpa, FTAG);
2184                 return (0);
2185         }
2186
2187         snap = list_head(&ddpa->shared_snaps);
2188         origin_ds = snap->ds;
2189
2190         /* compute origin's new unique space */
2191         snap = list_tail(&ddpa->clone_snaps);
2192         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2193             origin_ds->ds_object);
2194         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2195             dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2196             &ddpa->unique, &unused, &unused);
2197
2198         /*
2199          * Walk the snapshots that we are moving
2200          *
2201          * Compute space to transfer.  Consider the incremental changes
2202          * to used by each snapshot:
2203          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2204          * So each snapshot gave birth to:
2205          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2206          * So a sequence would look like:
2207          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2208          * Which simplifies to:
2209          * uN + kN + kN-1 + ... + k1 + k0
2210          * Note however, if we stop before we reach the ORIGIN we get:
2211          * uN + kN + kN-1 + ... + kM - uM-1
2212          */
2213         ss_mv_cnt = 0;
2214         ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2215         ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2216         ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2217         for (snap = list_head(&ddpa->shared_snaps); snap;
2218             snap = list_next(&ddpa->shared_snaps, snap)) {
2219                 uint64_t val, dlused, dlcomp, dluncomp;
2220                 dsl_dataset_t *ds = snap->ds;
2221
2222                 ss_mv_cnt++;
2223
2224                 /*
2225                  * If there are long holds, we won't be able to evict
2226                  * the objset.
2227                  */
2228                 if (dsl_dataset_long_held(ds)) {
2229                         err = SET_ERROR(EBUSY);
2230                         goto out;
2231                 }
2232
2233                 /* Check that the snapshot name does not conflict */
2234                 VERIFY0(dsl_dataset_get_snapname(ds));
2235                 if (strlen(ds->ds_snapname) >= max_snap_len) {
2236                         err = SET_ERROR(ENAMETOOLONG);
2237                         goto out;
2238                 }
2239                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2240                 if (err == 0) {
2241                         (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
2242                         err = SET_ERROR(EEXIST);
2243                         goto out;
2244                 }
2245                 if (err != ENOENT)
2246                         goto out;
2247
2248                 /* The very first snapshot does not have a deadlist */
2249                 if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2250                         continue;
2251
2252                 dsl_deadlist_space(&ds->ds_deadlist,
2253                     &dlused, &dlcomp, &dluncomp);
2254                 ddpa->used += dlused;
2255                 ddpa->comp += dlcomp;
2256                 ddpa->uncomp += dluncomp;
2257         }
2258
2259         /*
2260          * If we are a clone of a clone then we never reached ORIGIN,
2261          * so we need to subtract out the clone origin's used space.
2262          */
2263         if (ddpa->origin_origin) {
2264                 ddpa->used -=
2265                     dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2266                 ddpa->comp -=
2267                     dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2268                 ddpa->uncomp -=
2269                     dsl_dataset_phys(ddpa->origin_origin)->
2270                     ds_uncompressed_bytes;
2271         }
2272
2273         /* Check that there is enough space and limit headroom here */
2274         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2275             0, ss_mv_cnt, ddpa->used, ddpa->cr);
2276         if (err != 0)
2277                 goto out;
2278
2279         /*
2280          * Compute the amounts of space that will be used by snapshots
2281          * after the promotion (for both origin and clone).  For each,
2282          * it is the amount of space that will be on all of their
2283          * deadlists (that was not born before their new origin).
2284          */
2285         if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2286                 uint64_t space;
2287
2288                 /*
2289                  * Note, typically this will not be a clone of a clone,
2290                  * so dd_origin_txg will be < TXG_INITIAL, so
2291                  * these snaplist_space() -> dsl_deadlist_space_range()
2292                  * calls will be fast because they do not have to
2293                  * iterate over all bps.
2294                  */
2295                 snap = list_head(&ddpa->origin_snaps);
2296                 err = snaplist_space(&ddpa->shared_snaps,
2297                     snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2298                 if (err != 0)
2299                         goto out;
2300
2301                 err = snaplist_space(&ddpa->clone_snaps,
2302                     snap->ds->ds_dir->dd_origin_txg, &space);
2303                 if (err != 0)
2304                         goto out;
2305                 ddpa->cloneusedsnap += space;
2306         }
2307         if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2308             DD_FLAG_USED_BREAKDOWN) {
2309                 err = snaplist_space(&ddpa->origin_snaps,
2310                     dsl_dataset_phys(origin_ds)->ds_creation_txg,
2311                     &ddpa->originusedsnap);
2312                 if (err != 0)
2313                         goto out;
2314         }
2315
2316 out:
2317         promote_rele(ddpa, FTAG);
2318         return (err);
2319 }
2320
2321 static void
2322 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2323 {
2324         dsl_dataset_promote_arg_t *ddpa = arg;
2325         dsl_pool_t *dp = dmu_tx_pool(tx);
2326         dsl_dataset_t *hds;
2327         struct promotenode *snap;
2328         dsl_dataset_t *origin_ds;
2329         dsl_dataset_t *origin_head;
2330         dsl_dir_t *dd;
2331         dsl_dir_t *odd = NULL;
2332         uint64_t oldnext_obj;
2333         int64_t delta;
2334 #if defined(__FreeBSD__) && defined(_KERNEL)
2335         char *oldname, *newname;
2336 #endif
2337
2338         VERIFY0(promote_hold(ddpa, dp, FTAG));
2339         hds = ddpa->ddpa_clone;
2340
2341         ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2342
2343         snap = list_head(&ddpa->shared_snaps);
2344         origin_ds = snap->ds;
2345         dd = hds->ds_dir;
2346
2347         snap = list_head(&ddpa->origin_snaps);
2348         origin_head = snap->ds;
2349
2350         /*
2351          * We need to explicitly open odd, since origin_ds's dd will be
2352          * changing.
2353          */
2354         VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2355             NULL, FTAG, &odd));
2356
2357         /* change origin's next snap */
2358         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2359         oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2360         snap = list_tail(&ddpa->clone_snaps);
2361         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2362             origin_ds->ds_object);
2363         dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2364
2365         /* change the origin's next clone */
2366         if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2367                 dsl_dataset_remove_from_next_clones(origin_ds,
2368                     snap->ds->ds_object, tx);
2369                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2370                     dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2371                     oldnext_obj, tx));
2372         }
2373
2374         /* change origin */
2375         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2376         ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2377         dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2378         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2379         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2380         dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2381         origin_head->ds_dir->dd_origin_txg =
2382             dsl_dataset_phys(origin_ds)->ds_creation_txg;
2383
2384         /* change dd_clone entries */
2385         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2386                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2387                     dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2388                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2389                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2390                     hds->ds_object, tx));
2391
2392                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2393                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2394                     origin_head->ds_object, tx));
2395                 if (dsl_dir_phys(dd)->dd_clones == 0) {
2396                         dsl_dir_phys(dd)->dd_clones =
2397                             zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
2398                             DMU_OT_NONE, 0, tx);
2399                 }
2400                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2401                     dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
2402         }
2403
2404 #if defined(__FreeBSD__) && defined(_KERNEL)
2405         /* Take the spa_namespace_lock early so zvol renames don't deadlock. */
2406         mutex_enter(&spa_namespace_lock);
2407
2408         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2409         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2410 #endif
2411
2412         /* move snapshots to this dir */
2413         for (snap = list_head(&ddpa->shared_snaps); snap;
2414             snap = list_next(&ddpa->shared_snaps, snap)) {
2415                 dsl_dataset_t *ds = snap->ds;
2416
2417                 /*
2418                  * Property callbacks are registered to a particular
2419                  * dsl_dir.  Since ours is changing, evict the objset
2420                  * so that they will be unregistered from the old dsl_dir.
2421                  */
2422                 if (ds->ds_objset) {
2423                         dmu_objset_evict(ds->ds_objset);
2424                         ds->ds_objset = NULL;
2425                 }
2426
2427                 /* move snap name entry */
2428                 VERIFY0(dsl_dataset_get_snapname(ds));
2429                 VERIFY0(dsl_dataset_snap_remove(origin_head,
2430                     ds->ds_snapname, tx, B_TRUE));
2431                 VERIFY0(zap_add(dp->dp_meta_objset,
2432                     dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
2433                     8, 1, &ds->ds_object, tx));
2434                 dsl_fs_ss_count_adjust(hds->ds_dir, 1,
2435                     DD_FIELD_SNAPSHOT_COUNT, tx);
2436
2437                 /* change containing dsl_dir */
2438                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2439                 ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
2440                 dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
2441                 ASSERT3P(ds->ds_dir, ==, odd);
2442                 dsl_dir_rele(ds->ds_dir, ds);
2443                 VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2444                     NULL, ds, &ds->ds_dir));
2445
2446 #if defined(__FreeBSD__) && defined(_KERNEL)
2447                 dsl_dataset_name(ds, newname);
2448                 zfsvfs_update_fromname(oldname, newname);
2449                 zvol_rename_minors(oldname, newname);
2450 #endif
2451
2452                 /* move any clone references */
2453                 if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
2454                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2455                         zap_cursor_t zc;
2456                         zap_attribute_t za;
2457
2458                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2459                             dsl_dataset_phys(ds)->ds_next_clones_obj);
2460                             zap_cursor_retrieve(&zc, &za) == 0;
2461                             zap_cursor_advance(&zc)) {
2462                                 dsl_dataset_t *cnds;
2463                                 uint64_t o;
2464
2465                                 if (za.za_first_integer == oldnext_obj) {
2466                                         /*
2467                                          * We've already moved the
2468                                          * origin's reference.
2469                                          */
2470                                         continue;
2471                                 }
2472
2473                                 VERIFY0(dsl_dataset_hold_obj(dp,
2474                                     za.za_first_integer, FTAG, &cnds));
2475                                 o = dsl_dir_phys(cnds->ds_dir)->
2476                                     dd_head_dataset_obj;
2477
2478                                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2479                                     dsl_dir_phys(odd)->dd_clones, o, tx));
2480                                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2481                                     dsl_dir_phys(dd)->dd_clones, o, tx));
2482                                 dsl_dataset_rele(cnds, FTAG);
2483                         }
2484                         zap_cursor_fini(&zc);
2485                 }
2486
2487                 ASSERT(!dsl_prop_hascb(ds));
2488         }
2489
2490 #if defined(__FreeBSD__) && defined(_KERNEL)
2491         mutex_exit(&spa_namespace_lock);
2492
2493         kmem_free(newname, MAXPATHLEN);
2494         kmem_free(oldname, MAXPATHLEN);
2495 #endif
2496         /*
2497          * Change space accounting.
2498          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2499          * both be valid, or both be 0 (resulting in delta == 0).  This
2500          * is true for each of {clone,origin} independently.
2501          */
2502
2503         delta = ddpa->cloneusedsnap -
2504             dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
2505         ASSERT3S(delta, >=, 0);
2506         ASSERT3U(ddpa->used, >=, delta);
2507         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2508         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2509             ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2510
2511         delta = ddpa->originusedsnap -
2512             dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
2513         ASSERT3S(delta, <=, 0);
2514         ASSERT3U(ddpa->used, >=, -delta);
2515         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2516         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2517             -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2518
2519         dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
2520
2521         /* log history record */
2522         spa_history_log_internal_ds(hds, "promote", tx, "");
2523
2524         dsl_dir_rele(odd, FTAG);
2525         promote_rele(ddpa, FTAG);
2526 }
2527
2528 /*
2529  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2530  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2531  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2532  * snapshots back to this dataset's origin.
2533  */
2534 static int
2535 snaplist_make(dsl_pool_t *dp,
2536     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2537 {
2538         uint64_t obj = last_obj;
2539
2540         list_create(l, sizeof (struct promotenode),
2541             offsetof(struct promotenode, link));
2542
2543         while (obj != first_obj) {
2544                 dsl_dataset_t *ds;
2545                 struct promotenode *snap;
2546                 int err;
2547
2548                 err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2549                 ASSERT(err != ENOENT);
2550                 if (err != 0)
2551                         return (err);
2552
2553                 if (first_obj == 0)
2554                         first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
2555
2556                 snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2557                 snap->ds = ds;
2558                 list_insert_tail(l, snap);
2559                 obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
2560         }
2561
2562         return (0);
2563 }
2564
2565 static int
2566 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2567 {
2568         struct promotenode *snap;
2569
2570         *spacep = 0;
2571         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2572                 uint64_t used, comp, uncomp;
2573                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2574                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
2575                 *spacep += used;
2576         }
2577         return (0);
2578 }
2579
2580 static void
2581 snaplist_destroy(list_t *l, void *tag)
2582 {
2583         struct promotenode *snap;
2584
2585         if (l == NULL || !list_link_active(&l->list_head))
2586                 return;
2587
2588         while ((snap = list_tail(l)) != NULL) {
2589                 list_remove(l, snap);
2590                 dsl_dataset_rele(snap->ds, tag);
2591                 kmem_free(snap, sizeof (*snap));
2592         }
2593         list_destroy(l);
2594 }
2595
2596 static int
2597 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2598 {
2599         int error;
2600         dsl_dir_t *dd;
2601         struct promotenode *snap;
2602
2603         error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2604             &ddpa->ddpa_clone);
2605         if (error != 0)
2606                 return (error);
2607         dd = ddpa->ddpa_clone->ds_dir;
2608
2609         if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
2610             !dsl_dir_is_clone(dd)) {
2611                 dsl_dataset_rele(ddpa->ddpa_clone, tag);
2612                 return (SET_ERROR(EINVAL));
2613         }
2614
2615         error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
2616             &ddpa->shared_snaps, tag);
2617         if (error != 0)
2618                 goto out;
2619
2620         error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2621             &ddpa->clone_snaps, tag);
2622         if (error != 0)
2623                 goto out;
2624
2625         snap = list_head(&ddpa->shared_snaps);
2626         ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
2627         error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
2628             dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
2629             &ddpa->origin_snaps, tag);
2630         if (error != 0)
2631                 goto out;
2632
2633         if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
2634                 error = dsl_dataset_hold_obj(dp,
2635                     dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
2636                     tag, &ddpa->origin_origin);
2637                 if (error != 0)
2638                         goto out;
2639         }
2640 out:
2641         if (error != 0)
2642                 promote_rele(ddpa, tag);
2643         return (error);
2644 }
2645
2646 static void
2647 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2648 {
2649         snaplist_destroy(&ddpa->shared_snaps, tag);
2650         snaplist_destroy(&ddpa->clone_snaps, tag);
2651         snaplist_destroy(&ddpa->origin_snaps, tag);
2652         if (ddpa->origin_origin != NULL)
2653                 dsl_dataset_rele(ddpa->origin_origin, tag);
2654         dsl_dataset_rele(ddpa->ddpa_clone, tag);
2655 }
2656
2657 /*
2658  * Promote a clone.
2659  *
2660  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2661  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2662  */
2663 int
2664 dsl_dataset_promote(const char *name, char *conflsnap)
2665 {
2666         dsl_dataset_promote_arg_t ddpa = { 0 };
2667         uint64_t numsnaps;
2668         int error;
2669         objset_t *os;
2670
2671         /*
2672          * We will modify space proportional to the number of
2673          * snapshots.  Compute numsnaps.
2674          */
2675         error = dmu_objset_hold(name, FTAG, &os);
2676         if (error != 0)
2677                 return (error);
2678         error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2679             dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
2680             &numsnaps);
2681         dmu_objset_rele(os, FTAG);
2682         if (error != 0)
2683                 return (error);
2684
2685         ddpa.ddpa_clonename = name;
2686         ddpa.err_ds = conflsnap;
2687         ddpa.cr = CRED();
2688
2689         return (dsl_sync_task(name, dsl_dataset_promote_check,
2690             dsl_dataset_promote_sync, &ddpa,
2691             2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
2692 }
2693
2694 int
2695 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2696     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2697 {
2698         int64_t unused_refres_delta;
2699
2700         /* they should both be heads */
2701         if (dsl_dataset_is_snapshot(clone) ||
2702             dsl_dataset_is_snapshot(origin_head))
2703                 return (SET_ERROR(EINVAL));
2704
2705         /* if we are not forcing, the branch point should be just before them */
2706         if (!force && clone->ds_prev != origin_head->ds_prev)
2707                 return (SET_ERROR(EINVAL));
2708
2709         /* clone should be the clone (unless they are unrelated) */
2710         if (clone->ds_prev != NULL &&
2711             clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2712             origin_head->ds_dir != clone->ds_prev->ds_dir)
2713                 return (SET_ERROR(EINVAL));
2714
2715         /* the clone should be a child of the origin */
2716         if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2717                 return (SET_ERROR(EINVAL));
2718
2719         /* origin_head shouldn't be modified unless 'force' */
2720         if (!force &&
2721             dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2722                 return (SET_ERROR(ETXTBSY));
2723
2724         /* origin_head should have no long holds (e.g. is not mounted) */
2725         if (dsl_dataset_handoff_check(origin_head, owner, tx))
2726                 return (SET_ERROR(EBUSY));
2727
2728         /* check amount of any unconsumed refreservation */
2729         unused_refres_delta =
2730             (int64_t)MIN(origin_head->ds_reserved,
2731             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2732             (int64_t)MIN(origin_head->ds_reserved,
2733             dsl_dataset_phys(clone)->ds_unique_bytes);
2734
2735         if (unused_refres_delta > 0 &&
2736             unused_refres_delta >
2737             dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2738                 return (SET_ERROR(ENOSPC));
2739
2740         /* clone can't be over the head's refquota */
2741         if (origin_head->ds_quota != 0 &&
2742             dsl_dataset_phys(clone)->ds_referenced_bytes >
2743             origin_head->ds_quota)
2744                 return (SET_ERROR(EDQUOT));
2745
2746         return (0);
2747 }
2748
2749 void
2750 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2751     dsl_dataset_t *origin_head, dmu_tx_t *tx)
2752 {
2753         dsl_pool_t *dp = dmu_tx_pool(tx);
2754         int64_t unused_refres_delta;
2755
2756         ASSERT(clone->ds_reserved == 0);
2757         ASSERT(origin_head->ds_quota == 0 ||
2758             dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
2759         ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2760
2761         dmu_buf_will_dirty(clone->ds_dbuf, tx);
2762         dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2763
2764         if (clone->ds_objset != NULL) {
2765                 dmu_objset_evict(clone->ds_objset);
2766                 clone->ds_objset = NULL;
2767         }
2768
2769         if (origin_head->ds_objset != NULL) {
2770                 dmu_objset_evict(origin_head->ds_objset);
2771                 origin_head->ds_objset = NULL;
2772         }
2773
2774         unused_refres_delta =
2775             (int64_t)MIN(origin_head->ds_reserved,
2776             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2777             (int64_t)MIN(origin_head->ds_reserved,
2778             dsl_dataset_phys(clone)->ds_unique_bytes);
2779
2780         /*
2781          * Reset origin's unique bytes, if it exists.
2782          */
2783         if (clone->ds_prev) {
2784                 dsl_dataset_t *origin = clone->ds_prev;
2785                 uint64_t comp, uncomp;
2786
2787                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
2788                 dsl_deadlist_space_range(&clone->ds_deadlist,
2789                     dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
2790                     &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
2791         }
2792
2793         /* swap blkptrs */
2794         {
2795                 blkptr_t tmp;
2796                 tmp = dsl_dataset_phys(origin_head)->ds_bp;
2797                 dsl_dataset_phys(origin_head)->ds_bp =
2798                     dsl_dataset_phys(clone)->ds_bp;
2799                 dsl_dataset_phys(clone)->ds_bp = tmp;
2800         }
2801
2802         /* set dd_*_bytes */
2803         {
2804                 int64_t dused, dcomp, duncomp;
2805                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
2806                 uint64_t odl_used, odl_comp, odl_uncomp;
2807
2808                 ASSERT3U(dsl_dir_phys(clone->ds_dir)->
2809                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
2810
2811                 dsl_deadlist_space(&clone->ds_deadlist,
2812                     &cdl_used, &cdl_comp, &cdl_uncomp);
2813                 dsl_deadlist_space(&origin_head->ds_deadlist,
2814                     &odl_used, &odl_comp, &odl_uncomp);
2815
2816                 dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
2817                     cdl_used -
2818                     (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
2819                     odl_used);
2820                 dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
2821                     cdl_comp -
2822                     (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
2823                     odl_comp);
2824                 duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
2825                     cdl_uncomp -
2826                     (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
2827                     odl_uncomp);
2828
2829                 dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
2830                     dused, dcomp, duncomp, tx);
2831                 dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
2832                     -dused, -dcomp, -duncomp, tx);
2833
2834                 /*
2835                  * The difference in the space used by snapshots is the
2836                  * difference in snapshot space due to the head's
2837                  * deadlist (since that's the only thing that's
2838                  * changing that affects the snapused).
2839                  */
2840                 dsl_deadlist_space_range(&clone->ds_deadlist,
2841                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2842                     &cdl_used, &cdl_comp, &cdl_uncomp);
2843                 dsl_deadlist_space_range(&origin_head->ds_deadlist,
2844                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2845                     &odl_used, &odl_comp, &odl_uncomp);
2846                 dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
2847                     DD_USED_HEAD, DD_USED_SNAP, NULL);
2848         }
2849
2850         /* swap ds_*_bytes */
2851         SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
2852             dsl_dataset_phys(clone)->ds_referenced_bytes);
2853         SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
2854             dsl_dataset_phys(clone)->ds_compressed_bytes);
2855         SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
2856             dsl_dataset_phys(clone)->ds_uncompressed_bytes);
2857         SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
2858             dsl_dataset_phys(clone)->ds_unique_bytes);
2859
2860         /* apply any parent delta for change in unconsumed refreservation */
2861         dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
2862             unused_refres_delta, 0, 0, tx);
2863
2864         /*
2865          * Swap deadlists.
2866          */
2867         dsl_deadlist_close(&clone->ds_deadlist);
2868         dsl_deadlist_close(&origin_head->ds_deadlist);
2869         SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
2870             dsl_dataset_phys(clone)->ds_deadlist_obj);
2871         dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
2872             dsl_dataset_phys(clone)->ds_deadlist_obj);
2873         dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
2874             dsl_dataset_phys(origin_head)->ds_deadlist_obj);
2875
2876         dsl_scan_ds_clone_swapped(origin_head, clone, tx);
2877
2878         spa_history_log_internal_ds(clone, "clone swap", tx,
2879             "parent=%s", origin_head->ds_dir->dd_myname);
2880 }
2881
2882 /*
2883  * Given a pool name and a dataset object number in that pool,
2884  * return the name of that dataset.
2885  */
2886 int
2887 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2888 {
2889         dsl_pool_t *dp;
2890         dsl_dataset_t *ds;
2891         int error;
2892
2893         error = dsl_pool_hold(pname, FTAG, &dp);
2894         if (error != 0)
2895                 return (error);
2896
2897         error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
2898         if (error == 0) {
2899                 dsl_dataset_name(ds, buf);
2900                 dsl_dataset_rele(ds, FTAG);
2901         }
2902         dsl_pool_rele(dp, FTAG);
2903
2904         return (error);
2905 }
2906
2907 int
2908 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2909     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2910 {
2911         int error = 0;
2912
2913         ASSERT3S(asize, >, 0);
2914
2915         /*
2916          * *ref_rsrv is the portion of asize that will come from any
2917          * unconsumed refreservation space.
2918          */
2919         *ref_rsrv = 0;
2920
2921         mutex_enter(&ds->ds_lock);
2922         /*
2923          * Make a space adjustment for reserved bytes.
2924          */
2925         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
2926                 ASSERT3U(*used, >=,
2927                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
2928                 *used -=
2929                     (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
2930                 *ref_rsrv =
2931                     asize - MIN(asize, parent_delta(ds, asize + inflight));
2932         }
2933
2934         if (!check_quota || ds->ds_quota == 0) {
2935                 mutex_exit(&ds->ds_lock);
2936                 return (0);
2937         }
2938         /*
2939          * If they are requesting more space, and our current estimate
2940          * is over quota, they get to try again unless the actual
2941          * on-disk is over quota and there are no pending changes (which
2942          * may free up space for us).
2943          */
2944         if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
2945             ds->ds_quota) {
2946                 if (inflight > 0 ||
2947                     dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
2948                         error = SET_ERROR(ERESTART);
2949                 else
2950                         error = SET_ERROR(EDQUOT);
2951         }
2952         mutex_exit(&ds->ds_lock);
2953
2954         return (error);
2955 }
2956
2957 typedef struct dsl_dataset_set_qr_arg {
2958         const char *ddsqra_name;
2959         zprop_source_t ddsqra_source;
2960         uint64_t ddsqra_value;
2961 } dsl_dataset_set_qr_arg_t;
2962
2963
2964 /* ARGSUSED */
2965 static int
2966 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
2967 {
2968         dsl_dataset_set_qr_arg_t *ddsqra = arg;
2969         dsl_pool_t *dp = dmu_tx_pool(tx);
2970         dsl_dataset_t *ds;
2971         int error;
2972         uint64_t newval;
2973
2974         if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
2975                 return (SET_ERROR(ENOTSUP));
2976
2977         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2978         if (error != 0)
2979                 return (error);
2980
2981         if (dsl_dataset_is_snapshot(ds)) {
2982                 dsl_dataset_rele(ds, FTAG);
2983                 return (SET_ERROR(EINVAL));
2984         }
2985
2986         error = dsl_prop_predict(ds->ds_dir,
2987             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2988             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2989         if (error != 0) {
2990                 dsl_dataset_rele(ds, FTAG);
2991                 return (error);
2992         }
2993
2994         if (newval == 0) {
2995                 dsl_dataset_rele(ds, FTAG);
2996                 return (0);
2997         }
2998
2999         if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
3000             newval < ds->ds_reserved) {
3001                 dsl_dataset_rele(ds, FTAG);
3002                 return (SET_ERROR(ENOSPC));
3003         }
3004
3005         dsl_dataset_rele(ds, FTAG);
3006         return (0);
3007 }
3008
3009 static void
3010 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3011 {
3012         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3013         dsl_pool_t *dp = dmu_tx_pool(tx);
3014         dsl_dataset_t *ds;
3015         uint64_t newval;
3016
3017         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3018
3019         dsl_prop_set_sync_impl(ds,
3020             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3021             ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3022             &ddsqra->ddsqra_value, tx);
3023
3024         VERIFY0(dsl_prop_get_int_ds(ds,
3025             zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3026
3027         if (ds->ds_quota != newval) {
3028                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3029                 ds->ds_quota = newval;
3030         }
3031         dsl_dataset_rele(ds, FTAG);
3032 }
3033
3034 int
3035 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3036     uint64_t refquota)
3037 {
3038         dsl_dataset_set_qr_arg_t ddsqra;
3039
3040         ddsqra.ddsqra_name = dsname;
3041         ddsqra.ddsqra_source = source;
3042         ddsqra.ddsqra_value = refquota;
3043
3044         return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3045             dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3046 }
3047
3048 static int
3049 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3050 {
3051         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3052         dsl_pool_t *dp = dmu_tx_pool(tx);
3053         dsl_dataset_t *ds;
3054         int error;
3055         uint64_t newval, unique;
3056
3057         if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3058                 return (SET_ERROR(ENOTSUP));
3059
3060         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3061         if (error != 0)
3062                 return (error);
3063
3064         if (dsl_dataset_is_snapshot(ds)) {
3065                 dsl_dataset_rele(ds, FTAG);
3066                 return (SET_ERROR(EINVAL));
3067         }
3068
3069         error = dsl_prop_predict(ds->ds_dir,
3070             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3071             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3072         if (error != 0) {
3073                 dsl_dataset_rele(ds, FTAG);
3074                 return (error);
3075         }
3076
3077         /*
3078          * If we are doing the preliminary check in open context, the
3079          * space estimates may be inaccurate.
3080          */
3081         if (!dmu_tx_is_syncing(tx)) {
3082                 dsl_dataset_rele(ds, FTAG);
3083                 return (0);
3084         }
3085
3086         mutex_enter(&ds->ds_lock);
3087         if (!DS_UNIQUE_IS_ACCURATE(ds))
3088                 dsl_dataset_recalc_head_uniq(ds);
3089         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3090         mutex_exit(&ds->ds_lock);
3091
3092         if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3093                 uint64_t delta = MAX(unique, newval) -
3094                     MAX(unique, ds->ds_reserved);
3095
3096                 if (delta >
3097                     dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3098                     (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3099                         dsl_dataset_rele(ds, FTAG);
3100                         return (SET_ERROR(ENOSPC));
3101                 }
3102         }
3103
3104         dsl_dataset_rele(ds, FTAG);
3105         return (0);
3106 }
3107
3108 void
3109 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3110     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3111 {
3112         uint64_t newval;
3113         uint64_t unique;
3114         int64_t delta;
3115
3116         dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3117             source, sizeof (value), 1, &value, tx);
3118
3119         VERIFY0(dsl_prop_get_int_ds(ds,
3120             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3121
3122         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3123         mutex_enter(&ds->ds_dir->dd_lock);
3124         mutex_enter(&ds->ds_lock);
3125         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3126         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3127         delta = MAX(0, (int64_t)(newval - unique)) -
3128             MAX(0, (int64_t)(ds->ds_reserved - unique));
3129         ds->ds_reserved = newval;
3130         mutex_exit(&ds->ds_lock);
3131
3132         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3133         mutex_exit(&ds->ds_dir->dd_lock);
3134 }
3135
3136 static void
3137 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3138 {
3139         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3140         dsl_pool_t *dp = dmu_tx_pool(tx);
3141         dsl_dataset_t *ds;
3142
3143         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3144         dsl_dataset_set_refreservation_sync_impl(ds,
3145             ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3146         dsl_dataset_rele(ds, FTAG);
3147 }
3148
3149 int
3150 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3151     uint64_t refreservation)
3152 {
3153         dsl_dataset_set_qr_arg_t ddsqra;
3154
3155         ddsqra.ddsqra_name = dsname;
3156         ddsqra.ddsqra_source = source;
3157         ddsqra.ddsqra_value = refreservation;
3158
3159         return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3160             dsl_dataset_set_refreservation_sync, &ddsqra,
3161             0, ZFS_SPACE_CHECK_NONE));
3162 }
3163
3164 /*
3165  * Return (in *usedp) the amount of space written in new that is not
3166  * present in oldsnap.  New may be a snapshot or the head.  Old must be
3167  * a snapshot before new, in new's filesystem (or its origin).  If not then
3168  * fail and return EINVAL.
3169  *
3170  * The written space is calculated by considering two components:  First, we
3171  * ignore any freed space, and calculate the written as new's used space
3172  * minus old's used space.  Next, we add in the amount of space that was freed
3173  * between the two snapshots, thus reducing new's used space relative to old's.
3174  * Specifically, this is the space that was born before old->ds_creation_txg,
3175  * and freed before new (ie. on new's deadlist or a previous deadlist).
3176  *
3177  * space freed                         [---------------------]
3178  * snapshots                       ---O-------O--------O-------O------
3179  *                                         oldsnap            new
3180  */
3181 int
3182 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3183     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3184 {
3185         int err = 0;
3186         uint64_t snapobj;
3187         dsl_pool_t *dp = new->ds_dir->dd_pool;
3188
3189         ASSERT(dsl_pool_config_held(dp));
3190
3191         *usedp = 0;
3192         *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3193         *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3194
3195         *compp = 0;
3196         *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3197         *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3198
3199         *uncompp = 0;
3200         *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3201         *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3202
3203         snapobj = new->ds_object;
3204         while (snapobj != oldsnap->ds_object) {
3205                 dsl_dataset_t *snap;
3206                 uint64_t used, comp, uncomp;
3207
3208                 if (snapobj == new->ds_object) {
3209                         snap = new;
3210                 } else {
3211                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3212                         if (err != 0)
3213                                 break;
3214                 }
3215
3216                 if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3217                     dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3218                         /*
3219                          * The blocks in the deadlist can not be born after
3220                          * ds_prev_snap_txg, so get the whole deadlist space,
3221                          * which is more efficient (especially for old-format
3222                          * deadlists).  Unfortunately the deadlist code
3223                          * doesn't have enough information to make this
3224                          * optimization itself.
3225                          */
3226                         dsl_deadlist_space(&snap->ds_deadlist,
3227                             &used, &comp, &uncomp);
3228                 } else {
3229                         dsl_deadlist_space_range(&snap->ds_deadlist,
3230                             0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3231                             &used, &comp, &uncomp);
3232                 }
3233                 *usedp += used;
3234                 *compp += comp;
3235                 *uncompp += uncomp;
3236
3237                 /*
3238                  * If we get to the beginning of the chain of snapshots
3239                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3240                  * was not a snapshot of/before new.
3241                  */
3242                 snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3243                 if (snap != new)
3244                         dsl_dataset_rele(snap, FTAG);
3245                 if (snapobj == 0) {
3246                         err = SET_ERROR(EINVAL);
3247                         break;
3248                 }
3249
3250         }
3251         return (err);
3252 }
3253
3254 /*
3255  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3256  * lastsnap, and all snapshots in between are deleted.
3257  *
3258  * blocks that would be freed            [---------------------------]
3259  * snapshots                       ---O-------O--------O-------O--------O
3260  *                                        firstsnap        lastsnap
3261  *
3262  * This is the set of blocks that were born after the snap before firstsnap,
3263  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3264  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3265  * We calculate this by iterating over the relevant deadlists (from the snap
3266  * after lastsnap, backward to the snap after firstsnap), summing up the
3267  * space on the deadlist that was born after the snap before firstsnap.
3268  */
3269 int
3270 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3271     dsl_dataset_t *lastsnap,
3272     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3273 {
3274         int err = 0;
3275         uint64_t snapobj;
3276         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3277
3278         ASSERT(dsl_dataset_is_snapshot(firstsnap));
3279         ASSERT(dsl_dataset_is_snapshot(lastsnap));
3280
3281         /*
3282          * Check that the snapshots are in the same dsl_dir, and firstsnap
3283          * is before lastsnap.
3284          */
3285         if (firstsnap->ds_dir != lastsnap->ds_dir ||
3286             dsl_dataset_phys(firstsnap)->ds_creation_txg >
3287             dsl_dataset_phys(lastsnap)->ds_creation_txg)
3288                 return (SET_ERROR(EINVAL));
3289
3290         *usedp = *compp = *uncompp = 0;
3291
3292         snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3293         while (snapobj != firstsnap->ds_object) {
3294                 dsl_dataset_t *ds;
3295                 uint64_t used, comp, uncomp;
3296
3297                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3298                 if (err != 0)
3299                         break;
3300
3301                 dsl_deadlist_space_range(&ds->ds_deadlist,
3302                     dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3303                     &used, &comp, &uncomp);
3304                 *usedp += used;
3305                 *compp += comp;
3306                 *uncompp += uncomp;
3307
3308                 snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3309                 ASSERT3U(snapobj, !=, 0);
3310                 dsl_dataset_rele(ds, FTAG);
3311         }
3312         return (err);
3313 }
3314
3315 static int
3316 dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
3317 {
3318         const char *dsname = arg;
3319         dsl_dataset_t *ds;
3320         dsl_pool_t *dp = dmu_tx_pool(tx);
3321         int error = 0;
3322
3323         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
3324                 return (SET_ERROR(ENOTSUP));
3325
3326         ASSERT(spa_feature_is_enabled(dp->dp_spa,
3327             SPA_FEATURE_EXTENSIBLE_DATASET));
3328
3329         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
3330         if (error != 0)
3331                 return (error);
3332
3333         if (ds->ds_large_blocks)
3334                 error = EALREADY;
3335         dsl_dataset_rele(ds, FTAG);
3336
3337         return (error);
3338 }
3339
3340 void
3341 dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
3342 {
3343         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3344         objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
3345         uint64_t zero = 0;
3346
3347         spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
3348         dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
3349
3350         VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
3351             sizeof (zero), 1, &zero, tx));
3352 }
3353
3354 static void
3355 dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
3356 {
3357         const char *dsname = arg;
3358         dsl_dataset_t *ds;
3359
3360         VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
3361
3362         dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
3363         ASSERT(!ds->ds_large_blocks);
3364         ds->ds_large_blocks = B_TRUE;
3365         dsl_dataset_rele(ds, FTAG);
3366 }
3367
3368 int
3369 dsl_dataset_activate_large_blocks(const char *dsname)
3370 {
3371         int error;
3372
3373         error = dsl_sync_task(dsname,
3374             dsl_dataset_activate_large_blocks_check,
3375             dsl_dataset_activate_large_blocks_sync, (void *)dsname,
3376             1, ZFS_SPACE_CHECK_RESERVED);
3377
3378         /*
3379          * EALREADY indicates that this dataset already supports large blocks.
3380          */
3381         if (error == EALREADY)
3382                 error = 0;
3383         return (error);
3384 }
3385
3386 /*
3387  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3388  * For example, they could both be snapshots of the same filesystem, and
3389  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3390  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3391  * filesystem.  Or 'earlier' could be the origin's origin.
3392  *
3393  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3394  */
3395 boolean_t
3396 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
3397         uint64_t earlier_txg)
3398 {
3399         dsl_pool_t *dp = later->ds_dir->dd_pool;
3400         int error;
3401         boolean_t ret;
3402
3403         ASSERT(dsl_pool_config_held(dp));
3404         ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0);
3405
3406         if (earlier_txg == 0)
3407                 earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
3408
3409         if (dsl_dataset_is_snapshot(later) &&
3410             earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
3411                 return (B_FALSE);
3412
3413         if (later->ds_dir == earlier->ds_dir)
3414                 return (B_TRUE);
3415         if (!dsl_dir_is_clone(later->ds_dir))
3416                 return (B_FALSE);
3417
3418         if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
3419                 return (B_TRUE);
3420         dsl_dataset_t *origin;
3421         error = dsl_dataset_hold_obj(dp,
3422             dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
3423         if (error != 0)
3424                 return (B_FALSE);
3425         ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3426         dsl_dataset_rele(origin, FTAG);
3427         return (ret);
3428 }
3429
3430
3431 void
3432 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3433 {
3434         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3435         dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3436 }