]> CyberLeo.Net >> Repos - FreeBSD/releng/10.3.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
- Copy stable/10@296371 to releng/10.3 in preparation for 10.3-RC1
[FreeBSD/releng/10.3.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
24  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 RackTop Systems.
27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28  */
29
30 #include <sys/dmu_objset.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_prop.h>
34 #include <sys/dsl_synctask.h>
35 #include <sys/dmu_traverse.h>
36 #include <sys/dmu_impl.h>
37 #include <sys/dmu_send.h>
38 #include <sys/dmu_tx.h>
39 #include <sys/arc.h>
40 #include <sys/zio.h>
41 #include <sys/zap.h>
42 #include <sys/zfeature.h>
43 #include <sys/unique.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/spa.h>
47 #include <sys/zfs_znode.h>
48 #include <sys/zfs_onexit.h>
49 #include <sys/zvol.h>
50 #include <sys/dsl_scan.h>
51 #include <sys/dsl_deadlist.h>
52 #include <sys/dsl_destroy.h>
53 #include <sys/dsl_userhold.h>
54 #include <sys/dsl_bookmark.h>
55 #include <sys/dmu_send.h>
56 #include <sys/zio_checksum.h>
57 #include <sys/zio_compress.h>
58 #include <zfs_fletcher.h>
59
60 SYSCTL_DECL(_vfs_zfs);
61
62 /*
63  * The SPA supports block sizes up to 16MB.  However, very large blocks
64  * can have an impact on i/o latency (e.g. tying up a spinning disk for
65  * ~300ms), and also potentially on the memory allocator.  Therefore,
66  * we do not allow the recordsize to be set larger than zfs_max_recordsize
67  * (default 1MB).  Larger blocks can be created by changing this tunable,
68  * and pools with larger blocks can always be imported and used, regardless
69  * of this setting.
70  */
71 int zfs_max_recordsize = 1 * 1024 * 1024;
72 SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
73     &zfs_max_recordsize, 0,
74     "Maximum block size.  Expect dragons when tuning this.");
75
76 #define SWITCH64(x, y) \
77         { \
78                 uint64_t __tmp = (x); \
79                 (x) = (y); \
80                 (y) = __tmp; \
81         }
82
83 #define DS_REF_MAX      (1ULL << 62)
84
85 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
86
87 /*
88  * Figure out how much of this delta should be propogated to the dsl_dir
89  * layer.  If there's a refreservation, that space has already been
90  * partially accounted for in our ancestors.
91  */
92 static int64_t
93 parent_delta(dsl_dataset_t *ds, int64_t delta)
94 {
95         dsl_dataset_phys_t *ds_phys;
96         uint64_t old_bytes, new_bytes;
97
98         if (ds->ds_reserved == 0)
99                 return (delta);
100
101         ds_phys = dsl_dataset_phys(ds);
102         old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
103         new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
104
105         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
106         return (new_bytes - old_bytes);
107 }
108
109 void
110 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
111 {
112         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
113         int compressed = BP_GET_PSIZE(bp);
114         int uncompressed = BP_GET_UCSIZE(bp);
115         int64_t delta;
116
117         dprintf_bp(bp, "ds=%p", ds);
118
119         ASSERT(dmu_tx_is_syncing(tx));
120         /* It could have been compressed away to nothing */
121         if (BP_IS_HOLE(bp))
122                 return;
123         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
124         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
125         if (ds == NULL) {
126                 dsl_pool_mos_diduse_space(tx->tx_pool,
127                     used, compressed, uncompressed);
128                 return;
129         }
130
131         dmu_buf_will_dirty(ds->ds_dbuf, tx);
132         mutex_enter(&ds->ds_lock);
133         delta = parent_delta(ds, used);
134         dsl_dataset_phys(ds)->ds_referenced_bytes += used;
135         dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
136         dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
137         dsl_dataset_phys(ds)->ds_unique_bytes += used;
138
139         if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
140                 ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
141                     B_TRUE;
142         }
143
144         spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
145         if (f != SPA_FEATURE_NONE)
146                 ds->ds_feature_activation_needed[f] = B_TRUE;
147
148         mutex_exit(&ds->ds_lock);
149         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
150             compressed, uncompressed, tx);
151         dsl_dir_transfer_space(ds->ds_dir, used - delta,
152             DD_USED_REFRSRV, DD_USED_HEAD, NULL);
153 }
154
155 int
156 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
157     boolean_t async)
158 {
159         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
160         int compressed = BP_GET_PSIZE(bp);
161         int uncompressed = BP_GET_UCSIZE(bp);
162
163         if (BP_IS_HOLE(bp))
164                 return (0);
165
166         ASSERT(dmu_tx_is_syncing(tx));
167         ASSERT(bp->blk_birth <= tx->tx_txg);
168
169         if (ds == NULL) {
170                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
171                 dsl_pool_mos_diduse_space(tx->tx_pool,
172                     -used, -compressed, -uncompressed);
173                 return (used);
174         }
175         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
176
177         ASSERT(!ds->ds_is_snapshot);
178         dmu_buf_will_dirty(ds->ds_dbuf, tx);
179
180         if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
181                 int64_t delta;
182
183                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
184                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
185
186                 mutex_enter(&ds->ds_lock);
187                 ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
188                     !DS_UNIQUE_IS_ACCURATE(ds));
189                 delta = parent_delta(ds, -used);
190                 dsl_dataset_phys(ds)->ds_unique_bytes -= used;
191                 mutex_exit(&ds->ds_lock);
192                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
193                     delta, -compressed, -uncompressed, tx);
194                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
195                     DD_USED_REFRSRV, DD_USED_HEAD, NULL);
196         } else {
197                 dprintf_bp(bp, "putting on dead list: %s", "");
198                 if (async) {
199                         /*
200                          * We are here as part of zio's write done callback,
201                          * which means we're a zio interrupt thread.  We can't
202                          * call dsl_deadlist_insert() now because it may block
203                          * waiting for I/O.  Instead, put bp on the deferred
204                          * queue and let dsl_pool_sync() finish the job.
205                          */
206                         bplist_append(&ds->ds_pending_deadlist, bp);
207                 } else {
208                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
209                 }
210                 ASSERT3U(ds->ds_prev->ds_object, ==,
211                     dsl_dataset_phys(ds)->ds_prev_snap_obj);
212                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
213                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
214                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
215                     ds->ds_object && bp->blk_birth >
216                     dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
217                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
218                         mutex_enter(&ds->ds_prev->ds_lock);
219                         dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
220                         mutex_exit(&ds->ds_prev->ds_lock);
221                 }
222                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
223                         dsl_dir_transfer_space(ds->ds_dir, used,
224                             DD_USED_HEAD, DD_USED_SNAP, tx);
225                 }
226         }
227         mutex_enter(&ds->ds_lock);
228         ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
229         dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
230         ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
231         dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
232         ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
233         dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
234         mutex_exit(&ds->ds_lock);
235
236         return (used);
237 }
238
239 uint64_t
240 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
241 {
242         uint64_t trysnap = 0;
243
244         if (ds == NULL)
245                 return (0);
246         /*
247          * The snapshot creation could fail, but that would cause an
248          * incorrect FALSE return, which would only result in an
249          * overestimation of the amount of space that an operation would
250          * consume, which is OK.
251          *
252          * There's also a small window where we could miss a pending
253          * snapshot, because we could set the sync task in the quiescing
254          * phase.  So this should only be used as a guess.
255          */
256         if (ds->ds_trysnap_txg >
257             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
258                 trysnap = ds->ds_trysnap_txg;
259         return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
260 }
261
262 boolean_t
263 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
264     uint64_t blk_birth)
265 {
266         if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
267             (bp != NULL && BP_IS_HOLE(bp)))
268                 return (B_FALSE);
269
270         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
271
272         return (B_TRUE);
273 }
274
275 static void
276 dsl_dataset_evict(void *dbu)
277 {
278         dsl_dataset_t *ds = dbu;
279
280         ASSERT(ds->ds_owner == NULL);
281
282         ds->ds_dbuf = NULL;
283
284         unique_remove(ds->ds_fsid_guid);
285
286         if (ds->ds_objset != NULL)
287                 dmu_objset_evict(ds->ds_objset);
288
289         if (ds->ds_prev) {
290                 dsl_dataset_rele(ds->ds_prev, ds);
291                 ds->ds_prev = NULL;
292         }
293
294         bplist_destroy(&ds->ds_pending_deadlist);
295         if (ds->ds_deadlist.dl_os != NULL)
296                 dsl_deadlist_close(&ds->ds_deadlist);
297         if (ds->ds_dir)
298                 dsl_dir_async_rele(ds->ds_dir, ds);
299
300         ASSERT(!list_link_active(&ds->ds_synced_link));
301
302         list_destroy(&ds->ds_prop_cbs);
303         if (mutex_owned(&ds->ds_lock))
304                 mutex_exit(&ds->ds_lock);
305         mutex_destroy(&ds->ds_lock);
306         if (mutex_owned(&ds->ds_opening_lock))
307                 mutex_exit(&ds->ds_opening_lock);
308         mutex_destroy(&ds->ds_opening_lock);
309         mutex_destroy(&ds->ds_sendstream_lock);
310         refcount_destroy(&ds->ds_longholds);
311
312         kmem_free(ds, sizeof (dsl_dataset_t));
313 }
314
315 int
316 dsl_dataset_get_snapname(dsl_dataset_t *ds)
317 {
318         dsl_dataset_phys_t *headphys;
319         int err;
320         dmu_buf_t *headdbuf;
321         dsl_pool_t *dp = ds->ds_dir->dd_pool;
322         objset_t *mos = dp->dp_meta_objset;
323
324         if (ds->ds_snapname[0])
325                 return (0);
326         if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
327                 return (0);
328
329         err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
330             FTAG, &headdbuf);
331         if (err != 0)
332                 return (err);
333         headphys = headdbuf->db_data;
334         err = zap_value_search(dp->dp_meta_objset,
335             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
336         dmu_buf_rele(headdbuf, FTAG);
337         return (err);
338 }
339
340 int
341 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
342 {
343         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
344         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
345         matchtype_t mt;
346         int err;
347
348         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
349                 mt = MT_FIRST;
350         else
351                 mt = MT_EXACT;
352
353         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
354             value, mt, NULL, 0, NULL);
355         if (err == ENOTSUP && mt == MT_FIRST)
356                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
357         return (err);
358 }
359
360 int
361 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
362     boolean_t adj_cnt)
363 {
364         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
365         uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
366         matchtype_t mt;
367         int err;
368
369         dsl_dir_snap_cmtime_update(ds->ds_dir);
370
371         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
372                 mt = MT_FIRST;
373         else
374                 mt = MT_EXACT;
375
376         err = zap_remove_norm(mos, snapobj, name, mt, tx);
377         if (err == ENOTSUP && mt == MT_FIRST)
378                 err = zap_remove(mos, snapobj, name, tx);
379
380         if (err == 0 && adj_cnt)
381                 dsl_fs_ss_count_adjust(ds->ds_dir, -1,
382                     DD_FIELD_SNAPSHOT_COUNT, tx);
383
384         return (err);
385 }
386
387 boolean_t
388 dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
389 {
390         dmu_buf_t *dbuf = ds->ds_dbuf;
391         boolean_t result = B_FALSE;
392
393         if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
394             ds->ds_object, DMU_BONUS_BLKID, tag)) {
395
396                 if (ds == dmu_buf_get_user(dbuf))
397                         result = B_TRUE;
398                 else
399                         dmu_buf_rele(dbuf, tag);
400         }
401
402         return (result);
403 }
404
405 int
406 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
407     dsl_dataset_t **dsp)
408 {
409         objset_t *mos = dp->dp_meta_objset;
410         dmu_buf_t *dbuf;
411         dsl_dataset_t *ds;
412         int err;
413         dmu_object_info_t doi;
414
415         ASSERT(dsl_pool_config_held(dp));
416
417         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
418         if (err != 0)
419                 return (err);
420
421         /* Make sure dsobj has the correct object type. */
422         dmu_object_info_from_db(dbuf, &doi);
423         if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
424                 dmu_buf_rele(dbuf, tag);
425                 return (SET_ERROR(EINVAL));
426         }
427
428         ds = dmu_buf_get_user(dbuf);
429         if (ds == NULL) {
430                 dsl_dataset_t *winner = NULL;
431
432                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
433                 ds->ds_dbuf = dbuf;
434                 ds->ds_object = dsobj;
435                 ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
436
437                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
438                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
439                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
440                 refcount_create(&ds->ds_longholds);
441
442                 bplist_create(&ds->ds_pending_deadlist);
443                 dsl_deadlist_open(&ds->ds_deadlist,
444                     mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
445
446                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
447                     offsetof(dmu_sendarg_t, dsa_link));
448
449                 list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
450                     offsetof(dsl_prop_cb_record_t, cbr_ds_node));
451
452                 if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
453                         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
454                                 if (!(spa_feature_table[f].fi_flags &
455                                     ZFEATURE_FLAG_PER_DATASET))
456                                         continue;
457                                 err = zap_contains(mos, dsobj,
458                                     spa_feature_table[f].fi_guid);
459                                 if (err == 0) {
460                                         ds->ds_feature_inuse[f] = B_TRUE;
461                                 } else {
462                                         ASSERT3U(err, ==, ENOENT);
463                                         err = 0;
464                                 }
465                         }
466                 }
467
468                 err = dsl_dir_hold_obj(dp,
469                     dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
470                 if (err != 0) {
471                         mutex_destroy(&ds->ds_lock);
472                         mutex_destroy(&ds->ds_opening_lock);
473                         mutex_destroy(&ds->ds_sendstream_lock);
474                         refcount_destroy(&ds->ds_longholds);
475                         bplist_destroy(&ds->ds_pending_deadlist);
476                         dsl_deadlist_close(&ds->ds_deadlist);
477                         kmem_free(ds, sizeof (dsl_dataset_t));
478                         dmu_buf_rele(dbuf, tag);
479                         return (err);
480                 }
481
482                 if (!ds->ds_is_snapshot) {
483                         ds->ds_snapname[0] = '\0';
484                         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
485                                 err = dsl_dataset_hold_obj(dp,
486                                     dsl_dataset_phys(ds)->ds_prev_snap_obj,
487                                     ds, &ds->ds_prev);
488                         }
489                         if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
490                                 int zaperr = zap_lookup(mos, ds->ds_object,
491                                     DS_FIELD_BOOKMARK_NAMES,
492                                     sizeof (ds->ds_bookmarks), 1,
493                                     &ds->ds_bookmarks);
494                                 if (zaperr != ENOENT)
495                                         VERIFY0(zaperr);
496                         }
497                 } else {
498                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
499                                 err = dsl_dataset_get_snapname(ds);
500                         if (err == 0 &&
501                             dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
502                                 err = zap_count(
503                                     ds->ds_dir->dd_pool->dp_meta_objset,
504                                     dsl_dataset_phys(ds)->ds_userrefs_obj,
505                                     &ds->ds_userrefs);
506                         }
507                 }
508
509                 if (err == 0 && !ds->ds_is_snapshot) {
510                         err = dsl_prop_get_int_ds(ds,
511                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
512                             &ds->ds_reserved);
513                         if (err == 0) {
514                                 err = dsl_prop_get_int_ds(ds,
515                                     zfs_prop_to_name(ZFS_PROP_REFQUOTA),
516                                     &ds->ds_quota);
517                         }
518                 } else {
519                         ds->ds_reserved = ds->ds_quota = 0;
520                 }
521
522                 dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
523                 if (err == 0)
524                         winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
525
526                 if (err != 0 || winner != NULL) {
527                         bplist_destroy(&ds->ds_pending_deadlist);
528                         dsl_deadlist_close(&ds->ds_deadlist);
529                         if (ds->ds_prev)
530                                 dsl_dataset_rele(ds->ds_prev, ds);
531                         dsl_dir_rele(ds->ds_dir, ds);
532                         mutex_destroy(&ds->ds_lock);
533                         mutex_destroy(&ds->ds_opening_lock);
534                         mutex_destroy(&ds->ds_sendstream_lock);
535                         refcount_destroy(&ds->ds_longholds);
536                         kmem_free(ds, sizeof (dsl_dataset_t));
537                         if (err != 0) {
538                                 dmu_buf_rele(dbuf, tag);
539                                 return (err);
540                         }
541                         ds = winner;
542                 } else {
543                         ds->ds_fsid_guid =
544                             unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
545                 }
546         }
547         ASSERT3P(ds->ds_dbuf, ==, dbuf);
548         ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
549         ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
550             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
551             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
552         *dsp = ds;
553         return (0);
554 }
555
556 int
557 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
558     void *tag, dsl_dataset_t **dsp)
559 {
560         dsl_dir_t *dd;
561         const char *snapname;
562         uint64_t obj;
563         int err = 0;
564         dsl_dataset_t *ds;
565
566         err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
567         if (err != 0)
568                 return (err);
569
570         ASSERT(dsl_pool_config_held(dp));
571         obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
572         if (obj != 0)
573                 err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
574         else
575                 err = SET_ERROR(ENOENT);
576
577         /* we may be looking for a snapshot */
578         if (err == 0 && snapname != NULL) {
579                 dsl_dataset_t *snap_ds;
580
581                 if (*snapname++ != '@') {
582                         dsl_dataset_rele(ds, tag);
583                         dsl_dir_rele(dd, FTAG);
584                         return (SET_ERROR(ENOENT));
585                 }
586
587                 dprintf("looking for snapshot '%s'\n", snapname);
588                 err = dsl_dataset_snap_lookup(ds, snapname, &obj);
589                 if (err == 0)
590                         err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
591                 dsl_dataset_rele(ds, tag);
592
593                 if (err == 0) {
594                         mutex_enter(&snap_ds->ds_lock);
595                         if (snap_ds->ds_snapname[0] == 0)
596                                 (void) strlcpy(snap_ds->ds_snapname, snapname,
597                                     sizeof (snap_ds->ds_snapname));
598                         mutex_exit(&snap_ds->ds_lock);
599                         ds = snap_ds;
600                 }
601         }
602         if (err == 0)
603                 *dsp = ds;
604         dsl_dir_rele(dd, FTAG);
605         return (err);
606 }
607
608 int
609 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
610     void *tag, dsl_dataset_t **dsp)
611 {
612         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
613         if (err != 0)
614                 return (err);
615         if (!dsl_dataset_tryown(*dsp, tag)) {
616                 dsl_dataset_rele(*dsp, tag);
617                 *dsp = NULL;
618                 return (SET_ERROR(EBUSY));
619         }
620         return (0);
621 }
622
623 int
624 dsl_dataset_own(dsl_pool_t *dp, const char *name,
625     void *tag, dsl_dataset_t **dsp)
626 {
627         int err = dsl_dataset_hold(dp, name, tag, dsp);
628         if (err != 0)
629                 return (err);
630         if (!dsl_dataset_tryown(*dsp, tag)) {
631                 dsl_dataset_rele(*dsp, tag);
632                 return (SET_ERROR(EBUSY));
633         }
634         return (0);
635 }
636
637 /*
638  * See the comment above dsl_pool_hold() for details.  In summary, a long
639  * hold is used to prevent destruction of a dataset while the pool hold
640  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
641  *
642  * The dataset and pool must be held when this function is called.  After it
643  * is called, the pool hold may be released while the dataset is still held
644  * and accessed.
645  */
646 void
647 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
648 {
649         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
650         (void) refcount_add(&ds->ds_longholds, tag);
651 }
652
653 void
654 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
655 {
656         (void) refcount_remove(&ds->ds_longholds, tag);
657 }
658
659 /* Return B_TRUE if there are any long holds on this dataset. */
660 boolean_t
661 dsl_dataset_long_held(dsl_dataset_t *ds)
662 {
663         return (!refcount_is_zero(&ds->ds_longholds));
664 }
665
666 void
667 dsl_dataset_name(dsl_dataset_t *ds, char *name)
668 {
669         if (ds == NULL) {
670                 (void) strcpy(name, "mos");
671         } else {
672                 dsl_dir_name(ds->ds_dir, name);
673                 VERIFY0(dsl_dataset_get_snapname(ds));
674                 if (ds->ds_snapname[0]) {
675                         (void) strcat(name, "@");
676                         /*
677                          * We use a "recursive" mutex so that we
678                          * can call dprintf_ds() with ds_lock held.
679                          */
680                         if (!MUTEX_HELD(&ds->ds_lock)) {
681                                 mutex_enter(&ds->ds_lock);
682                                 (void) strcat(name, ds->ds_snapname);
683                                 mutex_exit(&ds->ds_lock);
684                         } else {
685                                 (void) strcat(name, ds->ds_snapname);
686                         }
687                 }
688         }
689 }
690
691 void
692 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
693 {
694         dmu_buf_rele(ds->ds_dbuf, tag);
695 }
696
697 void
698 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
699 {
700         ASSERT3P(ds->ds_owner, ==, tag);
701         ASSERT(ds->ds_dbuf != NULL);
702
703         mutex_enter(&ds->ds_lock);
704         ds->ds_owner = NULL;
705         mutex_exit(&ds->ds_lock);
706         dsl_dataset_long_rele(ds, tag);
707         dsl_dataset_rele(ds, tag);
708 }
709
710 boolean_t
711 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
712 {
713         boolean_t gotit = FALSE;
714
715         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
716         mutex_enter(&ds->ds_lock);
717         if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
718                 ds->ds_owner = tag;
719                 dsl_dataset_long_hold(ds, tag);
720                 gotit = TRUE;
721         }
722         mutex_exit(&ds->ds_lock);
723         return (gotit);
724 }
725
726 boolean_t
727 dsl_dataset_has_owner(dsl_dataset_t *ds)
728 {
729         boolean_t rv;
730         mutex_enter(&ds->ds_lock);
731         rv = (ds->ds_owner != NULL);
732         mutex_exit(&ds->ds_lock);
733         return (rv);
734 }
735
736 static void
737 dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
738 {
739         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
740         objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
741         uint64_t zero = 0;
742
743         VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
744
745         spa_feature_incr(spa, f, tx);
746         dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
747
748         VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
749             sizeof (zero), 1, &zero, tx));
750 }
751
752 void
753 dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
754 {
755         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
756         objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
757
758         VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
759
760         VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
761         spa_feature_decr(spa, f, tx);
762 }
763
764 uint64_t
765 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
766     uint64_t flags, dmu_tx_t *tx)
767 {
768         dsl_pool_t *dp = dd->dd_pool;
769         dmu_buf_t *dbuf;
770         dsl_dataset_phys_t *dsphys;
771         uint64_t dsobj;
772         objset_t *mos = dp->dp_meta_objset;
773
774         if (origin == NULL)
775                 origin = dp->dp_origin_snap;
776
777         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
778         ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
779         ASSERT(dmu_tx_is_syncing(tx));
780         ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
781
782         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
783             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
784         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
785         dmu_buf_will_dirty(dbuf, tx);
786         dsphys = dbuf->db_data;
787         bzero(dsphys, sizeof (dsl_dataset_phys_t));
788         dsphys->ds_dir_obj = dd->dd_object;
789         dsphys->ds_flags = flags;
790         dsphys->ds_fsid_guid = unique_create();
791         do {
792                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
793                     sizeof (dsphys->ds_guid));
794         } while (dsphys->ds_guid == 0);
795         dsphys->ds_snapnames_zapobj =
796             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
797             DMU_OT_NONE, 0, tx);
798         dsphys->ds_creation_time = gethrestime_sec();
799         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
800
801         if (origin == NULL) {
802                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
803         } else {
804                 dsl_dataset_t *ohds; /* head of the origin snapshot */
805
806                 dsphys->ds_prev_snap_obj = origin->ds_object;
807                 dsphys->ds_prev_snap_txg =
808                     dsl_dataset_phys(origin)->ds_creation_txg;
809                 dsphys->ds_referenced_bytes =
810                     dsl_dataset_phys(origin)->ds_referenced_bytes;
811                 dsphys->ds_compressed_bytes =
812                     dsl_dataset_phys(origin)->ds_compressed_bytes;
813                 dsphys->ds_uncompressed_bytes =
814                     dsl_dataset_phys(origin)->ds_uncompressed_bytes;
815                 dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
816
817                 /*
818                  * Inherit flags that describe the dataset's contents
819                  * (INCONSISTENT) or properties (Case Insensitive).
820                  */
821                 dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
822                     (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
823
824                 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
825                         if (origin->ds_feature_inuse[f])
826                                 dsl_dataset_activate_feature(dsobj, f, tx);
827                 }
828
829                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
830                 dsl_dataset_phys(origin)->ds_num_children++;
831
832                 VERIFY0(dsl_dataset_hold_obj(dp,
833                     dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
834                     FTAG, &ohds));
835                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
836                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
837                 dsl_dataset_rele(ohds, FTAG);
838
839                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
840                         if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
841                                 dsl_dataset_phys(origin)->ds_next_clones_obj =
842                                     zap_create(mos,
843                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
844                         }
845                         VERIFY0(zap_add_int(mos,
846                             dsl_dataset_phys(origin)->ds_next_clones_obj,
847                             dsobj, tx));
848                 }
849
850                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
851                 dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
852                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
853                         if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
854                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
855                                 dsl_dir_phys(origin->ds_dir)->dd_clones =
856                                     zap_create(mos,
857                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
858                         }
859                         VERIFY0(zap_add_int(mos,
860                             dsl_dir_phys(origin->ds_dir)->dd_clones,
861                             dsobj, tx));
862                 }
863         }
864
865         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
866                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
867
868         dmu_buf_rele(dbuf, FTAG);
869
870         dmu_buf_will_dirty(dd->dd_dbuf, tx);
871         dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
872
873         return (dsobj);
874 }
875
876 static void
877 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
878 {
879         objset_t *os;
880
881         VERIFY0(dmu_objset_from_ds(ds, &os));
882         bzero(&os->os_zil_header, sizeof (os->os_zil_header));
883         dsl_dataset_dirty(ds, tx);
884 }
885
886 uint64_t
887 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
888     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
889 {
890         dsl_pool_t *dp = pdd->dd_pool;
891         uint64_t dsobj, ddobj;
892         dsl_dir_t *dd;
893
894         ASSERT(dmu_tx_is_syncing(tx));
895         ASSERT(lastname[0] != '@');
896
897         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
898         VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
899
900         dsobj = dsl_dataset_create_sync_dd(dd, origin,
901             flags & ~DS_CREATE_FLAG_NODIRTY, tx);
902
903         dsl_deleg_set_create_perms(dd, tx, cr);
904
905         /*
906          * Since we're creating a new node we know it's a leaf, so we can
907          * initialize the counts if the limit feature is active.
908          */
909         if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
910                 uint64_t cnt = 0;
911                 objset_t *os = dd->dd_pool->dp_meta_objset;
912
913                 dsl_dir_zapify(dd, tx);
914                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
915                     sizeof (cnt), 1, &cnt, tx));
916                 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
917                     sizeof (cnt), 1, &cnt, tx));
918         }
919
920         dsl_dir_rele(dd, FTAG);
921
922         /*
923          * If we are creating a clone, make sure we zero out any stale
924          * data from the origin snapshots zil header.
925          */
926         if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
927                 dsl_dataset_t *ds;
928
929                 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
930                 dsl_dataset_zero_zil(ds, tx);
931                 dsl_dataset_rele(ds, FTAG);
932         }
933
934         return (dsobj);
935 }
936
937 #ifdef __FreeBSD__
938 /* FreeBSD ioctl compat begin */
939 struct destroyarg {
940         nvlist_t *nvl;
941         const char *snapname;
942 };
943
944 static int
945 dsl_check_snap_cb(const char *name, void *arg)
946 {
947         struct destroyarg *da = arg;
948         dsl_dataset_t *ds;
949         char *dsname;
950
951         dsname = kmem_asprintf("%s@%s", name, da->snapname);
952         fnvlist_add_boolean(da->nvl, dsname);
953         kmem_free(dsname, strlen(dsname) + 1);
954
955         return (0);
956 }
957
958 int
959 dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
960     nvlist_t *snaps)
961 {
962         struct destroyarg *da;
963         int err;
964
965         da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
966         da->nvl = snaps;
967         da->snapname = snapname;
968         err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
969             DS_FIND_CHILDREN);
970         kmem_free(da, sizeof (struct destroyarg));
971
972         return (err);
973 }
974 /* FreeBSD ioctl compat end */
975 #endif /* __FreeBSD__ */
976
977 /*
978  * The unique space in the head dataset can be calculated by subtracting
979  * the space used in the most recent snapshot, that is still being used
980  * in this file system, from the space currently in use.  To figure out
981  * the space in the most recent snapshot still in use, we need to take
982  * the total space used in the snapshot and subtract out the space that
983  * has been freed up since the snapshot was taken.
984  */
985 void
986 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
987 {
988         uint64_t mrs_used;
989         uint64_t dlused, dlcomp, dluncomp;
990
991         ASSERT(!ds->ds_is_snapshot);
992
993         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
994                 mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
995         else
996                 mrs_used = 0;
997
998         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
999
1000         ASSERT3U(dlused, <=, mrs_used);
1001         dsl_dataset_phys(ds)->ds_unique_bytes =
1002             dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
1003
1004         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1005             SPA_VERSION_UNIQUE_ACCURATE)
1006                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1007 }
1008
1009 void
1010 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
1011     dmu_tx_t *tx)
1012 {
1013         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1014         uint64_t count;
1015         int err;
1016
1017         ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
1018         err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1019             obj, tx);
1020         /*
1021          * The err should not be ENOENT, but a bug in a previous version
1022          * of the code could cause upgrade_clones_cb() to not set
1023          * ds_next_snap_obj when it should, leading to a missing entry.
1024          * If we knew that the pool was created after
1025          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1026          * ENOENT.  However, at least we can check that we don't have
1027          * too many entries in the next_clones_obj even after failing to
1028          * remove this one.
1029          */
1030         if (err != ENOENT)
1031                 VERIFY0(err);
1032         ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1033             &count));
1034         ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
1035 }
1036
1037
1038 blkptr_t *
1039 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1040 {
1041         return (&dsl_dataset_phys(ds)->ds_bp);
1042 }
1043
1044 void
1045 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1046 {
1047         ASSERT(dmu_tx_is_syncing(tx));
1048         /* If it's the meta-objset, set dp_meta_rootbp */
1049         if (ds == NULL) {
1050                 tx->tx_pool->dp_meta_rootbp = *bp;
1051         } else {
1052                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1053                 dsl_dataset_phys(ds)->ds_bp = *bp;
1054         }
1055 }
1056
1057 spa_t *
1058 dsl_dataset_get_spa(dsl_dataset_t *ds)
1059 {
1060         return (ds->ds_dir->dd_pool->dp_spa);
1061 }
1062
1063 void
1064 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1065 {
1066         dsl_pool_t *dp;
1067
1068         if (ds == NULL) /* this is the meta-objset */
1069                 return;
1070
1071         ASSERT(ds->ds_objset != NULL);
1072
1073         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
1074                 panic("dirtying snapshot!");
1075
1076         dp = ds->ds_dir->dd_pool;
1077
1078         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1079                 /* up the hold count until we can be written out */
1080                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1081         }
1082 }
1083
1084 boolean_t
1085 dsl_dataset_is_dirty(dsl_dataset_t *ds)
1086 {
1087         for (int t = 0; t < TXG_SIZE; t++) {
1088                 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1089                     ds, t))
1090                         return (B_TRUE);
1091         }
1092         return (B_FALSE);
1093 }
1094
1095 static int
1096 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1097 {
1098         uint64_t asize;
1099
1100         if (!dmu_tx_is_syncing(tx))
1101                 return (0);
1102
1103         /*
1104          * If there's an fs-only reservation, any blocks that might become
1105          * owned by the snapshot dataset must be accommodated by space
1106          * outside of the reservation.
1107          */
1108         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1109         asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1110         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1111                 return (SET_ERROR(ENOSPC));
1112
1113         /*
1114          * Propagate any reserved space for this snapshot to other
1115          * snapshot checks in this sync group.
1116          */
1117         if (asize > 0)
1118                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1119
1120         return (0);
1121 }
1122
1123 typedef struct dsl_dataset_snapshot_arg {
1124         nvlist_t *ddsa_snaps;
1125         nvlist_t *ddsa_props;
1126         nvlist_t *ddsa_errors;
1127         cred_t *ddsa_cr;
1128 } dsl_dataset_snapshot_arg_t;
1129
1130 int
1131 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1132     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1133 {
1134         int error;
1135         uint64_t value;
1136
1137         ds->ds_trysnap_txg = tx->tx_txg;
1138
1139         if (!dmu_tx_is_syncing(tx))
1140                 return (0);
1141
1142         /*
1143          * We don't allow multiple snapshots of the same txg.  If there
1144          * is already one, try again.
1145          */
1146         if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1147                 return (SET_ERROR(EAGAIN));
1148
1149         /*
1150          * Check for conflicting snapshot name.
1151          */
1152         error = dsl_dataset_snap_lookup(ds, snapname, &value);
1153         if (error == 0)
1154                 return (SET_ERROR(EEXIST));
1155         if (error != ENOENT)
1156                 return (error);
1157
1158         /*
1159          * We don't allow taking snapshots of inconsistent datasets, such as
1160          * those into which we are currently receiving.  However, if we are
1161          * creating this snapshot as part of a receive, this check will be
1162          * executed atomically with respect to the completion of the receive
1163          * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1164          * case we ignore this, knowing it will be fixed up for us shortly in
1165          * dmu_recv_end_sync().
1166          */
1167         if (!recv && DS_IS_INCONSISTENT(ds))
1168                 return (SET_ERROR(EBUSY));
1169
1170         /*
1171          * Skip the check for temporary snapshots or if we have already checked
1172          * the counts in dsl_dataset_snapshot_check. This means we really only
1173          * check the count here when we're receiving a stream.
1174          */
1175         if (cnt != 0 && cr != NULL) {
1176                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1177                     ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1178                 if (error != 0)
1179                         return (error);
1180         }
1181
1182         error = dsl_dataset_snapshot_reserve_space(ds, tx);
1183         if (error != 0)
1184                 return (error);
1185
1186         return (0);
1187 }
1188
1189 static int
1190 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1191 {
1192         dsl_dataset_snapshot_arg_t *ddsa = arg;
1193         dsl_pool_t *dp = dmu_tx_pool(tx);
1194         nvpair_t *pair;
1195         int rv = 0;
1196
1197         /*
1198          * Pre-compute how many total new snapshots will be created for each
1199          * level in the tree and below. This is needed for validating the
1200          * snapshot limit when either taking a recursive snapshot or when
1201          * taking multiple snapshots.
1202          *
1203          * The problem is that the counts are not actually adjusted when
1204          * we are checking, only when we finally sync. For a single snapshot,
1205          * this is easy, the count will increase by 1 at each node up the tree,
1206          * but its more complicated for the recursive/multiple snapshot case.
1207          *
1208          * The dsl_fs_ss_limit_check function does recursively check the count
1209          * at each level up the tree but since it is validating each snapshot
1210          * independently we need to be sure that we are validating the complete
1211          * count for the entire set of snapshots. We do this by rolling up the
1212          * counts for each component of the name into an nvlist and then
1213          * checking each of those cases with the aggregated count.
1214          *
1215          * This approach properly handles not only the recursive snapshot
1216          * case (where we get all of those on the ddsa_snaps list) but also
1217          * the sibling case (e.g. snapshot a/b and a/c so that we will also
1218          * validate the limit on 'a' using a count of 2).
1219          *
1220          * We validate the snapshot names in the third loop and only report
1221          * name errors once.
1222          */
1223         if (dmu_tx_is_syncing(tx)) {
1224                 nvlist_t *cnt_track = NULL;
1225                 cnt_track = fnvlist_alloc();
1226
1227                 /* Rollup aggregated counts into the cnt_track list */
1228                 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1229                     pair != NULL;
1230                     pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1231                         char *pdelim;
1232                         uint64_t val;
1233                         char nm[MAXPATHLEN];
1234
1235                         (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1236                         pdelim = strchr(nm, '@');
1237                         if (pdelim == NULL)
1238                                 continue;
1239                         *pdelim = '\0';
1240
1241                         do {
1242                                 if (nvlist_lookup_uint64(cnt_track, nm,
1243                                     &val) == 0) {
1244                                         /* update existing entry */
1245                                         fnvlist_add_uint64(cnt_track, nm,
1246                                             val + 1);
1247                                 } else {
1248                                         /* add to list */
1249                                         fnvlist_add_uint64(cnt_track, nm, 1);
1250                                 }
1251
1252                                 pdelim = strrchr(nm, '/');
1253                                 if (pdelim != NULL)
1254                                         *pdelim = '\0';
1255                         } while (pdelim != NULL);
1256                 }
1257
1258                 /* Check aggregated counts at each level */
1259                 for (pair = nvlist_next_nvpair(cnt_track, NULL);
1260                     pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1261                         int error = 0;
1262                         char *name;
1263                         uint64_t cnt = 0;
1264                         dsl_dataset_t *ds;
1265
1266                         name = nvpair_name(pair);
1267                         cnt = fnvpair_value_uint64(pair);
1268                         ASSERT(cnt > 0);
1269
1270                         error = dsl_dataset_hold(dp, name, FTAG, &ds);
1271                         if (error == 0) {
1272                                 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1273                                     ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1274                                     ddsa->ddsa_cr);
1275                                 dsl_dataset_rele(ds, FTAG);
1276                         }
1277
1278                         if (error != 0) {
1279                                 if (ddsa->ddsa_errors != NULL)
1280                                         fnvlist_add_int32(ddsa->ddsa_errors,
1281                                             name, error);
1282                                 rv = error;
1283                                 /* only report one error for this check */
1284                                 break;
1285                         }
1286                 }
1287                 nvlist_free(cnt_track);
1288         }
1289
1290         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1291             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1292                 int error = 0;
1293                 dsl_dataset_t *ds;
1294                 char *name, *atp;
1295                 char dsname[MAXNAMELEN];
1296
1297                 name = nvpair_name(pair);
1298                 if (strlen(name) >= MAXNAMELEN)
1299                         error = SET_ERROR(ENAMETOOLONG);
1300                 if (error == 0) {
1301                         atp = strchr(name, '@');
1302                         if (atp == NULL)
1303                                 error = SET_ERROR(EINVAL);
1304                         if (error == 0)
1305                                 (void) strlcpy(dsname, name, atp - name + 1);
1306                 }
1307                 if (error == 0)
1308                         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1309                 if (error == 0) {
1310                         /* passing 0/NULL skips dsl_fs_ss_limit_check */
1311                         error = dsl_dataset_snapshot_check_impl(ds,
1312                             atp + 1, tx, B_FALSE, 0, NULL);
1313                         dsl_dataset_rele(ds, FTAG);
1314                 }
1315
1316                 if (error != 0) {
1317                         if (ddsa->ddsa_errors != NULL) {
1318                                 fnvlist_add_int32(ddsa->ddsa_errors,
1319                                     name, error);
1320                         }
1321                         rv = error;
1322                 }
1323         }
1324
1325         return (rv);
1326 }
1327
1328 void
1329 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1330     dmu_tx_t *tx)
1331 {
1332         static zil_header_t zero_zil;
1333
1334         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1335         dmu_buf_t *dbuf;
1336         dsl_dataset_phys_t *dsphys;
1337         uint64_t dsobj, crtxg;
1338         objset_t *mos = dp->dp_meta_objset;
1339         objset_t *os;
1340
1341         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1342
1343         /*
1344          * If we are on an old pool, the zil must not be active, in which
1345          * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1346          */
1347         ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1348             dmu_objset_from_ds(ds, &os) != 0 ||
1349             bcmp(&os->os_phys->os_zil_header, &zero_zil,
1350             sizeof (zero_zil)) == 0);
1351
1352         dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1353
1354         /*
1355          * The origin's ds_creation_txg has to be < TXG_INITIAL
1356          */
1357         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1358                 crtxg = 1;
1359         else
1360                 crtxg = tx->tx_txg;
1361
1362         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1363             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1364         VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1365         dmu_buf_will_dirty(dbuf, tx);
1366         dsphys = dbuf->db_data;
1367         bzero(dsphys, sizeof (dsl_dataset_phys_t));
1368         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1369         dsphys->ds_fsid_guid = unique_create();
1370         do {
1371                 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1372                     sizeof (dsphys->ds_guid));
1373         } while (dsphys->ds_guid == 0);
1374         dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1375         dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1376         dsphys->ds_next_snap_obj = ds->ds_object;
1377         dsphys->ds_num_children = 1;
1378         dsphys->ds_creation_time = gethrestime_sec();
1379         dsphys->ds_creation_txg = crtxg;
1380         dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1381         dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1382         dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1383         dsphys->ds_uncompressed_bytes =
1384             dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1385         dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1386         dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1387         dmu_buf_rele(dbuf, FTAG);
1388
1389         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1390                 if (ds->ds_feature_inuse[f])
1391                         dsl_dataset_activate_feature(dsobj, f, tx);
1392         }
1393
1394         ASSERT3U(ds->ds_prev != 0, ==,
1395             dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1396         if (ds->ds_prev) {
1397                 uint64_t next_clones_obj =
1398                     dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1399                 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1400                     ds->ds_object ||
1401                     dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1402                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1403                     ds->ds_object) {
1404                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1405                         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1406                             dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1407                         dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1408                 } else if (next_clones_obj != 0) {
1409                         dsl_dataset_remove_from_next_clones(ds->ds_prev,
1410                             dsphys->ds_next_snap_obj, tx);
1411                         VERIFY0(zap_add_int(mos,
1412                             next_clones_obj, dsobj, tx));
1413                 }
1414         }
1415
1416         /*
1417          * If we have a reference-reservation on this dataset, we will
1418          * need to increase the amount of refreservation being charged
1419          * since our unique space is going to zero.
1420          */
1421         if (ds->ds_reserved) {
1422                 int64_t delta;
1423                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1424                 delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1425                     ds->ds_reserved);
1426                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1427                     delta, 0, 0, tx);
1428         }
1429
1430         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1431         dsl_dataset_phys(ds)->ds_deadlist_obj =
1432             dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1433             dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1434         dsl_deadlist_close(&ds->ds_deadlist);
1435         dsl_deadlist_open(&ds->ds_deadlist, mos,
1436             dsl_dataset_phys(ds)->ds_deadlist_obj);
1437         dsl_deadlist_add_key(&ds->ds_deadlist,
1438             dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1439
1440         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1441         dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1442         dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1443         dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1444         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1445                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1446
1447         VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1448             snapname, 8, 1, &dsobj, tx));
1449
1450         if (ds->ds_prev)
1451                 dsl_dataset_rele(ds->ds_prev, ds);
1452         VERIFY0(dsl_dataset_hold_obj(dp,
1453             dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1454
1455         dsl_scan_ds_snapshotted(ds, tx);
1456
1457         dsl_dir_snap_cmtime_update(ds->ds_dir);
1458
1459         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1460 }
1461
1462 static void
1463 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1464 {
1465         dsl_dataset_snapshot_arg_t *ddsa = arg;
1466         dsl_pool_t *dp = dmu_tx_pool(tx);
1467         nvpair_t *pair;
1468
1469         for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1470             pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1471                 dsl_dataset_t *ds;
1472                 char *name, *atp;
1473                 char dsname[MAXNAMELEN];
1474
1475                 name = nvpair_name(pair);
1476                 atp = strchr(name, '@');
1477                 (void) strlcpy(dsname, name, atp - name + 1);
1478                 VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1479
1480                 dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1481                 if (ddsa->ddsa_props != NULL) {
1482                         dsl_props_set_sync_impl(ds->ds_prev,
1483                             ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1484                 }
1485                 dsl_dataset_rele(ds, FTAG);
1486         }
1487 }
1488
1489 /*
1490  * The snapshots must all be in the same pool.
1491  * All-or-nothing: if there are any failures, nothing will be modified.
1492  */
1493 int
1494 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1495 {
1496         dsl_dataset_snapshot_arg_t ddsa;
1497         nvpair_t *pair;
1498         boolean_t needsuspend;
1499         int error;
1500         spa_t *spa;
1501         char *firstname;
1502         nvlist_t *suspended = NULL;
1503
1504         pair = nvlist_next_nvpair(snaps, NULL);
1505         if (pair == NULL)
1506                 return (0);
1507         firstname = nvpair_name(pair);
1508
1509         error = spa_open(firstname, &spa, FTAG);
1510         if (error != 0)
1511                 return (error);
1512         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1513         spa_close(spa, FTAG);
1514
1515         if (needsuspend) {
1516                 suspended = fnvlist_alloc();
1517                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1518                     pair = nvlist_next_nvpair(snaps, pair)) {
1519                         char fsname[MAXNAMELEN];
1520                         char *snapname = nvpair_name(pair);
1521                         char *atp;
1522                         void *cookie;
1523
1524                         atp = strchr(snapname, '@');
1525                         if (atp == NULL) {
1526                                 error = SET_ERROR(EINVAL);
1527                                 break;
1528                         }
1529                         (void) strlcpy(fsname, snapname, atp - snapname + 1);
1530
1531                         error = zil_suspend(fsname, &cookie);
1532                         if (error != 0)
1533                                 break;
1534                         fnvlist_add_uint64(suspended, fsname,
1535                             (uintptr_t)cookie);
1536                 }
1537         }
1538
1539         ddsa.ddsa_snaps = snaps;
1540         ddsa.ddsa_props = props;
1541         ddsa.ddsa_errors = errors;
1542         ddsa.ddsa_cr = CRED();
1543
1544         if (error == 0) {
1545                 error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1546                     dsl_dataset_snapshot_sync, &ddsa,
1547                     fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1548         }
1549
1550         if (suspended != NULL) {
1551                 for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1552                     pair = nvlist_next_nvpair(suspended, pair)) {
1553                         zil_resume((void *)(uintptr_t)
1554                             fnvpair_value_uint64(pair));
1555                 }
1556                 fnvlist_free(suspended);
1557         }
1558
1559 #ifdef __FreeBSD__
1560 #ifdef _KERNEL
1561         if (error == 0) {
1562                 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1563                     pair = nvlist_next_nvpair(snaps, pair)) {
1564                         char *snapname = nvpair_name(pair);
1565                         zvol_create_minors(snapname);
1566                 }
1567         }
1568 #endif
1569 #endif
1570         return (error);
1571 }
1572
1573 typedef struct dsl_dataset_snapshot_tmp_arg {
1574         const char *ddsta_fsname;
1575         const char *ddsta_snapname;
1576         minor_t ddsta_cleanup_minor;
1577         const char *ddsta_htag;
1578 } dsl_dataset_snapshot_tmp_arg_t;
1579
1580 static int
1581 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1582 {
1583         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1584         dsl_pool_t *dp = dmu_tx_pool(tx);
1585         dsl_dataset_t *ds;
1586         int error;
1587
1588         error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1589         if (error != 0)
1590                 return (error);
1591
1592         /* NULL cred means no limit check for tmp snapshot */
1593         error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1594             tx, B_FALSE, 0, NULL);
1595         if (error != 0) {
1596                 dsl_dataset_rele(ds, FTAG);
1597                 return (error);
1598         }
1599
1600         if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1601                 dsl_dataset_rele(ds, FTAG);
1602                 return (SET_ERROR(ENOTSUP));
1603         }
1604         error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1605             B_TRUE, tx);
1606         if (error != 0) {
1607                 dsl_dataset_rele(ds, FTAG);
1608                 return (error);
1609         }
1610
1611         dsl_dataset_rele(ds, FTAG);
1612         return (0);
1613 }
1614
1615 static void
1616 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1617 {
1618         dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1619         dsl_pool_t *dp = dmu_tx_pool(tx);
1620         dsl_dataset_t *ds;
1621
1622         VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1623
1624         dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1625         dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1626             ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1627         dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1628
1629         dsl_dataset_rele(ds, FTAG);
1630 }
1631
1632 int
1633 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1634     minor_t cleanup_minor, const char *htag)
1635 {
1636         dsl_dataset_snapshot_tmp_arg_t ddsta;
1637         int error;
1638         spa_t *spa;
1639         boolean_t needsuspend;
1640         void *cookie;
1641
1642         ddsta.ddsta_fsname = fsname;
1643         ddsta.ddsta_snapname = snapname;
1644         ddsta.ddsta_cleanup_minor = cleanup_minor;
1645         ddsta.ddsta_htag = htag;
1646
1647         error = spa_open(fsname, &spa, FTAG);
1648         if (error != 0)
1649                 return (error);
1650         needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1651         spa_close(spa, FTAG);
1652
1653         if (needsuspend) {
1654                 error = zil_suspend(fsname, &cookie);
1655                 if (error != 0)
1656                         return (error);
1657         }
1658
1659         error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1660             dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1661
1662         if (needsuspend)
1663                 zil_resume(cookie);
1664         return (error);
1665 }
1666
1667
1668 void
1669 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1670 {
1671         ASSERT(dmu_tx_is_syncing(tx));
1672         ASSERT(ds->ds_objset != NULL);
1673         ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1674
1675         /*
1676          * in case we had to change ds_fsid_guid when we opened it,
1677          * sync it out now.
1678          */
1679         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1680         dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1681
1682         if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
1683                 VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1684                     ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
1685                     &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
1686                 VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1687                     ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
1688                     &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
1689                 VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1690                     ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
1691                     &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
1692                 ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
1693                 ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
1694                 ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
1695         }
1696
1697         dmu_objset_sync(ds->ds_objset, zio, tx);
1698
1699         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1700                 if (ds->ds_feature_activation_needed[f]) {
1701                         if (ds->ds_feature_inuse[f])
1702                                 continue;
1703                         dsl_dataset_activate_feature(ds->ds_object, f, tx);
1704                         ds->ds_feature_inuse[f] = B_TRUE;
1705                 }
1706         }
1707 }
1708
1709 static void
1710 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1711 {
1712         uint64_t count = 0;
1713         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1714         zap_cursor_t zc;
1715         zap_attribute_t za;
1716         nvlist_t *propval = fnvlist_alloc();
1717         nvlist_t *val = fnvlist_alloc();
1718
1719         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1720
1721         /*
1722          * There may be missing entries in ds_next_clones_obj
1723          * due to a bug in a previous version of the code.
1724          * Only trust it if it has the right number of entries.
1725          */
1726         if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1727                 VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1728                     &count));
1729         }
1730         if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
1731                 goto fail;
1732         for (zap_cursor_init(&zc, mos,
1733             dsl_dataset_phys(ds)->ds_next_clones_obj);
1734             zap_cursor_retrieve(&zc, &za) == 0;
1735             zap_cursor_advance(&zc)) {
1736                 dsl_dataset_t *clone;
1737                 char buf[ZFS_MAXNAMELEN];
1738                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1739                     za.za_first_integer, FTAG, &clone));
1740                 dsl_dir_name(clone->ds_dir, buf);
1741                 fnvlist_add_boolean(val, buf);
1742                 dsl_dataset_rele(clone, FTAG);
1743         }
1744         zap_cursor_fini(&zc);
1745         fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1746         fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1747 fail:
1748         nvlist_free(val);
1749         nvlist_free(propval);
1750 }
1751
1752 static void
1753 get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
1754 {
1755         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1756
1757         if (dsl_dataset_has_resume_receive_state(ds)) {
1758                 char *str;
1759                 void *packed;
1760                 uint8_t *compressed;
1761                 uint64_t val;
1762                 nvlist_t *token_nv = fnvlist_alloc();
1763                 size_t packed_size, compressed_size;
1764
1765                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1766                     DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
1767                         fnvlist_add_uint64(token_nv, "fromguid", val);
1768                 }
1769                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1770                     DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
1771                         fnvlist_add_uint64(token_nv, "object", val);
1772                 }
1773                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1774                     DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
1775                         fnvlist_add_uint64(token_nv, "offset", val);
1776                 }
1777                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1778                     DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
1779                         fnvlist_add_uint64(token_nv, "bytes", val);
1780                 }
1781                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1782                     DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
1783                         fnvlist_add_uint64(token_nv, "toguid", val);
1784                 }
1785                 char buf[256];
1786                 if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1787                     DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
1788                         fnvlist_add_string(token_nv, "toname", buf);
1789                 }
1790                 if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1791                     DS_FIELD_RESUME_EMBEDOK) == 0) {
1792                         fnvlist_add_boolean(token_nv, "embedok");
1793                 }
1794                 packed = fnvlist_pack(token_nv, &packed_size);
1795                 fnvlist_free(token_nv);
1796                 compressed = kmem_alloc(packed_size, KM_SLEEP);
1797
1798                 compressed_size = gzip_compress(packed, compressed,
1799                     packed_size, packed_size, 6);
1800
1801                 zio_cksum_t cksum;
1802                 fletcher_4_native(compressed, compressed_size, NULL, &cksum);
1803
1804                 str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
1805                 for (int i = 0; i < compressed_size; i++) {
1806                         (void) sprintf(str + i * 2, "%02x", compressed[i]);
1807                 }
1808                 str[compressed_size * 2] = '\0';
1809                 char *propval = kmem_asprintf("%u-%llx-%llx-%s",
1810                     ZFS_SEND_RESUME_TOKEN_VERSION,
1811                     (longlong_t)cksum.zc_word[0],
1812                     (longlong_t)packed_size, str);
1813                 dsl_prop_nvlist_add_string(nv,
1814                     ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
1815                 kmem_free(packed, packed_size);
1816                 kmem_free(str, compressed_size * 2 + 1);
1817                 kmem_free(compressed, packed_size);
1818                 strfree(propval);
1819         }
1820 }
1821
1822 void
1823 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1824 {
1825         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1826         uint64_t refd, avail, uobjs, aobjs, ratio;
1827
1828         ASSERT(dsl_pool_config_held(dp));
1829
1830         ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1831             (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1832             dsl_dataset_phys(ds)->ds_compressed_bytes);
1833
1834         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1835         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1836             dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1837
1838         if (ds->ds_is_snapshot) {
1839                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1840                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1841                     dsl_dataset_phys(ds)->ds_unique_bytes);
1842                 get_clones_stat(ds, nv);
1843         } else {
1844                 if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
1845                         char buf[MAXNAMELEN];
1846                         dsl_dataset_name(ds->ds_prev, buf);
1847                         dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
1848                 }
1849
1850                 dsl_dir_stats(ds->ds_dir, nv);
1851         }
1852
1853         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1854         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1855         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1856
1857         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1858             dsl_dataset_phys(ds)->ds_creation_time);
1859         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1860             dsl_dataset_phys(ds)->ds_creation_txg);
1861         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1862             ds->ds_quota);
1863         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1864             ds->ds_reserved);
1865         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1866             dsl_dataset_phys(ds)->ds_guid);
1867         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1868             dsl_dataset_phys(ds)->ds_unique_bytes);
1869         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1870             ds->ds_object);
1871         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1872             ds->ds_userrefs);
1873         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1874             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1875
1876         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1877                 uint64_t written, comp, uncomp;
1878                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1879                 dsl_dataset_t *prev;
1880
1881                 int err = dsl_dataset_hold_obj(dp,
1882                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1883                 if (err == 0) {
1884                         err = dsl_dataset_space_written(prev, ds, &written,
1885                             &comp, &uncomp);
1886                         dsl_dataset_rele(prev, FTAG);
1887                         if (err == 0) {
1888                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1889                                     written);
1890                         }
1891                 }
1892         }
1893
1894         if (!dsl_dataset_is_snapshot(ds)) {
1895                 /*
1896                  * A failed "newfs" (e.g. full) resumable receive leaves
1897                  * the stats set on this dataset.  Check here for the prop.
1898                  */
1899                 get_receive_resume_stats(ds, nv);
1900
1901                 /*
1902                  * A failed incremental resumable receive leaves the
1903                  * stats set on our child named "%recv".  Check the child
1904                  * for the prop.
1905                  */
1906                 char recvname[ZFS_MAXNAMELEN];
1907                 dsl_dataset_t *recv_ds;
1908                 dsl_dataset_name(ds, recvname);
1909                 (void) strcat(recvname, "/");
1910                 (void) strcat(recvname, recv_clone_name);
1911                 if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
1912                         get_receive_resume_stats(recv_ds, nv);
1913                         dsl_dataset_rele(recv_ds, FTAG);
1914                 }
1915         }
1916 }
1917
1918 void
1919 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1920 {
1921         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1922         ASSERT(dsl_pool_config_held(dp));
1923
1924         stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
1925         stat->dds_inconsistent =
1926             dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
1927         stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
1928         stat->dds_origin[0] = '\0';
1929         if (ds->ds_is_snapshot) {
1930                 stat->dds_is_snapshot = B_TRUE;
1931                 stat->dds_num_clones =
1932                     dsl_dataset_phys(ds)->ds_num_children - 1;
1933         } else {
1934                 stat->dds_is_snapshot = B_FALSE;
1935                 stat->dds_num_clones = 0;
1936
1937                 if (dsl_dir_is_clone(ds->ds_dir)) {
1938                         dsl_dataset_t *ods;
1939
1940                         VERIFY0(dsl_dataset_hold_obj(dp,
1941                             dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
1942                             FTAG, &ods));
1943                         dsl_dataset_name(ods, stat->dds_origin);
1944                         dsl_dataset_rele(ods, FTAG);
1945                 }
1946         }
1947 }
1948
1949 uint64_t
1950 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1951 {
1952         return (ds->ds_fsid_guid);
1953 }
1954
1955 void
1956 dsl_dataset_space(dsl_dataset_t *ds,
1957     uint64_t *refdbytesp, uint64_t *availbytesp,
1958     uint64_t *usedobjsp, uint64_t *availobjsp)
1959 {
1960         *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
1961         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1962         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
1963                 *availbytesp +=
1964                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
1965         if (ds->ds_quota != 0) {
1966                 /*
1967                  * Adjust available bytes according to refquota
1968                  */
1969                 if (*refdbytesp < ds->ds_quota)
1970                         *availbytesp = MIN(*availbytesp,
1971                             ds->ds_quota - *refdbytesp);
1972                 else
1973                         *availbytesp = 0;
1974         }
1975         *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
1976         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
1977 }
1978
1979 boolean_t
1980 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1981 {
1982         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1983
1984         ASSERT(dsl_pool_config_held(dp));
1985         if (snap == NULL)
1986                 return (B_FALSE);
1987         if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
1988             dsl_dataset_phys(snap)->ds_creation_txg) {
1989                 objset_t *os, *os_snap;
1990                 /*
1991                  * It may be that only the ZIL differs, because it was
1992                  * reset in the head.  Don't count that as being
1993                  * modified.
1994                  */
1995                 if (dmu_objset_from_ds(ds, &os) != 0)
1996                         return (B_TRUE);
1997                 if (dmu_objset_from_ds(snap, &os_snap) != 0)
1998                         return (B_TRUE);
1999                 return (bcmp(&os->os_phys->os_meta_dnode,
2000                     &os_snap->os_phys->os_meta_dnode,
2001                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2002         }
2003         return (B_FALSE);
2004 }
2005
2006 typedef struct dsl_dataset_rename_snapshot_arg {
2007         const char *ddrsa_fsname;
2008         const char *ddrsa_oldsnapname;
2009         const char *ddrsa_newsnapname;
2010         boolean_t ddrsa_recursive;
2011         dmu_tx_t *ddrsa_tx;
2012 } dsl_dataset_rename_snapshot_arg_t;
2013
2014 /* ARGSUSED */
2015 static int
2016 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
2017     dsl_dataset_t *hds, void *arg)
2018 {
2019         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2020         int error;
2021         uint64_t val;
2022
2023         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2024         if (error != 0) {
2025                 /* ignore nonexistent snapshots */
2026                 return (error == ENOENT ? 0 : error);
2027         }
2028
2029         /* new name should not exist */
2030         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
2031         if (error == 0)
2032                 error = SET_ERROR(EEXIST);
2033         else if (error == ENOENT)
2034                 error = 0;
2035
2036         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2037         if (dsl_dir_namelen(hds->ds_dir) + 1 +
2038             strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
2039                 error = SET_ERROR(ENAMETOOLONG);
2040
2041         return (error);
2042 }
2043
2044 static int
2045 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
2046 {
2047         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2048         dsl_pool_t *dp = dmu_tx_pool(tx);
2049         dsl_dataset_t *hds;
2050         int error;
2051
2052         error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
2053         if (error != 0)
2054                 return (error);
2055
2056         if (ddrsa->ddrsa_recursive) {
2057                 error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2058                     dsl_dataset_rename_snapshot_check_impl, ddrsa,
2059                     DS_FIND_CHILDREN);
2060         } else {
2061                 error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
2062         }
2063         dsl_dataset_rele(hds, FTAG);
2064         return (error);
2065 }
2066
2067 static int
2068 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
2069     dsl_dataset_t *hds, void *arg)
2070 {
2071 #ifdef __FreeBSD__
2072 #ifdef _KERNEL
2073         char *oldname, *newname;
2074 #endif
2075 #endif
2076         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2077         dsl_dataset_t *ds;
2078         uint64_t val;
2079         dmu_tx_t *tx = ddrsa->ddrsa_tx;
2080         int error;
2081
2082         error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2083         ASSERT(error == 0 || error == ENOENT);
2084         if (error == ENOENT) {
2085                 /* ignore nonexistent snapshots */
2086                 return (0);
2087         }
2088
2089         VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
2090
2091         /* log before we change the name */
2092         spa_history_log_internal_ds(ds, "rename", tx,
2093             "-> @%s", ddrsa->ddrsa_newsnapname);
2094
2095         VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
2096             B_FALSE));
2097         mutex_enter(&ds->ds_lock);
2098         (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
2099         mutex_exit(&ds->ds_lock);
2100         VERIFY0(zap_add(dp->dp_meta_objset,
2101             dsl_dataset_phys(hds)->ds_snapnames_zapobj,
2102             ds->ds_snapname, 8, 1, &ds->ds_object, tx));
2103
2104 #ifdef __FreeBSD__
2105 #ifdef _KERNEL
2106         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2107         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2108         snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2109             ddrsa->ddrsa_oldsnapname);
2110         snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2111             ddrsa->ddrsa_newsnapname);
2112         zfsvfs_update_fromname(oldname, newname);
2113         zvol_rename_minors(oldname, newname);
2114         kmem_free(newname, MAXPATHLEN);
2115         kmem_free(oldname, MAXPATHLEN);
2116 #endif
2117 #endif
2118         dsl_dataset_rele(ds, FTAG);
2119
2120         return (0);
2121 }
2122
2123 static void
2124 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
2125 {
2126         dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2127         dsl_pool_t *dp = dmu_tx_pool(tx);
2128         dsl_dataset_t *hds;
2129
2130         VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
2131         ddrsa->ddrsa_tx = tx;
2132         if (ddrsa->ddrsa_recursive) {
2133                 VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2134                     dsl_dataset_rename_snapshot_sync_impl, ddrsa,
2135                     DS_FIND_CHILDREN));
2136         } else {
2137                 VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
2138         }
2139         dsl_dataset_rele(hds, FTAG);
2140 }
2141
2142 int
2143 dsl_dataset_rename_snapshot(const char *fsname,
2144     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
2145 {
2146         dsl_dataset_rename_snapshot_arg_t ddrsa;
2147
2148         ddrsa.ddrsa_fsname = fsname;
2149         ddrsa.ddrsa_oldsnapname = oldsnapname;
2150         ddrsa.ddrsa_newsnapname = newsnapname;
2151         ddrsa.ddrsa_recursive = recursive;
2152
2153         return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
2154             dsl_dataset_rename_snapshot_sync, &ddrsa,
2155             1, ZFS_SPACE_CHECK_RESERVED));
2156 }
2157
2158 /*
2159  * If we're doing an ownership handoff, we need to make sure that there is
2160  * only one long hold on the dataset.  We're not allowed to change anything here
2161  * so we don't permanently release the long hold or regular hold here.  We want
2162  * to do this only when syncing to avoid the dataset unexpectedly going away
2163  * when we release the long hold.
2164  */
2165 static int
2166 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
2167 {
2168         boolean_t held;
2169
2170         if (!dmu_tx_is_syncing(tx))
2171                 return (0);
2172
2173         if (owner != NULL) {
2174                 VERIFY3P(ds->ds_owner, ==, owner);
2175                 dsl_dataset_long_rele(ds, owner);
2176         }
2177
2178         held = dsl_dataset_long_held(ds);
2179
2180         if (owner != NULL)
2181                 dsl_dataset_long_hold(ds, owner);
2182
2183         if (held)
2184                 return (SET_ERROR(EBUSY));
2185
2186         return (0);
2187 }
2188
2189 typedef struct dsl_dataset_rollback_arg {
2190         const char *ddra_fsname;
2191         void *ddra_owner;
2192         nvlist_t *ddra_result;
2193 } dsl_dataset_rollback_arg_t;
2194
2195 static int
2196 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
2197 {
2198         dsl_dataset_rollback_arg_t *ddra = arg;
2199         dsl_pool_t *dp = dmu_tx_pool(tx);
2200         dsl_dataset_t *ds;
2201         int64_t unused_refres_delta;
2202         int error;
2203
2204         error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2205         if (error != 0)
2206                 return (error);
2207
2208         /* must not be a snapshot */
2209         if (ds->ds_is_snapshot) {
2210                 dsl_dataset_rele(ds, FTAG);
2211                 return (SET_ERROR(EINVAL));
2212         }
2213
2214         /* must have a most recent snapshot */
2215         if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2216                 dsl_dataset_rele(ds, FTAG);
2217                 return (SET_ERROR(EINVAL));
2218         }
2219
2220         /* must not have any bookmarks after the most recent snapshot */
2221         nvlist_t *proprequest = fnvlist_alloc();
2222         fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2223         nvlist_t *bookmarks = fnvlist_alloc();
2224         error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2225         fnvlist_free(proprequest);
2226         if (error != 0)
2227                 return (error);
2228         for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2229             pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2230                 nvlist_t *valuenv =
2231                     fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2232                     zfs_prop_to_name(ZFS_PROP_CREATETXG));
2233                 uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2234                 if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2235                         fnvlist_free(bookmarks);
2236                         dsl_dataset_rele(ds, FTAG);
2237                         return (SET_ERROR(EEXIST));
2238                 }
2239         }
2240         fnvlist_free(bookmarks);
2241
2242         error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2243         if (error != 0) {
2244                 dsl_dataset_rele(ds, FTAG);
2245                 return (error);
2246         }
2247
2248         /*
2249          * Check if the snap we are rolling back to uses more than
2250          * the refquota.
2251          */
2252         if (ds->ds_quota != 0 &&
2253             dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2254                 dsl_dataset_rele(ds, FTAG);
2255                 return (SET_ERROR(EDQUOT));
2256         }
2257
2258         /*
2259          * When we do the clone swap, we will temporarily use more space
2260          * due to the refreservation (the head will no longer have any
2261          * unique space, so the entire amount of the refreservation will need
2262          * to be free).  We will immediately destroy the clone, freeing
2263          * this space, but the freeing happens over many txg's.
2264          */
2265         unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2266             dsl_dataset_phys(ds)->ds_unique_bytes);
2267
2268         if (unused_refres_delta > 0 &&
2269             unused_refres_delta >
2270             dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2271                 dsl_dataset_rele(ds, FTAG);
2272                 return (SET_ERROR(ENOSPC));
2273         }
2274
2275         dsl_dataset_rele(ds, FTAG);
2276         return (0);
2277 }
2278
2279 static void
2280 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2281 {
2282         dsl_dataset_rollback_arg_t *ddra = arg;
2283         dsl_pool_t *dp = dmu_tx_pool(tx);
2284         dsl_dataset_t *ds, *clone;
2285         uint64_t cloneobj;
2286         char namebuf[ZFS_MAXNAMELEN];
2287
2288         VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2289
2290         dsl_dataset_name(ds->ds_prev, namebuf);
2291         fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2292
2293         cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2294             ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2295
2296         VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2297
2298         dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2299         dsl_dataset_zero_zil(ds, tx);
2300
2301         dsl_destroy_head_sync_impl(clone, tx);
2302
2303         dsl_dataset_rele(clone, FTAG);
2304         dsl_dataset_rele(ds, FTAG);
2305 }
2306
2307 /*
2308  * Rolls back the given filesystem or volume to the most recent snapshot.
2309  * The name of the most recent snapshot will be returned under key "target"
2310  * in the result nvlist.
2311  *
2312  * If owner != NULL:
2313  * - The existing dataset MUST be owned by the specified owner at entry
2314  * - Upon return, dataset will still be held by the same owner, whether we
2315  *   succeed or not.
2316  *
2317  * This mode is required any time the existing filesystem is mounted.  See
2318  * notes above zfs_suspend_fs() for further details.
2319  */
2320 int
2321 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
2322 {
2323         dsl_dataset_rollback_arg_t ddra;
2324
2325         ddra.ddra_fsname = fsname;
2326         ddra.ddra_owner = owner;
2327         ddra.ddra_result = result;
2328
2329         return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2330             dsl_dataset_rollback_sync, &ddra,
2331             1, ZFS_SPACE_CHECK_RESERVED));
2332 }
2333
2334 struct promotenode {
2335         list_node_t link;
2336         dsl_dataset_t *ds;
2337 };
2338
2339 typedef struct dsl_dataset_promote_arg {
2340         const char *ddpa_clonename;
2341         dsl_dataset_t *ddpa_clone;
2342         list_t shared_snaps, origin_snaps, clone_snaps;
2343         dsl_dataset_t *origin_origin; /* origin of the origin */
2344         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2345         char *err_ds;
2346         cred_t *cr;
2347 } dsl_dataset_promote_arg_t;
2348
2349 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2350 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2351     void *tag);
2352 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2353
2354 static int
2355 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2356 {
2357         dsl_dataset_promote_arg_t *ddpa = arg;
2358         dsl_pool_t *dp = dmu_tx_pool(tx);
2359         dsl_dataset_t *hds;
2360         struct promotenode *snap;
2361         dsl_dataset_t *origin_ds;
2362         int err;
2363         uint64_t unused;
2364         uint64_t ss_mv_cnt;
2365         size_t max_snap_len;
2366
2367         err = promote_hold(ddpa, dp, FTAG);
2368         if (err != 0)
2369                 return (err);
2370
2371         hds = ddpa->ddpa_clone;
2372         max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
2373
2374         if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2375                 promote_rele(ddpa, FTAG);
2376                 return (SET_ERROR(EXDEV));
2377         }
2378
2379         /*
2380          * Compute and check the amount of space to transfer.  Since this is
2381          * so expensive, don't do the preliminary check.
2382          */
2383         if (!dmu_tx_is_syncing(tx)) {
2384                 promote_rele(ddpa, FTAG);
2385                 return (0);
2386         }
2387
2388         snap = list_head(&ddpa->shared_snaps);
2389         origin_ds = snap->ds;
2390
2391         /* compute origin's new unique space */
2392         snap = list_tail(&ddpa->clone_snaps);
2393         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2394             origin_ds->ds_object);
2395         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2396             dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2397             &ddpa->unique, &unused, &unused);
2398
2399         /*
2400          * Walk the snapshots that we are moving
2401          *
2402          * Compute space to transfer.  Consider the incremental changes
2403          * to used by each snapshot:
2404          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2405          * So each snapshot gave birth to:
2406          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2407          * So a sequence would look like:
2408          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2409          * Which simplifies to:
2410          * uN + kN + kN-1 + ... + k1 + k0
2411          * Note however, if we stop before we reach the ORIGIN we get:
2412          * uN + kN + kN-1 + ... + kM - uM-1
2413          */
2414         ss_mv_cnt = 0;
2415         ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2416         ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2417         ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2418         for (snap = list_head(&ddpa->shared_snaps); snap;
2419             snap = list_next(&ddpa->shared_snaps, snap)) {
2420                 uint64_t val, dlused, dlcomp, dluncomp;
2421                 dsl_dataset_t *ds = snap->ds;
2422
2423                 ss_mv_cnt++;
2424
2425                 /*
2426                  * If there are long holds, we won't be able to evict
2427                  * the objset.
2428                  */
2429                 if (dsl_dataset_long_held(ds)) {
2430                         err = SET_ERROR(EBUSY);
2431                         goto out;
2432                 }
2433
2434                 /* Check that the snapshot name does not conflict */
2435                 VERIFY0(dsl_dataset_get_snapname(ds));
2436                 if (strlen(ds->ds_snapname) >= max_snap_len) {
2437                         err = SET_ERROR(ENAMETOOLONG);
2438                         goto out;
2439                 }
2440                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2441                 if (err == 0) {
2442                         (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
2443                         err = SET_ERROR(EEXIST);
2444                         goto out;
2445                 }
2446                 if (err != ENOENT)
2447                         goto out;
2448
2449                 /* The very first snapshot does not have a deadlist */
2450                 if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2451                         continue;
2452
2453                 dsl_deadlist_space(&ds->ds_deadlist,
2454                     &dlused, &dlcomp, &dluncomp);
2455                 ddpa->used += dlused;
2456                 ddpa->comp += dlcomp;
2457                 ddpa->uncomp += dluncomp;
2458         }
2459
2460         /*
2461          * If we are a clone of a clone then we never reached ORIGIN,
2462          * so we need to subtract out the clone origin's used space.
2463          */
2464         if (ddpa->origin_origin) {
2465                 ddpa->used -=
2466                     dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2467                 ddpa->comp -=
2468                     dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2469                 ddpa->uncomp -=
2470                     dsl_dataset_phys(ddpa->origin_origin)->
2471                     ds_uncompressed_bytes;
2472         }
2473
2474         /* Check that there is enough space and limit headroom here */
2475         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2476             0, ss_mv_cnt, ddpa->used, ddpa->cr);
2477         if (err != 0)
2478                 goto out;
2479
2480         /*
2481          * Compute the amounts of space that will be used by snapshots
2482          * after the promotion (for both origin and clone).  For each,
2483          * it is the amount of space that will be on all of their
2484          * deadlists (that was not born before their new origin).
2485          */
2486         if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2487                 uint64_t space;
2488
2489                 /*
2490                  * Note, typically this will not be a clone of a clone,
2491                  * so dd_origin_txg will be < TXG_INITIAL, so
2492                  * these snaplist_space() -> dsl_deadlist_space_range()
2493                  * calls will be fast because they do not have to
2494                  * iterate over all bps.
2495                  */
2496                 snap = list_head(&ddpa->origin_snaps);
2497                 err = snaplist_space(&ddpa->shared_snaps,
2498                     snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2499                 if (err != 0)
2500                         goto out;
2501
2502                 err = snaplist_space(&ddpa->clone_snaps,
2503                     snap->ds->ds_dir->dd_origin_txg, &space);
2504                 if (err != 0)
2505                         goto out;
2506                 ddpa->cloneusedsnap += space;
2507         }
2508         if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2509             DD_FLAG_USED_BREAKDOWN) {
2510                 err = snaplist_space(&ddpa->origin_snaps,
2511                     dsl_dataset_phys(origin_ds)->ds_creation_txg,
2512                     &ddpa->originusedsnap);
2513                 if (err != 0)
2514                         goto out;
2515         }
2516
2517 out:
2518         promote_rele(ddpa, FTAG);
2519         return (err);
2520 }
2521
2522 static void
2523 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2524 {
2525         dsl_dataset_promote_arg_t *ddpa = arg;
2526         dsl_pool_t *dp = dmu_tx_pool(tx);
2527         dsl_dataset_t *hds;
2528         struct promotenode *snap;
2529         dsl_dataset_t *origin_ds;
2530         dsl_dataset_t *origin_head;
2531         dsl_dir_t *dd;
2532         dsl_dir_t *odd = NULL;
2533         uint64_t oldnext_obj;
2534         int64_t delta;
2535 #if defined(__FreeBSD__) && defined(_KERNEL)
2536         char *oldname, *newname;
2537 #endif
2538
2539         VERIFY0(promote_hold(ddpa, dp, FTAG));
2540         hds = ddpa->ddpa_clone;
2541
2542         ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2543
2544         snap = list_head(&ddpa->shared_snaps);
2545         origin_ds = snap->ds;
2546         dd = hds->ds_dir;
2547
2548         snap = list_head(&ddpa->origin_snaps);
2549         origin_head = snap->ds;
2550
2551         /*
2552          * We need to explicitly open odd, since origin_ds's dd will be
2553          * changing.
2554          */
2555         VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2556             NULL, FTAG, &odd));
2557
2558         /* change origin's next snap */
2559         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2560         oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2561         snap = list_tail(&ddpa->clone_snaps);
2562         ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2563             origin_ds->ds_object);
2564         dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2565
2566         /* change the origin's next clone */
2567         if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2568                 dsl_dataset_remove_from_next_clones(origin_ds,
2569                     snap->ds->ds_object, tx);
2570                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2571                     dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2572                     oldnext_obj, tx));
2573         }
2574
2575         /* change origin */
2576         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2577         ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2578         dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2579         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2580         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2581         dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2582         origin_head->ds_dir->dd_origin_txg =
2583             dsl_dataset_phys(origin_ds)->ds_creation_txg;
2584
2585         /* change dd_clone entries */
2586         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2587                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2588                     dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2589                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2590                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2591                     hds->ds_object, tx));
2592
2593                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2594                     dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2595                     origin_head->ds_object, tx));
2596                 if (dsl_dir_phys(dd)->dd_clones == 0) {
2597                         dsl_dir_phys(dd)->dd_clones =
2598                             zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
2599                             DMU_OT_NONE, 0, tx);
2600                 }
2601                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2602                     dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
2603         }
2604
2605 #if defined(__FreeBSD__) && defined(_KERNEL)
2606         /* Take the spa_namespace_lock early so zvol renames don't deadlock. */
2607         mutex_enter(&spa_namespace_lock);
2608
2609         oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2610         newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2611 #endif
2612
2613         /* move snapshots to this dir */
2614         for (snap = list_head(&ddpa->shared_snaps); snap;
2615             snap = list_next(&ddpa->shared_snaps, snap)) {
2616                 dsl_dataset_t *ds = snap->ds;
2617
2618                 /*
2619                  * Property callbacks are registered to a particular
2620                  * dsl_dir.  Since ours is changing, evict the objset
2621                  * so that they will be unregistered from the old dsl_dir.
2622                  */
2623                 if (ds->ds_objset) {
2624                         dmu_objset_evict(ds->ds_objset);
2625                         ds->ds_objset = NULL;
2626                 }
2627
2628                 /* move snap name entry */
2629                 VERIFY0(dsl_dataset_get_snapname(ds));
2630                 VERIFY0(dsl_dataset_snap_remove(origin_head,
2631                     ds->ds_snapname, tx, B_TRUE));
2632                 VERIFY0(zap_add(dp->dp_meta_objset,
2633                     dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
2634                     8, 1, &ds->ds_object, tx));
2635                 dsl_fs_ss_count_adjust(hds->ds_dir, 1,
2636                     DD_FIELD_SNAPSHOT_COUNT, tx);
2637
2638                 /* change containing dsl_dir */
2639                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2640                 ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
2641                 dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
2642                 ASSERT3P(ds->ds_dir, ==, odd);
2643                 dsl_dir_rele(ds->ds_dir, ds);
2644                 VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2645                     NULL, ds, &ds->ds_dir));
2646
2647 #if defined(__FreeBSD__) && defined(_KERNEL)
2648                 dsl_dataset_name(ds, newname);
2649                 zfsvfs_update_fromname(oldname, newname);
2650                 zvol_rename_minors(oldname, newname);
2651 #endif
2652
2653                 /* move any clone references */
2654                 if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
2655                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2656                         zap_cursor_t zc;
2657                         zap_attribute_t za;
2658
2659                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2660                             dsl_dataset_phys(ds)->ds_next_clones_obj);
2661                             zap_cursor_retrieve(&zc, &za) == 0;
2662                             zap_cursor_advance(&zc)) {
2663                                 dsl_dataset_t *cnds;
2664                                 uint64_t o;
2665
2666                                 if (za.za_first_integer == oldnext_obj) {
2667                                         /*
2668                                          * We've already moved the
2669                                          * origin's reference.
2670                                          */
2671                                         continue;
2672                                 }
2673
2674                                 VERIFY0(dsl_dataset_hold_obj(dp,
2675                                     za.za_first_integer, FTAG, &cnds));
2676                                 o = dsl_dir_phys(cnds->ds_dir)->
2677                                     dd_head_dataset_obj;
2678
2679                                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2680                                     dsl_dir_phys(odd)->dd_clones, o, tx));
2681                                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2682                                     dsl_dir_phys(dd)->dd_clones, o, tx));
2683                                 dsl_dataset_rele(cnds, FTAG);
2684                         }
2685                         zap_cursor_fini(&zc);
2686                 }
2687
2688                 ASSERT(!dsl_prop_hascb(ds));
2689         }
2690
2691 #if defined(__FreeBSD__) && defined(_KERNEL)
2692         mutex_exit(&spa_namespace_lock);
2693
2694         kmem_free(newname, MAXPATHLEN);
2695         kmem_free(oldname, MAXPATHLEN);
2696 #endif
2697         /*
2698          * Change space accounting.
2699          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2700          * both be valid, or both be 0 (resulting in delta == 0).  This
2701          * is true for each of {clone,origin} independently.
2702          */
2703
2704         delta = ddpa->cloneusedsnap -
2705             dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
2706         ASSERT3S(delta, >=, 0);
2707         ASSERT3U(ddpa->used, >=, delta);
2708         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2709         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2710             ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2711
2712         delta = ddpa->originusedsnap -
2713             dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
2714         ASSERT3S(delta, <=, 0);
2715         ASSERT3U(ddpa->used, >=, -delta);
2716         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2717         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2718             -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2719
2720         dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
2721
2722         /* log history record */
2723         spa_history_log_internal_ds(hds, "promote", tx, "");
2724
2725         dsl_dir_rele(odd, FTAG);
2726         promote_rele(ddpa, FTAG);
2727 }
2728
2729 /*
2730  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2731  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2732  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2733  * snapshots back to this dataset's origin.
2734  */
2735 static int
2736 snaplist_make(dsl_pool_t *dp,
2737     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2738 {
2739         uint64_t obj = last_obj;
2740
2741         list_create(l, sizeof (struct promotenode),
2742             offsetof(struct promotenode, link));
2743
2744         while (obj != first_obj) {
2745                 dsl_dataset_t *ds;
2746                 struct promotenode *snap;
2747                 int err;
2748
2749                 err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2750                 ASSERT(err != ENOENT);
2751                 if (err != 0)
2752                         return (err);
2753
2754                 if (first_obj == 0)
2755                         first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
2756
2757                 snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2758                 snap->ds = ds;
2759                 list_insert_tail(l, snap);
2760                 obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
2761         }
2762
2763         return (0);
2764 }
2765
2766 static int
2767 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2768 {
2769         struct promotenode *snap;
2770
2771         *spacep = 0;
2772         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2773                 uint64_t used, comp, uncomp;
2774                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2775                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
2776                 *spacep += used;
2777         }
2778         return (0);
2779 }
2780
2781 static void
2782 snaplist_destroy(list_t *l, void *tag)
2783 {
2784         struct promotenode *snap;
2785
2786         if (l == NULL || !list_link_active(&l->list_head))
2787                 return;
2788
2789         while ((snap = list_tail(l)) != NULL) {
2790                 list_remove(l, snap);
2791                 dsl_dataset_rele(snap->ds, tag);
2792                 kmem_free(snap, sizeof (*snap));
2793         }
2794         list_destroy(l);
2795 }
2796
2797 static int
2798 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2799 {
2800         int error;
2801         dsl_dir_t *dd;
2802         struct promotenode *snap;
2803
2804         error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2805             &ddpa->ddpa_clone);
2806         if (error != 0)
2807                 return (error);
2808         dd = ddpa->ddpa_clone->ds_dir;
2809
2810         if (ddpa->ddpa_clone->ds_is_snapshot ||
2811             !dsl_dir_is_clone(dd)) {
2812                 dsl_dataset_rele(ddpa->ddpa_clone, tag);
2813                 return (SET_ERROR(EINVAL));
2814         }
2815
2816         error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
2817             &ddpa->shared_snaps, tag);
2818         if (error != 0)
2819                 goto out;
2820
2821         error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2822             &ddpa->clone_snaps, tag);
2823         if (error != 0)
2824                 goto out;
2825
2826         snap = list_head(&ddpa->shared_snaps);
2827         ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
2828         error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
2829             dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
2830             &ddpa->origin_snaps, tag);
2831         if (error != 0)
2832                 goto out;
2833
2834         if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
2835                 error = dsl_dataset_hold_obj(dp,
2836                     dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
2837                     tag, &ddpa->origin_origin);
2838                 if (error != 0)
2839                         goto out;
2840         }
2841 out:
2842         if (error != 0)
2843                 promote_rele(ddpa, tag);
2844         return (error);
2845 }
2846
2847 static void
2848 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2849 {
2850         snaplist_destroy(&ddpa->shared_snaps, tag);
2851         snaplist_destroy(&ddpa->clone_snaps, tag);
2852         snaplist_destroy(&ddpa->origin_snaps, tag);
2853         if (ddpa->origin_origin != NULL)
2854                 dsl_dataset_rele(ddpa->origin_origin, tag);
2855         dsl_dataset_rele(ddpa->ddpa_clone, tag);
2856 }
2857
2858 /*
2859  * Promote a clone.
2860  *
2861  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2862  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2863  */
2864 int
2865 dsl_dataset_promote(const char *name, char *conflsnap)
2866 {
2867         dsl_dataset_promote_arg_t ddpa = { 0 };
2868         uint64_t numsnaps;
2869         int error;
2870         objset_t *os;
2871
2872         /*
2873          * We will modify space proportional to the number of
2874          * snapshots.  Compute numsnaps.
2875          */
2876         error = dmu_objset_hold(name, FTAG, &os);
2877         if (error != 0)
2878                 return (error);
2879         error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2880             dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
2881             &numsnaps);
2882         dmu_objset_rele(os, FTAG);
2883         if (error != 0)
2884                 return (error);
2885
2886         ddpa.ddpa_clonename = name;
2887         ddpa.err_ds = conflsnap;
2888         ddpa.cr = CRED();
2889
2890         return (dsl_sync_task(name, dsl_dataset_promote_check,
2891             dsl_dataset_promote_sync, &ddpa,
2892             2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
2893 }
2894
2895 int
2896 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2897     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2898 {
2899         int64_t unused_refres_delta;
2900
2901         /* they should both be heads */
2902         if (clone->ds_is_snapshot ||
2903             origin_head->ds_is_snapshot)
2904                 return (SET_ERROR(EINVAL));
2905
2906         /* if we are not forcing, the branch point should be just before them */
2907         if (!force && clone->ds_prev != origin_head->ds_prev)
2908                 return (SET_ERROR(EINVAL));
2909
2910         /* clone should be the clone (unless they are unrelated) */
2911         if (clone->ds_prev != NULL &&
2912             clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2913             origin_head->ds_dir != clone->ds_prev->ds_dir)
2914                 return (SET_ERROR(EINVAL));
2915
2916         /* the clone should be a child of the origin */
2917         if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2918                 return (SET_ERROR(EINVAL));
2919
2920         /* origin_head shouldn't be modified unless 'force' */
2921         if (!force &&
2922             dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2923                 return (SET_ERROR(ETXTBSY));
2924
2925         /* origin_head should have no long holds (e.g. is not mounted) */
2926         if (dsl_dataset_handoff_check(origin_head, owner, tx))
2927                 return (SET_ERROR(EBUSY));
2928
2929         /* check amount of any unconsumed refreservation */
2930         unused_refres_delta =
2931             (int64_t)MIN(origin_head->ds_reserved,
2932             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2933             (int64_t)MIN(origin_head->ds_reserved,
2934             dsl_dataset_phys(clone)->ds_unique_bytes);
2935
2936         if (unused_refres_delta > 0 &&
2937             unused_refres_delta >
2938             dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2939                 return (SET_ERROR(ENOSPC));
2940
2941         /* clone can't be over the head's refquota */
2942         if (origin_head->ds_quota != 0 &&
2943             dsl_dataset_phys(clone)->ds_referenced_bytes >
2944             origin_head->ds_quota)
2945                 return (SET_ERROR(EDQUOT));
2946
2947         return (0);
2948 }
2949
2950 void
2951 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2952     dsl_dataset_t *origin_head, dmu_tx_t *tx)
2953 {
2954         dsl_pool_t *dp = dmu_tx_pool(tx);
2955         int64_t unused_refres_delta;
2956
2957         ASSERT(clone->ds_reserved == 0);
2958         ASSERT(origin_head->ds_quota == 0 ||
2959             dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
2960         ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2961
2962         /*
2963          * Swap per-dataset feature flags.
2964          */
2965         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2966                 if (!(spa_feature_table[f].fi_flags &
2967                     ZFEATURE_FLAG_PER_DATASET)) {
2968                         ASSERT(!clone->ds_feature_inuse[f]);
2969                         ASSERT(!origin_head->ds_feature_inuse[f]);
2970                         continue;
2971                 }
2972
2973                 boolean_t clone_inuse = clone->ds_feature_inuse[f];
2974                 boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
2975
2976                 if (clone_inuse) {
2977                         dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
2978                         clone->ds_feature_inuse[f] = B_FALSE;
2979                 }
2980                 if (origin_head_inuse) {
2981                         dsl_dataset_deactivate_feature(origin_head->ds_object,
2982                             f, tx);
2983                         origin_head->ds_feature_inuse[f] = B_FALSE;
2984                 }
2985                 if (clone_inuse) {
2986                         dsl_dataset_activate_feature(origin_head->ds_object,
2987                             f, tx);
2988                         origin_head->ds_feature_inuse[f] = B_TRUE;
2989                 }
2990                 if (origin_head_inuse) {
2991                         dsl_dataset_activate_feature(clone->ds_object, f, tx);
2992                         clone->ds_feature_inuse[f] = B_TRUE;
2993                 }
2994         }
2995
2996         dmu_buf_will_dirty(clone->ds_dbuf, tx);
2997         dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2998
2999         if (clone->ds_objset != NULL) {
3000                 dmu_objset_evict(clone->ds_objset);
3001                 clone->ds_objset = NULL;
3002         }
3003
3004         if (origin_head->ds_objset != NULL) {
3005                 dmu_objset_evict(origin_head->ds_objset);
3006                 origin_head->ds_objset = NULL;
3007         }
3008
3009         unused_refres_delta =
3010             (int64_t)MIN(origin_head->ds_reserved,
3011             dsl_dataset_phys(origin_head)->ds_unique_bytes) -
3012             (int64_t)MIN(origin_head->ds_reserved,
3013             dsl_dataset_phys(clone)->ds_unique_bytes);
3014
3015         /*
3016          * Reset origin's unique bytes, if it exists.
3017          */
3018         if (clone->ds_prev) {
3019                 dsl_dataset_t *origin = clone->ds_prev;
3020                 uint64_t comp, uncomp;
3021
3022                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3023                 dsl_deadlist_space_range(&clone->ds_deadlist,
3024                     dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
3025                     &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
3026         }
3027
3028         /* swap blkptrs */
3029         {
3030                 blkptr_t tmp;
3031                 tmp = dsl_dataset_phys(origin_head)->ds_bp;
3032                 dsl_dataset_phys(origin_head)->ds_bp =
3033                     dsl_dataset_phys(clone)->ds_bp;
3034                 dsl_dataset_phys(clone)->ds_bp = tmp;
3035         }
3036
3037         /* set dd_*_bytes */
3038         {
3039                 int64_t dused, dcomp, duncomp;
3040                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3041                 uint64_t odl_used, odl_comp, odl_uncomp;
3042
3043                 ASSERT3U(dsl_dir_phys(clone->ds_dir)->
3044                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3045
3046                 dsl_deadlist_space(&clone->ds_deadlist,
3047                     &cdl_used, &cdl_comp, &cdl_uncomp);
3048                 dsl_deadlist_space(&origin_head->ds_deadlist,
3049                     &odl_used, &odl_comp, &odl_uncomp);
3050
3051                 dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
3052                     cdl_used -
3053                     (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
3054                     odl_used);
3055                 dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
3056                     cdl_comp -
3057                     (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
3058                     odl_comp);
3059                 duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
3060                     cdl_uncomp -
3061                     (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
3062                     odl_uncomp);
3063
3064                 dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
3065                     dused, dcomp, duncomp, tx);
3066                 dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
3067                     -dused, -dcomp, -duncomp, tx);
3068
3069                 /*
3070                  * The difference in the space used by snapshots is the
3071                  * difference in snapshot space due to the head's
3072                  * deadlist (since that's the only thing that's
3073                  * changing that affects the snapused).
3074                  */
3075                 dsl_deadlist_space_range(&clone->ds_deadlist,
3076                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3077                     &cdl_used, &cdl_comp, &cdl_uncomp);
3078                 dsl_deadlist_space_range(&origin_head->ds_deadlist,
3079                     origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3080                     &odl_used, &odl_comp, &odl_uncomp);
3081                 dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
3082                     DD_USED_HEAD, DD_USED_SNAP, NULL);
3083         }
3084
3085         /* swap ds_*_bytes */
3086         SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
3087             dsl_dataset_phys(clone)->ds_referenced_bytes);
3088         SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
3089             dsl_dataset_phys(clone)->ds_compressed_bytes);
3090         SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
3091             dsl_dataset_phys(clone)->ds_uncompressed_bytes);
3092         SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
3093             dsl_dataset_phys(clone)->ds_unique_bytes);
3094
3095         /* apply any parent delta for change in unconsumed refreservation */
3096         dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
3097             unused_refres_delta, 0, 0, tx);
3098
3099         /*
3100          * Swap deadlists.
3101          */
3102         dsl_deadlist_close(&clone->ds_deadlist);
3103         dsl_deadlist_close(&origin_head->ds_deadlist);
3104         SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
3105             dsl_dataset_phys(clone)->ds_deadlist_obj);
3106         dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
3107             dsl_dataset_phys(clone)->ds_deadlist_obj);
3108         dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
3109             dsl_dataset_phys(origin_head)->ds_deadlist_obj);
3110
3111         dsl_scan_ds_clone_swapped(origin_head, clone, tx);
3112
3113         spa_history_log_internal_ds(clone, "clone swap", tx,
3114             "parent=%s", origin_head->ds_dir->dd_myname);
3115 }
3116
3117 /*
3118  * Given a pool name and a dataset object number in that pool,
3119  * return the name of that dataset.
3120  */
3121 int
3122 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3123 {
3124         dsl_pool_t *dp;
3125         dsl_dataset_t *ds;
3126         int error;
3127
3128         error = dsl_pool_hold(pname, FTAG, &dp);
3129         if (error != 0)
3130                 return (error);
3131
3132         error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
3133         if (error == 0) {
3134                 dsl_dataset_name(ds, buf);
3135                 dsl_dataset_rele(ds, FTAG);
3136         }
3137         dsl_pool_rele(dp, FTAG);
3138
3139         return (error);
3140 }
3141
3142 int
3143 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3144     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3145 {
3146         int error = 0;
3147
3148         ASSERT3S(asize, >, 0);
3149
3150         /*
3151          * *ref_rsrv is the portion of asize that will come from any
3152          * unconsumed refreservation space.
3153          */
3154         *ref_rsrv = 0;
3155
3156         mutex_enter(&ds->ds_lock);
3157         /*
3158          * Make a space adjustment for reserved bytes.
3159          */
3160         if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
3161                 ASSERT3U(*used, >=,
3162                     ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3163                 *used -=
3164                     (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3165                 *ref_rsrv =
3166                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3167         }
3168
3169         if (!check_quota || ds->ds_quota == 0) {
3170                 mutex_exit(&ds->ds_lock);
3171                 return (0);
3172         }
3173         /*
3174          * If they are requesting more space, and our current estimate
3175          * is over quota, they get to try again unless the actual
3176          * on-disk is over quota and there are no pending changes (which
3177          * may free up space for us).
3178          */
3179         if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
3180             ds->ds_quota) {
3181                 if (inflight > 0 ||
3182                     dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
3183                         error = SET_ERROR(ERESTART);
3184                 else
3185                         error = SET_ERROR(EDQUOT);
3186         }
3187         mutex_exit(&ds->ds_lock);
3188
3189         return (error);
3190 }
3191
3192 typedef struct dsl_dataset_set_qr_arg {
3193         const char *ddsqra_name;
3194         zprop_source_t ddsqra_source;
3195         uint64_t ddsqra_value;
3196 } dsl_dataset_set_qr_arg_t;
3197
3198
3199 /* ARGSUSED */
3200 static int
3201 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
3202 {
3203         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3204         dsl_pool_t *dp = dmu_tx_pool(tx);
3205         dsl_dataset_t *ds;
3206         int error;
3207         uint64_t newval;
3208
3209         if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
3210                 return (SET_ERROR(ENOTSUP));
3211
3212         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3213         if (error != 0)
3214                 return (error);
3215
3216         if (ds->ds_is_snapshot) {
3217                 dsl_dataset_rele(ds, FTAG);
3218                 return (SET_ERROR(EINVAL));
3219         }
3220
3221         error = dsl_prop_predict(ds->ds_dir,
3222             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3223             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3224         if (error != 0) {
3225                 dsl_dataset_rele(ds, FTAG);
3226                 return (error);
3227         }
3228
3229         if (newval == 0) {
3230                 dsl_dataset_rele(ds, FTAG);
3231                 return (0);
3232         }
3233
3234         if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
3235             newval < ds->ds_reserved) {
3236                 dsl_dataset_rele(ds, FTAG);
3237                 return (SET_ERROR(ENOSPC));
3238         }
3239
3240         dsl_dataset_rele(ds, FTAG);
3241         return (0);
3242 }
3243
3244 static void
3245 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3246 {
3247         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3248         dsl_pool_t *dp = dmu_tx_pool(tx);
3249         dsl_dataset_t *ds;
3250         uint64_t newval;
3251
3252         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3253
3254         dsl_prop_set_sync_impl(ds,
3255             zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3256             ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3257             &ddsqra->ddsqra_value, tx);
3258
3259         VERIFY0(dsl_prop_get_int_ds(ds,
3260             zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3261
3262         if (ds->ds_quota != newval) {
3263                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3264                 ds->ds_quota = newval;
3265         }
3266         dsl_dataset_rele(ds, FTAG);
3267 }
3268
3269 int
3270 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3271     uint64_t refquota)
3272 {
3273         dsl_dataset_set_qr_arg_t ddsqra;
3274
3275         ddsqra.ddsqra_name = dsname;
3276         ddsqra.ddsqra_source = source;
3277         ddsqra.ddsqra_value = refquota;
3278
3279         return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3280             dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3281 }
3282
3283 static int
3284 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3285 {
3286         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3287         dsl_pool_t *dp = dmu_tx_pool(tx);
3288         dsl_dataset_t *ds;
3289         int error;
3290         uint64_t newval, unique;
3291
3292         if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3293                 return (SET_ERROR(ENOTSUP));
3294
3295         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3296         if (error != 0)
3297                 return (error);
3298
3299         if (ds->ds_is_snapshot) {
3300                 dsl_dataset_rele(ds, FTAG);
3301                 return (SET_ERROR(EINVAL));
3302         }
3303
3304         error = dsl_prop_predict(ds->ds_dir,
3305             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3306             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3307         if (error != 0) {
3308                 dsl_dataset_rele(ds, FTAG);
3309                 return (error);
3310         }
3311
3312         /*
3313          * If we are doing the preliminary check in open context, the
3314          * space estimates may be inaccurate.
3315          */
3316         if (!dmu_tx_is_syncing(tx)) {
3317                 dsl_dataset_rele(ds, FTAG);
3318                 return (0);
3319         }
3320
3321         mutex_enter(&ds->ds_lock);
3322         if (!DS_UNIQUE_IS_ACCURATE(ds))
3323                 dsl_dataset_recalc_head_uniq(ds);
3324         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3325         mutex_exit(&ds->ds_lock);
3326
3327         if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3328                 uint64_t delta = MAX(unique, newval) -
3329                     MAX(unique, ds->ds_reserved);
3330
3331                 if (delta >
3332                     dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3333                     (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3334                         dsl_dataset_rele(ds, FTAG);
3335                         return (SET_ERROR(ENOSPC));
3336                 }
3337         }
3338
3339         dsl_dataset_rele(ds, FTAG);
3340         return (0);
3341 }
3342
3343 void
3344 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3345     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3346 {
3347         uint64_t newval;
3348         uint64_t unique;
3349         int64_t delta;
3350
3351         dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3352             source, sizeof (value), 1, &value, tx);
3353
3354         VERIFY0(dsl_prop_get_int_ds(ds,
3355             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3356
3357         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3358         mutex_enter(&ds->ds_dir->dd_lock);
3359         mutex_enter(&ds->ds_lock);
3360         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3361         unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3362         delta = MAX(0, (int64_t)(newval - unique)) -
3363             MAX(0, (int64_t)(ds->ds_reserved - unique));
3364         ds->ds_reserved = newval;
3365         mutex_exit(&ds->ds_lock);
3366
3367         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3368         mutex_exit(&ds->ds_dir->dd_lock);
3369 }
3370
3371 static void
3372 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3373 {
3374         dsl_dataset_set_qr_arg_t *ddsqra = arg;
3375         dsl_pool_t *dp = dmu_tx_pool(tx);
3376         dsl_dataset_t *ds;
3377
3378         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3379         dsl_dataset_set_refreservation_sync_impl(ds,
3380             ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3381         dsl_dataset_rele(ds, FTAG);
3382 }
3383
3384 int
3385 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3386     uint64_t refreservation)
3387 {
3388         dsl_dataset_set_qr_arg_t ddsqra;
3389
3390         ddsqra.ddsqra_name = dsname;
3391         ddsqra.ddsqra_source = source;
3392         ddsqra.ddsqra_value = refreservation;
3393
3394         return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3395             dsl_dataset_set_refreservation_sync, &ddsqra,
3396             0, ZFS_SPACE_CHECK_NONE));
3397 }
3398
3399 /*
3400  * Return (in *usedp) the amount of space written in new that is not
3401  * present in oldsnap.  New may be a snapshot or the head.  Old must be
3402  * a snapshot before new, in new's filesystem (or its origin).  If not then
3403  * fail and return EINVAL.
3404  *
3405  * The written space is calculated by considering two components:  First, we
3406  * ignore any freed space, and calculate the written as new's used space
3407  * minus old's used space.  Next, we add in the amount of space that was freed
3408  * between the two snapshots, thus reducing new's used space relative to old's.
3409  * Specifically, this is the space that was born before old->ds_creation_txg,
3410  * and freed before new (ie. on new's deadlist or a previous deadlist).
3411  *
3412  * space freed                         [---------------------]
3413  * snapshots                       ---O-------O--------O-------O------
3414  *                                         oldsnap            new
3415  */
3416 int
3417 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3418     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3419 {
3420         int err = 0;
3421         uint64_t snapobj;
3422         dsl_pool_t *dp = new->ds_dir->dd_pool;
3423
3424         ASSERT(dsl_pool_config_held(dp));
3425
3426         *usedp = 0;
3427         *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3428         *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3429
3430         *compp = 0;
3431         *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3432         *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3433
3434         *uncompp = 0;
3435         *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3436         *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3437
3438         snapobj = new->ds_object;
3439         while (snapobj != oldsnap->ds_object) {
3440                 dsl_dataset_t *snap;
3441                 uint64_t used, comp, uncomp;
3442
3443                 if (snapobj == new->ds_object) {
3444                         snap = new;
3445                 } else {
3446                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3447                         if (err != 0)
3448                                 break;
3449                 }
3450
3451                 if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3452                     dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3453                         /*
3454                          * The blocks in the deadlist can not be born after
3455                          * ds_prev_snap_txg, so get the whole deadlist space,
3456                          * which is more efficient (especially for old-format
3457                          * deadlists).  Unfortunately the deadlist code
3458                          * doesn't have enough information to make this
3459                          * optimization itself.
3460                          */
3461                         dsl_deadlist_space(&snap->ds_deadlist,
3462                             &used, &comp, &uncomp);
3463                 } else {
3464                         dsl_deadlist_space_range(&snap->ds_deadlist,
3465                             0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3466                             &used, &comp, &uncomp);
3467                 }
3468                 *usedp += used;
3469                 *compp += comp;
3470                 *uncompp += uncomp;
3471
3472                 /*
3473                  * If we get to the beginning of the chain of snapshots
3474                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3475                  * was not a snapshot of/before new.
3476                  */
3477                 snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3478                 if (snap != new)
3479                         dsl_dataset_rele(snap, FTAG);
3480                 if (snapobj == 0) {
3481                         err = SET_ERROR(EINVAL);
3482                         break;
3483                 }
3484
3485         }
3486         return (err);
3487 }
3488
3489 /*
3490  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3491  * lastsnap, and all snapshots in between are deleted.
3492  *
3493  * blocks that would be freed            [---------------------------]
3494  * snapshots                       ---O-------O--------O-------O--------O
3495  *                                        firstsnap        lastsnap
3496  *
3497  * This is the set of blocks that were born after the snap before firstsnap,
3498  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3499  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3500  * We calculate this by iterating over the relevant deadlists (from the snap
3501  * after lastsnap, backward to the snap after firstsnap), summing up the
3502  * space on the deadlist that was born after the snap before firstsnap.
3503  */
3504 int
3505 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3506     dsl_dataset_t *lastsnap,
3507     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3508 {
3509         int err = 0;
3510         uint64_t snapobj;
3511         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3512
3513         ASSERT(firstsnap->ds_is_snapshot);
3514         ASSERT(lastsnap->ds_is_snapshot);
3515
3516         /*
3517          * Check that the snapshots are in the same dsl_dir, and firstsnap
3518          * is before lastsnap.
3519          */
3520         if (firstsnap->ds_dir != lastsnap->ds_dir ||
3521             dsl_dataset_phys(firstsnap)->ds_creation_txg >
3522             dsl_dataset_phys(lastsnap)->ds_creation_txg)
3523                 return (SET_ERROR(EINVAL));
3524
3525         *usedp = *compp = *uncompp = 0;
3526
3527         snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3528         while (snapobj != firstsnap->ds_object) {
3529                 dsl_dataset_t *ds;
3530                 uint64_t used, comp, uncomp;
3531
3532                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3533                 if (err != 0)
3534                         break;
3535
3536                 dsl_deadlist_space_range(&ds->ds_deadlist,
3537                     dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3538                     &used, &comp, &uncomp);
3539                 *usedp += used;
3540                 *compp += comp;
3541                 *uncompp += uncomp;
3542
3543                 snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3544                 ASSERT3U(snapobj, !=, 0);
3545                 dsl_dataset_rele(ds, FTAG);
3546         }
3547         return (err);
3548 }
3549
3550 /*
3551  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3552  * For example, they could both be snapshots of the same filesystem, and
3553  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3554  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3555  * filesystem.  Or 'earlier' could be the origin's origin.
3556  *
3557  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3558  */
3559 boolean_t
3560 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
3561     uint64_t earlier_txg)
3562 {
3563         dsl_pool_t *dp = later->ds_dir->dd_pool;
3564         int error;
3565         boolean_t ret;
3566
3567         ASSERT(dsl_pool_config_held(dp));
3568         ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
3569
3570         if (earlier_txg == 0)
3571                 earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
3572
3573         if (later->ds_is_snapshot &&
3574             earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
3575                 return (B_FALSE);
3576
3577         if (later->ds_dir == earlier->ds_dir)
3578                 return (B_TRUE);
3579         if (!dsl_dir_is_clone(later->ds_dir))
3580                 return (B_FALSE);
3581
3582         if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
3583                 return (B_TRUE);
3584         dsl_dataset_t *origin;
3585         error = dsl_dataset_hold_obj(dp,
3586             dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
3587         if (error != 0)
3588                 return (B_FALSE);
3589         ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3590         dsl_dataset_rele(origin, FTAG);
3591         return (ret);
3592 }
3593
3594 void
3595 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3596 {
3597         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3598         dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3599 }
3600
3601 boolean_t
3602 dsl_dataset_is_zapified(dsl_dataset_t *ds)
3603 {
3604         dmu_object_info_t doi;
3605
3606         dmu_object_info_from_db(ds->ds_dbuf, &doi);
3607         return (doi.doi_type == DMU_OTN_ZAP_METADATA);
3608 }
3609
3610 boolean_t
3611 dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
3612 {
3613         return (dsl_dataset_is_zapified(ds) &&
3614             zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
3615             ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
3616 }