]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #include <sys/dmu_objset.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_dir.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dmu_traverse.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/arc.h>
34 #include <sys/zio.h>
35 #include <sys/zap.h>
36 #include <sys/unique.h>
37 #include <sys/zfs_context.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/spa.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/sunddi.h>
42
43 static char *dsl_reaper = "the grim reaper";
44
45 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
46 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
47 static dsl_checkfunc_t dsl_dataset_rollback_check;
48 static dsl_syncfunc_t dsl_dataset_rollback_sync;
49 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
50
51 #define DS_REF_MAX      (1ULL << 62)
52
53 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
54
55 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
56
57
58 /*
59  * Figure out how much of this delta should be propogated to the dsl_dir
60  * layer.  If there's a refreservation, that space has already been
61  * partially accounted for in our ancestors.
62  */
63 static int64_t
64 parent_delta(dsl_dataset_t *ds, int64_t delta)
65 {
66         uint64_t old_bytes, new_bytes;
67
68         if (ds->ds_reserved == 0)
69                 return (delta);
70
71         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
72         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
73
74         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
75         return (new_bytes - old_bytes);
76 }
77
78 void
79 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
80 {
81         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
82         int compressed = BP_GET_PSIZE(bp);
83         int uncompressed = BP_GET_UCSIZE(bp);
84         int64_t delta;
85
86         dprintf_bp(bp, "born, ds=%p\n", ds);
87
88         ASSERT(dmu_tx_is_syncing(tx));
89         /* It could have been compressed away to nothing */
90         if (BP_IS_HOLE(bp))
91                 return;
92         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
93         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
94         if (ds == NULL) {
95                 /*
96                  * Account for the meta-objset space in its placeholder
97                  * dsl_dir.
98                  */
99                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
100                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
101                     used, compressed, uncompressed, tx);
102                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
103                 return;
104         }
105         dmu_buf_will_dirty(ds->ds_dbuf, tx);
106         mutex_enter(&ds->ds_dir->dd_lock);
107         mutex_enter(&ds->ds_lock);
108         delta = parent_delta(ds, used);
109         ds->ds_phys->ds_used_bytes += used;
110         ds->ds_phys->ds_compressed_bytes += compressed;
111         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
112         ds->ds_phys->ds_unique_bytes += used;
113         mutex_exit(&ds->ds_lock);
114         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
115             compressed, uncompressed, tx);
116         dsl_dir_transfer_space(ds->ds_dir, used - delta,
117             DD_USED_REFRSRV, DD_USED_HEAD, tx);
118         mutex_exit(&ds->ds_dir->dd_lock);
119 }
120
121 int
122 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
123     dmu_tx_t *tx)
124 {
125         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
126         int compressed = BP_GET_PSIZE(bp);
127         int uncompressed = BP_GET_UCSIZE(bp);
128
129         ASSERT(pio != NULL);
130         ASSERT(dmu_tx_is_syncing(tx));
131         /* No block pointer => nothing to free */
132         if (BP_IS_HOLE(bp))
133                 return (0);
134
135         ASSERT(used > 0);
136         if (ds == NULL) {
137                 int err;
138                 /*
139                  * Account for the meta-objset space in its placeholder
140                  * dataset.
141                  */
142                 err = dsl_free(pio, tx->tx_pool,
143                     tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
144                 ASSERT(err == 0);
145
146                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
147                     -used, -compressed, -uncompressed, tx);
148                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
149                 return (used);
150         }
151         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
152
153         ASSERT(!dsl_dataset_is_snapshot(ds));
154         dmu_buf_will_dirty(ds->ds_dbuf, tx);
155
156         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
157                 int err;
158                 int64_t delta;
159
160                 dprintf_bp(bp, "freeing: %s", "");
161                 err = dsl_free(pio, tx->tx_pool,
162                     tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
163                 ASSERT(err == 0);
164
165                 mutex_enter(&ds->ds_dir->dd_lock);
166                 mutex_enter(&ds->ds_lock);
167                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
168                     !DS_UNIQUE_IS_ACCURATE(ds));
169                 delta = parent_delta(ds, -used);
170                 ds->ds_phys->ds_unique_bytes -= used;
171                 mutex_exit(&ds->ds_lock);
172                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
173                     delta, -compressed, -uncompressed, tx);
174                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
175                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
176                 mutex_exit(&ds->ds_dir->dd_lock);
177         } else {
178                 dprintf_bp(bp, "putting on dead list: %s", "");
179                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
180                 ASSERT3U(ds->ds_prev->ds_object, ==,
181                     ds->ds_phys->ds_prev_snap_obj);
182                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
183                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
184                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
185                     ds->ds_object && bp->blk_birth >
186                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
187                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
188                         mutex_enter(&ds->ds_prev->ds_lock);
189                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
190                         mutex_exit(&ds->ds_prev->ds_lock);
191                 }
192                 if (bp->blk_birth > ds->ds_origin_txg) {
193                         dsl_dir_transfer_space(ds->ds_dir, used,
194                             DD_USED_HEAD, DD_USED_SNAP, tx);
195                 }
196         }
197         mutex_enter(&ds->ds_lock);
198         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
199         ds->ds_phys->ds_used_bytes -= used;
200         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
201         ds->ds_phys->ds_compressed_bytes -= compressed;
202         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
203         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
204         mutex_exit(&ds->ds_lock);
205
206         return (used);
207 }
208
209 uint64_t
210 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
211 {
212         uint64_t trysnap = 0;
213
214         if (ds == NULL)
215                 return (0);
216         /*
217          * The snapshot creation could fail, but that would cause an
218          * incorrect FALSE return, which would only result in an
219          * overestimation of the amount of space that an operation would
220          * consume, which is OK.
221          *
222          * There's also a small window where we could miss a pending
223          * snapshot, because we could set the sync task in the quiescing
224          * phase.  So this should only be used as a guess.
225          */
226         if (ds->ds_trysnap_txg >
227             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
228                 trysnap = ds->ds_trysnap_txg;
229         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
230 }
231
232 int
233 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
234 {
235         return (blk_birth > dsl_dataset_prev_snap_txg(ds));
236 }
237
238 /* ARGSUSED */
239 static void
240 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
241 {
242         dsl_dataset_t *ds = dsv;
243
244         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
245
246         dprintf_ds(ds, "evicting %s\n", "");
247
248         unique_remove(ds->ds_fsid_guid);
249
250         if (ds->ds_user_ptr != NULL)
251                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
252
253         if (ds->ds_prev) {
254                 dsl_dataset_drop_ref(ds->ds_prev, ds);
255                 ds->ds_prev = NULL;
256         }
257
258         bplist_close(&ds->ds_deadlist);
259         if (ds->ds_dir)
260                 dsl_dir_close(ds->ds_dir, ds);
261
262         ASSERT(!list_link_active(&ds->ds_synced_link));
263
264         if (mutex_owned(&ds->ds_lock))
265                 mutex_exit(&ds->ds_lock);
266         mutex_destroy(&ds->ds_lock);
267         if (mutex_owned(&ds->ds_opening_lock))
268                 mutex_exit(&ds->ds_opening_lock);
269         mutex_destroy(&ds->ds_opening_lock);
270         if (mutex_owned(&ds->ds_deadlist.bpl_lock))
271                 mutex_exit(&ds->ds_deadlist.bpl_lock);
272         mutex_destroy(&ds->ds_deadlist.bpl_lock);
273         rw_destroy(&ds->ds_rwlock);
274         cv_destroy(&ds->ds_exclusive_cv);
275
276         kmem_free(ds, sizeof (dsl_dataset_t));
277 }
278
279 static int
280 dsl_dataset_get_snapname(dsl_dataset_t *ds)
281 {
282         dsl_dataset_phys_t *headphys;
283         int err;
284         dmu_buf_t *headdbuf;
285         dsl_pool_t *dp = ds->ds_dir->dd_pool;
286         objset_t *mos = dp->dp_meta_objset;
287
288         if (ds->ds_snapname[0])
289                 return (0);
290         if (ds->ds_phys->ds_next_snap_obj == 0)
291                 return (0);
292
293         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
294             FTAG, &headdbuf);
295         if (err)
296                 return (err);
297         headphys = headdbuf->db_data;
298         err = zap_value_search(dp->dp_meta_objset,
299             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
300         dmu_buf_rele(headdbuf, FTAG);
301         return (err);
302 }
303
304 static int
305 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
306 {
307         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
308         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
309         matchtype_t mt;
310         int err;
311
312         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
313                 mt = MT_FIRST;
314         else
315                 mt = MT_EXACT;
316
317         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
318             value, mt, NULL, 0, NULL);
319         if (err == ENOTSUP && mt == MT_FIRST)
320                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
321         return (err);
322 }
323
324 static int
325 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
326 {
327         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
328         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
329         matchtype_t mt;
330         int err;
331
332         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
333                 mt = MT_FIRST;
334         else
335                 mt = MT_EXACT;
336
337         err = zap_remove_norm(mos, snapobj, name, mt, tx);
338         if (err == ENOTSUP && mt == MT_FIRST)
339                 err = zap_remove(mos, snapobj, name, tx);
340         return (err);
341 }
342
343 static int
344 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
345     dsl_dataset_t **dsp)
346 {
347         objset_t *mos = dp->dp_meta_objset;
348         dmu_buf_t *dbuf;
349         dsl_dataset_t *ds;
350         int err;
351
352         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
353             dsl_pool_sync_context(dp));
354
355         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
356         if (err)
357                 return (err);
358         ds = dmu_buf_get_user(dbuf);
359         if (ds == NULL) {
360                 dsl_dataset_t *winner;
361
362                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
363                 ds->ds_dbuf = dbuf;
364                 ds->ds_object = dsobj;
365                 ds->ds_phys = dbuf->db_data;
366
367                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
368                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
369                 mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
370                     NULL);
371                 rw_init(&ds->ds_rwlock, 0, 0, 0);
372                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
373
374                 err = bplist_open(&ds->ds_deadlist,
375                     mos, ds->ds_phys->ds_deadlist_obj);
376                 if (err == 0) {
377                         err = dsl_dir_open_obj(dp,
378                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
379                 }
380                 if (err) {
381                         /*
382                          * we don't really need to close the blist if we
383                          * just opened it.
384                          */
385                         mutex_destroy(&ds->ds_lock);
386                         mutex_destroy(&ds->ds_opening_lock);
387                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
388                         rw_destroy(&ds->ds_rwlock);
389                         cv_destroy(&ds->ds_exclusive_cv);
390                         kmem_free(ds, sizeof (dsl_dataset_t));
391                         dmu_buf_rele(dbuf, tag);
392                         return (err);
393                 }
394
395                 if (!dsl_dataset_is_snapshot(ds)) {
396                         ds->ds_snapname[0] = '\0';
397                         if (ds->ds_phys->ds_prev_snap_obj) {
398                                 err = dsl_dataset_get_ref(dp,
399                                     ds->ds_phys->ds_prev_snap_obj,
400                                     ds, &ds->ds_prev);
401                         }
402
403                         if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
404                                 dsl_dataset_t *origin;
405
406                                 err = dsl_dataset_hold_obj(dp,
407                                     ds->ds_dir->dd_phys->dd_origin_obj,
408                                     FTAG, &origin);
409                                 if (err == 0) {
410                                         ds->ds_origin_txg =
411                                             origin->ds_phys->ds_creation_txg;
412                                         dsl_dataset_rele(origin, FTAG);
413                                 }
414                         }
415                 } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
416                         err = dsl_dataset_get_snapname(ds);
417                 }
418
419                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
420                         /*
421                          * In sync context, we're called with either no lock
422                          * or with the write lock.  If we're not syncing,
423                          * we're always called with the read lock held.
424                          */
425                         boolean_t need_lock =
426                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
427                             dsl_pool_sync_context(dp);
428
429                         if (need_lock)
430                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
431
432                         err = dsl_prop_get_ds(ds,
433                             "refreservation", sizeof (uint64_t), 1,
434                             &ds->ds_reserved, NULL);
435                         if (err == 0) {
436                                 err = dsl_prop_get_ds(ds,
437                                     "refquota", sizeof (uint64_t), 1,
438                                     &ds->ds_quota, NULL);
439                         }
440
441                         if (need_lock)
442                                 rw_exit(&dp->dp_config_rwlock);
443                 } else {
444                         ds->ds_reserved = ds->ds_quota = 0;
445                 }
446
447                 if (err == 0) {
448                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
449                             dsl_dataset_evict);
450                 }
451                 if (err || winner) {
452                         bplist_close(&ds->ds_deadlist);
453                         if (ds->ds_prev)
454                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
455                         dsl_dir_close(ds->ds_dir, ds);
456                         mutex_destroy(&ds->ds_lock);
457                         mutex_destroy(&ds->ds_opening_lock);
458                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
459                         rw_destroy(&ds->ds_rwlock);
460                         cv_destroy(&ds->ds_exclusive_cv);
461                         kmem_free(ds, sizeof (dsl_dataset_t));
462                         if (err) {
463                                 dmu_buf_rele(dbuf, tag);
464                                 return (err);
465                         }
466                         ds = winner;
467                 } else {
468                         ds->ds_fsid_guid =
469                             unique_insert(ds->ds_phys->ds_fsid_guid);
470                 }
471         }
472         ASSERT3P(ds->ds_dbuf, ==, dbuf);
473         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
474         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
475             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
476             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
477         mutex_enter(&ds->ds_lock);
478         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
479                 mutex_exit(&ds->ds_lock);
480                 dmu_buf_rele(ds->ds_dbuf, tag);
481                 return (ENOENT);
482         }
483         mutex_exit(&ds->ds_lock);
484         *dsp = ds;
485         return (0);
486 }
487
488 static int
489 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
490 {
491         dsl_pool_t *dp = ds->ds_dir->dd_pool;
492
493         /*
494          * In syncing context we don't want the rwlock lock: there
495          * may be an existing writer waiting for sync phase to
496          * finish.  We don't need to worry about such writers, since
497          * sync phase is single-threaded, so the writer can't be
498          * doing anything while we are active.
499          */
500         if (dsl_pool_sync_context(dp)) {
501                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
502                 return (0);
503         }
504
505         /*
506          * Normal users will hold the ds_rwlock as a READER until they
507          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
508          * drop their READER lock after they set the ds_owner field.
509          *
510          * If the dataset is being destroyed, the destroy thread will
511          * obtain a WRITER lock for exclusive access after it's done its
512          * open-context work and then change the ds_owner to
513          * dsl_reaper once destruction is assured.  So threads
514          * may block here temporarily, until the "destructability" of
515          * the dataset is determined.
516          */
517         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
518         mutex_enter(&ds->ds_lock);
519         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
520                 rw_exit(&dp->dp_config_rwlock);
521                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
522                 if (DSL_DATASET_IS_DESTROYED(ds)) {
523                         mutex_exit(&ds->ds_lock);
524                         dsl_dataset_drop_ref(ds, tag);
525                         rw_enter(&dp->dp_config_rwlock, RW_READER);
526                         return (ENOENT);
527                 }
528                 rw_enter(&dp->dp_config_rwlock, RW_READER);
529         }
530         mutex_exit(&ds->ds_lock);
531         return (0);
532 }
533
534 int
535 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
536     dsl_dataset_t **dsp)
537 {
538         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
539
540         if (err)
541                 return (err);
542         return (dsl_dataset_hold_ref(*dsp, tag));
543 }
544
545 int
546 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
547     dsl_dataset_t **dsp)
548 {
549         int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
550
551         ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
552
553         if (err)
554                 return (err);
555         if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
556                 dsl_dataset_rele(*dsp, owner);
557                 return (EBUSY);
558         }
559         return (0);
560 }
561
562 int
563 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
564 {
565         dsl_dir_t *dd;
566         dsl_pool_t *dp;
567         const char *snapname;
568         uint64_t obj;
569         int err = 0;
570
571         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
572         if (err)
573                 return (err);
574
575         dp = dd->dd_pool;
576         obj = dd->dd_phys->dd_head_dataset_obj;
577         rw_enter(&dp->dp_config_rwlock, RW_READER);
578         if (obj)
579                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
580         else
581                 err = ENOENT;
582         if (err)
583                 goto out;
584
585         err = dsl_dataset_hold_ref(*dsp, tag);
586
587         /* we may be looking for a snapshot */
588         if (err == 0 && snapname != NULL) {
589                 dsl_dataset_t *ds = NULL;
590
591                 if (*snapname++ != '@') {
592                         dsl_dataset_rele(*dsp, tag);
593                         err = ENOENT;
594                         goto out;
595                 }
596
597                 dprintf("looking for snapshot '%s'\n", snapname);
598                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
599                 if (err == 0)
600                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
601                 dsl_dataset_rele(*dsp, tag);
602
603                 ASSERT3U((err == 0), ==, (ds != NULL));
604
605                 if (ds) {
606                         mutex_enter(&ds->ds_lock);
607                         if (ds->ds_snapname[0] == 0)
608                                 (void) strlcpy(ds->ds_snapname, snapname,
609                                     sizeof (ds->ds_snapname));
610                         mutex_exit(&ds->ds_lock);
611                         err = dsl_dataset_hold_ref(ds, tag);
612                         *dsp = err ? NULL : ds;
613                 }
614         }
615 out:
616         rw_exit(&dp->dp_config_rwlock);
617         dsl_dir_close(dd, FTAG);
618         return (err);
619 }
620
621 int
622 dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
623 {
624         int err = dsl_dataset_hold(name, owner, dsp);
625         if (err)
626                 return (err);
627         if ((*dsp)->ds_phys->ds_num_children > 0 &&
628             !DS_MODE_IS_READONLY(flags)) {
629                 dsl_dataset_rele(*dsp, owner);
630                 return (EROFS);
631         }
632         if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
633                 dsl_dataset_rele(*dsp, owner);
634                 return (EBUSY);
635         }
636         return (0);
637 }
638
639 void
640 dsl_dataset_name(dsl_dataset_t *ds, char *name)
641 {
642         if (ds == NULL) {
643                 (void) strcpy(name, "mos");
644         } else {
645                 dsl_dir_name(ds->ds_dir, name);
646                 VERIFY(0 == dsl_dataset_get_snapname(ds));
647                 if (ds->ds_snapname[0]) {
648                         (void) strcat(name, "@");
649                         /*
650                          * We use a "recursive" mutex so that we
651                          * can call dprintf_ds() with ds_lock held.
652                          */
653                         if (!MUTEX_HELD(&ds->ds_lock)) {
654                                 mutex_enter(&ds->ds_lock);
655                                 (void) strcat(name, ds->ds_snapname);
656                                 mutex_exit(&ds->ds_lock);
657                         } else {
658                                 (void) strcat(name, ds->ds_snapname);
659                         }
660                 }
661         }
662 }
663
664 static int
665 dsl_dataset_namelen(dsl_dataset_t *ds)
666 {
667         int result;
668
669         if (ds == NULL) {
670                 result = 3;     /* "mos" */
671         } else {
672                 result = dsl_dir_namelen(ds->ds_dir);
673                 VERIFY(0 == dsl_dataset_get_snapname(ds));
674                 if (ds->ds_snapname[0]) {
675                         ++result;       /* adding one for the @-sign */
676                         if (!MUTEX_HELD(&ds->ds_lock)) {
677                                 mutex_enter(&ds->ds_lock);
678                                 result += strlen(ds->ds_snapname);
679                                 mutex_exit(&ds->ds_lock);
680                         } else {
681                                 result += strlen(ds->ds_snapname);
682                         }
683                 }
684         }
685
686         return (result);
687 }
688
689 void
690 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
691 {
692         dmu_buf_rele(ds->ds_dbuf, tag);
693 }
694
695 void
696 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
697 {
698         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
699                 rw_exit(&ds->ds_rwlock);
700         }
701         dsl_dataset_drop_ref(ds, tag);
702 }
703
704 void
705 dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
706 {
707         ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
708             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
709
710         mutex_enter(&ds->ds_lock);
711         ds->ds_owner = NULL;
712         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
713                 rw_exit(&ds->ds_rwlock);
714                 cv_broadcast(&ds->ds_exclusive_cv);
715         }
716         mutex_exit(&ds->ds_lock);
717         if (ds->ds_dbuf)
718                 dsl_dataset_drop_ref(ds, owner);
719         else
720                 dsl_dataset_evict(ds->ds_dbuf, ds);
721 }
722
723 boolean_t
724 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
725 {
726         boolean_t gotit = FALSE;
727
728         mutex_enter(&ds->ds_lock);
729         if (ds->ds_owner == NULL &&
730             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
731                 ds->ds_owner = owner;
732                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
733                         rw_exit(&ds->ds_rwlock);
734                 gotit = TRUE;
735         }
736         mutex_exit(&ds->ds_lock);
737         return (gotit);
738 }
739
740 void
741 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
742 {
743         ASSERT3P(owner, ==, ds->ds_owner);
744         if (!RW_WRITE_HELD(&ds->ds_rwlock))
745                 rw_enter(&ds->ds_rwlock, RW_WRITER);
746 }
747
748 uint64_t
749 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
750     uint64_t flags, dmu_tx_t *tx)
751 {
752         dsl_pool_t *dp = dd->dd_pool;
753         dmu_buf_t *dbuf;
754         dsl_dataset_phys_t *dsphys;
755         uint64_t dsobj;
756         objset_t *mos = dp->dp_meta_objset;
757
758         if (origin == NULL)
759                 origin = dp->dp_origin_snap;
760
761         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
762         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
763         ASSERT(dmu_tx_is_syncing(tx));
764         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
765
766         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
767             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
768         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
769         dmu_buf_will_dirty(dbuf, tx);
770         dsphys = dbuf->db_data;
771         bzero(dsphys, sizeof (dsl_dataset_phys_t));
772         dsphys->ds_dir_obj = dd->dd_object;
773         dsphys->ds_flags = flags;
774         dsphys->ds_fsid_guid = unique_create();
775         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
776             sizeof (dsphys->ds_guid));
777         dsphys->ds_snapnames_zapobj =
778             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
779             DMU_OT_NONE, 0, tx);
780         dsphys->ds_creation_time = gethrestime_sec();
781         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
782         dsphys->ds_deadlist_obj =
783             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
784
785         if (origin) {
786                 dsphys->ds_prev_snap_obj = origin->ds_object;
787                 dsphys->ds_prev_snap_txg =
788                     origin->ds_phys->ds_creation_txg;
789                 dsphys->ds_used_bytes =
790                     origin->ds_phys->ds_used_bytes;
791                 dsphys->ds_compressed_bytes =
792                     origin->ds_phys->ds_compressed_bytes;
793                 dsphys->ds_uncompressed_bytes =
794                     origin->ds_phys->ds_uncompressed_bytes;
795                 dsphys->ds_bp = origin->ds_phys->ds_bp;
796                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
797
798                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
799                 origin->ds_phys->ds_num_children++;
800
801                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
802                         if (origin->ds_phys->ds_next_clones_obj == 0) {
803                                 origin->ds_phys->ds_next_clones_obj =
804                                     zap_create(mos,
805                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
806                         }
807                         VERIFY(0 == zap_add_int(mos,
808                             origin->ds_phys->ds_next_clones_obj,
809                             dsobj, tx));
810                 }
811
812                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
813                 dd->dd_phys->dd_origin_obj = origin->ds_object;
814         }
815
816         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
817                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
818
819         dmu_buf_rele(dbuf, FTAG);
820
821         dmu_buf_will_dirty(dd->dd_dbuf, tx);
822         dd->dd_phys->dd_head_dataset_obj = dsobj;
823
824         return (dsobj);
825 }
826
827 uint64_t
828 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
829     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
830 {
831         dsl_pool_t *dp = pdd->dd_pool;
832         uint64_t dsobj, ddobj;
833         dsl_dir_t *dd;
834
835         ASSERT(lastname[0] != '@');
836
837         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
838         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
839
840         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
841
842         dsl_deleg_set_create_perms(dd, tx, cr);
843
844         dsl_dir_close(dd, FTAG);
845
846         return (dsobj);
847 }
848
849 struct destroyarg {
850         dsl_sync_task_group_t *dstg;
851         char *snapname;
852         char *failed;
853 };
854
855 static int
856 dsl_snapshot_destroy_one(char *name, void *arg)
857 {
858         struct destroyarg *da = arg;
859         dsl_dataset_t *ds;
860         char *cp;
861         int err;
862
863         (void) strcat(name, "@");
864         (void) strcat(name, da->snapname);
865         err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
866             da->dstg, &ds);
867         cp = strchr(name, '@');
868         *cp = '\0';
869         if (err == 0) {
870                 dsl_dataset_make_exclusive(ds, da->dstg);
871                 if (ds->ds_user_ptr) {
872                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
873                         ds->ds_user_ptr = NULL;
874                 }
875                 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
876                     dsl_dataset_destroy_sync, ds, da->dstg, 0);
877         } else if (err == ENOENT) {
878                 err = 0;
879         } else {
880                 (void) strcpy(da->failed, name);
881         }
882         return (err);
883 }
884
885 /*
886  * Destroy 'snapname' in all descendants of 'fsname'.
887  */
888 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
889 int
890 dsl_snapshots_destroy(char *fsname, char *snapname)
891 {
892         int err;
893         struct destroyarg da;
894         dsl_sync_task_t *dst;
895         spa_t *spa;
896
897         err = spa_open(fsname, &spa, FTAG);
898         if (err)
899                 return (err);
900         da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
901         da.snapname = snapname;
902         da.failed = fsname;
903
904         err = dmu_objset_find(fsname,
905             dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
906
907         if (err == 0)
908                 err = dsl_sync_task_group_wait(da.dstg);
909
910         for (dst = list_head(&da.dstg->dstg_tasks); dst;
911             dst = list_next(&da.dstg->dstg_tasks, dst)) {
912                 dsl_dataset_t *ds = dst->dst_arg1;
913                 /*
914                  * Return the file system name that triggered the error
915                  */
916                 if (dst->dst_err) {
917                         dsl_dataset_name(ds, fsname);
918                         *strchr(fsname, '@') = '\0';
919                 }
920                 dsl_dataset_disown(ds, da.dstg);
921         }
922
923         dsl_sync_task_group_destroy(da.dstg);
924         spa_close(spa, FTAG);
925         return (err);
926 }
927
928 /*
929  * ds must be opened as OWNER.  On return (whether successful or not),
930  * ds will be closed and caller can no longer dereference it.
931  */
932 int
933 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
934 {
935         int err;
936         dsl_sync_task_group_t *dstg;
937         objset_t *os;
938         dsl_dir_t *dd;
939         uint64_t obj;
940
941         if (dsl_dataset_is_snapshot(ds)) {
942                 /* Destroying a snapshot is simpler */
943                 dsl_dataset_make_exclusive(ds, tag);
944
945                 if (ds->ds_user_ptr) {
946                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
947                         ds->ds_user_ptr = NULL;
948                 }
949                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
950                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
951                     ds, tag, 0);
952                 goto out;
953         }
954
955         dd = ds->ds_dir;
956
957         /*
958          * Check for errors and mark this ds as inconsistent, in
959          * case we crash while freeing the objects.
960          */
961         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
962             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
963         if (err)
964                 goto out;
965
966         err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
967         if (err)
968                 goto out;
969
970         /*
971          * remove the objects in open context, so that we won't
972          * have too much to do in syncing context.
973          */
974         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
975             ds->ds_phys->ds_prev_snap_txg)) {
976                 /*
977                  * Ignore errors, if there is not enough disk space
978                  * we will deal with it in dsl_dataset_destroy_sync().
979                  */
980                 (void) dmu_free_object(os, obj);
981         }
982
983         dmu_objset_close(os);
984         if (err != ESRCH)
985                 goto out;
986
987         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
988         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
989         rw_exit(&dd->dd_pool->dp_config_rwlock);
990
991         if (err)
992                 goto out;
993
994         if (ds->ds_user_ptr) {
995                 /*
996                  * We need to sync out all in-flight IO before we try
997                  * to evict (the dataset evict func is trying to clear
998                  * the cached entries for this dataset in the ARC).
999                  */
1000                 txg_wait_synced(dd->dd_pool, 0);
1001         }
1002
1003         /*
1004          * Blow away the dsl_dir + head dataset.
1005          */
1006         dsl_dataset_make_exclusive(ds, tag);
1007         if (ds->ds_user_ptr) {
1008                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
1009                 ds->ds_user_ptr = NULL;
1010         }
1011         dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1012         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1013             dsl_dataset_destroy_sync, ds, tag, 0);
1014         dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1015             dsl_dir_destroy_sync, dd, FTAG, 0);
1016         err = dsl_sync_task_group_wait(dstg);
1017         dsl_sync_task_group_destroy(dstg);
1018         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1019         if (err)
1020                 dsl_dir_close(dd, FTAG);
1021 out:
1022         dsl_dataset_disown(ds, tag);
1023         return (err);
1024 }
1025
1026 int
1027 dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
1028 {
1029         int err;
1030
1031         ASSERT(ds->ds_owner);
1032
1033         dsl_dataset_make_exclusive(ds, ds->ds_owner);
1034         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1035             dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
1036             ds, &ost, 0);
1037         /* drop exclusive access */
1038         mutex_enter(&ds->ds_lock);
1039         rw_exit(&ds->ds_rwlock);
1040         cv_broadcast(&ds->ds_exclusive_cv);
1041         mutex_exit(&ds->ds_lock);
1042         return (err);
1043 }
1044
1045 void *
1046 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
1047     void *p, dsl_dataset_evict_func_t func)
1048 {
1049         void *old;
1050
1051         mutex_enter(&ds->ds_lock);
1052         old = ds->ds_user_ptr;
1053         if (old == NULL) {
1054                 ds->ds_user_ptr = p;
1055                 ds->ds_user_evict_func = func;
1056         }
1057         mutex_exit(&ds->ds_lock);
1058         return (old);
1059 }
1060
1061 void *
1062 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
1063 {
1064         return (ds->ds_user_ptr);
1065 }
1066
1067
1068 blkptr_t *
1069 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1070 {
1071         return (&ds->ds_phys->ds_bp);
1072 }
1073
1074 void
1075 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1076 {
1077         ASSERT(dmu_tx_is_syncing(tx));
1078         /* If it's the meta-objset, set dp_meta_rootbp */
1079         if (ds == NULL) {
1080                 tx->tx_pool->dp_meta_rootbp = *bp;
1081         } else {
1082                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1083                 ds->ds_phys->ds_bp = *bp;
1084         }
1085 }
1086
1087 spa_t *
1088 dsl_dataset_get_spa(dsl_dataset_t *ds)
1089 {
1090         return (ds->ds_dir->dd_pool->dp_spa);
1091 }
1092
1093 void
1094 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1095 {
1096         dsl_pool_t *dp;
1097
1098         if (ds == NULL) /* this is the meta-objset */
1099                 return;
1100
1101         ASSERT(ds->ds_user_ptr != NULL);
1102
1103         if (ds->ds_phys->ds_next_snap_obj != 0)
1104                 panic("dirtying snapshot!");
1105
1106         dp = ds->ds_dir->dd_pool;
1107
1108         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1109                 /* up the hold count until we can be written out */
1110                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1111         }
1112 }
1113
1114 /*
1115  * The unique space in the head dataset can be calculated by subtracting
1116  * the space used in the most recent snapshot, that is still being used
1117  * in this file system, from the space currently in use.  To figure out
1118  * the space in the most recent snapshot still in use, we need to take
1119  * the total space used in the snapshot and subtract out the space that
1120  * has been freed up since the snapshot was taken.
1121  */
1122 static void
1123 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1124 {
1125         uint64_t mrs_used;
1126         uint64_t dlused, dlcomp, dluncomp;
1127
1128         ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
1129
1130         if (ds->ds_phys->ds_prev_snap_obj != 0)
1131                 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
1132         else
1133                 mrs_used = 0;
1134
1135         VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
1136             &dluncomp));
1137
1138         ASSERT3U(dlused, <=, mrs_used);
1139         ds->ds_phys->ds_unique_bytes =
1140             ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
1141
1142         if (!DS_UNIQUE_IS_ACCURATE(ds) &&
1143             spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1144             SPA_VERSION_UNIQUE_ACCURATE)
1145                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1146 }
1147
1148 static uint64_t
1149 dsl_dataset_unique(dsl_dataset_t *ds)
1150 {
1151         if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
1152                 dsl_dataset_recalc_head_uniq(ds);
1153
1154         return (ds->ds_phys->ds_unique_bytes);
1155 }
1156
1157 struct killarg {
1158         dsl_dataset_t *ds;
1159         zio_t *zio;
1160         dmu_tx_t *tx;
1161 };
1162
1163 /* ARGSUSED */
1164 static int
1165 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
1166 {
1167         struct killarg *ka = arg;
1168         blkptr_t *bp = &bc->bc_blkptr;
1169
1170         ASSERT3U(bc->bc_errno, ==, 0);
1171
1172         ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1173         (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
1174
1175         return (0);
1176 }
1177
1178 /* ARGSUSED */
1179 static int
1180 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
1181 {
1182         dsl_dataset_t *ds = arg1;
1183         dmu_objset_type_t *ost = arg2;
1184
1185         /*
1186          * We can only roll back to emptyness if it is a ZPL objset.
1187          */
1188         if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
1189                 return (EINVAL);
1190
1191         /*
1192          * This must not be a snapshot.
1193          */
1194         if (ds->ds_phys->ds_next_snap_obj != 0)
1195                 return (EINVAL);
1196
1197         /*
1198          * If we made changes this txg, traverse_dsl_dataset won't find
1199          * them.  Try again.
1200          */
1201         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1202                 return (EAGAIN);
1203
1204         return (0);
1205 }
1206
1207 /* ARGSUSED */
1208 static void
1209 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1210 {
1211         dsl_dataset_t *ds = arg1;
1212         dmu_objset_type_t *ost = arg2;
1213         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1214
1215         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1216
1217         /*
1218          * Before the roll back destroy the zil.
1219          */
1220         if (ds->ds_user_ptr != NULL) {
1221                 zil_rollback_destroy(
1222                     ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
1223
1224                 /*
1225                  * We need to make sure that the objset_impl_t is reopened after
1226                  * we do the rollback, otherwise it will have the wrong
1227                  * objset_phys_t.  Normally this would happen when this
1228                  * dataset-open is closed, thus causing the
1229                  * dataset to be immediately evicted.  But when doing "zfs recv
1230                  * -F", we reopen the objset before that, so that there is no
1231                  * window where the dataset is closed and inconsistent.
1232                  */
1233                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
1234                 ds->ds_user_ptr = NULL;
1235         }
1236
1237         /* Transfer space that was freed since last snap back to the head. */
1238         {
1239                 uint64_t used;
1240
1241                 VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
1242                     ds->ds_origin_txg, UINT64_MAX, &used));
1243                 dsl_dir_transfer_space(ds->ds_dir, used,
1244                     DD_USED_SNAP, DD_USED_HEAD, tx);
1245         }
1246
1247         /* Zero out the deadlist. */
1248         bplist_close(&ds->ds_deadlist);
1249         bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1250         ds->ds_phys->ds_deadlist_obj =
1251             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1252         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1253             ds->ds_phys->ds_deadlist_obj));
1254
1255         {
1256                 /* Free blkptrs that we gave birth to */
1257                 zio_t *zio;
1258                 struct killarg ka;
1259
1260                 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
1261                     ZIO_FLAG_MUSTSUCCEED);
1262                 ka.ds = ds;
1263                 ka.zio = zio;
1264                 ka.tx = tx;
1265                 (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1266                     ADVANCE_POST, kill_blkptr, &ka);
1267                 (void) zio_wait(zio);
1268         }
1269
1270         ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
1271             ds->ds_phys->ds_unique_bytes == 0);
1272
1273         if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
1274                 /* Change our contents to that of the prev snapshot */
1275
1276                 ASSERT3U(ds->ds_prev->ds_object, ==,
1277                     ds->ds_phys->ds_prev_snap_obj);
1278                 ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
1279                     ds->ds_prev->ds_phys->ds_used_bytes);
1280
1281                 ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
1282                 ds->ds_phys->ds_used_bytes =
1283                     ds->ds_prev->ds_phys->ds_used_bytes;
1284                 ds->ds_phys->ds_compressed_bytes =
1285                     ds->ds_prev->ds_phys->ds_compressed_bytes;
1286                 ds->ds_phys->ds_uncompressed_bytes =
1287                     ds->ds_prev->ds_phys->ds_uncompressed_bytes;
1288                 ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
1289
1290                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1291                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1292                         ds->ds_prev->ds_phys->ds_unique_bytes = 0;
1293                 }
1294         } else {
1295                 objset_impl_t *osi;
1296
1297                 ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
1298                 ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
1299                 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
1300
1301                 bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
1302                 ds->ds_phys->ds_flags = 0;
1303                 ds->ds_phys->ds_unique_bytes = 0;
1304                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1305                     SPA_VERSION_UNIQUE_ACCURATE)
1306                         ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1307
1308                 osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
1309                     &ds->ds_phys->ds_bp, *ost, tx);
1310 #ifdef _KERNEL
1311                 zfs_create_fs(&osi->os, kcred, NULL, tx);
1312 #endif
1313         }
1314
1315         spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
1316             tx, cr, "dataset = %llu", ds->ds_object);
1317 }
1318
1319 /* ARGSUSED */
1320 static int
1321 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1322 {
1323         dsl_dataset_t *ds = arg1;
1324         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1325         uint64_t count;
1326         int err;
1327
1328         /*
1329          * Can't delete a head dataset if there are snapshots of it.
1330          * (Except if the only snapshots are from the branch we cloned
1331          * from.)
1332          */
1333         if (ds->ds_prev != NULL &&
1334             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1335                 return (EINVAL);
1336
1337         /*
1338          * This is really a dsl_dir thing, but check it here so that
1339          * we'll be less likely to leave this dataset inconsistent &
1340          * nearly destroyed.
1341          */
1342         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1343         if (err)
1344                 return (err);
1345         if (count != 0)
1346                 return (EEXIST);
1347
1348         return (0);
1349 }
1350
1351 /* ARGSUSED */
1352 static void
1353 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1354 {
1355         dsl_dataset_t *ds = arg1;
1356         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1357
1358         /* Mark it as inconsistent on-disk, in case we crash */
1359         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1360         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1361
1362         spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
1363             cr, "dataset = %llu", ds->ds_object);
1364 }
1365
1366 /* ARGSUSED */
1367 int
1368 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1369 {
1370         dsl_dataset_t *ds = arg1;
1371
1372         /* we have an owner hold, so noone else can destroy us */
1373         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1374
1375         /* Can't delete a branch point. */
1376         if (ds->ds_phys->ds_num_children > 1)
1377                 return (EEXIST);
1378
1379         /*
1380          * Can't delete a head dataset if there are snapshots of it.
1381          * (Except if the only snapshots are from the branch we cloned
1382          * from.)
1383          */
1384         if (ds->ds_prev != NULL &&
1385             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1386                 return (EINVAL);
1387
1388         /*
1389          * If we made changes this txg, traverse_dsl_dataset won't find
1390          * them.  Try again.
1391          */
1392         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1393                 return (EAGAIN);
1394
1395         /* XXX we should do some i/o error checking... */
1396         return (0);
1397 }
1398
1399 struct refsarg {
1400         kmutex_t lock;
1401         boolean_t gone;
1402         kcondvar_t cv;
1403 };
1404
1405 /* ARGSUSED */
1406 static void
1407 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1408 {
1409         struct refsarg *arg = argv;
1410
1411         mutex_enter(&arg->lock);
1412         arg->gone = TRUE;
1413         cv_signal(&arg->cv);
1414         mutex_exit(&arg->lock);
1415 }
1416
1417 static void
1418 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1419 {
1420         struct refsarg arg;
1421
1422         bzero(&arg, sizeof(arg));
1423         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1424         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1425         arg.gone = FALSE;
1426         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1427             dsl_dataset_refs_gone);
1428         dmu_buf_rele(ds->ds_dbuf, tag);
1429         mutex_enter(&arg.lock);
1430         while (!arg.gone)
1431                 cv_wait(&arg.cv, &arg.lock);
1432         ASSERT(arg.gone);
1433         mutex_exit(&arg.lock);
1434         ds->ds_dbuf = NULL;
1435         ds->ds_phys = NULL;
1436         mutex_destroy(&arg.lock);
1437         cv_destroy(&arg.cv);
1438 }
1439
1440 void
1441 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
1442 {
1443         dsl_dataset_t *ds = arg1;
1444         zio_t *zio;
1445         int err;
1446         int after_branch_point = FALSE;
1447         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1448         objset_t *mos = dp->dp_meta_objset;
1449         dsl_dataset_t *ds_prev = NULL;
1450         uint64_t obj;
1451
1452         ASSERT(ds->ds_owner);
1453         ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
1454         ASSERT(ds->ds_prev == NULL ||
1455             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1456         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1457
1458         /* signal any waiters that this dataset is going away */
1459         mutex_enter(&ds->ds_lock);
1460         ds->ds_owner = dsl_reaper;
1461         cv_broadcast(&ds->ds_exclusive_cv);
1462         mutex_exit(&ds->ds_lock);
1463
1464         /* Remove our reservation */
1465         if (ds->ds_reserved != 0) {
1466                 uint64_t val = 0;
1467                 dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
1468                 ASSERT3U(ds->ds_reserved, ==, 0);
1469         }
1470
1471         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1472
1473         dsl_pool_ds_destroyed(ds, tx);
1474
1475         obj = ds->ds_object;
1476
1477         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1478                 if (ds->ds_prev) {
1479                         ds_prev = ds->ds_prev;
1480                 } else {
1481                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1482                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1483                 }
1484                 after_branch_point =
1485                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1486
1487                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1488                 if (after_branch_point &&
1489                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1490                         VERIFY(0 == zap_remove_int(mos,
1491                             ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
1492                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1493                                 VERIFY(0 == zap_add_int(mos,
1494                                     ds_prev->ds_phys->ds_next_clones_obj,
1495                                     ds->ds_phys->ds_next_snap_obj, tx));
1496                         }
1497                 }
1498                 if (after_branch_point &&
1499                     ds->ds_phys->ds_next_snap_obj == 0) {
1500                         /* This clone is toast. */
1501                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1502                         ds_prev->ds_phys->ds_num_children--;
1503                 } else if (!after_branch_point) {
1504                         ds_prev->ds_phys->ds_next_snap_obj =
1505                             ds->ds_phys->ds_next_snap_obj;
1506                 }
1507         }
1508
1509         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1510
1511         if (ds->ds_phys->ds_next_snap_obj != 0) {
1512                 blkptr_t bp;
1513                 dsl_dataset_t *ds_next;
1514                 uint64_t itor = 0;
1515                 uint64_t old_unique;
1516                 int64_t used = 0, compressed = 0, uncompressed = 0;
1517
1518                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1519                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1520                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1521
1522                 old_unique = dsl_dataset_unique(ds_next);
1523
1524                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1525                 ds_next->ds_phys->ds_prev_snap_obj =
1526                     ds->ds_phys->ds_prev_snap_obj;
1527                 ds_next->ds_phys->ds_prev_snap_txg =
1528                     ds->ds_phys->ds_prev_snap_txg;
1529                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1530                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1531
1532                 /*
1533                  * Transfer to our deadlist (which will become next's
1534                  * new deadlist) any entries from next's current
1535                  * deadlist which were born before prev, and free the
1536                  * other entries.
1537                  *
1538                  * XXX we're doing this long task with the config lock held
1539                  */
1540                 while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
1541                         if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
1542                                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
1543                                     &bp, tx));
1544                                 if (ds_prev && !after_branch_point &&
1545                                     bp.blk_birth >
1546                                     ds_prev->ds_phys->ds_prev_snap_txg) {
1547                                         ds_prev->ds_phys->ds_unique_bytes +=
1548                                             bp_get_dasize(dp->dp_spa, &bp);
1549                                 }
1550                         } else {
1551                                 used += bp_get_dasize(dp->dp_spa, &bp);
1552                                 compressed += BP_GET_PSIZE(&bp);
1553                                 uncompressed += BP_GET_UCSIZE(&bp);
1554                                 /* XXX check return value? */
1555                                 (void) dsl_free(zio, dp, tx->tx_txg,
1556                                     &bp, NULL, NULL, ARC_NOWAIT);
1557                         }
1558                 }
1559
1560                 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
1561
1562                 /* change snapused */
1563                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1564                     -used, -compressed, -uncompressed, tx);
1565
1566                 /* free next's deadlist */
1567                 bplist_close(&ds_next->ds_deadlist);
1568                 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
1569
1570                 /* set next's deadlist to our deadlist */
1571                 bplist_close(&ds->ds_deadlist);
1572                 ds_next->ds_phys->ds_deadlist_obj =
1573                     ds->ds_phys->ds_deadlist_obj;
1574                 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
1575                     ds_next->ds_phys->ds_deadlist_obj));
1576                 ds->ds_phys->ds_deadlist_obj = 0;
1577
1578                 if (ds_next->ds_phys->ds_next_snap_obj != 0) {
1579                         /*
1580                          * Update next's unique to include blocks which
1581                          * were previously shared by only this snapshot
1582                          * and it.  Those blocks will be born after the
1583                          * prev snap and before this snap, and will have
1584                          * died after the next snap and before the one
1585                          * after that (ie. be on the snap after next's
1586                          * deadlist).
1587                          *
1588                          * XXX we're doing this long task with the
1589                          * config lock held
1590                          */
1591                         dsl_dataset_t *ds_after_next;
1592                         uint64_t space;
1593
1594                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1595                             ds_next->ds_phys->ds_next_snap_obj,
1596                             FTAG, &ds_after_next));
1597
1598                         VERIFY(0 ==
1599                             bplist_space_birthrange(&ds_after_next->ds_deadlist,
1600                             ds->ds_phys->ds_prev_snap_txg,
1601                             ds->ds_phys->ds_creation_txg, &space));
1602                         ds_next->ds_phys->ds_unique_bytes += space;
1603
1604                         dsl_dataset_rele(ds_after_next, FTAG);
1605                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1606                 } else {
1607                         ASSERT3P(ds_next->ds_prev, ==, ds);
1608                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1609                         ds_next->ds_prev = NULL;
1610                         if (ds_prev) {
1611                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1612                                     ds->ds_phys->ds_prev_snap_obj,
1613                                     ds_next, &ds_next->ds_prev));
1614                         }
1615
1616                         dsl_dataset_recalc_head_uniq(ds_next);
1617
1618                         /*
1619                          * Reduce the amount of our unconsmed refreservation
1620                          * being charged to our parent by the amount of
1621                          * new unique data we have gained.
1622                          */
1623                         if (old_unique < ds_next->ds_reserved) {
1624                                 int64_t mrsdelta;
1625                                 uint64_t new_unique =
1626                                     ds_next->ds_phys->ds_unique_bytes;
1627
1628                                 ASSERT(old_unique <= new_unique);
1629                                 mrsdelta = MIN(new_unique - old_unique,
1630                                     ds_next->ds_reserved - old_unique);
1631                                 dsl_dir_diduse_space(ds->ds_dir,
1632                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1633                         }
1634                 }
1635                 dsl_dataset_rele(ds_next, FTAG);
1636         } else {
1637                 /*
1638                  * There's no next snapshot, so this is a head dataset.
1639                  * Destroy the deadlist.  Unless it's a clone, the
1640                  * deadlist should be empty.  (If it's a clone, it's
1641                  * safe to ignore the deadlist contents.)
1642                  */
1643                 struct killarg ka;
1644
1645                 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
1646                 bplist_close(&ds->ds_deadlist);
1647                 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1648                 ds->ds_phys->ds_deadlist_obj = 0;
1649
1650                 /*
1651                  * Free everything that we point to (that's born after
1652                  * the previous snapshot, if we are a clone)
1653                  *
1654                  * NB: this should be very quick, because we already
1655                  * freed all the objects in open context.
1656                  */
1657                 ka.ds = ds;
1658                 ka.zio = zio;
1659                 ka.tx = tx;
1660                 err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1661                     ADVANCE_POST, kill_blkptr, &ka);
1662                 ASSERT3U(err, ==, 0);
1663                 ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE ||
1664                     ds->ds_phys->ds_unique_bytes == 0);
1665         }
1666
1667         err = zio_wait(zio);
1668         ASSERT3U(err, ==, 0);
1669
1670         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1671                 /* Erase the link in the dir */
1672                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1673                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1674                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1675                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1676                 ASSERT(err == 0);
1677         } else {
1678                 /* remove from snapshot namespace */
1679                 dsl_dataset_t *ds_head;
1680                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1681                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1682                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1683                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1684 #ifdef ZFS_DEBUG
1685                 {
1686                         uint64_t val;
1687
1688                         err = dsl_dataset_snap_lookup(ds_head,
1689                             ds->ds_snapname, &val);
1690                         ASSERT3U(err, ==, 0);
1691                         ASSERT3U(val, ==, obj);
1692                 }
1693 #endif
1694                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1695                 ASSERT(err == 0);
1696                 dsl_dataset_rele(ds_head, FTAG);
1697         }
1698
1699         if (ds_prev && ds->ds_prev != ds_prev)
1700                 dsl_dataset_rele(ds_prev, FTAG);
1701
1702         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1703         spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
1704             cr, "dataset = %llu", ds->ds_object);
1705
1706         if (ds->ds_phys->ds_next_clones_obj != 0) {
1707                 uint64_t count;
1708                 ASSERT(0 == zap_count(mos,
1709                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1710                 VERIFY(0 == dmu_object_free(mos,
1711                     ds->ds_phys->ds_next_clones_obj, tx));
1712         }
1713         if (ds->ds_phys->ds_props_obj != 0)
1714                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1715         dsl_dir_close(ds->ds_dir, ds);
1716         ds->ds_dir = NULL;
1717         dsl_dataset_drain_refs(ds, tag);
1718         VERIFY(0 == dmu_object_free(mos, obj, tx));
1719 }
1720
1721 static int
1722 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1723 {
1724         uint64_t asize;
1725
1726         if (!dmu_tx_is_syncing(tx))
1727                 return (0);
1728
1729         /*
1730          * If there's an fs-only reservation, any blocks that might become
1731          * owned by the snapshot dataset must be accommodated by space
1732          * outside of the reservation.
1733          */
1734         asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
1735         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
1736                 return (ENOSPC);
1737
1738         /*
1739          * Propogate any reserved space for this snapshot to other
1740          * snapshot checks in this sync group.
1741          */
1742         if (asize > 0)
1743                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1744
1745         return (0);
1746 }
1747
1748 /* ARGSUSED */
1749 int
1750 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
1751 {
1752         dsl_dataset_t *ds = arg1;
1753         const char *snapname = arg2;
1754         int err;
1755         uint64_t value;
1756
1757         /*
1758          * We don't allow multiple snapshots of the same txg.  If there
1759          * is already one, try again.
1760          */
1761         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
1762                 return (EAGAIN);
1763
1764         /*
1765          * Check for conflicting name snapshot name.
1766          */
1767         err = dsl_dataset_snap_lookup(ds, snapname, &value);
1768         if (err == 0)
1769                 return (EEXIST);
1770         if (err != ENOENT)
1771                 return (err);
1772
1773         /*
1774          * Check that the dataset's name is not too long.  Name consists
1775          * of the dataset's length + 1 for the @-sign + snapshot name's length
1776          */
1777         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
1778                 return (ENAMETOOLONG);
1779
1780         err = dsl_dataset_snapshot_reserve_space(ds, tx);
1781         if (err)
1782                 return (err);
1783
1784         ds->ds_trysnap_txg = tx->tx_txg;
1785         return (0);
1786 }
1787
1788 void
1789 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
1790 {
1791         dsl_dataset_t *ds = arg1;
1792         const char *snapname = arg2;
1793         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1794         dmu_buf_t *dbuf;
1795         dsl_dataset_phys_t *dsphys;
1796         uint64_t dsobj, crtxg;
1797         objset_t *mos = dp->dp_meta_objset;
1798         int err;
1799
1800         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1801
1802         /*
1803          * The origin's ds_creation_txg has to be < TXG_INITIAL
1804          */
1805         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1806                 crtxg = 1;
1807         else
1808                 crtxg = tx->tx_txg;
1809
1810         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1811             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1812         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1813         dmu_buf_will_dirty(dbuf, tx);
1814         dsphys = dbuf->db_data;
1815         bzero(dsphys, sizeof (dsl_dataset_phys_t));
1816         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1817         dsphys->ds_fsid_guid = unique_create();
1818         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1819             sizeof (dsphys->ds_guid));
1820         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1821         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1822         dsphys->ds_next_snap_obj = ds->ds_object;
1823         dsphys->ds_num_children = 1;
1824         dsphys->ds_creation_time = gethrestime_sec();
1825         dsphys->ds_creation_txg = crtxg;
1826         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1827         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
1828         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1829         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1830         dsphys->ds_flags = ds->ds_phys->ds_flags;
1831         dsphys->ds_bp = ds->ds_phys->ds_bp;
1832         dmu_buf_rele(dbuf, FTAG);
1833
1834         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
1835         if (ds->ds_prev) {
1836                 uint64_t next_clones_obj =
1837                     ds->ds_prev->ds_phys->ds_next_clones_obj;
1838                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
1839                     ds->ds_object ||
1840                     ds->ds_prev->ds_phys->ds_num_children > 1);
1841                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1842                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1843                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1844                             ds->ds_prev->ds_phys->ds_creation_txg);
1845                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1846                 } else if (next_clones_obj != 0) {
1847                         VERIFY3U(0, ==, zap_remove_int(mos,
1848                             next_clones_obj, dsphys->ds_next_snap_obj, tx));
1849                         VERIFY3U(0, ==, zap_add_int(mos,
1850                             next_clones_obj, dsobj, tx));
1851                 }
1852         }
1853
1854         /*
1855          * If we have a reference-reservation on this dataset, we will
1856          * need to increase the amount of refreservation being charged
1857          * since our unique space is going to zero.
1858          */
1859         if (ds->ds_reserved) {
1860                 int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
1861                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1862                     add, 0, 0, tx);
1863         }
1864
1865         bplist_close(&ds->ds_deadlist);
1866         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1867         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
1868         ds->ds_phys->ds_prev_snap_obj = dsobj;
1869         ds->ds_phys->ds_prev_snap_txg = crtxg;
1870         ds->ds_phys->ds_unique_bytes = 0;
1871         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1872                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1873         ds->ds_phys->ds_deadlist_obj =
1874             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1875         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1876             ds->ds_phys->ds_deadlist_obj));
1877
1878         dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
1879         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1880             snapname, 8, 1, &dsobj, tx);
1881         ASSERT(err == 0);
1882
1883         if (ds->ds_prev)
1884                 dsl_dataset_drop_ref(ds->ds_prev, ds);
1885         VERIFY(0 == dsl_dataset_get_ref(dp,
1886             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
1887
1888         dsl_pool_ds_snapshotted(ds, tx);
1889
1890         spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
1891             "dataset = %llu", dsobj);
1892 }
1893
1894 void
1895 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1896 {
1897         ASSERT(dmu_tx_is_syncing(tx));
1898         ASSERT(ds->ds_user_ptr != NULL);
1899         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1900
1901         /*
1902          * in case we had to change ds_fsid_guid when we opened it,
1903          * sync it out now.
1904          */
1905         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1906         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
1907
1908         dsl_dir_dirty(ds->ds_dir, tx);
1909         dmu_objset_sync(ds->ds_user_ptr, zio, tx);
1910 }
1911
1912 void
1913 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1914 {
1915         uint64_t refd, avail, uobjs, aobjs;
1916
1917         dsl_dir_stats(ds->ds_dir, nv);
1918
1919         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1920         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1921         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1922
1923         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1924             ds->ds_phys->ds_creation_time);
1925         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1926             ds->ds_phys->ds_creation_txg);
1927         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1928             ds->ds_quota);
1929         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1930             ds->ds_reserved);
1931         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1932             ds->ds_phys->ds_guid);
1933
1934         if (ds->ds_phys->ds_next_snap_obj) {
1935                 /*
1936                  * This is a snapshot; override the dd's space used with
1937                  * our unique space and compression ratio.
1938                  */
1939                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1940                     ds->ds_phys->ds_unique_bytes);
1941                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
1942                     ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
1943                     (ds->ds_phys->ds_uncompressed_bytes * 100 /
1944                     ds->ds_phys->ds_compressed_bytes));
1945         }
1946 }
1947
1948 void
1949 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1950 {
1951         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1952         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
1953         stat->dds_guid = ds->ds_phys->ds_guid;
1954         if (ds->ds_phys->ds_next_snap_obj) {
1955                 stat->dds_is_snapshot = B_TRUE;
1956                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1957         }
1958
1959         /* clone origin is really a dsl_dir thing... */
1960         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
1961         if (dsl_dir_is_clone(ds->ds_dir)) {
1962                 dsl_dataset_t *ods;
1963
1964                 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
1965                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
1966                 dsl_dataset_name(ods, stat->dds_origin);
1967                 dsl_dataset_drop_ref(ods, FTAG);
1968         }
1969         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
1970 }
1971
1972 uint64_t
1973 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1974 {
1975         return (ds->ds_fsid_guid);
1976 }
1977
1978 void
1979 dsl_dataset_space(dsl_dataset_t *ds,
1980     uint64_t *refdbytesp, uint64_t *availbytesp,
1981     uint64_t *usedobjsp, uint64_t *availobjsp)
1982 {
1983         *refdbytesp = ds->ds_phys->ds_used_bytes;
1984         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1985         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
1986                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
1987         if (ds->ds_quota != 0) {
1988                 /*
1989                  * Adjust available bytes according to refquota
1990                  */
1991                 if (*refdbytesp < ds->ds_quota)
1992                         *availbytesp = MIN(*availbytesp,
1993                             ds->ds_quota - *refdbytesp);
1994                 else
1995                         *availbytesp = 0;
1996         }
1997         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
1998         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
1999 }
2000
2001 boolean_t
2002 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2003 {
2004         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2005
2006         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2007             dsl_pool_sync_context(dp));
2008         if (ds->ds_prev == NULL)
2009                 return (B_FALSE);
2010         if (ds->ds_phys->ds_bp.blk_birth >
2011             ds->ds_prev->ds_phys->ds_creation_txg)
2012                 return (B_TRUE);
2013         return (B_FALSE);
2014 }
2015
2016 /* ARGSUSED */
2017 static int
2018 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2019 {
2020         dsl_dataset_t *ds = arg1;
2021         char *newsnapname = arg2;
2022         dsl_dir_t *dd = ds->ds_dir;
2023         dsl_dataset_t *hds;
2024         uint64_t val;
2025         int err;
2026
2027         err = dsl_dataset_hold_obj(dd->dd_pool,
2028             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2029         if (err)
2030                 return (err);
2031
2032         /* new name better not be in use */
2033         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2034         dsl_dataset_rele(hds, FTAG);
2035
2036         if (err == 0)
2037                 err = EEXIST;
2038         else if (err == ENOENT)
2039                 err = 0;
2040
2041         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2042         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2043                 err = ENAMETOOLONG;
2044
2045         return (err);
2046 }
2047
2048 static void
2049 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
2050     cred_t *cr, dmu_tx_t *tx)
2051 {
2052         dsl_dataset_t *ds = arg1;
2053         const char *newsnapname = arg2;
2054         dsl_dir_t *dd = ds->ds_dir;
2055         objset_t *mos = dd->dd_pool->dp_meta_objset;
2056         dsl_dataset_t *hds;
2057         int err;
2058
2059         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2060
2061         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2062             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2063
2064         VERIFY(0 == dsl_dataset_get_snapname(ds));
2065         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2066         ASSERT3U(err, ==, 0);
2067         mutex_enter(&ds->ds_lock);
2068         (void) strcpy(ds->ds_snapname, newsnapname);
2069         mutex_exit(&ds->ds_lock);
2070         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2071             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2072         ASSERT3U(err, ==, 0);
2073
2074         spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
2075             cr, "dataset = %llu", ds->ds_object);
2076         dsl_dataset_rele(hds, FTAG);
2077 }
2078
2079 struct renamesnaparg {
2080         dsl_sync_task_group_t *dstg;
2081         char failed[MAXPATHLEN];
2082         char *oldsnap;
2083         char *newsnap;
2084 };
2085
2086 static int
2087 dsl_snapshot_rename_one(char *name, void *arg)
2088 {
2089         struct renamesnaparg *ra = arg;
2090         dsl_dataset_t *ds = NULL;
2091         char *cp;
2092         int err;
2093
2094         cp = name + strlen(name);
2095         *cp = '@';
2096         (void) strcpy(cp + 1, ra->oldsnap);
2097
2098         /*
2099          * For recursive snapshot renames the parent won't be changing
2100          * so we just pass name for both the to/from argument.
2101          */
2102         err = zfs_secpolicy_rename_perms(name, name, CRED());
2103         if (err == ENOENT) {
2104                 return (0);
2105         } else if (err) {
2106                 (void) strcpy(ra->failed, name);
2107                 return (err);
2108         }
2109
2110 #ifdef _KERNEL
2111         /*
2112          * For all filesystems undergoing rename, we'll need to unmount it.
2113          */
2114         (void) zfs_unmount_snap(name, NULL);
2115 #endif
2116         err = dsl_dataset_hold(name, ra->dstg, &ds);
2117         *cp = '\0';
2118         if (err == ENOENT) {
2119                 return (0);
2120         } else if (err) {
2121                 (void) strcpy(ra->failed, name);
2122                 return (err);
2123         }
2124
2125         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2126             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2127
2128         return (0);
2129 }
2130
2131 static int
2132 dsl_recursive_rename(char *oldname, const char *newname)
2133 {
2134         int err;
2135         struct renamesnaparg *ra;
2136         dsl_sync_task_t *dst;
2137         spa_t *spa;
2138         char *cp, *fsname = spa_strdup(oldname);
2139         int len = strlen(oldname);
2140
2141         /* truncate the snapshot name to get the fsname */
2142         cp = strchr(fsname, '@');
2143         *cp = '\0';
2144
2145         err = spa_open(fsname, &spa, FTAG);
2146         if (err) {
2147                 kmem_free(fsname, len + 1);
2148                 return (err);
2149         }
2150         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2151         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2152
2153         ra->oldsnap = strchr(oldname, '@') + 1;
2154         ra->newsnap = strchr(newname, '@') + 1;
2155         *ra->failed = '\0';
2156
2157         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2158             DS_FIND_CHILDREN);
2159         kmem_free(fsname, len + 1);
2160
2161         if (err == 0) {
2162                 err = dsl_sync_task_group_wait(ra->dstg);
2163         }
2164
2165         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2166             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2167                 dsl_dataset_t *ds = dst->dst_arg1;
2168                 if (dst->dst_err) {
2169                         dsl_dir_name(ds->ds_dir, ra->failed);
2170                         (void) strcat(ra->failed, "@");
2171                         (void) strcat(ra->failed, ra->newsnap);
2172                 }
2173                 dsl_dataset_rele(ds, ra->dstg);
2174         }
2175
2176         if (err)
2177                 (void) strcpy(oldname, ra->failed);
2178
2179         dsl_sync_task_group_destroy(ra->dstg);
2180         kmem_free(ra, sizeof (struct renamesnaparg));
2181         spa_close(spa, FTAG);
2182         return (err);
2183 }
2184
2185 static int
2186 dsl_valid_rename(char *oldname, void *arg)
2187 {
2188         int delta = *(int *)arg;
2189
2190         if (strlen(oldname) + delta >= MAXNAMELEN)
2191                 return (ENAMETOOLONG);
2192
2193         return (0);
2194 }
2195
2196 #pragma weak dmu_objset_rename = dsl_dataset_rename
2197 int
2198 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2199 {
2200         dsl_dir_t *dd;
2201         dsl_dataset_t *ds;
2202         const char *tail;
2203         int err;
2204
2205         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2206         if (err)
2207                 return (err);
2208         if (tail == NULL) {
2209                 int delta = strlen(newname) - strlen(oldname);
2210
2211                 /* if we're growing, validate child name lengths */
2212                 if (delta > 0)
2213                         err = dmu_objset_find(oldname, dsl_valid_rename,
2214                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2215
2216                 if (!err)
2217                         err = dsl_dir_rename(dd, newname);
2218                 dsl_dir_close(dd, FTAG);
2219                 return (err);
2220         }
2221         if (tail[0] != '@') {
2222                 /* the name ended in a nonexistant component */
2223                 dsl_dir_close(dd, FTAG);
2224                 return (ENOENT);
2225         }
2226
2227         dsl_dir_close(dd, FTAG);
2228
2229         /* new name must be snapshot in same filesystem */
2230         tail = strchr(newname, '@');
2231         if (tail == NULL)
2232                 return (EINVAL);
2233         tail++;
2234         if (strncmp(oldname, newname, tail - newname) != 0)
2235                 return (EXDEV);
2236
2237         if (recursive) {
2238                 err = dsl_recursive_rename(oldname, newname);
2239         } else {
2240                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2241                 if (err)
2242                         return (err);
2243
2244                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2245                     dsl_dataset_snapshot_rename_check,
2246                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2247
2248                 dsl_dataset_rele(ds, FTAG);
2249         }
2250
2251         return (err);
2252 }
2253
2254 struct promotenode {
2255         list_node_t link;
2256         dsl_dataset_t *ds;
2257 };
2258
2259 struct promotearg {
2260         list_t shared_snaps, origin_snaps, clone_snaps;
2261         dsl_dataset_t *origin_origin, *origin_head;
2262         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2263 };
2264
2265 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2266
2267 /* ARGSUSED */
2268 static int
2269 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2270 {
2271         dsl_dataset_t *hds = arg1;
2272         struct promotearg *pa = arg2;
2273         struct promotenode *snap = list_head(&pa->shared_snaps);
2274         dsl_dataset_t *origin_ds = snap->ds;
2275         int err;
2276
2277         /* Check that it is a real clone */
2278         if (!dsl_dir_is_clone(hds->ds_dir))
2279                 return (EINVAL);
2280
2281         /* Since this is so expensive, don't do the preliminary check */
2282         if (!dmu_tx_is_syncing(tx))
2283                 return (0);
2284
2285         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2286                 return (EXDEV);
2287
2288         /* compute origin's new unique space */
2289         snap = list_tail(&pa->clone_snaps);
2290         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2291         err = bplist_space_birthrange(&snap->ds->ds_deadlist,
2292             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
2293         if (err)
2294                 return (err);
2295
2296         /*
2297          * Walk the snapshots that we are moving
2298          *
2299          * Compute space to transfer.  Consider the incremental changes
2300          * to used for each snapshot:
2301          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2302          * So each snapshot gave birth to:
2303          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2304          * So a sequence would look like:
2305          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2306          * Which simplifies to:
2307          * uN + kN + kN-1 + ... + k1 + k0
2308          * Note however, if we stop before we reach the ORIGIN we get:
2309          * uN + kN + kN-1 + ... + kM - uM-1
2310          */
2311         pa->used = origin_ds->ds_phys->ds_used_bytes;
2312         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2313         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2314         for (snap = list_head(&pa->shared_snaps); snap;
2315             snap = list_next(&pa->shared_snaps, snap)) {
2316                 uint64_t val, dlused, dlcomp, dluncomp;
2317                 dsl_dataset_t *ds = snap->ds;
2318
2319                 /* Check that the snapshot name does not conflict */
2320                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2321                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2322                 if (err == 0)
2323                         return (EEXIST);
2324                 if (err != ENOENT)
2325                         return (err);
2326
2327                 /* The very first snapshot does not have a deadlist */
2328                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2329                         continue;
2330
2331                 if (err = bplist_space(&ds->ds_deadlist,
2332                     &dlused, &dlcomp, &dluncomp))
2333                         return (err);
2334                 pa->used += dlused;
2335                 pa->comp += dlcomp;
2336                 pa->uncomp += dluncomp;
2337         }
2338
2339         /*
2340          * If we are a clone of a clone then we never reached ORIGIN,
2341          * so we need to subtract out the clone origin's used space.
2342          */
2343         if (pa->origin_origin) {
2344                 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
2345                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2346                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2347         }
2348
2349         /* Check that there is enough space here */
2350         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2351             pa->used);
2352         if (err)
2353                 return (err);
2354
2355         /*
2356          * Compute the amounts of space that will be used by snapshots
2357          * after the promotion (for both origin and clone).  For each,
2358          * it is the amount of space that will be on all of their
2359          * deadlists (that was not born before their new origin).
2360          */
2361         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2362                 uint64_t space;
2363
2364                 /*
2365                  * Note, typically this will not be a clone of a clone,
2366                  * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
2367                  * these snaplist_space() -> bplist_space_birthrange()
2368                  * calls will be fast because they do not have to
2369                  * iterate over all bps.
2370                  */
2371                 snap = list_head(&pa->origin_snaps);
2372                 err = snaplist_space(&pa->shared_snaps,
2373                     snap->ds->ds_origin_txg, &pa->cloneusedsnap);
2374                 if (err)
2375                         return (err);
2376
2377                 err = snaplist_space(&pa->clone_snaps,
2378                     snap->ds->ds_origin_txg, &space);
2379                 if (err)
2380                         return (err);
2381                 pa->cloneusedsnap += space;
2382         }
2383         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2384                 err = snaplist_space(&pa->origin_snaps,
2385                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2386                 if (err)
2387                         return (err);
2388         }
2389
2390         return (0);
2391 }
2392
2393 static void
2394 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
2395 {
2396         dsl_dataset_t *hds = arg1;
2397         struct promotearg *pa = arg2;
2398         struct promotenode *snap = list_head(&pa->shared_snaps);
2399         dsl_dataset_t *origin_ds = snap->ds;
2400         dsl_dataset_t *origin_head;
2401         dsl_dir_t *dd = hds->ds_dir;
2402         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2403         dsl_dir_t *odd = NULL;
2404         uint64_t oldnext_obj;
2405         int64_t delta;
2406
2407         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2408
2409         snap = list_head(&pa->origin_snaps);
2410         origin_head = snap->ds;
2411
2412         /*
2413          * We need to explicitly open odd, since origin_ds's dd will be
2414          * changing.
2415          */
2416         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2417             NULL, FTAG, &odd));
2418
2419         /* change origin's next snap */
2420         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2421         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2422         snap = list_tail(&pa->clone_snaps);
2423         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2424         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2425
2426         /* change the origin's next clone */
2427         if (origin_ds->ds_phys->ds_next_clones_obj) {
2428                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2429                     origin_ds->ds_phys->ds_next_clones_obj,
2430                     origin_ds->ds_phys->ds_next_snap_obj, tx));
2431                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2432                     origin_ds->ds_phys->ds_next_clones_obj,
2433                     oldnext_obj, tx));
2434         }
2435
2436         /* change origin */
2437         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2438         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2439         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2440         hds->ds_origin_txg = origin_head->ds_origin_txg;
2441         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2442         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2443         origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
2444
2445         /* move snapshots to this dir */
2446         for (snap = list_head(&pa->shared_snaps); snap;
2447             snap = list_next(&pa->shared_snaps, snap)) {
2448                 dsl_dataset_t *ds = snap->ds;
2449
2450                 /* unregister props as dsl_dir is changing */
2451                 if (ds->ds_user_ptr) {
2452                         ds->ds_user_evict_func(ds, ds->ds_user_ptr);
2453                         ds->ds_user_ptr = NULL;
2454                 }
2455                 /* move snap name entry */
2456                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2457                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2458                     ds->ds_snapname, tx));
2459                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2460                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2461                     8, 1, &ds->ds_object, tx));
2462                 /* change containing dsl_dir */
2463                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2464                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2465                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2466                 ASSERT3P(ds->ds_dir, ==, odd);
2467                 dsl_dir_close(ds->ds_dir, ds);
2468                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2469                     NULL, ds, &ds->ds_dir));
2470
2471                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
2472         }
2473
2474         /*
2475          * Change space accounting.
2476          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2477          * both be valid, or both be 0 (resulting in delta == 0).  This
2478          * is true for each of {clone,origin} independently.
2479          */
2480
2481         delta = pa->cloneusedsnap -
2482             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2483         ASSERT3S(delta, >=, 0);
2484         ASSERT3U(pa->used, >=, delta);
2485         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2486         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2487             pa->used - delta, pa->comp, pa->uncomp, tx);
2488
2489         delta = pa->originusedsnap -
2490             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2491         ASSERT3S(delta, <=, 0);
2492         ASSERT3U(pa->used, >=, -delta);
2493         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2494         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2495             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2496
2497         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2498
2499         /* log history record */
2500         spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
2501             cr, "dataset = %llu", hds->ds_object);
2502
2503         dsl_dir_close(odd, FTAG);
2504 }
2505
2506 static char *snaplist_tag = "snaplist";
2507 /*
2508  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2509  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2510  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2511  * snapshots back to this dataset's origin.
2512  */
2513 static int
2514 snaplist_make(dsl_pool_t *dp, boolean_t own,
2515     uint64_t first_obj, uint64_t last_obj, list_t *l)
2516 {
2517         uint64_t obj = last_obj;
2518
2519         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2520
2521         list_create(l, sizeof (struct promotenode),
2522             offsetof(struct promotenode, link));
2523
2524         while (obj != first_obj) {
2525                 dsl_dataset_t *ds;
2526                 struct promotenode *snap;
2527                 int err;
2528
2529                 if (own) {
2530                         err = dsl_dataset_own_obj(dp, obj,
2531                             0, snaplist_tag, &ds);
2532                         if (err == 0)
2533                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2534                 } else {
2535                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2536                 }
2537                 if (err == ENOENT) {
2538                         /* lost race with snapshot destroy */
2539                         struct promotenode *last = list_tail(l);
2540                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2541                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2542                         continue;
2543                 } else if (err) {
2544                         return (err);
2545                 }
2546
2547                 if (first_obj == 0)
2548                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2549
2550                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2551                 snap->ds = ds;
2552                 list_insert_tail(l, snap);
2553                 obj = ds->ds_phys->ds_prev_snap_obj;
2554         }
2555
2556         return (0);
2557 }
2558
2559 static int
2560 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2561 {
2562         struct promotenode *snap;
2563
2564         *spacep = 0;
2565         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2566                 uint64_t used;
2567                 int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
2568                     mintxg, UINT64_MAX, &used);
2569                 if (err)
2570                         return (err);
2571                 *spacep += used;
2572         }
2573         return (0);
2574 }
2575
2576 static void
2577 snaplist_destroy(list_t *l, boolean_t own)
2578 {
2579         struct promotenode *snap;
2580
2581         if (!list_link_active(&l->list_head))
2582                 return;
2583
2584         while ((snap = list_tail(l)) != NULL) {
2585                 list_remove(l, snap);
2586                 if (own)
2587                         dsl_dataset_disown(snap->ds, snaplist_tag);
2588                 else
2589                         dsl_dataset_rele(snap->ds, snaplist_tag);
2590                 kmem_free(snap, sizeof (struct promotenode));
2591         }
2592         list_destroy(l);
2593 }
2594
2595 /*
2596  * Promote a clone.  Nomenclature note:
2597  * "clone" or "cds": the original clone which is being promoted
2598  * "origin" or "ods": the snapshot which is originally clone's origin
2599  * "origin head" or "ohds": the dataset which is the head
2600  * (filesystem/volume) for the origin
2601  * "origin origin": the origin of the origin's filesystem (typically
2602  * NULL, indicating that the clone is not a clone of a clone).
2603  */
2604 int
2605 dsl_dataset_promote(const char *name)
2606 {
2607         dsl_dataset_t *ds;
2608         dsl_dir_t *dd;
2609         dsl_pool_t *dp;
2610         dmu_object_info_t doi;
2611         struct promotearg pa = { 0 };
2612         struct promotenode *snap;
2613         int err;
2614
2615         err = dsl_dataset_hold(name, FTAG, &ds);
2616         if (err)
2617                 return (err);
2618         dd = ds->ds_dir;
2619         dp = dd->dd_pool;
2620
2621         err = dmu_object_info(dp->dp_meta_objset,
2622             ds->ds_phys->ds_snapnames_zapobj, &doi);
2623         if (err) {
2624                 dsl_dataset_rele(ds, FTAG);
2625                 return (err);
2626         }
2627
2628         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
2629                 dsl_dataset_rele(ds, FTAG);
2630                 return (EINVAL);
2631         }
2632
2633         /*
2634          * We are going to inherit all the snapshots taken before our
2635          * origin (i.e., our new origin will be our parent's origin).
2636          * Take ownership of them so that we can rename them into our
2637          * namespace.
2638          */
2639         rw_enter(&dp->dp_config_rwlock, RW_READER);
2640
2641         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
2642             &pa.shared_snaps);
2643         if (err != 0)
2644                 goto out;
2645
2646         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
2647         if (err != 0)
2648                 goto out;
2649
2650         snap = list_head(&pa.shared_snaps);
2651         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
2652         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
2653             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
2654         if (err != 0)
2655                 goto out;
2656
2657         if (dsl_dir_is_clone(snap->ds->ds_dir)) {
2658                 err = dsl_dataset_own_obj(dp,
2659                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
2660                     0, FTAG, &pa.origin_origin);
2661                 if (err != 0)
2662                         goto out;
2663         }
2664
2665 out:
2666         rw_exit(&dp->dp_config_rwlock);
2667
2668         /*
2669          * Add in 128x the snapnames zapobj size, since we will be moving
2670          * a bunch of snapnames to the promoted ds, and dirtying their
2671          * bonus buffers.
2672          */
2673         if (err == 0) {
2674                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
2675                     dsl_dataset_promote_sync, ds, &pa,
2676                     2 + 2 * doi.doi_physical_blks);
2677         }
2678
2679         snaplist_destroy(&pa.shared_snaps, B_TRUE);
2680         snaplist_destroy(&pa.clone_snaps, B_FALSE);
2681         snaplist_destroy(&pa.origin_snaps, B_FALSE);
2682         if (pa.origin_origin)
2683                 dsl_dataset_disown(pa.origin_origin, FTAG);
2684         dsl_dataset_rele(ds, FTAG);
2685         return (err);
2686 }
2687
2688 struct cloneswaparg {
2689         dsl_dataset_t *cds; /* clone dataset */
2690         dsl_dataset_t *ohds; /* origin's head dataset */
2691         boolean_t force;
2692         int64_t unused_refres_delta; /* change in unconsumed refreservation */
2693 };
2694
2695 /* ARGSUSED */
2696 static int
2697 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
2698 {
2699         struct cloneswaparg *csa = arg1;
2700
2701         /* they should both be heads */
2702         if (dsl_dataset_is_snapshot(csa->cds) ||
2703             dsl_dataset_is_snapshot(csa->ohds))
2704                 return (EINVAL);
2705
2706         /* the branch point should be just before them */
2707         if (csa->cds->ds_prev != csa->ohds->ds_prev)
2708                 return (EINVAL);
2709
2710         /* cds should be the clone */
2711         if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
2712             csa->ohds->ds_object)
2713                 return (EINVAL);
2714
2715         /* the clone should be a child of the origin */
2716         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
2717                 return (EINVAL);
2718
2719         /* ohds shouldn't be modified unless 'force' */
2720         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
2721                 return (ETXTBSY);
2722
2723         /* adjust amount of any unconsumed refreservation */
2724         csa->unused_refres_delta =
2725             (int64_t)MIN(csa->ohds->ds_reserved,
2726             csa->ohds->ds_phys->ds_unique_bytes) -
2727             (int64_t)MIN(csa->ohds->ds_reserved,
2728             csa->cds->ds_phys->ds_unique_bytes);
2729
2730         if (csa->unused_refres_delta > 0 &&
2731             csa->unused_refres_delta >
2732             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
2733                 return (ENOSPC);
2734
2735         return (0);
2736 }
2737
2738 /* ARGSUSED */
2739 static void
2740 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
2741 {
2742         struct cloneswaparg *csa = arg1;
2743         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
2744
2745         ASSERT(csa->cds->ds_reserved == 0);
2746         ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
2747
2748         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
2749         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
2750         dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
2751
2752         if (csa->cds->ds_user_ptr != NULL) {
2753                 csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
2754                 csa->cds->ds_user_ptr = NULL;
2755         }
2756
2757         if (csa->ohds->ds_user_ptr != NULL) {
2758                 csa->ohds->ds_user_evict_func(csa->ohds,
2759                     csa->ohds->ds_user_ptr);
2760                 csa->ohds->ds_user_ptr = NULL;
2761         }
2762
2763         /* reset origin's unique bytes */
2764         VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
2765             csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2766             &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
2767
2768         /* swap blkptrs */
2769         {
2770                 blkptr_t tmp;
2771                 tmp = csa->ohds->ds_phys->ds_bp;
2772                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
2773                 csa->cds->ds_phys->ds_bp = tmp;
2774         }
2775
2776         /* set dd_*_bytes */
2777         {
2778                 int64_t dused, dcomp, duncomp;
2779                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
2780                 uint64_t odl_used, odl_comp, odl_uncomp;
2781
2782                 ASSERT3U(csa->cds->ds_dir->dd_phys->
2783                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
2784
2785                 VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
2786                     &cdl_comp, &cdl_uncomp));
2787                 VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
2788                     &odl_comp, &odl_uncomp));
2789
2790                 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
2791                     (csa->ohds->ds_phys->ds_used_bytes + odl_used);
2792                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
2793                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
2794                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
2795                     cdl_uncomp -
2796                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
2797
2798                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
2799                     dused, dcomp, duncomp, tx);
2800                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
2801                     -dused, -dcomp, -duncomp, tx);
2802
2803                 /*
2804                  * The difference in the space used by snapshots is the
2805                  * difference in snapshot space due to the head's
2806                  * deadlist (since that's the only thing that's
2807                  * changing that affects the snapused).
2808                  */
2809                 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
2810                     csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
2811                 VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
2812                     csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
2813                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
2814                     DD_USED_HEAD, DD_USED_SNAP, tx);
2815         }
2816
2817 #define SWITCH64(x, y) \
2818         { \
2819                 uint64_t __tmp = (x); \
2820                 (x) = (y); \
2821                 (y) = __tmp; \
2822         }
2823
2824         /* swap ds_*_bytes */
2825         SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
2826             csa->cds->ds_phys->ds_used_bytes);
2827         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
2828             csa->cds->ds_phys->ds_compressed_bytes);
2829         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
2830             csa->cds->ds_phys->ds_uncompressed_bytes);
2831         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
2832             csa->cds->ds_phys->ds_unique_bytes);
2833
2834         /* apply any parent delta for change in unconsumed refreservation */
2835         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
2836             csa->unused_refres_delta, 0, 0, tx);
2837
2838         /* swap deadlists */
2839         bplist_close(&csa->cds->ds_deadlist);
2840         bplist_close(&csa->ohds->ds_deadlist);
2841         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
2842             csa->cds->ds_phys->ds_deadlist_obj);
2843         VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
2844             csa->cds->ds_phys->ds_deadlist_obj));
2845         VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
2846             csa->ohds->ds_phys->ds_deadlist_obj));
2847 }
2848
2849 /*
2850  * Swap 'clone' with its origin head file system.  Used at the end
2851  * of "online recv" to swizzle the file system to the new version.
2852  */
2853 int
2854 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
2855     boolean_t force)
2856 {
2857         struct cloneswaparg csa;
2858         int error;
2859
2860         ASSERT(clone->ds_owner);
2861         ASSERT(origin_head->ds_owner);
2862 retry:
2863         /* Need exclusive access for the swap */
2864         rw_enter(&clone->ds_rwlock, RW_WRITER);
2865         if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
2866                 rw_exit(&clone->ds_rwlock);
2867                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
2868                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
2869                         rw_exit(&origin_head->ds_rwlock);
2870                         goto retry;
2871                 }
2872         }
2873         csa.cds = clone;
2874         csa.ohds = origin_head;
2875         csa.force = force;
2876         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
2877             dsl_dataset_clone_swap_check,
2878             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
2879         return (error);
2880 }
2881
2882 /*
2883  * Given a pool name and a dataset object number in that pool,
2884  * return the name of that dataset.
2885  */
2886 int
2887 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2888 {
2889         spa_t *spa;
2890         dsl_pool_t *dp;
2891         dsl_dataset_t *ds;
2892         int error;
2893
2894         if ((error = spa_open(pname, &spa, FTAG)) != 0)
2895                 return (error);
2896         dp = spa_get_dsl(spa);
2897         rw_enter(&dp->dp_config_rwlock, RW_READER);
2898         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
2899                 dsl_dataset_name(ds, buf);
2900                 dsl_dataset_rele(ds, FTAG);
2901         }
2902         rw_exit(&dp->dp_config_rwlock);
2903         spa_close(spa, FTAG);
2904
2905         return (error);
2906 }
2907
2908 int
2909 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2910     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2911 {
2912         int error = 0;
2913
2914         ASSERT3S(asize, >, 0);
2915
2916         /*
2917          * *ref_rsrv is the portion of asize that will come from any
2918          * unconsumed refreservation space.
2919          */
2920         *ref_rsrv = 0;
2921
2922         mutex_enter(&ds->ds_lock);
2923         /*
2924          * Make a space adjustment for reserved bytes.
2925          */
2926         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
2927                 ASSERT3U(*used, >=,
2928                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2929                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2930                 *ref_rsrv =
2931                     asize - MIN(asize, parent_delta(ds, asize + inflight));
2932         }
2933
2934         if (!check_quota || ds->ds_quota == 0) {
2935                 mutex_exit(&ds->ds_lock);
2936                 return (0);
2937         }
2938         /*
2939          * If they are requesting more space, and our current estimate
2940          * is over quota, they get to try again unless the actual
2941          * on-disk is over quota and there are no pending changes (which
2942          * may free up space for us).
2943          */
2944         if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
2945                 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
2946                         error = ERESTART;
2947                 else
2948                         error = EDQUOT;
2949         }
2950         mutex_exit(&ds->ds_lock);
2951
2952         return (error);
2953 }
2954
2955 /* ARGSUSED */
2956 static int
2957 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
2958 {
2959         dsl_dataset_t *ds = arg1;
2960         uint64_t *quotap = arg2;
2961         uint64_t new_quota = *quotap;
2962
2963         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
2964                 return (ENOTSUP);
2965
2966         if (new_quota == 0)
2967                 return (0);
2968
2969         if (new_quota < ds->ds_phys->ds_used_bytes ||
2970             new_quota < ds->ds_reserved)
2971                 return (ENOSPC);
2972
2973         return (0);
2974 }
2975
2976 /* ARGSUSED */
2977 void
2978 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
2979 {
2980         dsl_dataset_t *ds = arg1;
2981         uint64_t *quotap = arg2;
2982         uint64_t new_quota = *quotap;
2983
2984         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2985
2986         ds->ds_quota = new_quota;
2987
2988         dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
2989
2990         spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
2991             tx, cr, "%lld dataset = %llu ",
2992             (longlong_t)new_quota, ds->ds_object);
2993 }
2994
2995 int
2996 dsl_dataset_set_quota(const char *dsname, uint64_t quota)
2997 {
2998         dsl_dataset_t *ds;
2999         int err;
3000
3001         err = dsl_dataset_hold(dsname, FTAG, &ds);
3002         if (err)
3003                 return (err);
3004
3005         if (quota != ds->ds_quota) {
3006                 /*
3007                  * If someone removes a file, then tries to set the quota, we
3008                  * want to make sure the file freeing takes effect.
3009                  */
3010                 txg_wait_open(ds->ds_dir->dd_pool, 0);
3011
3012                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3013                     dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3014                     ds, &quota, 0);
3015         }
3016         dsl_dataset_rele(ds, FTAG);
3017         return (err);
3018 }
3019
3020 static int
3021 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3022 {
3023         dsl_dataset_t *ds = arg1;
3024         uint64_t *reservationp = arg2;
3025         uint64_t new_reservation = *reservationp;
3026         int64_t delta;
3027         uint64_t unique;
3028
3029         if (new_reservation > INT64_MAX)
3030                 return (EOVERFLOW);
3031
3032         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3033             SPA_VERSION_REFRESERVATION)
3034                 return (ENOTSUP);
3035
3036         if (dsl_dataset_is_snapshot(ds))
3037                 return (EINVAL);
3038
3039         /*
3040          * If we are doing the preliminary check in open context, the
3041          * space estimates may be inaccurate.
3042          */
3043         if (!dmu_tx_is_syncing(tx))
3044                 return (0);
3045
3046         mutex_enter(&ds->ds_lock);
3047         unique = dsl_dataset_unique(ds);
3048         delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
3049         mutex_exit(&ds->ds_lock);
3050
3051         if (delta > 0 &&
3052             delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3053                 return (ENOSPC);
3054         if (delta > 0 && ds->ds_quota > 0 &&
3055             new_reservation > ds->ds_quota)
3056                 return (ENOSPC);
3057
3058         return (0);
3059 }
3060
3061 /* ARGSUSED */
3062 static void
3063 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
3064     dmu_tx_t *tx)
3065 {
3066         dsl_dataset_t *ds = arg1;
3067         uint64_t *reservationp = arg2;
3068         uint64_t new_reservation = *reservationp;
3069         uint64_t unique;
3070         int64_t delta;
3071
3072         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3073
3074         mutex_enter(&ds->ds_dir->dd_lock);
3075         mutex_enter(&ds->ds_lock);
3076         unique = dsl_dataset_unique(ds);
3077         delta = MAX(0, (int64_t)(new_reservation - unique)) -
3078             MAX(0, (int64_t)(ds->ds_reserved - unique));
3079         ds->ds_reserved = new_reservation;
3080         mutex_exit(&ds->ds_lock);
3081
3082         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3083         mutex_exit(&ds->ds_dir->dd_lock);
3084         dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
3085             new_reservation, cr, tx);
3086
3087         spa_history_internal_log(LOG_DS_REFRESERV,
3088             ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
3089             (longlong_t)new_reservation, ds->ds_object);
3090 }
3091
3092 int
3093 dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
3094 {
3095         dsl_dataset_t *ds;
3096         int err;
3097
3098         err = dsl_dataset_hold(dsname, FTAG, &ds);
3099         if (err)
3100                 return (err);
3101
3102         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3103             dsl_dataset_set_reservation_check,
3104             dsl_dataset_set_reservation_sync, ds, &reservation, 0);
3105         dsl_dataset_rele(ds, FTAG);
3106         return (err);
3107 }