]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dataset.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <sys/dmu_objset.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_prop.h>
32 #include <sys/dsl_synctask.h>
33 #include <sys/dmu_traverse.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/arc.h>
36 #include <sys/zio.h>
37 #include <sys/zap.h>
38 #include <sys/unique.h>
39 #include <sys/zfs_context.h>
40 #include <sys/zfs_ioctl.h>
41
42 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
43 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
44 static dsl_checkfunc_t dsl_dataset_rollback_check;
45 static dsl_syncfunc_t dsl_dataset_rollback_sync;
46 static dsl_checkfunc_t dsl_dataset_destroy_check;
47 static dsl_syncfunc_t dsl_dataset_destroy_sync;
48
49 #define DS_REF_MAX      (1ULL << 62)
50
51 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
52
53 /*
54  * We use weighted reference counts to express the various forms of exclusion
55  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
56  * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
57  * This makes the exclusion logic simple: the total refcnt for all opens cannot
58  * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
59  * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
60  * just over half of the refcnt space, so there can't be more than one, but it
61  * can peacefully coexist with any number of STANDARD opens.
62  */
63 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
64         0,                      /* DS_MODE_NONE - invalid               */
65         1,                      /* DS_MODE_STANDARD - unlimited number  */
66         (DS_REF_MAX >> 1) + 1,  /* DS_MODE_PRIMARY - only one of these  */
67         DS_REF_MAX              /* DS_MODE_EXCLUSIVE - no other opens   */
68 };
69
70
71 void
72 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
73 {
74         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
75         int compressed = BP_GET_PSIZE(bp);
76         int uncompressed = BP_GET_UCSIZE(bp);
77
78         dprintf_bp(bp, "born, ds=%p\n", ds);
79
80         ASSERT(dmu_tx_is_syncing(tx));
81         /* It could have been compressed away to nothing */
82         if (BP_IS_HOLE(bp))
83                 return;
84         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
85         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
86         if (ds == NULL) {
87                 /*
88                  * Account for the meta-objset space in its placeholder
89                  * dsl_dir.
90                  */
91                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
92                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
93                     used, compressed, uncompressed, tx);
94                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
95                 return;
96         }
97         dmu_buf_will_dirty(ds->ds_dbuf, tx);
98         mutex_enter(&ds->ds_lock);
99         ds->ds_phys->ds_used_bytes += used;
100         ds->ds_phys->ds_compressed_bytes += compressed;
101         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
102         ds->ds_phys->ds_unique_bytes += used;
103         mutex_exit(&ds->ds_lock);
104         dsl_dir_diduse_space(ds->ds_dir,
105             used, compressed, uncompressed, tx);
106 }
107
108 void
109 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
110     dmu_tx_t *tx)
111 {
112         int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
113         int compressed = BP_GET_PSIZE(bp);
114         int uncompressed = BP_GET_UCSIZE(bp);
115
116         ASSERT(dmu_tx_is_syncing(tx));
117         /* No block pointer => nothing to free */
118         if (BP_IS_HOLE(bp))
119                 return;
120
121         ASSERT(used > 0);
122         if (ds == NULL) {
123                 int err;
124                 /*
125                  * Account for the meta-objset space in its placeholder
126                  * dataset.
127                  */
128                 err = arc_free(pio, tx->tx_pool->dp_spa,
129                     tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
130                 ASSERT(err == 0);
131
132                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
133                     -used, -compressed, -uncompressed, tx);
134                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
135                 return;
136         }
137         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
138
139         dmu_buf_will_dirty(ds->ds_dbuf, tx);
140
141         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
142                 int err;
143
144                 dprintf_bp(bp, "freeing: %s", "");
145                 err = arc_free(pio, tx->tx_pool->dp_spa,
146                     tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
147                 ASSERT(err == 0);
148
149                 mutex_enter(&ds->ds_lock);
150                 /* XXX unique_bytes is not accurate for head datasets */
151                 /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
152                 ds->ds_phys->ds_unique_bytes -= used;
153                 mutex_exit(&ds->ds_lock);
154                 dsl_dir_diduse_space(ds->ds_dir,
155                     -used, -compressed, -uncompressed, tx);
156         } else {
157                 dprintf_bp(bp, "putting on dead list: %s", "");
158                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
159                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
160                 if (ds->ds_phys->ds_prev_snap_obj != 0) {
161                         ASSERT3U(ds->ds_prev->ds_object, ==,
162                             ds->ds_phys->ds_prev_snap_obj);
163                         ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
164                         if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
165                             ds->ds_object && bp->blk_birth >
166                             ds->ds_prev->ds_phys->ds_prev_snap_txg) {
167                                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
168                                 mutex_enter(&ds->ds_prev->ds_lock);
169                                 ds->ds_prev->ds_phys->ds_unique_bytes +=
170                                     used;
171                                 mutex_exit(&ds->ds_prev->ds_lock);
172                         }
173                 }
174         }
175         mutex_enter(&ds->ds_lock);
176         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
177         ds->ds_phys->ds_used_bytes -= used;
178         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
179         ds->ds_phys->ds_compressed_bytes -= compressed;
180         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
181         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
182         mutex_exit(&ds->ds_lock);
183 }
184
185 uint64_t
186 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
187 {
188         uint64_t trysnap = 0;
189
190         if (ds == NULL)
191                 return (0);
192         /*
193          * The snapshot creation could fail, but that would cause an
194          * incorrect FALSE return, which would only result in an
195          * overestimation of the amount of space that an operation would
196          * consume, which is OK.
197          *
198          * There's also a small window where we could miss a pending
199          * snapshot, because we could set the sync task in the quiescing
200          * phase.  So this should only be used as a guess.
201          */
202         if (ds->ds_trysnap_txg >
203             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
204                 trysnap = ds->ds_trysnap_txg;
205         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
206 }
207
208 int
209 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
210 {
211         return (blk_birth > dsl_dataset_prev_snap_txg(ds));
212 }
213
214 /* ARGSUSED */
215 static void
216 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
217 {
218         dsl_dataset_t *ds = dsv;
219         dsl_pool_t *dp = ds->ds_dir->dd_pool;
220
221         /* open_refcount == DS_REF_MAX when deleting */
222         ASSERT(ds->ds_open_refcount == 0 ||
223             ds->ds_open_refcount == DS_REF_MAX);
224
225         dprintf_ds(ds, "evicting %s\n", "");
226
227         unique_remove(ds->ds_phys->ds_fsid_guid);
228
229         if (ds->ds_user_ptr != NULL)
230                 ds->ds_user_evict_func(ds, ds->ds_user_ptr);
231
232         if (ds->ds_prev) {
233                 dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
234                 ds->ds_prev = NULL;
235         }
236
237         bplist_close(&ds->ds_deadlist);
238         dsl_dir_close(ds->ds_dir, ds);
239
240         if (list_link_active(&ds->ds_synced_link))
241                 list_remove(&dp->dp_synced_objsets, ds);
242
243         mutex_destroy(&ds->ds_lock);
244         mutex_destroy(&ds->ds_deadlist.bpl_lock);
245
246         kmem_free(ds, sizeof (dsl_dataset_t));
247 }
248
249 static int
250 dsl_dataset_get_snapname(dsl_dataset_t *ds)
251 {
252         dsl_dataset_phys_t *headphys;
253         int err;
254         dmu_buf_t *headdbuf;
255         dsl_pool_t *dp = ds->ds_dir->dd_pool;
256         objset_t *mos = dp->dp_meta_objset;
257
258         if (ds->ds_snapname[0])
259                 return (0);
260         if (ds->ds_phys->ds_next_snap_obj == 0)
261                 return (0);
262
263         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
264             FTAG, &headdbuf);
265         if (err)
266                 return (err);
267         headphys = headdbuf->db_data;
268         err = zap_value_search(dp->dp_meta_objset,
269             headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
270         dmu_buf_rele(headdbuf, FTAG);
271         return (err);
272 }
273
274 int
275 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
276     int mode, void *tag, dsl_dataset_t **dsp)
277 {
278         uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
279         objset_t *mos = dp->dp_meta_objset;
280         dmu_buf_t *dbuf;
281         dsl_dataset_t *ds;
282         int err;
283
284         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
285             dsl_pool_sync_context(dp));
286
287         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
288         if (err)
289                 return (err);
290         ds = dmu_buf_get_user(dbuf);
291         if (ds == NULL) {
292                 dsl_dataset_t *winner;
293
294                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
295                 ds->ds_dbuf = dbuf;
296                 ds->ds_object = dsobj;
297                 ds->ds_phys = dbuf->db_data;
298
299                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
300                 mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
301                     NULL);
302
303                 err = bplist_open(&ds->ds_deadlist,
304                     mos, ds->ds_phys->ds_deadlist_obj);
305                 if (err == 0) {
306                         err = dsl_dir_open_obj(dp,
307                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
308                 }
309                 if (err) {
310                         /*
311                          * we don't really need to close the blist if we
312                          * just opened it.
313                          */
314                         mutex_destroy(&ds->ds_lock);
315                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
316                         kmem_free(ds, sizeof (dsl_dataset_t));
317                         dmu_buf_rele(dbuf, tag);
318                         return (err);
319                 }
320
321                 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
322                         ds->ds_snapname[0] = '\0';
323                         if (ds->ds_phys->ds_prev_snap_obj) {
324                                 err = dsl_dataset_open_obj(dp,
325                                     ds->ds_phys->ds_prev_snap_obj, NULL,
326                                     DS_MODE_NONE, ds, &ds->ds_prev);
327                         }
328                 } else {
329                         if (snapname) {
330 #ifdef ZFS_DEBUG
331                                 dsl_dataset_phys_t *headphys;
332                                 dmu_buf_t *headdbuf;
333                                 err = dmu_bonus_hold(mos,
334                                     ds->ds_dir->dd_phys->dd_head_dataset_obj,
335                                     FTAG, &headdbuf);
336                                 if (err == 0) {
337                                         headphys = headdbuf->db_data;
338                                         uint64_t foundobj;
339                                         err = zap_lookup(dp->dp_meta_objset,
340                                             headphys->ds_snapnames_zapobj,
341                                             snapname, sizeof (foundobj), 1,
342                                             &foundobj);
343                                         ASSERT3U(foundobj, ==, dsobj);
344                                         dmu_buf_rele(headdbuf, FTAG);
345                                 }
346 #endif
347                                 (void) strcat(ds->ds_snapname, snapname);
348                         } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
349                                 err = dsl_dataset_get_snapname(ds);
350                         }
351                 }
352
353                 if (err == 0) {
354                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
355                             dsl_dataset_evict);
356                 }
357                 if (err || winner) {
358                         bplist_close(&ds->ds_deadlist);
359                         if (ds->ds_prev) {
360                                 dsl_dataset_close(ds->ds_prev,
361                                     DS_MODE_NONE, ds);
362                         }
363                         dsl_dir_close(ds->ds_dir, ds);
364                         mutex_destroy(&ds->ds_lock);
365                         mutex_destroy(&ds->ds_deadlist.bpl_lock);
366                         kmem_free(ds, sizeof (dsl_dataset_t));
367                         if (err) {
368                                 dmu_buf_rele(dbuf, tag);
369                                 return (err);
370                         }
371                         ds = winner;
372                 } else {
373                         uint64_t new =
374                             unique_insert(ds->ds_phys->ds_fsid_guid);
375                         if (new != ds->ds_phys->ds_fsid_guid) {
376                                 /* XXX it won't necessarily be synced... */
377                                 ds->ds_phys->ds_fsid_guid = new;
378                         }
379                 }
380         }
381         ASSERT3P(ds->ds_dbuf, ==, dbuf);
382         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
383
384         mutex_enter(&ds->ds_lock);
385         if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
386             (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
387             !DS_MODE_IS_INCONSISTENT(mode)) ||
388             (ds->ds_open_refcount + weight > DS_REF_MAX)) {
389                 mutex_exit(&ds->ds_lock);
390                 dsl_dataset_close(ds, DS_MODE_NONE, tag);
391                 return (EBUSY);
392         }
393         ds->ds_open_refcount += weight;
394         mutex_exit(&ds->ds_lock);
395
396         *dsp = ds;
397         return (0);
398 }
399
400 int
401 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
402     void *tag, dsl_dataset_t **dsp)
403 {
404         dsl_dir_t *dd;
405         dsl_pool_t *dp;
406         const char *tail;
407         uint64_t obj;
408         dsl_dataset_t *ds = NULL;
409         int err = 0;
410
411         err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
412         if (err)
413                 return (err);
414
415         dp = dd->dd_pool;
416         obj = dd->dd_phys->dd_head_dataset_obj;
417         rw_enter(&dp->dp_config_rwlock, RW_READER);
418         if (obj == 0) {
419                 /* A dataset with no associated objset */
420                 err = ENOENT;
421                 goto out;
422         }
423
424         if (tail != NULL) {
425                 objset_t *mos = dp->dp_meta_objset;
426
427                 err = dsl_dataset_open_obj(dp, obj, NULL,
428                     DS_MODE_NONE, tag, &ds);
429                 if (err)
430                         goto out;
431                 obj = ds->ds_phys->ds_snapnames_zapobj;
432                 dsl_dataset_close(ds, DS_MODE_NONE, tag);
433                 ds = NULL;
434
435                 if (tail[0] != '@') {
436                         err = ENOENT;
437                         goto out;
438                 }
439                 tail++;
440
441                 /* Look for a snapshot */
442                 if (!DS_MODE_IS_READONLY(mode)) {
443                         err = EROFS;
444                         goto out;
445                 }
446                 dprintf("looking for snapshot '%s'\n", tail);
447                 err = zap_lookup(mos, obj, tail, 8, 1, &obj);
448                 if (err)
449                         goto out;
450         }
451         err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
452
453 out:
454         rw_exit(&dp->dp_config_rwlock);
455         dsl_dir_close(dd, FTAG);
456
457         ASSERT3U((err == 0), ==, (ds != NULL));
458         /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
459
460         *dsp = ds;
461         return (err);
462 }
463
464 int
465 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
466 {
467         return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
468 }
469
470 void
471 dsl_dataset_name(dsl_dataset_t *ds, char *name)
472 {
473         if (ds == NULL) {
474                 (void) strcpy(name, "mos");
475         } else {
476                 dsl_dir_name(ds->ds_dir, name);
477                 VERIFY(0 == dsl_dataset_get_snapname(ds));
478                 if (ds->ds_snapname[0]) {
479                         (void) strcat(name, "@");
480                         if (!MUTEX_HELD(&ds->ds_lock)) {
481                                 /*
482                                  * We use a "recursive" mutex so that we
483                                  * can call dprintf_ds() with ds_lock held.
484                                  */
485                                 mutex_enter(&ds->ds_lock);
486                                 (void) strcat(name, ds->ds_snapname);
487                                 mutex_exit(&ds->ds_lock);
488                         } else {
489                                 (void) strcat(name, ds->ds_snapname);
490                         }
491                 }
492         }
493 }
494
495 static int
496 dsl_dataset_namelen(dsl_dataset_t *ds)
497 {
498         int result;
499
500         if (ds == NULL) {
501                 result = 3;     /* "mos" */
502         } else {
503                 result = dsl_dir_namelen(ds->ds_dir);
504                 VERIFY(0 == dsl_dataset_get_snapname(ds));
505                 if (ds->ds_snapname[0]) {
506                         ++result;       /* adding one for the @-sign */
507                         if (!MUTEX_HELD(&ds->ds_lock)) {
508                                 /* see dsl_datset_name */
509                                 mutex_enter(&ds->ds_lock);
510                                 result += strlen(ds->ds_snapname);
511                                 mutex_exit(&ds->ds_lock);
512                         } else {
513                                 result += strlen(ds->ds_snapname);
514                         }
515                 }
516         }
517
518         return (result);
519 }
520
521 void
522 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
523 {
524         uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
525         mutex_enter(&ds->ds_lock);
526         ASSERT3U(ds->ds_open_refcount, >=, weight);
527         ds->ds_open_refcount -= weight;
528         dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
529             mode, ds->ds_open_refcount);
530         mutex_exit(&ds->ds_lock);
531
532         dmu_buf_rele(ds->ds_dbuf, tag);
533 }
534
535 void
536 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
537 {
538         objset_t *mos = dp->dp_meta_objset;
539         dmu_buf_t *dbuf;
540         dsl_dataset_phys_t *dsphys;
541         dsl_dataset_t *ds;
542         uint64_t dsobj;
543         dsl_dir_t *dd;
544
545         dsl_dir_create_root(mos, ddobjp, tx);
546         VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
547
548         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
549             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
550         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
551         dmu_buf_will_dirty(dbuf, tx);
552         dsphys = dbuf->db_data;
553         dsphys->ds_dir_obj = dd->dd_object;
554         dsphys->ds_fsid_guid = unique_create();
555         unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
556         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
557             sizeof (dsphys->ds_guid));
558         dsphys->ds_snapnames_zapobj =
559             zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
560         dsphys->ds_creation_time = gethrestime_sec();
561         dsphys->ds_creation_txg = tx->tx_txg;
562         dsphys->ds_deadlist_obj =
563             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
564         dmu_buf_rele(dbuf, FTAG);
565
566         dmu_buf_will_dirty(dd->dd_dbuf, tx);
567         dd->dd_phys->dd_head_dataset_obj = dsobj;
568         dsl_dir_close(dd, FTAG);
569
570         VERIFY(0 ==
571             dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
572         (void) dmu_objset_create_impl(dp->dp_spa, ds,
573             &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
574         dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
575 }
576
577 uint64_t
578 dsl_dataset_create_sync(dsl_dir_t *pdd,
579     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
580 {
581         dsl_pool_t *dp = pdd->dd_pool;
582         dmu_buf_t *dbuf;
583         dsl_dataset_phys_t *dsphys;
584         uint64_t dsobj, ddobj;
585         objset_t *mos = dp->dp_meta_objset;
586         dsl_dir_t *dd;
587
588         ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
589         ASSERT(clone_parent == NULL ||
590             clone_parent->ds_phys->ds_num_children > 0);
591         ASSERT(lastname[0] != '@');
592         ASSERT(dmu_tx_is_syncing(tx));
593
594         ddobj = dsl_dir_create_sync(pdd, lastname, tx);
595         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
596
597         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
598             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
599         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
600         dmu_buf_will_dirty(dbuf, tx);
601         dsphys = dbuf->db_data;
602         dsphys->ds_dir_obj = dd->dd_object;
603         dsphys->ds_fsid_guid = unique_create();
604         unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
605         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
606             sizeof (dsphys->ds_guid));
607         dsphys->ds_snapnames_zapobj =
608             zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
609         dsphys->ds_creation_time = gethrestime_sec();
610         dsphys->ds_creation_txg = tx->tx_txg;
611         dsphys->ds_deadlist_obj =
612             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
613         if (clone_parent) {
614                 dsphys->ds_prev_snap_obj = clone_parent->ds_object;
615                 dsphys->ds_prev_snap_txg =
616                     clone_parent->ds_phys->ds_creation_txg;
617                 dsphys->ds_used_bytes =
618                     clone_parent->ds_phys->ds_used_bytes;
619                 dsphys->ds_compressed_bytes =
620                     clone_parent->ds_phys->ds_compressed_bytes;
621                 dsphys->ds_uncompressed_bytes =
622                     clone_parent->ds_phys->ds_uncompressed_bytes;
623                 dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
624
625                 dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
626                 clone_parent->ds_phys->ds_num_children++;
627
628                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
629                 dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
630         }
631         dmu_buf_rele(dbuf, FTAG);
632
633         dmu_buf_will_dirty(dd->dd_dbuf, tx);
634         dd->dd_phys->dd_head_dataset_obj = dsobj;
635         dsl_dir_close(dd, FTAG);
636
637         return (dsobj);
638 }
639
640 struct destroyarg {
641         dsl_sync_task_group_t *dstg;
642         char *snapname;
643         char *failed;
644 };
645
646 static int
647 dsl_snapshot_destroy_one(char *name, void *arg)
648 {
649         struct destroyarg *da = arg;
650         dsl_dataset_t *ds;
651         char *cp;
652         int err;
653
654         (void) strcat(name, "@");
655         (void) strcat(name, da->snapname);
656         err = dsl_dataset_open(name,
657             DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
658             da->dstg, &ds);
659         cp = strchr(name, '@');
660         *cp = '\0';
661         if (err == ENOENT)
662                 return (0);
663         if (err) {
664                 (void) strcpy(da->failed, name);
665                 return (err);
666         }
667
668         dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
669             dsl_dataset_destroy_sync, ds, da->dstg, 0);
670         return (0);
671 }
672
673 /*
674  * Destroy 'snapname' in all descendants of 'fsname'.
675  */
676 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
677 int
678 dsl_snapshots_destroy(char *fsname, char *snapname)
679 {
680         int err;
681         struct destroyarg da;
682         dsl_sync_task_t *dst;
683         spa_t *spa;
684         char *cp;
685
686         cp = strchr(fsname, '/');
687         if (cp) {
688                 *cp = '\0';
689                 err = spa_open(fsname, &spa, FTAG);
690                 *cp = '/';
691         } else {
692                 err = spa_open(fsname, &spa, FTAG);
693         }
694         if (err)
695                 return (err);
696         da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
697         da.snapname = snapname;
698         da.failed = fsname;
699
700         err = dmu_objset_find(fsname,
701             dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
702
703         if (err == 0)
704                 err = dsl_sync_task_group_wait(da.dstg);
705
706         for (dst = list_head(&da.dstg->dstg_tasks); dst;
707             dst = list_next(&da.dstg->dstg_tasks, dst)) {
708                 dsl_dataset_t *ds = dst->dst_arg1;
709                 if (dst->dst_err) {
710                         dsl_dataset_name(ds, fsname);
711                         cp = strchr(fsname, '@');
712                         *cp = '\0';
713                 }
714                 /*
715                  * If it was successful, destroy_sync would have
716                  * closed the ds
717                  */
718                 if (err)
719                         dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
720         }
721
722         dsl_sync_task_group_destroy(da.dstg);
723         spa_close(spa, FTAG);
724         return (err);
725 }
726
727 int
728 dsl_dataset_destroy(const char *name)
729 {
730         int err;
731         dsl_sync_task_group_t *dstg;
732         objset_t *os;
733         dsl_dataset_t *ds;
734         dsl_dir_t *dd;
735         uint64_t obj;
736
737         if (strchr(name, '@')) {
738                 /* Destroying a snapshot is simpler */
739                 err = dsl_dataset_open(name,
740                     DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
741                     FTAG, &ds);
742                 if (err)
743                         return (err);
744                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
745                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
746                     ds, FTAG, 0);
747                 if (err)
748                         dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
749                 return (err);
750         }
751
752         err = dmu_objset_open(name, DMU_OST_ANY,
753             DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
754         if (err)
755                 return (err);
756         ds = os->os->os_dsl_dataset;
757         dd = ds->ds_dir;
758
759         /*
760          * Check for errors and mark this ds as inconsistent, in
761          * case we crash while freeing the objects.
762          */
763         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
764             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
765         if (err) {
766                 dmu_objset_close(os);
767                 return (err);
768         }
769
770         /*
771          * remove the objects in open context, so that we won't
772          * have too much to do in syncing context.
773          */
774         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
775             ds->ds_phys->ds_prev_snap_txg)) {
776                 dmu_tx_t *tx = dmu_tx_create(os);
777                 dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
778                 dmu_tx_hold_bonus(tx, obj);
779                 err = dmu_tx_assign(tx, TXG_WAIT);
780                 if (err) {
781                         /*
782                          * Perhaps there is not enough disk
783                          * space.  Just deal with it from
784                          * dsl_dataset_destroy_sync().
785                          */
786                         dmu_tx_abort(tx);
787                         continue;
788                 }
789                 VERIFY(0 == dmu_object_free(os, obj, tx));
790                 dmu_tx_commit(tx);
791         }
792         /* Make sure it's not dirty before we finish destroying it. */
793         txg_wait_synced(dd->dd_pool, 0);
794
795         dmu_objset_close(os);
796         if (err != ESRCH)
797                 return (err);
798
799         err = dsl_dataset_open(name,
800             DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
801             FTAG, &ds);
802         if (err)
803                 return (err);
804
805         err = dsl_dir_open(name, FTAG, &dd, NULL);
806         if (err) {
807                 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
808                 return (err);
809         }
810
811         /*
812          * Blow away the dsl_dir + head dataset.
813          */
814         dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
815         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
816             dsl_dataset_destroy_sync, ds, FTAG, 0);
817         dsl_sync_task_create(dstg, dsl_dir_destroy_check,
818             dsl_dir_destroy_sync, dd, FTAG, 0);
819         err = dsl_sync_task_group_wait(dstg);
820         dsl_sync_task_group_destroy(dstg);
821         /* if it is successful, *destroy_sync will close the ds+dd */
822         if (err) {
823                 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
824                 dsl_dir_close(dd, FTAG);
825         }
826         return (err);
827 }
828
829 int
830 dsl_dataset_rollback(dsl_dataset_t *ds)
831 {
832         ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
833         return (dsl_sync_task_do(ds->ds_dir->dd_pool,
834             dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
835             ds, NULL, 0));
836 }
837
838 void *
839 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
840     void *p, dsl_dataset_evict_func_t func)
841 {
842         void *old;
843
844         mutex_enter(&ds->ds_lock);
845         old = ds->ds_user_ptr;
846         if (old == NULL) {
847                 ds->ds_user_ptr = p;
848                 ds->ds_user_evict_func = func;
849         }
850         mutex_exit(&ds->ds_lock);
851         return (old);
852 }
853
854 void *
855 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
856 {
857         return (ds->ds_user_ptr);
858 }
859
860
861 blkptr_t *
862 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
863 {
864         return (&ds->ds_phys->ds_bp);
865 }
866
867 void
868 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
869 {
870         ASSERT(dmu_tx_is_syncing(tx));
871         /* If it's the meta-objset, set dp_meta_rootbp */
872         if (ds == NULL) {
873                 tx->tx_pool->dp_meta_rootbp = *bp;
874         } else {
875                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
876                 ds->ds_phys->ds_bp = *bp;
877         }
878 }
879
880 spa_t *
881 dsl_dataset_get_spa(dsl_dataset_t *ds)
882 {
883         return (ds->ds_dir->dd_pool->dp_spa);
884 }
885
886 void
887 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
888 {
889         dsl_pool_t *dp;
890
891         if (ds == NULL) /* this is the meta-objset */
892                 return;
893
894         ASSERT(ds->ds_user_ptr != NULL);
895
896         if (ds->ds_phys->ds_next_snap_obj != 0)
897                 panic("dirtying snapshot!");
898
899         dp = ds->ds_dir->dd_pool;
900
901         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
902                 /* up the hold count until we can be written out */
903                 dmu_buf_add_ref(ds->ds_dbuf, ds);
904         }
905 }
906
907 struct killarg {
908         uint64_t *usedp;
909         uint64_t *compressedp;
910         uint64_t *uncompressedp;
911         zio_t *zio;
912         dmu_tx_t *tx;
913 };
914
915 static int
916 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
917 {
918         struct killarg *ka = arg;
919         blkptr_t *bp = &bc->bc_blkptr;
920
921         ASSERT3U(bc->bc_errno, ==, 0);
922
923         /*
924          * Since this callback is not called concurrently, no lock is
925          * needed on the accounting values.
926          */
927         *ka->usedp += bp_get_dasize(spa, bp);
928         *ka->compressedp += BP_GET_PSIZE(bp);
929         *ka->uncompressedp += BP_GET_UCSIZE(bp);
930         /* XXX check for EIO? */
931         (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
932             ARC_NOWAIT);
933         return (0);
934 }
935
936 /* ARGSUSED */
937 static int
938 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
939 {
940         dsl_dataset_t *ds = arg1;
941
942         /*
943          * There must be a previous snapshot.  I suppose we could roll
944          * it back to being empty (and re-initialize the upper (ZPL)
945          * layer).  But for now there's no way to do this via the user
946          * interface.
947          */
948         if (ds->ds_phys->ds_prev_snap_txg == 0)
949                 return (EINVAL);
950
951         /*
952          * This must not be a snapshot.
953          */
954         if (ds->ds_phys->ds_next_snap_obj != 0)
955                 return (EINVAL);
956
957         /*
958          * If we made changes this txg, traverse_dsl_dataset won't find
959          * them.  Try again.
960          */
961         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
962                 return (EAGAIN);
963
964         return (0);
965 }
966
967 /* ARGSUSED */
968 static void
969 dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
970 {
971         dsl_dataset_t *ds = arg1;
972         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
973
974         dmu_buf_will_dirty(ds->ds_dbuf, tx);
975
976         /* Zero out the deadlist. */
977         bplist_close(&ds->ds_deadlist);
978         bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
979         ds->ds_phys->ds_deadlist_obj =
980             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
981         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
982             ds->ds_phys->ds_deadlist_obj));
983
984         {
985                 /* Free blkptrs that we gave birth to */
986                 zio_t *zio;
987                 uint64_t used = 0, compressed = 0, uncompressed = 0;
988                 struct killarg ka;
989
990                 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
991                     ZIO_FLAG_MUSTSUCCEED);
992                 ka.usedp = &used;
993                 ka.compressedp = &compressed;
994                 ka.uncompressedp = &uncompressed;
995                 ka.zio = zio;
996                 ka.tx = tx;
997                 (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
998                     ADVANCE_POST, kill_blkptr, &ka);
999                 (void) zio_wait(zio);
1000
1001                 dsl_dir_diduse_space(ds->ds_dir,
1002                     -used, -compressed, -uncompressed, tx);
1003         }
1004
1005         /* Change our contents to that of the prev snapshot */
1006         ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
1007         ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
1008         ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
1009         ds->ds_phys->ds_compressed_bytes =
1010             ds->ds_prev->ds_phys->ds_compressed_bytes;
1011         ds->ds_phys->ds_uncompressed_bytes =
1012             ds->ds_prev->ds_phys->ds_uncompressed_bytes;
1013         ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
1014         ds->ds_phys->ds_unique_bytes = 0;
1015
1016         if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1017                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1018                 ds->ds_prev->ds_phys->ds_unique_bytes = 0;
1019         }
1020 }
1021
1022 /* ARGSUSED */
1023 static int
1024 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1025 {
1026         dsl_dataset_t *ds = arg1;
1027
1028         /*
1029          * Can't delete a head dataset if there are snapshots of it.
1030          * (Except if the only snapshots are from the branch we cloned
1031          * from.)
1032          */
1033         if (ds->ds_prev != NULL &&
1034             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1035                 return (EINVAL);
1036
1037         return (0);
1038 }
1039
1040 /* ARGSUSED */
1041 static void
1042 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 {
1044         dsl_dataset_t *ds = arg1;
1045
1046         /* Mark it as inconsistent on-disk, in case we crash */
1047         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1048         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1049 }
1050
1051 /* ARGSUSED */
1052 static int
1053 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1054 {
1055         dsl_dataset_t *ds = arg1;
1056
1057         /* Can't delete a branch point. */
1058         if (ds->ds_phys->ds_num_children > 1)
1059                 return (EEXIST);
1060
1061         /*
1062          * Can't delete a head dataset if there are snapshots of it.
1063          * (Except if the only snapshots are from the branch we cloned
1064          * from.)
1065          */
1066         if (ds->ds_prev != NULL &&
1067             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1068                 return (EINVAL);
1069
1070         /*
1071          * If we made changes this txg, traverse_dsl_dataset won't find
1072          * them.  Try again.
1073          */
1074         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1075                 return (EAGAIN);
1076
1077         /* XXX we should do some i/o error checking... */
1078         return (0);
1079 }
1080
1081 static void
1082 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1083 {
1084         dsl_dataset_t *ds = arg1;
1085         uint64_t used = 0, compressed = 0, uncompressed = 0;
1086         zio_t *zio;
1087         int err;
1088         int after_branch_point = FALSE;
1089         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1090         objset_t *mos = dp->dp_meta_objset;
1091         dsl_dataset_t *ds_prev = NULL;
1092         uint64_t obj;
1093
1094         ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
1095         ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
1096         ASSERT(ds->ds_prev == NULL ||
1097             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1098         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1099
1100         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1101
1102         obj = ds->ds_object;
1103
1104         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1105                 if (ds->ds_prev) {
1106                         ds_prev = ds->ds_prev;
1107                 } else {
1108                         VERIFY(0 == dsl_dataset_open_obj(dp,
1109                             ds->ds_phys->ds_prev_snap_obj, NULL,
1110                             DS_MODE_NONE, FTAG, &ds_prev));
1111                 }
1112                 after_branch_point =
1113                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1114
1115                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1116                 if (after_branch_point &&
1117                     ds->ds_phys->ds_next_snap_obj == 0) {
1118                         /* This clone is toast. */
1119                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1120                         ds_prev->ds_phys->ds_num_children--;
1121                 } else if (!after_branch_point) {
1122                         ds_prev->ds_phys->ds_next_snap_obj =
1123                             ds->ds_phys->ds_next_snap_obj;
1124                 }
1125         }
1126
1127         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1128
1129         if (ds->ds_phys->ds_next_snap_obj != 0) {
1130                 blkptr_t bp;
1131                 dsl_dataset_t *ds_next;
1132                 uint64_t itor = 0;
1133
1134                 spa_scrub_restart(dp->dp_spa, tx->tx_txg);
1135
1136                 VERIFY(0 == dsl_dataset_open_obj(dp,
1137                     ds->ds_phys->ds_next_snap_obj, NULL,
1138                     DS_MODE_NONE, FTAG, &ds_next));
1139                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1140
1141                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1142                 ds_next->ds_phys->ds_prev_snap_obj =
1143                     ds->ds_phys->ds_prev_snap_obj;
1144                 ds_next->ds_phys->ds_prev_snap_txg =
1145                     ds->ds_phys->ds_prev_snap_txg;
1146                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1147                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1148
1149                 /*
1150                  * Transfer to our deadlist (which will become next's
1151                  * new deadlist) any entries from next's current
1152                  * deadlist which were born before prev, and free the
1153                  * other entries.
1154                  *
1155                  * XXX we're doing this long task with the config lock held
1156                  */
1157                 while (bplist_iterate(&ds_next->ds_deadlist, &itor,
1158                     &bp) == 0) {
1159                         if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
1160                                 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
1161                                     &bp, tx));
1162                                 if (ds_prev && !after_branch_point &&
1163                                     bp.blk_birth >
1164                                     ds_prev->ds_phys->ds_prev_snap_txg) {
1165                                         ds_prev->ds_phys->ds_unique_bytes +=
1166                                             bp_get_dasize(dp->dp_spa, &bp);
1167                                 }
1168                         } else {
1169                                 used += bp_get_dasize(dp->dp_spa, &bp);
1170                                 compressed += BP_GET_PSIZE(&bp);
1171                                 uncompressed += BP_GET_UCSIZE(&bp);
1172                                 /* XXX check return value? */
1173                                 (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
1174                                     &bp, NULL, NULL, ARC_NOWAIT);
1175                         }
1176                 }
1177
1178                 /* free next's deadlist */
1179                 bplist_close(&ds_next->ds_deadlist);
1180                 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
1181
1182                 /* set next's deadlist to our deadlist */
1183                 ds_next->ds_phys->ds_deadlist_obj =
1184                     ds->ds_phys->ds_deadlist_obj;
1185                 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
1186                     ds_next->ds_phys->ds_deadlist_obj));
1187                 ds->ds_phys->ds_deadlist_obj = 0;
1188
1189                 if (ds_next->ds_phys->ds_next_snap_obj != 0) {
1190                         /*
1191                          * Update next's unique to include blocks which
1192                          * were previously shared by only this snapshot
1193                          * and it.  Those blocks will be born after the
1194                          * prev snap and before this snap, and will have
1195                          * died after the next snap and before the one
1196                          * after that (ie. be on the snap after next's
1197                          * deadlist).
1198                          *
1199                          * XXX we're doing this long task with the
1200                          * config lock held
1201                          */
1202                         dsl_dataset_t *ds_after_next;
1203
1204                         VERIFY(0 == dsl_dataset_open_obj(dp,
1205                             ds_next->ds_phys->ds_next_snap_obj, NULL,
1206                             DS_MODE_NONE, FTAG, &ds_after_next));
1207                         itor = 0;
1208                         while (bplist_iterate(&ds_after_next->ds_deadlist,
1209                             &itor, &bp) == 0) {
1210                                 if (bp.blk_birth >
1211                                     ds->ds_phys->ds_prev_snap_txg &&
1212                                     bp.blk_birth <=
1213                                     ds->ds_phys->ds_creation_txg) {
1214                                         ds_next->ds_phys->ds_unique_bytes +=
1215                                             bp_get_dasize(dp->dp_spa, &bp);
1216                                 }
1217                         }
1218
1219                         dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
1220                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1221                 } else {
1222                         /*
1223                          * It would be nice to update the head dataset's
1224                          * unique.  To do so we would have to traverse
1225                          * it for blocks born after ds_prev, which is
1226                          * pretty expensive just to maintain something
1227                          * for debugging purposes.
1228                          */
1229                         ASSERT3P(ds_next->ds_prev, ==, ds);
1230                         dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
1231                             ds_next);
1232                         if (ds_prev) {
1233                                 VERIFY(0 == dsl_dataset_open_obj(dp,
1234                                     ds->ds_phys->ds_prev_snap_obj, NULL,
1235                                     DS_MODE_NONE, ds_next, &ds_next->ds_prev));
1236                         } else {
1237                                 ds_next->ds_prev = NULL;
1238                         }
1239                 }
1240                 dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
1241
1242                 /*
1243                  * NB: unique_bytes is not accurate for head objsets
1244                  * because we don't update it when we delete the most
1245                  * recent snapshot -- see above comment.
1246                  */
1247                 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
1248         } else {
1249                 /*
1250                  * There's no next snapshot, so this is a head dataset.
1251                  * Destroy the deadlist.  Unless it's a clone, the
1252                  * deadlist should be empty.  (If it's a clone, it's
1253                  * safe to ignore the deadlist contents.)
1254                  */
1255                 struct killarg ka;
1256
1257                 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
1258                 bplist_close(&ds->ds_deadlist);
1259                 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1260                 ds->ds_phys->ds_deadlist_obj = 0;
1261
1262                 /*
1263                  * Free everything that we point to (that's born after
1264                  * the previous snapshot, if we are a clone)
1265                  *
1266                  * XXX we're doing this long task with the config lock held
1267                  */
1268                 ka.usedp = &used;
1269                 ka.compressedp = &compressed;
1270                 ka.uncompressedp = &uncompressed;
1271                 ka.zio = zio;
1272                 ka.tx = tx;
1273                 err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1274                     ADVANCE_POST, kill_blkptr, &ka);
1275                 ASSERT3U(err, ==, 0);
1276         }
1277
1278         err = zio_wait(zio);
1279         ASSERT3U(err, ==, 0);
1280
1281         dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
1282
1283         if (ds->ds_phys->ds_snapnames_zapobj) {
1284                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1285                 ASSERT(err == 0);
1286         }
1287
1288         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1289                 /* Erase the link in the dataset */
1290                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1291                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1292                 /*
1293                  * dsl_dir_sync_destroy() called us, they'll destroy
1294                  * the dataset.
1295                  */
1296         } else {
1297                 /* remove from snapshot namespace */
1298                 dsl_dataset_t *ds_head;
1299                 VERIFY(0 == dsl_dataset_open_obj(dp,
1300                     ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
1301                     DS_MODE_NONE, FTAG, &ds_head));
1302                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1303 #ifdef ZFS_DEBUG
1304                 {
1305                         uint64_t val;
1306                         err = zap_lookup(mos,
1307                             ds_head->ds_phys->ds_snapnames_zapobj,
1308                             ds->ds_snapname, 8, 1, &val);
1309                         ASSERT3U(err, ==, 0);
1310                         ASSERT3U(val, ==, obj);
1311                 }
1312 #endif
1313                 err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
1314                     ds->ds_snapname, tx);
1315                 ASSERT(err == 0);
1316                 dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
1317         }
1318
1319         if (ds_prev && ds->ds_prev != ds_prev)
1320                 dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
1321
1322         spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1323         dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
1324         VERIFY(0 == dmu_object_free(mos, obj, tx));
1325
1326 }
1327
1328 /* ARGSUSED */
1329 int
1330 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
1331 {
1332         objset_t *os = arg1;
1333         dsl_dataset_t *ds = os->os->os_dsl_dataset;
1334         const char *snapname = arg2;
1335         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1336         int err;
1337         uint64_t value;
1338
1339         /*
1340          * We don't allow multiple snapshots of the same txg.  If there
1341          * is already one, try again.
1342          */
1343         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
1344                 return (EAGAIN);
1345
1346         /*
1347          * Check for conflicting name snapshot name.
1348          */
1349         err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
1350             snapname, 8, 1, &value);
1351         if (err == 0)
1352                 return (EEXIST);
1353         if (err != ENOENT)
1354                 return (err);
1355
1356         /*
1357          * Check that the dataset's name is not too long.  Name consists
1358          * of the dataset's length + 1 for the @-sign + snapshot name's length
1359          */
1360         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
1361                 return (ENAMETOOLONG);
1362
1363         ds->ds_trysnap_txg = tx->tx_txg;
1364         return (0);
1365 }
1366
1367 void
1368 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1369 {
1370         objset_t *os = arg1;
1371         dsl_dataset_t *ds = os->os->os_dsl_dataset;
1372         const char *snapname = arg2;
1373         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1374         dmu_buf_t *dbuf;
1375         dsl_dataset_phys_t *dsphys;
1376         uint64_t dsobj;
1377         objset_t *mos = dp->dp_meta_objset;
1378         int err;
1379
1380         spa_scrub_restart(dp->dp_spa, tx->tx_txg);
1381         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1382
1383         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1384             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1385         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1386         dmu_buf_will_dirty(dbuf, tx);
1387         dsphys = dbuf->db_data;
1388         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1389         dsphys->ds_fsid_guid = unique_create();
1390         unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
1391         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1392             sizeof (dsphys->ds_guid));
1393         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1394         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1395         dsphys->ds_next_snap_obj = ds->ds_object;
1396         dsphys->ds_num_children = 1;
1397         dsphys->ds_creation_time = gethrestime_sec();
1398         dsphys->ds_creation_txg = tx->tx_txg;
1399         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1400         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
1401         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1402         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1403         dsphys->ds_flags = ds->ds_phys->ds_flags;
1404         dsphys->ds_bp = ds->ds_phys->ds_bp;
1405         dmu_buf_rele(dbuf, FTAG);
1406
1407         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
1408         if (ds->ds_prev) {
1409                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
1410                     ds->ds_object ||
1411                     ds->ds_prev->ds_phys->ds_num_children > 1);
1412                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1413                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1414                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1415                             ds->ds_prev->ds_phys->ds_creation_txg);
1416                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1417                 }
1418         }
1419
1420         bplist_close(&ds->ds_deadlist);
1421         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1422         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
1423         ds->ds_phys->ds_prev_snap_obj = dsobj;
1424         ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
1425         ds->ds_phys->ds_unique_bytes = 0;
1426         ds->ds_phys->ds_deadlist_obj =
1427             bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1428         VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1429             ds->ds_phys->ds_deadlist_obj));
1430
1431         dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
1432         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1433             snapname, 8, 1, &dsobj, tx);
1434         ASSERT(err == 0);
1435
1436         if (ds->ds_prev)
1437                 dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
1438         VERIFY(0 == dsl_dataset_open_obj(dp,
1439             ds->ds_phys->ds_prev_snap_obj, snapname,
1440             DS_MODE_NONE, ds, &ds->ds_prev));
1441 }
1442
1443 void
1444 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1445 {
1446         ASSERT(dmu_tx_is_syncing(tx));
1447         ASSERT(ds->ds_user_ptr != NULL);
1448         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1449
1450         dsl_dir_dirty(ds->ds_dir, tx);
1451         dmu_objset_sync(ds->ds_user_ptr, zio, tx);
1452         /* Unneeded? bplist_close(&ds->ds_deadlist); */
1453 }
1454
1455 void
1456 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1457 {
1458         dsl_dir_stats(ds->ds_dir, nv);
1459
1460         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1461             ds->ds_phys->ds_creation_time);
1462         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1463             ds->ds_phys->ds_creation_txg);
1464         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
1465             ds->ds_phys->ds_used_bytes);
1466
1467         if (ds->ds_phys->ds_next_snap_obj) {
1468                 /*
1469                  * This is a snapshot; override the dd's space used with
1470                  * our unique space and compression ratio.
1471                  */
1472                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1473                     ds->ds_phys->ds_unique_bytes);
1474                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
1475                     ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
1476                     (ds->ds_phys->ds_uncompressed_bytes * 100 /
1477                     ds->ds_phys->ds_compressed_bytes));
1478         }
1479 }
1480
1481 void
1482 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1483 {
1484         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1485         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
1486         if (ds->ds_phys->ds_next_snap_obj) {
1487                 stat->dds_is_snapshot = B_TRUE;
1488                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1489         }
1490
1491         /* clone origin is really a dsl_dir thing... */
1492         if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
1493                 dsl_dataset_t *ods;
1494
1495                 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
1496                 VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
1497                     ds->ds_dir->dd_phys->dd_clone_parent_obj,
1498                     NULL, DS_MODE_NONE, FTAG, &ods));
1499                 dsl_dataset_name(ods, stat->dds_clone_of);
1500                 dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
1501                 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
1502         }
1503 }
1504
1505 uint64_t
1506 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1507 {
1508         return (ds->ds_phys->ds_fsid_guid);
1509 }
1510
1511 void
1512 dsl_dataset_space(dsl_dataset_t *ds,
1513     uint64_t *refdbytesp, uint64_t *availbytesp,
1514     uint64_t *usedobjsp, uint64_t *availobjsp)
1515 {
1516         *refdbytesp = ds->ds_phys->ds_used_bytes;
1517         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1518         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
1519         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
1520 }
1521
1522 /* ARGSUSED */
1523 static int
1524 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1525 {
1526         dsl_dataset_t *ds = arg1;
1527         char *newsnapname = arg2;
1528         dsl_dir_t *dd = ds->ds_dir;
1529         objset_t *mos = dd->dd_pool->dp_meta_objset;
1530         dsl_dataset_t *hds;
1531         uint64_t val;
1532         int err;
1533
1534         err = dsl_dataset_open_obj(dd->dd_pool,
1535             dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
1536         if (err)
1537                 return (err);
1538
1539         /* new name better not be in use */
1540         err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
1541             newsnapname, 8, 1, &val);
1542         dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
1543
1544         if (err == 0)
1545                 err = EEXIST;
1546         else if (err == ENOENT)
1547                 err = 0;
1548
1549         /* dataset name + 1 for the "@" + the new snapshot name must fit */
1550         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
1551                 err = ENAMETOOLONG;
1552
1553         return (err);
1554 }
1555
1556 static void
1557 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1558 {
1559         dsl_dataset_t *ds = arg1;
1560         char *newsnapname = arg2;
1561         dsl_dir_t *dd = ds->ds_dir;
1562         objset_t *mos = dd->dd_pool->dp_meta_objset;
1563         dsl_dataset_t *hds;
1564         int err;
1565
1566         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
1567
1568         VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
1569             dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
1570
1571         VERIFY(0 == dsl_dataset_get_snapname(ds));
1572         err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
1573             ds->ds_snapname, tx);
1574         ASSERT3U(err, ==, 0);
1575         mutex_enter(&ds->ds_lock);
1576         (void) strcpy(ds->ds_snapname, newsnapname);
1577         mutex_exit(&ds->ds_lock);
1578         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
1579             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
1580         ASSERT3U(err, ==, 0);
1581
1582         dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
1583 }
1584
1585 struct renamearg {
1586         dsl_sync_task_group_t *dstg;
1587         char failed[MAXPATHLEN];
1588         char *oldsnap;
1589         char *newsnap;
1590 };
1591
1592 static int
1593 dsl_snapshot_rename_one(char *name, void *arg)
1594 {
1595         struct renamearg *ra = arg;
1596         dsl_dataset_t *ds = NULL;
1597         char *cp;
1598         int err;
1599
1600         cp = name + strlen(name);
1601         *cp = '@';
1602         (void) strcpy(cp + 1, ra->oldsnap);
1603         err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
1604             ra->dstg, &ds);
1605         if (err == ENOENT) {
1606                 *cp = '\0';
1607                 return (0);
1608         }
1609         if (err) {
1610                 (void) strcpy(ra->failed, name);
1611                 *cp = '\0';
1612                 dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
1613                 return (err);
1614         }
1615
1616 #ifdef _KERNEL
1617         /* for all filesystems undergoing rename, we'll need to unmount it */
1618         (void) zfs_unmount_snap(name, NULL);
1619 #endif
1620
1621         *cp = '\0';
1622
1623         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
1624             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
1625
1626         return (0);
1627 }
1628
1629 static int
1630 dsl_recursive_rename(char *oldname, const char *newname)
1631 {
1632         int err;
1633         struct renamearg *ra;
1634         dsl_sync_task_t *dst;
1635         spa_t *spa;
1636         char *cp, *fsname = spa_strdup(oldname);
1637         int len = strlen(oldname);
1638
1639         /* truncate the snapshot name to get the fsname */
1640         cp = strchr(fsname, '@');
1641         *cp = '\0';
1642
1643         cp = strchr(fsname, '/');
1644         if (cp) {
1645                 *cp = '\0';
1646                 err = spa_open(fsname, &spa, FTAG);
1647                 *cp = '/';
1648         } else {
1649                 err = spa_open(fsname, &spa, FTAG);
1650         }
1651         if (err) {
1652                 kmem_free(fsname, len + 1);
1653                 return (err);
1654         }
1655         ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
1656         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
1657
1658         ra->oldsnap = strchr(oldname, '@') + 1;
1659         ra->newsnap = strchr(newname, '@') + 1;
1660         *ra->failed = '\0';
1661
1662         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
1663             DS_FIND_CHILDREN);
1664         kmem_free(fsname, len + 1);
1665
1666         if (err == 0) {
1667                 err = dsl_sync_task_group_wait(ra->dstg);
1668         }
1669
1670         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
1671             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
1672                 dsl_dataset_t *ds = dst->dst_arg1;
1673                 if (dst->dst_err) {
1674                         dsl_dir_name(ds->ds_dir, ra->failed);
1675                         (void) strcat(ra->failed, "@");
1676                         (void) strcat(ra->failed, ra->newsnap);
1677                 }
1678                 dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
1679         }
1680
1681         (void) strcpy(oldname, ra->failed);
1682
1683         dsl_sync_task_group_destroy(ra->dstg);
1684         kmem_free(ra, sizeof (struct renamearg));
1685         spa_close(spa, FTAG);
1686         return (err);
1687 }
1688
1689 #pragma weak dmu_objset_rename = dsl_dataset_rename
1690 int
1691 dsl_dataset_rename(char *oldname, const char *newname,
1692     boolean_t recursive)
1693 {
1694         dsl_dir_t *dd;
1695         dsl_dataset_t *ds;
1696         const char *tail;
1697         int err;
1698
1699         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
1700         if (err)
1701                 return (err);
1702         if (tail == NULL) {
1703                 err = dsl_dir_rename(dd, newname);
1704                 dsl_dir_close(dd, FTAG);
1705                 return (err);
1706         }
1707         if (tail[0] != '@') {
1708                 /* the name ended in a nonexistant component */
1709                 dsl_dir_close(dd, FTAG);
1710                 return (ENOENT);
1711         }
1712
1713         dsl_dir_close(dd, FTAG);
1714
1715         /* new name must be snapshot in same filesystem */
1716         tail = strchr(newname, '@');
1717         if (tail == NULL)
1718                 return (EINVAL);
1719         tail++;
1720         if (strncmp(oldname, newname, tail - newname) != 0)
1721                 return (EXDEV);
1722
1723         if (recursive) {
1724                 err = dsl_recursive_rename(oldname, newname);
1725         } else {
1726                 err = dsl_dataset_open(oldname,
1727                     DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
1728                 if (err)
1729                         return (err);
1730
1731                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1732                     dsl_dataset_snapshot_rename_check,
1733                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
1734
1735                 dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
1736         }
1737
1738         return (err);
1739 }
1740
1741 struct promotearg {
1742         uint64_t used, comp, uncomp, unique;
1743         uint64_t newnext_obj, snapnames_obj;
1744 };
1745
1746 static int
1747 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
1748 {
1749         dsl_dataset_t *hds = arg1;
1750         struct promotearg *pa = arg2;
1751         dsl_dir_t *dd = hds->ds_dir;
1752         dsl_pool_t *dp = hds->ds_dir->dd_pool;
1753         dsl_dir_t *pdd = NULL;
1754         dsl_dataset_t *ds = NULL;
1755         dsl_dataset_t *pivot_ds = NULL;
1756         dsl_dataset_t *newnext_ds = NULL;
1757         int err;
1758         char *name = NULL;
1759         uint64_t itor = 0;
1760         blkptr_t bp;
1761
1762         bzero(pa, sizeof (*pa));
1763
1764         /* Check that it is a clone */
1765         if (dd->dd_phys->dd_clone_parent_obj == 0)
1766                 return (EINVAL);
1767
1768         /* Since this is so expensive, don't do the preliminary check */
1769         if (!dmu_tx_is_syncing(tx))
1770                 return (0);
1771
1772         if (err = dsl_dataset_open_obj(dp,
1773             dd->dd_phys->dd_clone_parent_obj,
1774             NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
1775                 goto out;
1776         pdd = pivot_ds->ds_dir;
1777
1778         {
1779                 dsl_dataset_t *phds;
1780                 if (err = dsl_dataset_open_obj(dd->dd_pool,
1781                     pdd->dd_phys->dd_head_dataset_obj,
1782                     NULL, DS_MODE_NONE, FTAG, &phds))
1783                         goto out;
1784                 pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
1785                 dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
1786         }
1787
1788         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
1789                 err = EXDEV;
1790                 goto out;
1791         }
1792
1793         /* find pivot point's new next ds */
1794         VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
1795             NULL, DS_MODE_NONE, FTAG, &newnext_ds));
1796         while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
1797                 dsl_dataset_t *prev;
1798
1799                 if (err = dsl_dataset_open_obj(dd->dd_pool,
1800                     newnext_ds->ds_phys->ds_prev_snap_obj,
1801                     NULL, DS_MODE_NONE, FTAG, &prev))
1802                         goto out;
1803                 dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
1804                 newnext_ds = prev;
1805         }
1806         pa->newnext_obj = newnext_ds->ds_object;
1807
1808         /* compute pivot point's new unique space */
1809         while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
1810             &itor, &bp)) == 0) {
1811                 if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
1812                         pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
1813         }
1814         if (err != ENOENT)
1815                 goto out;
1816
1817         /* Walk the snapshots that we are moving */
1818         name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1819         ds = pivot_ds;
1820         /* CONSTCOND */
1821         while (TRUE) {
1822                 uint64_t val, dlused, dlcomp, dluncomp;
1823                 dsl_dataset_t *prev;
1824
1825                 /* Check that the snapshot name does not conflict */
1826                 dsl_dataset_name(ds, name);
1827                 err = zap_lookup(dd->dd_pool->dp_meta_objset,
1828                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
1829                     8, 1, &val);
1830                 if (err != ENOENT) {
1831                         if (err == 0)
1832                                 err = EEXIST;
1833                         goto out;
1834                 }
1835
1836                 /*
1837                  * compute space to transfer.  Each snapshot gave birth to:
1838                  * (my used) - (prev's used) + (deadlist's used)
1839                  */
1840                 pa->used += ds->ds_phys->ds_used_bytes;
1841                 pa->comp += ds->ds_phys->ds_compressed_bytes;
1842                 pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
1843
1844                 /* If we reach the first snapshot, we're done. */
1845                 if (ds->ds_phys->ds_prev_snap_obj == 0)
1846                         break;
1847
1848                 if (err = bplist_space(&ds->ds_deadlist,
1849                     &dlused, &dlcomp, &dluncomp))
1850                         goto out;
1851                 if (err = dsl_dataset_open_obj(dd->dd_pool,
1852                     ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
1853                     FTAG, &prev))
1854                         goto out;
1855                 pa->used += dlused - prev->ds_phys->ds_used_bytes;
1856                 pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
1857                 pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
1858
1859                 /*
1860                  * We could be a clone of a clone.  If we reach our
1861                  * parent's branch point, we're done.
1862                  */
1863                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
1864                         dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
1865                         break;
1866                 }
1867                 if (ds != pivot_ds)
1868                         dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
1869                 ds = prev;
1870         }
1871
1872         /* Check that there is enough space here */
1873         err = dsl_dir_transfer_possible(pdd, dd, pa->used);
1874
1875 out:
1876         if (ds && ds != pivot_ds)
1877                 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
1878         if (pivot_ds)
1879                 dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
1880         if (newnext_ds)
1881                 dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
1882         if (name)
1883                 kmem_free(name, MAXPATHLEN);
1884         return (err);
1885 }
1886
1887 static void
1888 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1889 {
1890         dsl_dataset_t *hds = arg1;
1891         struct promotearg *pa = arg2;
1892         dsl_dir_t *dd = hds->ds_dir;
1893         dsl_pool_t *dp = hds->ds_dir->dd_pool;
1894         dsl_dir_t *pdd = NULL;
1895         dsl_dataset_t *ds, *pivot_ds;
1896         char *name;
1897
1898         ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
1899         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
1900
1901         VERIFY(0 == dsl_dataset_open_obj(dp,
1902             dd->dd_phys->dd_clone_parent_obj,
1903             NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
1904         /*
1905          * We need to explicitly open pdd, since pivot_ds's pdd will be
1906          * changing.
1907          */
1908         VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
1909             NULL, FTAG, &pdd));
1910
1911         /* move snapshots to this dir */
1912         name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1913         ds = pivot_ds;
1914         /* CONSTCOND */
1915         while (TRUE) {
1916                 dsl_dataset_t *prev;
1917
1918                 /* move snap name entry */
1919                 dsl_dataset_name(ds, name);
1920                 VERIFY(0 == zap_remove(dp->dp_meta_objset,
1921                     pa->snapnames_obj, ds->ds_snapname, tx));
1922                 VERIFY(0 == zap_add(dp->dp_meta_objset,
1923                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
1924                     8, 1, &ds->ds_object, tx));
1925
1926                 /* change containing dsl_dir */
1927                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1928                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
1929                 ds->ds_phys->ds_dir_obj = dd->dd_object;
1930                 ASSERT3P(ds->ds_dir, ==, pdd);
1931                 dsl_dir_close(ds->ds_dir, ds);
1932                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
1933                     NULL, ds, &ds->ds_dir));
1934
1935                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
1936
1937                 if (ds->ds_phys->ds_prev_snap_obj == 0)
1938                         break;
1939
1940                 VERIFY(0 == dsl_dataset_open_obj(dp,
1941                     ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
1942                     FTAG, &prev));
1943
1944                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
1945                         dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
1946                         break;
1947                 }
1948                 if (ds != pivot_ds)
1949                         dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
1950                 ds = prev;
1951         }
1952         if (ds != pivot_ds)
1953                 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
1954
1955         /* change pivot point's next snap */
1956         dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
1957         pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
1958
1959         /* change clone_parent-age */
1960         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1961         ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
1962         dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
1963         dmu_buf_will_dirty(pdd->dd_dbuf, tx);
1964         pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
1965
1966         /* change space accounting */
1967         dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
1968         dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
1969         pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
1970
1971         dsl_dir_close(pdd, FTAG);
1972         dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
1973         kmem_free(name, MAXPATHLEN);
1974 }
1975
1976 int
1977 dsl_dataset_promote(const char *name)
1978 {
1979         dsl_dataset_t *ds;
1980         int err;
1981         dmu_object_info_t doi;
1982         struct promotearg pa;
1983
1984         err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
1985         if (err)
1986                 return (err);
1987
1988         err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
1989             ds->ds_phys->ds_snapnames_zapobj, &doi);
1990         if (err) {
1991                 dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1992                 return (err);
1993         }
1994
1995         /*
1996          * Add in 128x the snapnames zapobj size, since we will be moving
1997          * a bunch of snapnames to the promoted ds, and dirtying their
1998          * bonus buffers.
1999          */
2000         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2001             dsl_dataset_promote_check,
2002             dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
2003         dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
2004         return (err);
2005 }
2006
2007 /*
2008  * Given a pool name and a dataset object number in that pool,
2009  * return the name of that dataset.
2010  */
2011 int
2012 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2013 {
2014         spa_t *spa;
2015         dsl_pool_t *dp;
2016         dsl_dataset_t *ds = NULL;
2017         int error;
2018
2019         if ((error = spa_open(pname, &spa, FTAG)) != 0)
2020                 return (error);
2021         dp = spa_get_dsl(spa);
2022         rw_enter(&dp->dp_config_rwlock, RW_READER);
2023         if ((error = dsl_dataset_open_obj(dp, obj,
2024             NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
2025                 rw_exit(&dp->dp_config_rwlock);
2026                 spa_close(spa, FTAG);
2027                 return (error);
2028         }
2029         dsl_dataset_name(ds, buf);
2030         dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
2031         rw_exit(&dp->dp_config_rwlock);
2032         spa_close(spa, FTAG);
2033
2034         return (0);
2035 }