]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_dir.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24
25 #include <sys/dmu.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_tx.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_prop.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/dsl_deleg.h>
33 #include <sys/spa.h>
34 #include <sys/metaslab.h>
35 #include <sys/zap.h>
36 #include <sys/zio.h>
37 #include <sys/arc.h>
38 #include <sys/sunddi.h>
39 #include <sys/zvol.h>
40 #include "zfs_namecheck.h"
41
42 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
43 static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
44
45
46 /* ARGSUSED */
47 static void
48 dsl_dir_evict(dmu_buf_t *db, void *arg)
49 {
50         dsl_dir_t *dd = arg;
51         dsl_pool_t *dp = dd->dd_pool;
52         int t;
53
54         for (t = 0; t < TXG_SIZE; t++) {
55                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
56                 ASSERT(dd->dd_tempreserved[t] == 0);
57                 ASSERT(dd->dd_space_towrite[t] == 0);
58         }
59
60         if (dd->dd_parent)
61                 dsl_dir_close(dd->dd_parent, dd);
62
63         spa_close(dd->dd_pool->dp_spa, dd);
64
65         /*
66          * The props callback list should have been cleaned up by
67          * objset_evict().
68          */
69         list_destroy(&dd->dd_prop_cbs);
70         mutex_destroy(&dd->dd_lock);
71         kmem_free(dd, sizeof (dsl_dir_t));
72 }
73
74 int
75 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
76     const char *tail, void *tag, dsl_dir_t **ddp)
77 {
78         dmu_buf_t *dbuf;
79         dsl_dir_t *dd;
80         int err;
81
82         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
83             dsl_pool_sync_context(dp));
84
85         err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
86         if (err)
87                 return (err);
88         dd = dmu_buf_get_user(dbuf);
89 #ifdef ZFS_DEBUG
90         {
91                 dmu_object_info_t doi;
92                 dmu_object_info_from_db(dbuf, &doi);
93                 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
94                 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
95         }
96 #endif
97         if (dd == NULL) {
98                 dsl_dir_t *winner;
99
100                 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
101                 dd->dd_object = ddobj;
102                 dd->dd_dbuf = dbuf;
103                 dd->dd_pool = dp;
104                 dd->dd_phys = dbuf->db_data;
105                 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
106
107                 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
108                     offsetof(dsl_prop_cb_record_t, cbr_node));
109
110                 dsl_dir_snap_cmtime_update(dd);
111
112                 if (dd->dd_phys->dd_parent_obj) {
113                         err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
114                             NULL, dd, &dd->dd_parent);
115                         if (err)
116                                 goto errout;
117                         if (tail) {
118 #ifdef ZFS_DEBUG
119                                 uint64_t foundobj;
120
121                                 err = zap_lookup(dp->dp_meta_objset,
122                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
123                                     tail, sizeof (foundobj), 1, &foundobj);
124                                 ASSERT(err || foundobj == ddobj);
125 #endif
126                                 (void) strcpy(dd->dd_myname, tail);
127                         } else {
128                                 err = zap_value_search(dp->dp_meta_objset,
129                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
130                                     ddobj, 0, dd->dd_myname);
131                         }
132                         if (err)
133                                 goto errout;
134                 } else {
135                         (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
136                 }
137
138                 if (dsl_dir_is_clone(dd)) {
139                         dmu_buf_t *origin_bonus;
140                         dsl_dataset_phys_t *origin_phys;
141
142                         /*
143                          * We can't open the origin dataset, because
144                          * that would require opening this dsl_dir.
145                          * Just look at its phys directly instead.
146                          */
147                         err = dmu_bonus_hold(dp->dp_meta_objset,
148                             dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
149                         if (err)
150                                 goto errout;
151                         origin_phys = origin_bonus->db_data;
152                         dd->dd_origin_txg =
153                             origin_phys->ds_creation_txg;
154                         dmu_buf_rele(origin_bonus, FTAG);
155                 }
156
157                 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
158                     dsl_dir_evict);
159                 if (winner) {
160                         if (dd->dd_parent)
161                                 dsl_dir_close(dd->dd_parent, dd);
162                         mutex_destroy(&dd->dd_lock);
163                         kmem_free(dd, sizeof (dsl_dir_t));
164                         dd = winner;
165                 } else {
166                         spa_open_ref(dp->dp_spa, dd);
167                 }
168         }
169
170         /*
171          * The dsl_dir_t has both open-to-close and instantiate-to-evict
172          * holds on the spa.  We need the open-to-close holds because
173          * otherwise the spa_refcnt wouldn't change when we open a
174          * dir which the spa also has open, so we could incorrectly
175          * think it was OK to unload/export/destroy the pool.  We need
176          * the instantiate-to-evict hold because the dsl_dir_t has a
177          * pointer to the dd_pool, which has a pointer to the spa_t.
178          */
179         spa_open_ref(dp->dp_spa, tag);
180         ASSERT3P(dd->dd_pool, ==, dp);
181         ASSERT3U(dd->dd_object, ==, ddobj);
182         ASSERT3P(dd->dd_dbuf, ==, dbuf);
183         *ddp = dd;
184         return (0);
185
186 errout:
187         if (dd->dd_parent)
188                 dsl_dir_close(dd->dd_parent, dd);
189         mutex_destroy(&dd->dd_lock);
190         kmem_free(dd, sizeof (dsl_dir_t));
191         dmu_buf_rele(dbuf, tag);
192         return (err);
193
194 }
195
196 void
197 dsl_dir_close(dsl_dir_t *dd, void *tag)
198 {
199         dprintf_dd(dd, "%s\n", "");
200         spa_close(dd->dd_pool->dp_spa, tag);
201         dmu_buf_rele(dd->dd_dbuf, tag);
202 }
203
204 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
205 void
206 dsl_dir_name(dsl_dir_t *dd, char *buf)
207 {
208         if (dd->dd_parent) {
209                 dsl_dir_name(dd->dd_parent, buf);
210                 (void) strcat(buf, "/");
211         } else {
212                 buf[0] = '\0';
213         }
214         if (!MUTEX_HELD(&dd->dd_lock)) {
215                 /*
216                  * recursive mutex so that we can use
217                  * dprintf_dd() with dd_lock held
218                  */
219                 mutex_enter(&dd->dd_lock);
220                 (void) strcat(buf, dd->dd_myname);
221                 mutex_exit(&dd->dd_lock);
222         } else {
223                 (void) strcat(buf, dd->dd_myname);
224         }
225 }
226
227 /* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
228 int
229 dsl_dir_namelen(dsl_dir_t *dd)
230 {
231         int result = 0;
232
233         if (dd->dd_parent) {
234                 /* parent's name + 1 for the "/" */
235                 result = dsl_dir_namelen(dd->dd_parent) + 1;
236         }
237
238         if (!MUTEX_HELD(&dd->dd_lock)) {
239                 /* see dsl_dir_name */
240                 mutex_enter(&dd->dd_lock);
241                 result += strlen(dd->dd_myname);
242                 mutex_exit(&dd->dd_lock);
243         } else {
244                 result += strlen(dd->dd_myname);
245         }
246
247         return (result);
248 }
249
250 static int
251 getcomponent(const char *path, char *component, const char **nextp)
252 {
253         char *p;
254         if ((path == NULL) || (path[0] == '\0'))
255                 return (ENOENT);
256         /* This would be a good place to reserve some namespace... */
257         p = strpbrk(path, "/@");
258         if (p && (p[1] == '/' || p[1] == '@')) {
259                 /* two separators in a row */
260                 return (EINVAL);
261         }
262         if (p == NULL || p == path) {
263                 /*
264                  * if the first thing is an @ or /, it had better be an
265                  * @ and it had better not have any more ats or slashes,
266                  * and it had better have something after the @.
267                  */
268                 if (p != NULL &&
269                     (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
270                         return (EINVAL);
271                 if (strlen(path) >= MAXNAMELEN)
272                         return (ENAMETOOLONG);
273                 (void) strcpy(component, path);
274                 p = NULL;
275         } else if (p[0] == '/') {
276                 if (p-path >= MAXNAMELEN)
277                         return (ENAMETOOLONG);
278                 (void) strncpy(component, path, p - path);
279                 component[p-path] = '\0';
280                 p++;
281         } else if (p[0] == '@') {
282                 /*
283                  * if the next separator is an @, there better not be
284                  * any more slashes.
285                  */
286                 if (strchr(path, '/'))
287                         return (EINVAL);
288                 if (p-path >= MAXNAMELEN)
289                         return (ENAMETOOLONG);
290                 (void) strncpy(component, path, p - path);
291                 component[p-path] = '\0';
292         } else {
293                 ASSERT(!"invalid p");
294         }
295         *nextp = p;
296         return (0);
297 }
298
299 /*
300  * same as dsl_open_dir, ignore the first component of name and use the
301  * spa instead
302  */
303 int
304 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
305     dsl_dir_t **ddp, const char **tailp)
306 {
307         char buf[MAXNAMELEN];
308         const char *next, *nextnext = NULL;
309         int err;
310         dsl_dir_t *dd;
311         dsl_pool_t *dp;
312         uint64_t ddobj;
313         int openedspa = FALSE;
314
315         dprintf("%s\n", name);
316
317         err = getcomponent(name, buf, &next);
318         if (err)
319                 return (err);
320         if (spa == NULL) {
321                 err = spa_open(buf, &spa, FTAG);
322                 if (err) {
323                         dprintf("spa_open(%s) failed\n", buf);
324                         return (err);
325                 }
326                 openedspa = TRUE;
327
328                 /* XXX this assertion belongs in spa_open */
329                 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
330         }
331
332         dp = spa_get_dsl(spa);
333
334         rw_enter(&dp->dp_config_rwlock, RW_READER);
335         err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
336         if (err) {
337                 rw_exit(&dp->dp_config_rwlock);
338                 if (openedspa)
339                         spa_close(spa, FTAG);
340                 return (err);
341         }
342
343         while (next != NULL) {
344                 dsl_dir_t *child_ds;
345                 err = getcomponent(next, buf, &nextnext);
346                 if (err)
347                         break;
348                 ASSERT(next[0] != '\0');
349                 if (next[0] == '@')
350                         break;
351                 dprintf("looking up %s in obj%lld\n",
352                     buf, dd->dd_phys->dd_child_dir_zapobj);
353
354                 err = zap_lookup(dp->dp_meta_objset,
355                     dd->dd_phys->dd_child_dir_zapobj,
356                     buf, sizeof (ddobj), 1, &ddobj);
357                 if (err) {
358                         if (err == ENOENT)
359                                 err = 0;
360                         break;
361                 }
362
363                 err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
364                 if (err)
365                         break;
366                 dsl_dir_close(dd, tag);
367                 dd = child_ds;
368                 next = nextnext;
369         }
370         rw_exit(&dp->dp_config_rwlock);
371
372         if (err) {
373                 dsl_dir_close(dd, tag);
374                 if (openedspa)
375                         spa_close(spa, FTAG);
376                 return (err);
377         }
378
379         /*
380          * It's an error if there's more than one component left, or
381          * tailp==NULL and there's any component left.
382          */
383         if (next != NULL &&
384             (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
385                 /* bad path name */
386                 dsl_dir_close(dd, tag);
387                 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
388                 err = ENOENT;
389         }
390         if (tailp)
391                 *tailp = next;
392         if (openedspa)
393                 spa_close(spa, FTAG);
394         *ddp = dd;
395         return (err);
396 }
397
398 /*
399  * Return the dsl_dir_t, and possibly the last component which couldn't
400  * be found in *tail.  Return NULL if the path is bogus, or if
401  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
402  * means that the last component is a snapshot.
403  */
404 int
405 dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
406 {
407         return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
408 }
409
410 uint64_t
411 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
412     dmu_tx_t *tx)
413 {
414         objset_t *mos = dp->dp_meta_objset;
415         uint64_t ddobj;
416         dsl_dir_phys_t *ddphys;
417         dmu_buf_t *dbuf;
418
419         ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
420             DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
421         if (pds) {
422                 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
423                     name, sizeof (uint64_t), 1, &ddobj, tx));
424         } else {
425                 /* it's the root dir */
426                 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
427                     DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
428         }
429         VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
430         dmu_buf_will_dirty(dbuf, tx);
431         ddphys = dbuf->db_data;
432
433         ddphys->dd_creation_time = gethrestime_sec();
434         if (pds)
435                 ddphys->dd_parent_obj = pds->dd_object;
436         ddphys->dd_props_zapobj = zap_create(mos,
437             DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
438         ddphys->dd_child_dir_zapobj = zap_create(mos,
439             DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
440         if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
441                 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
442         dmu_buf_rele(dbuf, FTAG);
443
444         return (ddobj);
445 }
446
447 /* ARGSUSED */
448 int
449 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
450 {
451         dsl_dataset_t *ds = arg1;
452         dsl_dir_t *dd = ds->ds_dir;
453         dsl_pool_t *dp = dd->dd_pool;
454         objset_t *mos = dp->dp_meta_objset;
455         int err;
456         uint64_t count;
457
458         /*
459          * There should be exactly two holds, both from
460          * dsl_dataset_destroy: one on the dd directory, and one on its
461          * head ds.  Otherwise, someone is trying to lookup something
462          * inside this dir while we want to destroy it.  The
463          * config_rwlock ensures that nobody else opens it after we
464          * check.
465          */
466         if (dmu_buf_refcount(dd->dd_dbuf) > 2)
467                 return (EBUSY);
468
469         err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
470         if (err)
471                 return (err);
472         if (count != 0)
473                 return (EEXIST);
474
475         return (0);
476 }
477
478 void
479 dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
480 {
481         dsl_dataset_t *ds = arg1;
482         dsl_dir_t *dd = ds->ds_dir;
483         objset_t *mos = dd->dd_pool->dp_meta_objset;
484         dsl_prop_setarg_t psa;
485         uint64_t value = 0;
486         uint64_t obj;
487         dd_used_t t;
488
489         ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
490         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
491
492         /* Remove our reservation. */
493         dsl_prop_setarg_init_uint64(&psa, "reservation",
494             (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
495             &value);
496         psa.psa_effective_value = 0;    /* predict default value */
497
498         dsl_dir_set_reservation_sync(ds, &psa, tx);
499
500         ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
501         ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
502         for (t = 0; t < DD_USED_NUM; t++)
503                 ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0);
504
505         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
506         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
507         VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
508         VERIFY(0 == zap_remove(mos,
509             dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
510
511         obj = dd->dd_object;
512         dsl_dir_close(dd, tag);
513         VERIFY(0 == dmu_object_free(mos, obj, tx));
514 }
515
516 boolean_t
517 dsl_dir_is_clone(dsl_dir_t *dd)
518 {
519         return (dd->dd_phys->dd_origin_obj &&
520             (dd->dd_pool->dp_origin_snap == NULL ||
521             dd->dd_phys->dd_origin_obj !=
522             dd->dd_pool->dp_origin_snap->ds_object));
523 }
524
525 void
526 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
527 {
528         mutex_enter(&dd->dd_lock);
529         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
530             dd->dd_phys->dd_used_bytes);
531         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
532         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
533             dd->dd_phys->dd_reserved);
534         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
535             dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
536             (dd->dd_phys->dd_uncompressed_bytes * 100 /
537             dd->dd_phys->dd_compressed_bytes));
538         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
539                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
540                     dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
541                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
542                     dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
543                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
544                     dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
545                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
546                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
547                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
548         }
549         mutex_exit(&dd->dd_lock);
550
551         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
552         if (dsl_dir_is_clone(dd)) {
553                 dsl_dataset_t *ds;
554                 char buf[MAXNAMELEN];
555
556                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
557                     dd->dd_phys->dd_origin_obj, FTAG, &ds));
558                 dsl_dataset_name(ds, buf);
559                 dsl_dataset_rele(ds, FTAG);
560                 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
561         }
562         rw_exit(&dd->dd_pool->dp_config_rwlock);
563 }
564
565 void
566 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
567 {
568         dsl_pool_t *dp = dd->dd_pool;
569
570         ASSERT(dd->dd_phys);
571
572         if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
573                 /* up the hold count until we can be written out */
574                 dmu_buf_add_ref(dd->dd_dbuf, dd);
575         }
576 }
577
578 static int64_t
579 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
580 {
581         uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
582         uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
583         return (new_accounted - old_accounted);
584 }
585
586 void
587 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
588 {
589         ASSERT(dmu_tx_is_syncing(tx));
590
591         dmu_buf_will_dirty(dd->dd_dbuf, tx);
592
593         mutex_enter(&dd->dd_lock);
594         ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
595         dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
596             dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
597         dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
598         mutex_exit(&dd->dd_lock);
599
600         /* release the hold from dsl_dir_dirty */
601         dmu_buf_rele(dd->dd_dbuf, dd);
602 }
603
604 static uint64_t
605 dsl_dir_space_towrite(dsl_dir_t *dd)
606 {
607         uint64_t space = 0;
608         int i;
609
610         ASSERT(MUTEX_HELD(&dd->dd_lock));
611
612         for (i = 0; i < TXG_SIZE; i++) {
613                 space += dd->dd_space_towrite[i&TXG_MASK];
614                 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
615         }
616         return (space);
617 }
618
619 /*
620  * How much space would dd have available if ancestor had delta applied
621  * to it?  If ondiskonly is set, we're only interested in what's
622  * on-disk, not estimated pending changes.
623  */
624 uint64_t
625 dsl_dir_space_available(dsl_dir_t *dd,
626     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
627 {
628         uint64_t parentspace, myspace, quota, used;
629
630         /*
631          * If there are no restrictions otherwise, assume we have
632          * unlimited space available.
633          */
634         quota = UINT64_MAX;
635         parentspace = UINT64_MAX;
636
637         if (dd->dd_parent != NULL) {
638                 parentspace = dsl_dir_space_available(dd->dd_parent,
639                     ancestor, delta, ondiskonly);
640         }
641
642         mutex_enter(&dd->dd_lock);
643         if (dd->dd_phys->dd_quota != 0)
644                 quota = dd->dd_phys->dd_quota;
645         used = dd->dd_phys->dd_used_bytes;
646         if (!ondiskonly)
647                 used += dsl_dir_space_towrite(dd);
648
649         if (dd->dd_parent == NULL) {
650                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
651                 quota = MIN(quota, poolsize);
652         }
653
654         if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
655                 /*
656                  * We have some space reserved, in addition to what our
657                  * parent gave us.
658                  */
659                 parentspace += dd->dd_phys->dd_reserved - used;
660         }
661
662         if (dd == ancestor) {
663                 ASSERT(delta <= 0);
664                 ASSERT(used >= -delta);
665                 used += delta;
666                 if (parentspace != UINT64_MAX)
667                         parentspace -= delta;
668         }
669
670         if (used > quota) {
671                 /* over quota */
672                 myspace = 0;
673         } else {
674                 /*
675                  * the lesser of the space provided by our parent and
676                  * the space left in our quota
677                  */
678                 myspace = MIN(parentspace, quota - used);
679         }
680
681         mutex_exit(&dd->dd_lock);
682
683         return (myspace);
684 }
685
686 struct tempreserve {
687         list_node_t tr_node;
688         dsl_pool_t *tr_dp;
689         dsl_dir_t *tr_ds;
690         uint64_t tr_size;
691 };
692
693 static int
694 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
695     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
696     dmu_tx_t *tx, boolean_t first)
697 {
698         uint64_t txg = tx->tx_txg;
699         uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
700         uint64_t deferred = 0;
701         struct tempreserve *tr;
702         int retval = EDQUOT;
703         int txgidx = txg & TXG_MASK;
704         int i;
705         uint64_t ref_rsrv = 0;
706
707         ASSERT3U(txg, !=, 0);
708         ASSERT3S(asize, >, 0);
709
710         mutex_enter(&dd->dd_lock);
711
712         /*
713          * Check against the dsl_dir's quota.  We don't add in the delta
714          * when checking for over-quota because they get one free hit.
715          */
716         est_inflight = dsl_dir_space_towrite(dd);
717         for (i = 0; i < TXG_SIZE; i++)
718                 est_inflight += dd->dd_tempreserved[i];
719         used_on_disk = dd->dd_phys->dd_used_bytes;
720
721         /*
722          * On the first iteration, fetch the dataset's used-on-disk and
723          * refreservation values. Also, if checkrefquota is set, test if
724          * allocating this space would exceed the dataset's refquota.
725          */
726         if (first && tx->tx_objset) {
727                 int error;
728                 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
729
730                 error = dsl_dataset_check_quota(ds, checkrefquota,
731                     asize, est_inflight, &used_on_disk, &ref_rsrv);
732                 if (error) {
733                         mutex_exit(&dd->dd_lock);
734                         return (error);
735                 }
736         }
737
738         /*
739          * If this transaction will result in a net free of space,
740          * we want to let it through.
741          */
742         if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
743                 quota = UINT64_MAX;
744         else
745                 quota = dd->dd_phys->dd_quota;
746
747         /*
748          * Adjust the quota against the actual pool size at the root
749          * minus any outstanding deferred frees.
750          * To ensure that it's possible to remove files from a full
751          * pool without inducing transient overcommits, we throttle
752          * netfree transactions against a quota that is slightly larger,
753          * but still within the pool's allocation slop.  In cases where
754          * we're very close to full, this will allow a steady trickle of
755          * removes to get through.
756          */
757         if (dd->dd_parent == NULL) {
758                 spa_t *spa = dd->dd_pool->dp_spa;
759                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
760                 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
761                 if (poolsize - deferred < quota) {
762                         quota = poolsize - deferred;
763                         retval = ENOSPC;
764                 }
765         }
766
767         /*
768          * If they are requesting more space, and our current estimate
769          * is over quota, they get to try again unless the actual
770          * on-disk is over quota and there are no pending changes (which
771          * may free up space for us).
772          */
773         if (used_on_disk + est_inflight >= quota) {
774                 if (est_inflight > 0 || used_on_disk < quota ||
775                     (retval == ENOSPC && used_on_disk < quota + deferred))
776                         retval = ERESTART;
777                 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
778                     "quota=%lluK tr=%lluK err=%d\n",
779                     used_on_disk>>10, est_inflight>>10,
780                     quota>>10, asize>>10, retval);
781                 mutex_exit(&dd->dd_lock);
782                 return (retval);
783         }
784
785         /* We need to up our estimated delta before dropping dd_lock */
786         dd->dd_tempreserved[txgidx] += asize;
787
788         parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
789             asize - ref_rsrv);
790         mutex_exit(&dd->dd_lock);
791
792         tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
793         tr->tr_ds = dd;
794         tr->tr_size = asize;
795         list_insert_tail(tr_list, tr);
796
797         /* see if it's OK with our parent */
798         if (dd->dd_parent && parent_rsrv) {
799                 boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
800
801                 return (dsl_dir_tempreserve_impl(dd->dd_parent,
802                     parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
803         } else {
804                 return (0);
805         }
806 }
807
808 /*
809  * Reserve space in this dsl_dir, to be used in this tx's txg.
810  * After the space has been dirtied (and dsl_dir_willuse_space()
811  * has been called), the reservation should be canceled, using
812  * dsl_dir_tempreserve_clear().
813  */
814 int
815 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
816     uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
817 {
818         int err;
819         list_t *tr_list;
820
821         if (asize == 0) {
822                 *tr_cookiep = NULL;
823                 return (0);
824         }
825
826         tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
827         list_create(tr_list, sizeof (struct tempreserve),
828             offsetof(struct tempreserve, tr_node));
829         ASSERT3S(asize, >, 0);
830         ASSERT3S(fsize, >=, 0);
831
832         err = arc_tempreserve_space(lsize, tx->tx_txg);
833         if (err == 0) {
834                 struct tempreserve *tr;
835
836                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
837                 tr->tr_size = lsize;
838                 list_insert_tail(tr_list, tr);
839
840                 err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
841         } else {
842                 if (err == EAGAIN) {
843                         txg_delay(dd->dd_pool, tx->tx_txg, 1);
844                         err = ERESTART;
845                 }
846                 dsl_pool_memory_pressure(dd->dd_pool);
847         }
848
849         if (err == 0) {
850                 struct tempreserve *tr;
851
852                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
853                 tr->tr_dp = dd->dd_pool;
854                 tr->tr_size = asize;
855                 list_insert_tail(tr_list, tr);
856
857                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
858                     FALSE, asize > usize, tr_list, tx, TRUE);
859         }
860
861         if (err)
862                 dsl_dir_tempreserve_clear(tr_list, tx);
863         else
864                 *tr_cookiep = tr_list;
865
866         return (err);
867 }
868
869 /*
870  * Clear a temporary reservation that we previously made with
871  * dsl_dir_tempreserve_space().
872  */
873 void
874 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
875 {
876         int txgidx = tx->tx_txg & TXG_MASK;
877         list_t *tr_list = tr_cookie;
878         struct tempreserve *tr;
879
880         ASSERT3U(tx->tx_txg, !=, 0);
881
882         if (tr_cookie == NULL)
883                 return;
884
885         while (tr = list_head(tr_list)) {
886                 if (tr->tr_dp) {
887                         dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
888                 } else if (tr->tr_ds) {
889                         mutex_enter(&tr->tr_ds->dd_lock);
890                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
891                             tr->tr_size);
892                         tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
893                         mutex_exit(&tr->tr_ds->dd_lock);
894                 } else {
895                         arc_tempreserve_clear(tr->tr_size);
896                 }
897                 list_remove(tr_list, tr);
898                 kmem_free(tr, sizeof (struct tempreserve));
899         }
900
901         kmem_free(tr_list, sizeof (list_t));
902 }
903
904 static void
905 dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
906 {
907         int64_t parent_space;
908         uint64_t est_used;
909
910         mutex_enter(&dd->dd_lock);
911         if (space > 0)
912                 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
913
914         est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
915         parent_space = parent_delta(dd, est_used, space);
916         mutex_exit(&dd->dd_lock);
917
918         /* Make sure that we clean up dd_space_to* */
919         dsl_dir_dirty(dd, tx);
920
921         /* XXX this is potentially expensive and unnecessary... */
922         if (parent_space && dd->dd_parent)
923                 dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
924 }
925
926 /*
927  * Call in open context when we think we're going to write/free space,
928  * eg. when dirtying data.  Be conservative (ie. OK to write less than
929  * this or free more than this, but don't write more or free less).
930  */
931 void
932 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
933 {
934         dsl_pool_willuse_space(dd->dd_pool, space, tx);
935         dsl_dir_willuse_space_impl(dd, space, tx);
936 }
937
938 /* call from syncing context when we actually write/free space for this dd */
939 void
940 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
941     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
942 {
943         int64_t accounted_delta;
944         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
945
946         ASSERT(dmu_tx_is_syncing(tx));
947         ASSERT(type < DD_USED_NUM);
948
949         dsl_dir_dirty(dd, tx);
950
951         if (needlock)
952                 mutex_enter(&dd->dd_lock);
953         accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
954         ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
955         ASSERT(compressed >= 0 ||
956             dd->dd_phys->dd_compressed_bytes >= -compressed);
957         ASSERT(uncompressed >= 0 ||
958             dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
959         dd->dd_phys->dd_used_bytes += used;
960         dd->dd_phys->dd_uncompressed_bytes += uncompressed;
961         dd->dd_phys->dd_compressed_bytes += compressed;
962
963         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
964                 ASSERT(used > 0 ||
965                     dd->dd_phys->dd_used_breakdown[type] >= -used);
966                 dd->dd_phys->dd_used_breakdown[type] += used;
967 #ifdef DEBUG
968                 dd_used_t t;
969                 uint64_t u = 0;
970                 for (t = 0; t < DD_USED_NUM; t++)
971                         u += dd->dd_phys->dd_used_breakdown[t];
972                 ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
973 #endif
974         }
975         if (needlock)
976                 mutex_exit(&dd->dd_lock);
977
978         if (dd->dd_parent != NULL) {
979                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
980                     accounted_delta, compressed, uncompressed, tx);
981                 dsl_dir_transfer_space(dd->dd_parent,
982                     used - accounted_delta,
983                     DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
984         }
985 }
986
987 void
988 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
989     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
990 {
991         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
992
993         ASSERT(dmu_tx_is_syncing(tx));
994         ASSERT(oldtype < DD_USED_NUM);
995         ASSERT(newtype < DD_USED_NUM);
996
997         if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
998                 return;
999
1000         dsl_dir_dirty(dd, tx);
1001         if (needlock)
1002                 mutex_enter(&dd->dd_lock);
1003         ASSERT(delta > 0 ?
1004             dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
1005             dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
1006         ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1007         dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1008         dd->dd_phys->dd_used_breakdown[newtype] += delta;
1009         if (needlock)
1010                 mutex_exit(&dd->dd_lock);
1011 }
1012
1013 static int
1014 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
1015 {
1016         dsl_dataset_t *ds = arg1;
1017         dsl_dir_t *dd = ds->ds_dir;
1018         dsl_prop_setarg_t *psa = arg2;
1019         int err;
1020         uint64_t towrite;
1021
1022         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1023                 return (err);
1024
1025         if (psa->psa_effective_value == 0)
1026                 return (0);
1027
1028         mutex_enter(&dd->dd_lock);
1029         /*
1030          * If we are doing the preliminary check in open context, and
1031          * there are pending changes, then don't fail it, since the
1032          * pending changes could under-estimate the amount of space to be
1033          * freed up.
1034          */
1035         towrite = dsl_dir_space_towrite(dd);
1036         if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1037             (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1038             psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1039                 err = ENOSPC;
1040         }
1041         mutex_exit(&dd->dd_lock);
1042         return (err);
1043 }
1044
1045 extern dsl_syncfunc_t dsl_prop_set_sync;
1046
1047 static void
1048 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1049 {
1050         dsl_dataset_t *ds = arg1;
1051         dsl_dir_t *dd = ds->ds_dir;
1052         dsl_prop_setarg_t *psa = arg2;
1053         uint64_t effective_value = psa->psa_effective_value;
1054
1055         dsl_prop_set_sync(ds, psa, tx);
1056         DSL_PROP_CHECK_PREDICTION(dd, psa);
1057
1058         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1059
1060         mutex_enter(&dd->dd_lock);
1061         dd->dd_phys->dd_quota = effective_value;
1062         mutex_exit(&dd->dd_lock);
1063
1064         spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
1065             tx, "%lld dataset = %llu ",
1066             (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
1067 }
1068
1069 int
1070 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1071 {
1072         dsl_dir_t *dd;
1073         dsl_dataset_t *ds;
1074         dsl_prop_setarg_t psa;
1075         int err;
1076
1077         dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
1078
1079         err = dsl_dataset_hold(ddname, FTAG, &ds);
1080         if (err)
1081                 return (err);
1082
1083         err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1084         if (err) {
1085                 dsl_dataset_rele(ds, FTAG);
1086                 return (err);
1087         }
1088
1089         ASSERT(ds->ds_dir == dd);
1090
1091         /*
1092          * If someone removes a file, then tries to set the quota, we want to
1093          * make sure the file freeing takes effect.
1094          */
1095         txg_wait_open(dd->dd_pool, 0);
1096
1097         err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
1098             dsl_dir_set_quota_sync, ds, &psa, 0);
1099
1100         dsl_dir_close(dd, FTAG);
1101         dsl_dataset_rele(ds, FTAG);
1102         return (err);
1103 }
1104
1105 int
1106 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
1107 {
1108         dsl_dataset_t *ds = arg1;
1109         dsl_dir_t *dd = ds->ds_dir;
1110         dsl_prop_setarg_t *psa = arg2;
1111         uint64_t effective_value;
1112         uint64_t used, avail;
1113         int err;
1114
1115         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1116                 return (err);
1117
1118         effective_value = psa->psa_effective_value;
1119
1120         /*
1121          * If we are doing the preliminary check in open context, the
1122          * space estimates may be inaccurate.
1123          */
1124         if (!dmu_tx_is_syncing(tx))
1125                 return (0);
1126
1127         mutex_enter(&dd->dd_lock);
1128         used = dd->dd_phys->dd_used_bytes;
1129         mutex_exit(&dd->dd_lock);
1130
1131         if (dd->dd_parent) {
1132                 avail = dsl_dir_space_available(dd->dd_parent,
1133                     NULL, 0, FALSE);
1134         } else {
1135                 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1136         }
1137
1138         if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
1139                 uint64_t delta = MAX(used, effective_value) -
1140                     MAX(used, dd->dd_phys->dd_reserved);
1141
1142                 if (delta > avail)
1143                         return (ENOSPC);
1144                 if (dd->dd_phys->dd_quota > 0 &&
1145                     effective_value > dd->dd_phys->dd_quota)
1146                         return (ENOSPC);
1147         }
1148
1149         return (0);
1150 }
1151
1152 static void
1153 dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1154 {
1155         dsl_dataset_t *ds = arg1;
1156         dsl_dir_t *dd = ds->ds_dir;
1157         dsl_prop_setarg_t *psa = arg2;
1158         uint64_t effective_value = psa->psa_effective_value;
1159         uint64_t used;
1160         int64_t delta;
1161
1162         dsl_prop_set_sync(ds, psa, tx);
1163         DSL_PROP_CHECK_PREDICTION(dd, psa);
1164
1165         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1166
1167         mutex_enter(&dd->dd_lock);
1168         used = dd->dd_phys->dd_used_bytes;
1169         delta = MAX(used, effective_value) -
1170             MAX(used, dd->dd_phys->dd_reserved);
1171         dd->dd_phys->dd_reserved = effective_value;
1172
1173         if (dd->dd_parent != NULL) {
1174                 /* Roll up this additional usage into our ancestors */
1175                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1176                     delta, 0, 0, tx);
1177         }
1178         mutex_exit(&dd->dd_lock);
1179
1180         spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
1181             tx, "%lld dataset = %llu",
1182             (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
1183 }
1184
1185 int
1186 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1187     uint64_t reservation)
1188 {
1189         dsl_dir_t *dd;
1190         dsl_dataset_t *ds;
1191         dsl_prop_setarg_t psa;
1192         int err;
1193
1194         dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
1195
1196         err = dsl_dataset_hold(ddname, FTAG, &ds);
1197         if (err)
1198                 return (err);
1199
1200         err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1201         if (err) {
1202                 dsl_dataset_rele(ds, FTAG);
1203                 return (err);
1204         }
1205
1206         ASSERT(ds->ds_dir == dd);
1207
1208         err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
1209             dsl_dir_set_reservation_sync, ds, &psa, 0);
1210
1211         dsl_dir_close(dd, FTAG);
1212         dsl_dataset_rele(ds, FTAG);
1213         return (err);
1214 }
1215
1216 static dsl_dir_t *
1217 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1218 {
1219         for (; ds1; ds1 = ds1->dd_parent) {
1220                 dsl_dir_t *dd;
1221                 for (dd = ds2; dd; dd = dd->dd_parent) {
1222                         if (ds1 == dd)
1223                                 return (dd);
1224                 }
1225         }
1226         return (NULL);
1227 }
1228
1229 /*
1230  * If delta is applied to dd, how much of that delta would be applied to
1231  * ancestor?  Syncing context only.
1232  */
1233 static int64_t
1234 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1235 {
1236         if (dd == ancestor)
1237                 return (delta);
1238
1239         mutex_enter(&dd->dd_lock);
1240         delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1241         mutex_exit(&dd->dd_lock);
1242         return (would_change(dd->dd_parent, delta, ancestor));
1243 }
1244
1245 struct renamearg {
1246         dsl_dir_t *newparent;
1247         const char *mynewname;
1248 };
1249
1250 static int
1251 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1252 {
1253         dsl_dir_t *dd = arg1;
1254         struct renamearg *ra = arg2;
1255         dsl_pool_t *dp = dd->dd_pool;
1256         objset_t *mos = dp->dp_meta_objset;
1257         int err;
1258         uint64_t val;
1259
1260         /*
1261          * There should only be one reference, from dmu_objset_rename().
1262          * Fleeting holds are also possible (eg, from "zfs list" getting
1263          * stats), but any that are present in open context will likely
1264          * be gone by syncing context, so only fail from syncing
1265          * context.
1266          */
1267         if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
1268                 return (EBUSY);
1269
1270         /* check for existing name */
1271         err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1272             ra->mynewname, 8, 1, &val);
1273         if (err == 0)
1274                 return (EEXIST);
1275         if (err != ENOENT)
1276                 return (err);
1277
1278         if (ra->newparent != dd->dd_parent) {
1279                 /* is there enough space? */
1280                 uint64_t myspace =
1281                     MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1282
1283                 /* no rename into our descendant */
1284                 if (closest_common_ancestor(dd, ra->newparent) == dd)
1285                         return (EINVAL);
1286
1287                 if (err = dsl_dir_transfer_possible(dd->dd_parent,
1288                     ra->newparent, myspace))
1289                         return (err);
1290         }
1291
1292         return (0);
1293 }
1294
1295 static void
1296 dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1297 {
1298         char oldname[MAXPATHLEN], newname[MAXPATHLEN];
1299         dsl_dir_t *dd = arg1;
1300         struct renamearg *ra = arg2;
1301         dsl_pool_t *dp = dd->dd_pool;
1302         objset_t *mos = dp->dp_meta_objset;
1303         int err;
1304
1305         ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1306
1307         if (ra->newparent != dd->dd_parent) {
1308                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1309                     -dd->dd_phys->dd_used_bytes,
1310                     -dd->dd_phys->dd_compressed_bytes,
1311                     -dd->dd_phys->dd_uncompressed_bytes, tx);
1312                 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1313                     dd->dd_phys->dd_used_bytes,
1314                     dd->dd_phys->dd_compressed_bytes,
1315                     dd->dd_phys->dd_uncompressed_bytes, tx);
1316
1317                 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1318                         uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1319                             dd->dd_phys->dd_used_bytes;
1320
1321                         dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1322                             -unused_rsrv, 0, 0, tx);
1323                         dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1324                             unused_rsrv, 0, 0, tx);
1325                 }
1326         }
1327
1328         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1329
1330         /* remove from old parent zapobj */
1331         dsl_dir_name(dd, oldname);
1332         err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1333             dd->dd_myname, tx);
1334         ASSERT3U(err, ==, 0);
1335
1336         (void) strcpy(dd->dd_myname, ra->mynewname);
1337         dsl_dir_close(dd->dd_parent, dd);
1338         dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
1339         VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
1340             ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
1341
1342         /* add to new parent zapobj */
1343         err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1344             dd->dd_myname, 8, 1, &dd->dd_object, tx);
1345         ASSERT3U(err, ==, 0);
1346         dsl_dir_name(dd, newname);
1347 #ifdef _KERNEL
1348         zvol_rename_minors(oldname, newname);
1349 #endif
1350
1351         spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
1352             tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
1353 }
1354
1355 int
1356 dsl_dir_rename(dsl_dir_t *dd, const char *newname)
1357 {
1358         struct renamearg ra;
1359         int err;
1360
1361         /* new parent should exist */
1362         err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
1363         if (err)
1364                 return (err);
1365
1366         /* can't rename to different pool */
1367         if (dd->dd_pool != ra.newparent->dd_pool) {
1368                 err = ENXIO;
1369                 goto out;
1370         }
1371
1372         /* new name should not already exist */
1373         if (ra.mynewname == NULL) {
1374                 err = EEXIST;
1375                 goto out;
1376         }
1377
1378         err = dsl_sync_task_do(dd->dd_pool,
1379             dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1380
1381 out:
1382         dsl_dir_close(ra.newparent, FTAG);
1383         return (err);
1384 }
1385
1386 int
1387 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
1388 {
1389         dsl_dir_t *ancestor;
1390         int64_t adelta;
1391         uint64_t avail;
1392
1393         ancestor = closest_common_ancestor(sdd, tdd);
1394         adelta = would_change(sdd, -space, ancestor);
1395         avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1396         if (avail < space)
1397                 return (ENOSPC);
1398
1399         return (0);
1400 }
1401
1402 timestruc_t
1403 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1404 {
1405         timestruc_t t;
1406
1407         mutex_enter(&dd->dd_lock);
1408         t = dd->dd_snap_cmtime;
1409         mutex_exit(&dd->dd_lock);
1410
1411         return (t);
1412 }
1413
1414 void
1415 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1416 {
1417         timestruc_t t;
1418
1419         gethrestime(&t);
1420         mutex_enter(&dd->dd_lock);
1421         dd->dd_snap_cmtime = t;
1422         mutex_exit(&dd->dd_lock);
1423 }