]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - module/zfs/dsl_dir.c
Merge branch 'kmem-rework'
[FreeBSD/FreeBSD.git] / module / zfs / dsl_dir.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
25  */
26
27 #include <sys/dmu.h>
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dsl_prop.h>
33 #include <sys/dsl_synctask.h>
34 #include <sys/dsl_deleg.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/spa.h>
37 #include <sys/metaslab.h>
38 #include <sys/zap.h>
39 #include <sys/zio.h>
40 #include <sys/arc.h>
41 #include <sys/sunddi.h>
42 #include <sys/zvol.h>
43 #include "zfs_namecheck.h"
44
45 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
46
47 /* ARGSUSED */
48 static void
49 dsl_dir_evict(dmu_buf_t *db, void *arg)
50 {
51         dsl_dir_t *dd = arg;
52         int t;
53         ASSERTV(dsl_pool_t *dp = dd->dd_pool);
54
55         for (t = 0; t < TXG_SIZE; t++) {
56                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
57                 ASSERT(dd->dd_tempreserved[t] == 0);
58                 ASSERT(dd->dd_space_towrite[t] == 0);
59         }
60
61         if (dd->dd_parent)
62                 dsl_dir_rele(dd->dd_parent, dd);
63
64         spa_close(dd->dd_pool->dp_spa, dd);
65
66         /*
67          * The props callback list should have been cleaned up by
68          * objset_evict().
69          */
70         list_destroy(&dd->dd_prop_cbs);
71         mutex_destroy(&dd->dd_lock);
72         kmem_free(dd, sizeof (dsl_dir_t));
73 }
74
75 int
76 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
77     const char *tail, void *tag, dsl_dir_t **ddp)
78 {
79         dmu_buf_t *dbuf;
80         dsl_dir_t *dd;
81         int err;
82
83         ASSERT(dsl_pool_config_held(dp));
84
85         err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
86         if (err != 0)
87                 return (err);
88         dd = dmu_buf_get_user(dbuf);
89 #ifdef ZFS_DEBUG
90         {
91                 dmu_object_info_t doi;
92                 dmu_object_info_from_db(dbuf, &doi);
93                 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
94                 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
95         }
96 #endif
97         if (dd == NULL) {
98                 dsl_dir_t *winner;
99
100                 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
101                 dd->dd_object = ddobj;
102                 dd->dd_dbuf = dbuf;
103                 dd->dd_pool = dp;
104                 dd->dd_phys = dbuf->db_data;
105                 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
106
107                 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
108                     offsetof(dsl_prop_cb_record_t, cbr_node));
109
110                 dsl_dir_snap_cmtime_update(dd);
111
112                 if (dd->dd_phys->dd_parent_obj) {
113                         err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
114                             NULL, dd, &dd->dd_parent);
115                         if (err != 0)
116                                 goto errout;
117                         if (tail) {
118 #ifdef ZFS_DEBUG
119                                 uint64_t foundobj;
120
121                                 err = zap_lookup(dp->dp_meta_objset,
122                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
123                                     tail, sizeof (foundobj), 1, &foundobj);
124                                 ASSERT(err || foundobj == ddobj);
125 #endif
126                                 (void) strcpy(dd->dd_myname, tail);
127                         } else {
128                                 err = zap_value_search(dp->dp_meta_objset,
129                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
130                                     ddobj, 0, dd->dd_myname);
131                         }
132                         if (err != 0)
133                                 goto errout;
134                 } else {
135                         (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
136                 }
137
138                 if (dsl_dir_is_clone(dd)) {
139                         dmu_buf_t *origin_bonus;
140                         dsl_dataset_phys_t *origin_phys;
141
142                         /*
143                          * We can't open the origin dataset, because
144                          * that would require opening this dsl_dir.
145                          * Just look at its phys directly instead.
146                          */
147                         err = dmu_bonus_hold(dp->dp_meta_objset,
148                             dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
149                         if (err != 0)
150                                 goto errout;
151                         origin_phys = origin_bonus->db_data;
152                         dd->dd_origin_txg =
153                             origin_phys->ds_creation_txg;
154                         dmu_buf_rele(origin_bonus, FTAG);
155                 }
156
157                 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
158                     dsl_dir_evict);
159                 if (winner) {
160                         if (dd->dd_parent)
161                                 dsl_dir_rele(dd->dd_parent, dd);
162                         mutex_destroy(&dd->dd_lock);
163                         kmem_free(dd, sizeof (dsl_dir_t));
164                         dd = winner;
165                 } else {
166                         spa_open_ref(dp->dp_spa, dd);
167                 }
168         }
169
170         /*
171          * The dsl_dir_t has both open-to-close and instantiate-to-evict
172          * holds on the spa.  We need the open-to-close holds because
173          * otherwise the spa_refcnt wouldn't change when we open a
174          * dir which the spa also has open, so we could incorrectly
175          * think it was OK to unload/export/destroy the pool.  We need
176          * the instantiate-to-evict hold because the dsl_dir_t has a
177          * pointer to the dd_pool, which has a pointer to the spa_t.
178          */
179         spa_open_ref(dp->dp_spa, tag);
180         ASSERT3P(dd->dd_pool, ==, dp);
181         ASSERT3U(dd->dd_object, ==, ddobj);
182         ASSERT3P(dd->dd_dbuf, ==, dbuf);
183         *ddp = dd;
184         return (0);
185
186 errout:
187         if (dd->dd_parent)
188                 dsl_dir_rele(dd->dd_parent, dd);
189         mutex_destroy(&dd->dd_lock);
190         kmem_free(dd, sizeof (dsl_dir_t));
191         dmu_buf_rele(dbuf, tag);
192         return (err);
193 }
194
195 void
196 dsl_dir_rele(dsl_dir_t *dd, void *tag)
197 {
198         dprintf_dd(dd, "%s\n", "");
199         spa_close(dd->dd_pool->dp_spa, tag);
200         dmu_buf_rele(dd->dd_dbuf, tag);
201 }
202
203 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
204 void
205 dsl_dir_name(dsl_dir_t *dd, char *buf)
206 {
207         if (dd->dd_parent) {
208                 dsl_dir_name(dd->dd_parent, buf);
209                 (void) strcat(buf, "/");
210         } else {
211                 buf[0] = '\0';
212         }
213         if (!MUTEX_HELD(&dd->dd_lock)) {
214                 /*
215                  * recursive mutex so that we can use
216                  * dprintf_dd() with dd_lock held
217                  */
218                 mutex_enter(&dd->dd_lock);
219                 (void) strcat(buf, dd->dd_myname);
220                 mutex_exit(&dd->dd_lock);
221         } else {
222                 (void) strcat(buf, dd->dd_myname);
223         }
224 }
225
226 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
227 int
228 dsl_dir_namelen(dsl_dir_t *dd)
229 {
230         int result = 0;
231
232         if (dd->dd_parent) {
233                 /* parent's name + 1 for the "/" */
234                 result = dsl_dir_namelen(dd->dd_parent) + 1;
235         }
236
237         if (!MUTEX_HELD(&dd->dd_lock)) {
238                 /* see dsl_dir_name */
239                 mutex_enter(&dd->dd_lock);
240                 result += strlen(dd->dd_myname);
241                 mutex_exit(&dd->dd_lock);
242         } else {
243                 result += strlen(dd->dd_myname);
244         }
245
246         return (result);
247 }
248
249 static int
250 getcomponent(const char *path, char *component, const char **nextp)
251 {
252         char *p;
253
254         if ((path == NULL) || (path[0] == '\0'))
255                 return (SET_ERROR(ENOENT));
256         /* This would be a good place to reserve some namespace... */
257         p = strpbrk(path, "/@");
258         if (p && (p[1] == '/' || p[1] == '@')) {
259                 /* two separators in a row */
260                 return (SET_ERROR(EINVAL));
261         }
262         if (p == NULL || p == path) {
263                 /*
264                  * if the first thing is an @ or /, it had better be an
265                  * @ and it had better not have any more ats or slashes,
266                  * and it had better have something after the @.
267                  */
268                 if (p != NULL &&
269                     (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
270                         return (SET_ERROR(EINVAL));
271                 if (strlen(path) >= MAXNAMELEN)
272                         return (SET_ERROR(ENAMETOOLONG));
273                 (void) strcpy(component, path);
274                 p = NULL;
275         } else if (p[0] == '/') {
276                 if (p - path >= MAXNAMELEN)
277                         return (SET_ERROR(ENAMETOOLONG));
278                 (void) strncpy(component, path, p - path);
279                 component[p - path] = '\0';
280                 p++;
281         } else if (p[0] == '@') {
282                 /*
283                  * if the next separator is an @, there better not be
284                  * any more slashes.
285                  */
286                 if (strchr(path, '/'))
287                         return (SET_ERROR(EINVAL));
288                 if (p - path >= MAXNAMELEN)
289                         return (SET_ERROR(ENAMETOOLONG));
290                 (void) strncpy(component, path, p - path);
291                 component[p - path] = '\0';
292         } else {
293                 panic("invalid p=%p", (void *)p);
294         }
295         *nextp = p;
296         return (0);
297 }
298
299 /*
300  * Return the dsl_dir_t, and possibly the last component which couldn't
301  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
302  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
303  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
304  * (*tail)[0] == '@' means that the last component is a snapshot.
305  */
306 int
307 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
308     dsl_dir_t **ddp, const char **tailp)
309 {
310         char *buf;
311         const char *spaname, *next, *nextnext = NULL;
312         int err;
313         dsl_dir_t *dd;
314         uint64_t ddobj;
315
316         buf = kmem_alloc(MAXNAMELEN, KM_SLEEP);
317         err = getcomponent(name, buf, &next);
318         if (err != 0)
319                 goto error;
320
321         /* Make sure the name is in the specified pool. */
322         spaname = spa_name(dp->dp_spa);
323         if (strcmp(buf, spaname) != 0) {
324                 err = SET_ERROR(EXDEV);
325                 goto error;
326         }
327
328         ASSERT(dsl_pool_config_held(dp));
329
330         err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
331         if (err != 0) {
332                 goto error;
333         }
334
335         while (next != NULL) {
336                 dsl_dir_t *child_ds;
337                 err = getcomponent(next, buf, &nextnext);
338                 if (err != 0)
339                         break;
340                 ASSERT(next[0] != '\0');
341                 if (next[0] == '@')
342                         break;
343                 dprintf("looking up %s in obj%lld\n",
344                     buf, dd->dd_phys->dd_child_dir_zapobj);
345
346                 err = zap_lookup(dp->dp_meta_objset,
347                     dd->dd_phys->dd_child_dir_zapobj,
348                     buf, sizeof (ddobj), 1, &ddobj);
349                 if (err != 0) {
350                         if (err == ENOENT)
351                                 err = 0;
352                         break;
353                 }
354
355                 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
356                 if (err != 0)
357                         break;
358                 dsl_dir_rele(dd, tag);
359                 dd = child_ds;
360                 next = nextnext;
361         }
362
363         if (err != 0) {
364                 dsl_dir_rele(dd, tag);
365                 goto error;
366         }
367
368         /*
369          * It's an error if there's more than one component left, or
370          * tailp==NULL and there's any component left.
371          */
372         if (next != NULL &&
373             (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
374                 /* bad path name */
375                 dsl_dir_rele(dd, tag);
376                 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
377                 err = SET_ERROR(ENOENT);
378         }
379         if (tailp != NULL)
380                 *tailp = next;
381         *ddp = dd;
382 error:
383         kmem_free(buf, MAXNAMELEN);
384         return (err);
385 }
386
387 uint64_t
388 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
389     dmu_tx_t *tx)
390 {
391         objset_t *mos = dp->dp_meta_objset;
392         uint64_t ddobj;
393         dsl_dir_phys_t *ddphys;
394         dmu_buf_t *dbuf;
395
396         ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
397             DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
398         if (pds) {
399                 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
400                     name, sizeof (uint64_t), 1, &ddobj, tx));
401         } else {
402                 /* it's the root dir */
403                 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
404                     DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
405         }
406         VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
407         dmu_buf_will_dirty(dbuf, tx);
408         ddphys = dbuf->db_data;
409
410         ddphys->dd_creation_time = gethrestime_sec();
411         if (pds)
412                 ddphys->dd_parent_obj = pds->dd_object;
413         ddphys->dd_props_zapobj = zap_create(mos,
414             DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
415         ddphys->dd_child_dir_zapobj = zap_create(mos,
416             DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
417         if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
418                 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
419         dmu_buf_rele(dbuf, FTAG);
420
421         return (ddobj);
422 }
423
424 boolean_t
425 dsl_dir_is_clone(dsl_dir_t *dd)
426 {
427         return (dd->dd_phys->dd_origin_obj &&
428             (dd->dd_pool->dp_origin_snap == NULL ||
429             dd->dd_phys->dd_origin_obj !=
430             dd->dd_pool->dp_origin_snap->ds_object));
431 }
432
433 void
434 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
435 {
436         mutex_enter(&dd->dd_lock);
437         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
438             dd->dd_phys->dd_used_bytes);
439         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
440         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
441             dd->dd_phys->dd_reserved);
442         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
443             dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
444             (dd->dd_phys->dd_uncompressed_bytes * 100 /
445             dd->dd_phys->dd_compressed_bytes));
446         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
447             dd->dd_phys->dd_uncompressed_bytes);
448         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
449                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
450                     dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
451                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
452                     dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
453                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
454                     dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
455                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
456                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
457                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
458         }
459         mutex_exit(&dd->dd_lock);
460
461         if (dsl_dir_is_clone(dd)) {
462                 dsl_dataset_t *ds;
463                 char buf[MAXNAMELEN];
464
465                 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
466                     dd->dd_phys->dd_origin_obj, FTAG, &ds));
467                 dsl_dataset_name(ds, buf);
468                 dsl_dataset_rele(ds, FTAG);
469                 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
470         }
471 }
472
473 void
474 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
475 {
476         dsl_pool_t *dp = dd->dd_pool;
477
478         ASSERT(dd->dd_phys);
479
480         if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
481                 /* up the hold count until we can be written out */
482                 dmu_buf_add_ref(dd->dd_dbuf, dd);
483         }
484 }
485
486 static int64_t
487 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
488 {
489         uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
490         uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
491         return (new_accounted - old_accounted);
492 }
493
494 void
495 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
496 {
497         ASSERT(dmu_tx_is_syncing(tx));
498
499         mutex_enter(&dd->dd_lock);
500         ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
501         dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
502             dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
503         dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
504         mutex_exit(&dd->dd_lock);
505
506         /* release the hold from dsl_dir_dirty */
507         dmu_buf_rele(dd->dd_dbuf, dd);
508 }
509
510 static uint64_t
511 dsl_dir_space_towrite(dsl_dir_t *dd)
512 {
513         uint64_t space = 0;
514         int i;
515
516         ASSERT(MUTEX_HELD(&dd->dd_lock));
517
518         for (i = 0; i < TXG_SIZE; i++) {
519                 space += dd->dd_space_towrite[i&TXG_MASK];
520                 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
521         }
522         return (space);
523 }
524
525 /*
526  * How much space would dd have available if ancestor had delta applied
527  * to it?  If ondiskonly is set, we're only interested in what's
528  * on-disk, not estimated pending changes.
529  */
530 uint64_t
531 dsl_dir_space_available(dsl_dir_t *dd,
532     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
533 {
534         uint64_t parentspace, myspace, quota, used;
535
536         /*
537          * If there are no restrictions otherwise, assume we have
538          * unlimited space available.
539          */
540         quota = UINT64_MAX;
541         parentspace = UINT64_MAX;
542
543         if (dd->dd_parent != NULL) {
544                 parentspace = dsl_dir_space_available(dd->dd_parent,
545                     ancestor, delta, ondiskonly);
546         }
547
548         mutex_enter(&dd->dd_lock);
549         if (dd->dd_phys->dd_quota != 0)
550                 quota = dd->dd_phys->dd_quota;
551         used = dd->dd_phys->dd_used_bytes;
552         if (!ondiskonly)
553                 used += dsl_dir_space_towrite(dd);
554
555         if (dd->dd_parent == NULL) {
556                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
557                 quota = MIN(quota, poolsize);
558         }
559
560         if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
561                 /*
562                  * We have some space reserved, in addition to what our
563                  * parent gave us.
564                  */
565                 parentspace += dd->dd_phys->dd_reserved - used;
566         }
567
568         if (dd == ancestor) {
569                 ASSERT(delta <= 0);
570                 ASSERT(used >= -delta);
571                 used += delta;
572                 if (parentspace != UINT64_MAX)
573                         parentspace -= delta;
574         }
575
576         if (used > quota) {
577                 /* over quota */
578                 myspace = 0;
579         } else {
580                 /*
581                  * the lesser of the space provided by our parent and
582                  * the space left in our quota
583                  */
584                 myspace = MIN(parentspace, quota - used);
585         }
586
587         mutex_exit(&dd->dd_lock);
588
589         return (myspace);
590 }
591
592 struct tempreserve {
593         list_node_t tr_node;
594         dsl_dir_t *tr_ds;
595         uint64_t tr_size;
596 };
597
598 static int
599 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
600     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
601     dmu_tx_t *tx, boolean_t first)
602 {
603         uint64_t txg = tx->tx_txg;
604         uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
605         uint64_t deferred = 0;
606         struct tempreserve *tr;
607         int retval = EDQUOT;
608         int txgidx = txg & TXG_MASK;
609         int i;
610         uint64_t ref_rsrv = 0;
611
612         ASSERT3U(txg, !=, 0);
613         ASSERT3S(asize, >, 0);
614
615         mutex_enter(&dd->dd_lock);
616
617         /*
618          * Check against the dsl_dir's quota.  We don't add in the delta
619          * when checking for over-quota because they get one free hit.
620          */
621         est_inflight = dsl_dir_space_towrite(dd);
622         for (i = 0; i < TXG_SIZE; i++)
623                 est_inflight += dd->dd_tempreserved[i];
624         used_on_disk = dd->dd_phys->dd_used_bytes;
625
626         /*
627          * On the first iteration, fetch the dataset's used-on-disk and
628          * refreservation values. Also, if checkrefquota is set, test if
629          * allocating this space would exceed the dataset's refquota.
630          */
631         if (first && tx->tx_objset) {
632                 int error;
633                 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
634
635                 error = dsl_dataset_check_quota(ds, checkrefquota,
636                     asize, est_inflight, &used_on_disk, &ref_rsrv);
637                 if (error) {
638                         mutex_exit(&dd->dd_lock);
639                         DMU_TX_STAT_BUMP(dmu_tx_quota);
640                         return (error);
641                 }
642         }
643
644         /*
645          * If this transaction will result in a net free of space,
646          * we want to let it through.
647          */
648         if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
649                 quota = UINT64_MAX;
650         else
651                 quota = dd->dd_phys->dd_quota;
652
653         /*
654          * Adjust the quota against the actual pool size at the root
655          * minus any outstanding deferred frees.
656          * To ensure that it's possible to remove files from a full
657          * pool without inducing transient overcommits, we throttle
658          * netfree transactions against a quota that is slightly larger,
659          * but still within the pool's allocation slop.  In cases where
660          * we're very close to full, this will allow a steady trickle of
661          * removes to get through.
662          */
663         if (dd->dd_parent == NULL) {
664                 spa_t *spa = dd->dd_pool->dp_spa;
665                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
666                 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
667                 if (poolsize - deferred < quota) {
668                         quota = poolsize - deferred;
669                         retval = ENOSPC;
670                 }
671         }
672
673         /*
674          * If they are requesting more space, and our current estimate
675          * is over quota, they get to try again unless the actual
676          * on-disk is over quota and there are no pending changes (which
677          * may free up space for us).
678          */
679         if (used_on_disk + est_inflight >= quota) {
680                 if (est_inflight > 0 || used_on_disk < quota ||
681                     (retval == ENOSPC && used_on_disk < quota + deferred))
682                         retval = ERESTART;
683                 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
684                     "quota=%lluK tr=%lluK err=%d\n",
685                     used_on_disk>>10, est_inflight>>10,
686                     quota>>10, asize>>10, retval);
687                 mutex_exit(&dd->dd_lock);
688                 DMU_TX_STAT_BUMP(dmu_tx_quota);
689                 return (SET_ERROR(retval));
690         }
691
692         /* We need to up our estimated delta before dropping dd_lock */
693         dd->dd_tempreserved[txgidx] += asize;
694
695         parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
696             asize - ref_rsrv);
697         mutex_exit(&dd->dd_lock);
698
699         tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
700         tr->tr_ds = dd;
701         tr->tr_size = asize;
702         list_insert_tail(tr_list, tr);
703
704         /* see if it's OK with our parent */
705         if (dd->dd_parent && parent_rsrv) {
706                 boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
707
708                 return (dsl_dir_tempreserve_impl(dd->dd_parent,
709                     parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
710         } else {
711                 return (0);
712         }
713 }
714
715 /*
716  * Reserve space in this dsl_dir, to be used in this tx's txg.
717  * After the space has been dirtied (and dsl_dir_willuse_space()
718  * has been called), the reservation should be canceled, using
719  * dsl_dir_tempreserve_clear().
720  */
721 int
722 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
723     uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
724 {
725         int err;
726         list_t *tr_list;
727
728         if (asize == 0) {
729                 *tr_cookiep = NULL;
730                 return (0);
731         }
732
733         tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
734         list_create(tr_list, sizeof (struct tempreserve),
735             offsetof(struct tempreserve, tr_node));
736         ASSERT3S(asize, >, 0);
737         ASSERT3S(fsize, >=, 0);
738
739         err = arc_tempreserve_space(lsize, tx->tx_txg);
740         if (err == 0) {
741                 struct tempreserve *tr;
742
743                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
744                 tr->tr_size = lsize;
745                 list_insert_tail(tr_list, tr);
746         } else {
747                 if (err == EAGAIN) {
748                         /*
749                          * If arc_memory_throttle() detected that pageout
750                          * is running and we are low on memory, we delay new
751                          * non-pageout transactions to give pageout an
752                          * advantage.
753                          *
754                          * It is unfortunate to be delaying while the caller's
755                          * locks are held.
756                          */
757                         txg_delay(dd->dd_pool, tx->tx_txg,
758                             MSEC2NSEC(10), MSEC2NSEC(10));
759                         err = SET_ERROR(ERESTART);
760                 }
761         }
762
763         if (err == 0) {
764                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
765                     FALSE, asize > usize, tr_list, tx, TRUE);
766         }
767
768         if (err != 0)
769                 dsl_dir_tempreserve_clear(tr_list, tx);
770         else
771                 *tr_cookiep = tr_list;
772
773         return (err);
774 }
775
776 /*
777  * Clear a temporary reservation that we previously made with
778  * dsl_dir_tempreserve_space().
779  */
780 void
781 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
782 {
783         int txgidx = tx->tx_txg & TXG_MASK;
784         list_t *tr_list = tr_cookie;
785         struct tempreserve *tr;
786
787         ASSERT3U(tx->tx_txg, !=, 0);
788
789         if (tr_cookie == NULL)
790                 return;
791
792         while ((tr = list_head(tr_list)) != NULL) {
793                 if (tr->tr_ds) {
794                         mutex_enter(&tr->tr_ds->dd_lock);
795                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
796                             tr->tr_size);
797                         tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
798                         mutex_exit(&tr->tr_ds->dd_lock);
799                 } else {
800                         arc_tempreserve_clear(tr->tr_size);
801                 }
802                 list_remove(tr_list, tr);
803                 kmem_free(tr, sizeof (struct tempreserve));
804         }
805
806         kmem_free(tr_list, sizeof (list_t));
807 }
808
809 /*
810  * This should be called from open context when we think we're going to write
811  * or free space, for example when dirtying data. Be conservative; it's okay
812  * to write less space or free more, but we don't want to write more or free
813  * less than the amount specified.
814  *
815  * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
816  * version however it has been adjusted to use an iterative rather then
817  * recursive algorithm to minimize stack usage.
818  */
819 void
820 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
821 {
822         int64_t parent_space;
823         uint64_t est_used;
824
825         do {
826                 mutex_enter(&dd->dd_lock);
827                 if (space > 0)
828                         dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
829
830                 est_used = dsl_dir_space_towrite(dd) +
831                     dd->dd_phys->dd_used_bytes;
832                 parent_space = parent_delta(dd, est_used, space);
833                 mutex_exit(&dd->dd_lock);
834
835                 /* Make sure that we clean up dd_space_to* */
836                 dsl_dir_dirty(dd, tx);
837
838                 dd = dd->dd_parent;
839                 space = parent_space;
840         } while (space && dd);
841 }
842
843 /* call from syncing context when we actually write/free space for this dd */
844 void
845 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
846     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
847 {
848         int64_t accounted_delta;
849
850         /*
851          * dsl_dataset_set_refreservation_sync_impl() calls this with
852          * dd_lock held, so that it can atomically update
853          * ds->ds_reserved and the dsl_dir accounting, so that
854          * dsl_dataset_check_quota() can see dataset and dir accounting
855          * consistently.
856          */
857         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
858
859         ASSERT(dmu_tx_is_syncing(tx));
860         ASSERT(type < DD_USED_NUM);
861
862         dmu_buf_will_dirty(dd->dd_dbuf, tx);
863
864         if (needlock)
865                 mutex_enter(&dd->dd_lock);
866         accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
867         ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
868         ASSERT(compressed >= 0 ||
869             dd->dd_phys->dd_compressed_bytes >= -compressed);
870         ASSERT(uncompressed >= 0 ||
871             dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
872         dd->dd_phys->dd_used_bytes += used;
873         dd->dd_phys->dd_uncompressed_bytes += uncompressed;
874         dd->dd_phys->dd_compressed_bytes += compressed;
875
876         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
877                 ASSERT(used > 0 ||
878                     dd->dd_phys->dd_used_breakdown[type] >= -used);
879                 dd->dd_phys->dd_used_breakdown[type] += used;
880 #ifdef DEBUG
881                 {
882                         dd_used_t t;
883                         uint64_t u = 0;
884                         for (t = 0; t < DD_USED_NUM; t++)
885                                 u += dd->dd_phys->dd_used_breakdown[t];
886                         ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
887                 }
888 #endif
889         }
890         if (needlock)
891                 mutex_exit(&dd->dd_lock);
892
893         if (dd->dd_parent != NULL) {
894                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
895                     accounted_delta, compressed, uncompressed, tx);
896                 dsl_dir_transfer_space(dd->dd_parent,
897                     used - accounted_delta,
898                     DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
899         }
900 }
901
902 void
903 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
904     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
905 {
906         ASSERT(dmu_tx_is_syncing(tx));
907         ASSERT(oldtype < DD_USED_NUM);
908         ASSERT(newtype < DD_USED_NUM);
909
910         if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
911                 return;
912
913         dmu_buf_will_dirty(dd->dd_dbuf, tx);
914         mutex_enter(&dd->dd_lock);
915         ASSERT(delta > 0 ?
916             dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
917             dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
918         ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
919         dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
920         dd->dd_phys->dd_used_breakdown[newtype] += delta;
921         mutex_exit(&dd->dd_lock);
922 }
923
924 typedef struct dsl_dir_set_qr_arg {
925         const char *ddsqra_name;
926         zprop_source_t ddsqra_source;
927         uint64_t ddsqra_value;
928 } dsl_dir_set_qr_arg_t;
929
930 static int
931 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
932 {
933         dsl_dir_set_qr_arg_t *ddsqra = arg;
934         dsl_pool_t *dp = dmu_tx_pool(tx);
935         dsl_dataset_t *ds;
936         int error;
937         uint64_t towrite, newval;
938
939         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
940         if (error != 0)
941                 return (error);
942
943         error = dsl_prop_predict(ds->ds_dir, "quota",
944             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
945         if (error != 0) {
946                 dsl_dataset_rele(ds, FTAG);
947                 return (error);
948         }
949
950         if (newval == 0) {
951                 dsl_dataset_rele(ds, FTAG);
952                 return (0);
953         }
954
955         mutex_enter(&ds->ds_dir->dd_lock);
956         /*
957          * If we are doing the preliminary check in open context, and
958          * there are pending changes, then don't fail it, since the
959          * pending changes could under-estimate the amount of space to be
960          * freed up.
961          */
962         towrite = dsl_dir_space_towrite(ds->ds_dir);
963         if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
964             (newval < ds->ds_dir->dd_phys->dd_reserved ||
965             newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
966                 error = SET_ERROR(ENOSPC);
967         }
968         mutex_exit(&ds->ds_dir->dd_lock);
969         dsl_dataset_rele(ds, FTAG);
970         return (error);
971 }
972
973 static void
974 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
975 {
976         dsl_dir_set_qr_arg_t *ddsqra = arg;
977         dsl_pool_t *dp = dmu_tx_pool(tx);
978         dsl_dataset_t *ds;
979         uint64_t newval;
980
981         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
982
983         if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
984                 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
985                     ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
986                     &ddsqra->ddsqra_value, tx);
987
988                 VERIFY0(dsl_prop_get_int_ds(ds,
989                     zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
990         } else {
991                 newval = ddsqra->ddsqra_value;
992                 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
993                     zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
994         }
995
996         dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
997         mutex_enter(&ds->ds_dir->dd_lock);
998         ds->ds_dir->dd_phys->dd_quota = newval;
999         mutex_exit(&ds->ds_dir->dd_lock);
1000         dsl_dataset_rele(ds, FTAG);
1001 }
1002
1003 int
1004 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1005 {
1006         dsl_dir_set_qr_arg_t ddsqra;
1007
1008         ddsqra.ddsqra_name = ddname;
1009         ddsqra.ddsqra_source = source;
1010         ddsqra.ddsqra_value = quota;
1011
1012         return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1013             dsl_dir_set_quota_sync, &ddsqra, 0));
1014 }
1015
1016 int
1017 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1018 {
1019         dsl_dir_set_qr_arg_t *ddsqra = arg;
1020         dsl_pool_t *dp = dmu_tx_pool(tx);
1021         dsl_dataset_t *ds;
1022         dsl_dir_t *dd;
1023         uint64_t newval, used, avail;
1024         int error;
1025
1026         error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1027         if (error != 0)
1028                 return (error);
1029         dd = ds->ds_dir;
1030
1031         /*
1032          * If we are doing the preliminary check in open context, the
1033          * space estimates may be inaccurate.
1034          */
1035         if (!dmu_tx_is_syncing(tx)) {
1036                 dsl_dataset_rele(ds, FTAG);
1037                 return (0);
1038         }
1039
1040         error = dsl_prop_predict(ds->ds_dir,
1041             zfs_prop_to_name(ZFS_PROP_RESERVATION),
1042             ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1043         if (error != 0) {
1044                 dsl_dataset_rele(ds, FTAG);
1045                 return (error);
1046         }
1047
1048         mutex_enter(&dd->dd_lock);
1049         used = dd->dd_phys->dd_used_bytes;
1050         mutex_exit(&dd->dd_lock);
1051
1052         if (dd->dd_parent) {
1053                 avail = dsl_dir_space_available(dd->dd_parent,
1054                     NULL, 0, FALSE);
1055         } else {
1056                 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1057         }
1058
1059         if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
1060                 uint64_t delta = MAX(used, newval) -
1061                     MAX(used, dd->dd_phys->dd_reserved);
1062
1063                 if (delta > avail ||
1064                     (dd->dd_phys->dd_quota > 0 &&
1065                     newval > dd->dd_phys->dd_quota))
1066                         error = SET_ERROR(ENOSPC);
1067         }
1068
1069         dsl_dataset_rele(ds, FTAG);
1070         return (error);
1071 }
1072
1073 void
1074 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1075 {
1076         uint64_t used;
1077         int64_t delta;
1078
1079         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1080
1081         mutex_enter(&dd->dd_lock);
1082         used = dd->dd_phys->dd_used_bytes;
1083         delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1084         dd->dd_phys->dd_reserved = value;
1085
1086         if (dd->dd_parent != NULL) {
1087                 /* Roll up this additional usage into our ancestors */
1088                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1089                     delta, 0, 0, tx);
1090         }
1091         mutex_exit(&dd->dd_lock);
1092 }
1093
1094 static void
1095 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1096 {
1097         dsl_dir_set_qr_arg_t *ddsqra = arg;
1098         dsl_pool_t *dp = dmu_tx_pool(tx);
1099         dsl_dataset_t *ds;
1100         uint64_t newval;
1101
1102         VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1103
1104         if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1105                 dsl_prop_set_sync_impl(ds,
1106                     zfs_prop_to_name(ZFS_PROP_RESERVATION),
1107                     ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1108                     &ddsqra->ddsqra_value, tx);
1109
1110                 VERIFY0(dsl_prop_get_int_ds(ds,
1111                     zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1112         } else {
1113                 newval = ddsqra->ddsqra_value;
1114                 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1115                     zfs_prop_to_name(ZFS_PROP_RESERVATION),
1116                     (longlong_t)newval);
1117         }
1118
1119         dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1120         dsl_dataset_rele(ds, FTAG);
1121 }
1122
1123 int
1124 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1125     uint64_t reservation)
1126 {
1127         dsl_dir_set_qr_arg_t ddsqra;
1128
1129         ddsqra.ddsqra_name = ddname;
1130         ddsqra.ddsqra_source = source;
1131         ddsqra.ddsqra_value = reservation;
1132
1133         return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1134             dsl_dir_set_reservation_sync, &ddsqra, 0));
1135 }
1136
1137 static dsl_dir_t *
1138 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1139 {
1140         for (; ds1; ds1 = ds1->dd_parent) {
1141                 dsl_dir_t *dd;
1142                 for (dd = ds2; dd; dd = dd->dd_parent) {
1143                         if (ds1 == dd)
1144                                 return (dd);
1145                 }
1146         }
1147         return (NULL);
1148 }
1149
1150 /*
1151  * If delta is applied to dd, how much of that delta would be applied to
1152  * ancestor?  Syncing context only.
1153  */
1154 static int64_t
1155 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1156 {
1157         if (dd == ancestor)
1158                 return (delta);
1159
1160         mutex_enter(&dd->dd_lock);
1161         delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1162         mutex_exit(&dd->dd_lock);
1163         return (would_change(dd->dd_parent, delta, ancestor));
1164 }
1165
1166 typedef struct dsl_dir_rename_arg {
1167         const char *ddra_oldname;
1168         const char *ddra_newname;
1169 } dsl_dir_rename_arg_t;
1170
1171 /* ARGSUSED */
1172 static int
1173 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1174 {
1175         int *deltap = arg;
1176         char namebuf[MAXNAMELEN];
1177
1178         dsl_dataset_name(ds, namebuf);
1179
1180         if (strlen(namebuf) + *deltap >= MAXNAMELEN)
1181                 return (SET_ERROR(ENAMETOOLONG));
1182         return (0);
1183 }
1184
1185 static int
1186 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1187 {
1188         dsl_dir_rename_arg_t *ddra = arg;
1189         dsl_pool_t *dp = dmu_tx_pool(tx);
1190         dsl_dir_t *dd, *newparent;
1191         const char *mynewname;
1192         int error;
1193         int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1194
1195         /* target dir should exist */
1196         error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1197         if (error != 0)
1198                 return (error);
1199
1200         /* new parent should exist */
1201         error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1202             &newparent, &mynewname);
1203         if (error != 0) {
1204                 dsl_dir_rele(dd, FTAG);
1205                 return (error);
1206         }
1207
1208         /* can't rename to different pool */
1209         if (dd->dd_pool != newparent->dd_pool) {
1210                 dsl_dir_rele(newparent, FTAG);
1211                 dsl_dir_rele(dd, FTAG);
1212                 return (SET_ERROR(EXDEV));
1213         }
1214
1215         /* new name should not already exist */
1216         if (mynewname == NULL) {
1217                 dsl_dir_rele(newparent, FTAG);
1218                 dsl_dir_rele(dd, FTAG);
1219                 return (SET_ERROR(EEXIST));
1220         }
1221
1222         /* if the name length is growing, validate child name lengths */
1223         if (delta > 0) {
1224                 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1225                     &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1226                 if (error != 0) {
1227                         dsl_dir_rele(newparent, FTAG);
1228                         dsl_dir_rele(dd, FTAG);
1229                         return (error);
1230                 }
1231         }
1232
1233         if (newparent != dd->dd_parent) {
1234                 /* is there enough space? */
1235                 uint64_t myspace =
1236                     MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1237
1238                 /* no rename into our descendant */
1239                 if (closest_common_ancestor(dd, newparent) == dd) {
1240                         dsl_dir_rele(newparent, FTAG);
1241                         dsl_dir_rele(dd, FTAG);
1242                         return (SET_ERROR(EINVAL));
1243                 }
1244
1245                 error = dsl_dir_transfer_possible(dd->dd_parent,
1246                     newparent, myspace);
1247                 if (error != 0) {
1248                         dsl_dir_rele(newparent, FTAG);
1249                         dsl_dir_rele(dd, FTAG);
1250                         return (error);
1251                 }
1252         }
1253
1254         dsl_dir_rele(newparent, FTAG);
1255         dsl_dir_rele(dd, FTAG);
1256         return (0);
1257 }
1258
1259 static void
1260 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1261 {
1262         dsl_dir_rename_arg_t *ddra = arg;
1263         dsl_pool_t *dp = dmu_tx_pool(tx);
1264         dsl_dir_t *dd, *newparent;
1265         const char *mynewname;
1266         int error;
1267         objset_t *mos = dp->dp_meta_objset;
1268
1269         VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1270         VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1271             &mynewname));
1272
1273         /* Log this before we change the name. */
1274         spa_history_log_internal_dd(dd, "rename", tx,
1275             "-> %s", ddra->ddra_newname);
1276
1277         if (newparent != dd->dd_parent) {
1278                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1279                     -dd->dd_phys->dd_used_bytes,
1280                     -dd->dd_phys->dd_compressed_bytes,
1281                     -dd->dd_phys->dd_uncompressed_bytes, tx);
1282                 dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1283                     dd->dd_phys->dd_used_bytes,
1284                     dd->dd_phys->dd_compressed_bytes,
1285                     dd->dd_phys->dd_uncompressed_bytes, tx);
1286
1287                 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1288                         uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1289                             dd->dd_phys->dd_used_bytes;
1290
1291                         dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1292                             -unused_rsrv, 0, 0, tx);
1293                         dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1294                             unused_rsrv, 0, 0, tx);
1295                 }
1296         }
1297
1298         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1299
1300         /* remove from old parent zapobj */
1301         error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1302             dd->dd_myname, tx);
1303         ASSERT0(error);
1304
1305         (void) strcpy(dd->dd_myname, mynewname);
1306         dsl_dir_rele(dd->dd_parent, dd);
1307         dd->dd_phys->dd_parent_obj = newparent->dd_object;
1308         VERIFY0(dsl_dir_hold_obj(dp,
1309             newparent->dd_object, NULL, dd, &dd->dd_parent));
1310
1311         /* add to new parent zapobj */
1312         VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
1313             dd->dd_myname, 8, 1, &dd->dd_object, tx));
1314
1315 #ifdef _KERNEL
1316         zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
1317 #endif
1318
1319         dsl_prop_notify_all(dd);
1320
1321         dsl_dir_rele(newparent, FTAG);
1322         dsl_dir_rele(dd, FTAG);
1323 }
1324
1325 int
1326 dsl_dir_rename(const char *oldname, const char *newname)
1327 {
1328         dsl_dir_rename_arg_t ddra;
1329
1330         ddra.ddra_oldname = oldname;
1331         ddra.ddra_newname = newname;
1332
1333         return (dsl_sync_task(oldname,
1334             dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
1335 }
1336
1337 int
1338 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
1339 {
1340         dsl_dir_t *ancestor;
1341         int64_t adelta;
1342         uint64_t avail;
1343
1344         ancestor = closest_common_ancestor(sdd, tdd);
1345         adelta = would_change(sdd, -space, ancestor);
1346         avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1347         if (avail < space)
1348                 return (SET_ERROR(ENOSPC));
1349
1350         return (0);
1351 }
1352
1353 timestruc_t
1354 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1355 {
1356         timestruc_t t;
1357
1358         mutex_enter(&dd->dd_lock);
1359         t = dd->dd_snap_cmtime;
1360         mutex_exit(&dd->dd_lock);
1361
1362         return (t);
1363 }
1364
1365 void
1366 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1367 {
1368         timestruc_t t;
1369
1370         gethrestime(&t);
1371         mutex_enter(&dd->dd_lock);
1372         dd->dd_snap_cmtime = t;
1373         mutex_exit(&dd->dd_lock);
1374 }
1375
1376 void
1377 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
1378 {
1379         objset_t *mos = dd->dd_pool->dp_meta_objset;
1380         dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
1381 }
1382
1383 #if defined(_KERNEL) && defined(HAVE_SPL)
1384 EXPORT_SYMBOL(dsl_dir_set_quota);
1385 EXPORT_SYMBOL(dsl_dir_set_reservation);
1386 #endif