4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dnode.h>
32 #include <sys/zfeature.h>
33 #include <sys/dsl_dataset.h>
37 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
38 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
39 int dnodesize, dmu_tx_t *tx)
42 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
43 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
45 int dn_slots = dnodesize >> DNODE_SHIFT;
46 boolean_t restarted = B_FALSE;
49 dn_slots = DNODE_MIN_SLOTS;
51 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
52 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
55 mutex_enter(&os->os_obj_lock);
57 object = os->os_obj_next;
59 * Each time we polish off a L1 bp worth of dnodes (2^12
60 * objects), move to another L1 bp that's still
61 * reasonably sparse (at most 1/4 full). Look from the
62 * beginning at most once per txg. If we still can't
63 * allocate from that L1 block, search for an empty L0
64 * block, which will quickly skip to the end of the
65 * metadnode if the no nearby L0 blocks are empty. This
66 * fallback avoids a pathology where full dnode blocks
67 * containing large dnodes appear sparse because they
68 * have a low blk_fill, leading to many failed
69 * allocation attempts. In the long term a better
70 * mechanism to search for sparse metadnode regions,
71 * such as spacemaps, could be implemented.
73 * os_scan_dnodes is set during txg sync if enough objects
74 * have been freed since the previous rescan to justify
77 * Note that dmu_traverse depends on the behavior that we use
78 * multiple blocks of the dnode object before going back to
79 * reuse objects. Any change to this algorithm should preserve
80 * that property or find another solution to the issues
81 * described in traverse_visitbp.
83 if (P2PHASE(object, L1_dnode_count) == 0) {
88 if (os->os_rescan_dnodes) {
90 os->os_rescan_dnodes = B_FALSE;
92 offset = object << DNODE_SHIFT;
94 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
95 minlvl = restarted ? 1 : 2;
97 error = dnode_next_offset(DMU_META_DNODE(os),
98 DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
100 object = offset >> DNODE_SHIFT;
102 os->os_obj_next = object + dn_slots;
105 * XXX We should check for an i/o error here and return
106 * up to our caller. Actually we should pre-read it in
107 * dmu_tx_assign(), but there is currently no mechanism
110 (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
115 if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
116 os->os_obj_next = object;
119 * Skip to next known valid starting point for a dnode.
121 os->os_obj_next = P2ROUNDUP(object + 1,
125 dnode_allocate(dn, ot, blocksize, indirect_blockshift,
126 bonustype, bonuslen, dn_slots, tx);
127 mutex_exit(&os->os_obj_lock);
129 dmu_tx_add_new_object(tx, dn);
130 dnode_rele(dn, FTAG);
136 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
137 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
139 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
144 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
145 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
148 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
149 bonustype, bonuslen, 0, tx);
153 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
154 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
156 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
157 bonuslen, dnodesize, tx));
161 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
162 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
164 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
169 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
170 int blocksize, dmu_object_type_t bonustype, int bonuslen,
171 int dnodesize, dmu_tx_t *tx)
174 int dn_slots = dnodesize >> DNODE_SHIFT;
178 dn_slots = DNODE_MIN_SLOTS;
179 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
180 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
182 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
183 return (SET_ERROR(EBADF));
185 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
189 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
190 dmu_tx_add_new_object(tx, dn);
192 dnode_rele(dn, FTAG);
198 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
199 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
201 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
206 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
207 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
211 int dn_slots = dnodesize >> DNODE_SHIFT;
214 if (object == DMU_META_DNODE_OBJECT)
215 return (SET_ERROR(EBADF));
217 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
222 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
224 dnode_rele(dn, FTAG);
230 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
235 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
237 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
242 ASSERT(dn->dn_type != DMU_OT_NONE);
244 * If we don't create this free range, we'll leak indirect blocks when
245 * we get to freeing the dnode in syncing context.
247 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
249 dnode_rele(dn, FTAG);
255 * Return (in *objectp) the next object which is allocated (or a hole)
256 * after *object, taking into account only objects that may have been modified
257 * after the specified txg.
260 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
263 dmu_object_info_t doi;
264 struct dsl_dataset *ds = os->os_dsl_dataset;
269 * Avoid expensive dnode hold if this dataset doesn't use large dnodes.
271 if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
272 error = dmu_object_info(os, *objectp, &doi);
273 if (error && !(error == EINVAL && *objectp == 0))
274 return (SET_ERROR(error));
276 dnodesize = doi.doi_dnodesize;
278 dnodesize = DNODE_MIN_SIZE;
282 offset = 1 << DNODE_SHIFT;
284 offset = (*objectp << DNODE_SHIFT) + dnodesize;
286 error = dnode_next_offset(DMU_META_DNODE(os),
287 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
289 *objectp = offset >> DNODE_SHIFT;
295 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
296 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
298 * Only for use from syncing context, on MOS objects.
301 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
306 ASSERT(dmu_tx_is_syncing(tx));
308 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
309 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
310 dnode_rele(dn, FTAG);
313 ASSERT3U(dn->dn_type, ==, old_type);
314 ASSERT0(dn->dn_maxblkid);
317 * We must initialize the ZAP data before changing the type,
318 * so that concurrent calls to *_is_zapified() can determine if
319 * the object has been completely zapified by checking the type.
321 mzap_create_impl(mos, object, 0, 0, tx);
323 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
324 DMU_OTN_ZAP_METADATA;
325 dnode_setdirty(dn, tx);
326 dnode_rele(dn, FTAG);
328 spa_feature_incr(dmu_objset_spa(mos),
329 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
333 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
338 ASSERT(dmu_tx_is_syncing(tx));
340 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
342 dnode_rele(dn, FTAG);
344 if (t == DMU_OTN_ZAP_METADATA) {
345 spa_feature_decr(dmu_objset_spa(mos),
346 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
348 VERIFY0(dmu_object_free(mos, object, tx));