4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
29 #include <sys/dmu_impl.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dnode.h>
34 #include <sys/zfeature.h>
35 #include <sys/dsl_dataset.h>
38 * Each of the concurrent object allocators will grab
39 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
40 * grab 128 slots, which is 4 blocks worth. This was experimentally
41 * determined to be the lowest value that eliminates the measurable effect
42 * of lock contention from this code path.
44 int dmu_object_alloc_chunk_shift = 7;
47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
48 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
49 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
52 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
53 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
55 int dn_slots = dnodesize >> DNODE_SHIFT;
56 boolean_t restarted = B_FALSE;
57 uint64_t *cpuobj = NULL;
58 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
62 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
63 os->os_obj_next_percpu_len];
67 dn_slots = DNODE_MIN_SLOTS;
69 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
70 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
74 * The "chunk" of dnodes that is assigned to a CPU-specific
75 * allocator needs to be at least one block's worth, to avoid
76 * lock contention on the dbuf. It can be at most one L1 block's
77 * worth, so that the "rescan after polishing off a L1's worth"
78 * logic below will be sure to kick in.
80 if (dnodes_per_chunk < DNODES_PER_BLOCK)
81 dnodes_per_chunk = DNODES_PER_BLOCK;
82 if (dnodes_per_chunk > L1_dnode_count)
83 dnodes_per_chunk = L1_dnode_count;
86 * The caller requested the dnode be returned as a performance
87 * optimization in order to avoid releasing the hold only to
88 * immediately reacquire it. Since they caller is responsible
89 * for releasing the hold they must provide the tag.
91 if (allocated_dnode != NULL) {
92 ASSERT3P(tag, !=, NULL);
94 ASSERT3P(tag, ==, NULL);
101 * If we finished a chunk of dnodes, get a new one from
102 * the global allocator.
104 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
105 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
107 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
108 mutex_enter(&os->os_obj_lock);
109 ASSERT0(P2PHASE(os->os_obj_next_chunk,
111 object = os->os_obj_next_chunk;
114 * Each time we polish off a L1 bp worth of dnodes
115 * (2^12 objects), move to another L1 bp that's
116 * still reasonably sparse (at most 1/4 full). Look
117 * from the beginning at most once per txg. If we
118 * still can't allocate from that L1 block, search
119 * for an empty L0 block, which will quickly skip
120 * to the end of the metadnode if no nearby L0
121 * blocks are empty. This fallback avoids a
122 * pathology where full dnode blocks containing
123 * large dnodes appear sparse because they have a
124 * low blk_fill, leading to many failed allocation
125 * attempts. In the long term a better mechanism to
126 * search for sparse metadnode regions, such as
127 * spacemaps, could be implemented.
129 * os_scan_dnodes is set during txg sync if enough
130 * objects have been freed since the previous
131 * rescan to justify backfilling again.
133 * Note that dmu_traverse depends on the behavior
134 * that we use multiple blocks of the dnode object
135 * before going back to reuse objects. Any change
136 * to this algorithm should preserve that property
137 * or find another solution to the issues described
138 * in traverse_visitbp.
140 if (P2PHASE(object, L1_dnode_count) == 0) {
144 if (os->os_rescan_dnodes) {
146 os->os_rescan_dnodes = B_FALSE;
148 offset = object << DNODE_SHIFT;
150 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
151 minlvl = restarted ? 1 : 2;
153 error = dnode_next_offset(DMU_META_DNODE(os),
154 DNODE_FIND_HOLE, &offset, minlvl,
157 object = offset >> DNODE_SHIFT;
161 * Note: if "restarted", we may find a L0 that
162 * is not suitably aligned.
164 os->os_obj_next_chunk =
165 P2ALIGN(object, dnodes_per_chunk) +
167 (void) atomic_swap_64(cpuobj, object);
168 mutex_exit(&os->os_obj_lock);
172 * The value of (*cpuobj) before adding dn_slots is the object
173 * ID assigned to us. The value afterwards is the object ID
174 * assigned to whoever wants to do an allocation next.
176 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
179 * XXX We should check for an i/o error here and return
180 * up to our caller. Actually we should pre-read it in
181 * dmu_tx_assign(), but there is currently no mechanism
184 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
187 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
189 * Another thread could have allocated it; check
190 * again now that we have the struct lock.
192 if (dn->dn_type == DMU_OT_NONE) {
193 dnode_allocate(dn, ot, blocksize,
194 indirect_blockshift, bonustype,
195 bonuslen, dn_slots, tx);
196 rw_exit(&dn->dn_struct_rwlock);
197 dmu_tx_add_new_object(tx, dn);
200 * Caller requested the allocated dnode be
201 * returned and is responsible for the hold.
203 if (allocated_dnode != NULL)
204 *allocated_dnode = dn;
210 rw_exit(&dn->dn_struct_rwlock);
212 DNODE_STAT_BUMP(dnode_alloc_race);
216 * Skip to next known valid starting point on error. This
217 * is the start of the next block of dnodes.
219 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
220 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
221 DNODE_STAT_BUMP(dnode_alloc_next_block);
223 (void) atomic_swap_64(cpuobj, object);
228 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
229 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
231 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
232 bonuslen, 0, NULL, NULL, tx);
236 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
237 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
240 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
241 bonustype, bonuslen, 0, NULL, NULL, tx);
245 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
246 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
248 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
249 bonuslen, dnodesize, NULL, NULL, tx));
253 * Allocate a new object and return a pointer to the newly allocated dnode
254 * via the allocated_dnode argument. The returned dnode will be held and
255 * the caller is responsible for releasing the hold by calling dnode_rele().
258 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
259 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
260 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
262 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
263 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
267 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
268 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
270 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
275 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
276 int blocksize, dmu_object_type_t bonustype, int bonuslen,
277 int dnodesize, dmu_tx_t *tx)
280 int dn_slots = dnodesize >> DNODE_SHIFT;
284 dn_slots = DNODE_MIN_SLOTS;
285 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
286 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
288 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
289 return (SET_ERROR(EBADF));
291 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
296 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
297 dmu_tx_add_new_object(tx, dn);
299 dnode_rele(dn, FTAG);
305 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
306 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
308 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
309 bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
313 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
314 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
315 boolean_t keep_spill, dmu_tx_t *tx)
318 int dn_slots = dnodesize >> DNODE_SHIFT;
322 dn_slots = DNODE_MIN_SLOTS;
324 if (object == DMU_META_DNODE_OBJECT)
325 return (SET_ERROR(EBADF));
327 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
332 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
335 dnode_rele(dn, FTAG);
340 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
345 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
350 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
351 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
352 dbuf_rm_spill(dn, tx);
353 dnode_rm_spill(dn, tx);
355 rw_exit(&dn->dn_struct_rwlock);
357 dnode_rele(dn, FTAG);
362 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
367 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
369 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
374 ASSERT(dn->dn_type != DMU_OT_NONE);
376 * If we don't create this free range, we'll leak indirect blocks when
377 * we get to freeing the dnode in syncing context.
379 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
381 dnode_rele(dn, FTAG);
387 * Return (in *objectp) the next object which is allocated (or a hole)
388 * after *object, taking into account only objects that may have been modified
389 * after the specified txg.
392 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
396 struct dsl_dataset *ds = os->os_dsl_dataset;
401 } else if (ds && dsl_dataset_feature_is_active(ds,
402 SPA_FEATURE_LARGE_DNODE)) {
403 uint64_t i = *objectp + 1;
404 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
405 dmu_object_info_t doi;
408 * Scan through the remaining meta dnode block. The contents
409 * of each slot in the block are known so it can be quickly
410 * checked. If the block is exhausted without a match then
411 * hand off to dnode_next_offset() for further scanning.
413 while (i <= last_obj) {
414 error = dmu_object_info(os, i, &doi);
415 if (error == ENOENT) {
422 } else if (error == EEXIST) {
424 } else if (error == 0) {
426 i += doi.doi_dnodesize >> DNODE_SHIFT;
438 start_obj = *objectp + 1;
441 offset = start_obj << DNODE_SHIFT;
443 error = dnode_next_offset(DMU_META_DNODE(os),
444 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
446 *objectp = offset >> DNODE_SHIFT;
452 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
453 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
455 * Only for use from syncing context, on MOS objects.
458 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
463 ASSERT(dmu_tx_is_syncing(tx));
465 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
466 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
467 dnode_rele(dn, FTAG);
470 ASSERT3U(dn->dn_type, ==, old_type);
471 ASSERT0(dn->dn_maxblkid);
474 * We must initialize the ZAP data before changing the type,
475 * so that concurrent calls to *_is_zapified() can determine if
476 * the object has been completely zapified by checking the type.
478 mzap_create_impl(dn, 0, 0, tx);
480 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
481 DMU_OTN_ZAP_METADATA;
482 dnode_setdirty(dn, tx);
483 dnode_rele(dn, FTAG);
485 spa_feature_incr(dmu_objset_spa(mos),
486 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
490 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
495 ASSERT(dmu_tx_is_syncing(tx));
497 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
499 dnode_rele(dn, FTAG);
501 if (t == DMU_OTN_ZAP_METADATA) {
502 spa_feature_decr(dmu_objset_spa(mos),
503 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
505 VERIFY0(dmu_object_free(mos, object, tx));
508 EXPORT_SYMBOL(dmu_object_alloc);
509 EXPORT_SYMBOL(dmu_object_alloc_ibs);
510 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
511 EXPORT_SYMBOL(dmu_object_alloc_hold);
512 EXPORT_SYMBOL(dmu_object_claim);
513 EXPORT_SYMBOL(dmu_object_claim_dnsize);
514 EXPORT_SYMBOL(dmu_object_reclaim);
515 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
516 EXPORT_SYMBOL(dmu_object_rm_spill);
517 EXPORT_SYMBOL(dmu_object_free);
518 EXPORT_SYMBOL(dmu_object_next);
519 EXPORT_SYMBOL(dmu_object_zapify);
520 EXPORT_SYMBOL(dmu_object_free_zapified);
523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
524 "CPU-specific allocator grabs 2^N objects at once");