From d2723c06afc44acea415dddcaf2af31caca1fcdf Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Thu, 31 Oct 2019 09:14:50 +0000 Subject: [PATCH] MFC r353176,r353304,r353556,r353559: large_dnode improvements and fixes r353176: MFV r350898, r351075: 8423 8199 7432 Implement large_dnode pool feature This updates FreeBSD large_dnode code (that was imported from ZoL) to a version that was committed to illumos. It has some cleanups, improvements and fixes comparing to what we have in FreeBSD now. I think that the most significant update is 8199 multi-threaded dmu_object_alloc(). r353304: zfs: use atomic_load_64 to read atomic variable in dmu_object_alloc_impl r353556: MFV r353551: 10452 ZoL: merge in large dnode feature fixes r353559: MFV r353558: 10572 10579 Fix race in dnode_check_slots_free() --- cddl/contrib/opensolaris/cmd/zdb/zdb.c | 43 +- cddl/contrib/opensolaris/cmd/zdb/zdb_il.c | 18 +- .../opensolaris/cmd/zstreamdump/zstreamdump.c | 10 +- cddl/contrib/opensolaris/cmd/ztest/ztest.c | 56 +- .../contrib/opensolaris/common/zfs/zfs_prop.c | 3 +- .../opensolaris/uts/common/fs/zfs/dbuf.c | 6 +- .../uts/common/fs/zfs/dmu_object.c | 267 +++++--- .../uts/common/fs/zfs/dmu_objset.c | 21 + .../opensolaris/uts/common/fs/zfs/dmu_send.c | 105 ++- .../opensolaris/uts/common/fs/zfs/dmu_tx.c | 10 +- .../opensolaris/uts/common/fs/zfs/dnode.c | 631 ++++++++++++------ .../uts/common/fs/zfs/dnode_sync.c | 2 + .../opensolaris/uts/common/fs/zfs/sa.c | 3 + .../opensolaris/uts/common/fs/zfs/spa_misc.c | 7 +- .../opensolaris/uts/common/fs/zfs/sys/dmu.h | 2 +- .../uts/common/fs/zfs/sys/dmu_impl.h | 2 +- .../uts/common/fs/zfs/sys/dmu_objset.h | 6 +- .../opensolaris/uts/common/fs/zfs/sys/dnode.h | 221 +++++- .../uts/common/fs/zfs/sys/sa_impl.h | 4 +- .../opensolaris/uts/common/fs/zfs/sys/zap.h | 2 + .../uts/common/fs/zfs/sys/zfs_context.h | 1 + .../uts/common/fs/zfs/sys/zfs_ioctl.h | 4 +- .../opensolaris/uts/common/fs/zfs/sys/zil.h | 4 +- .../opensolaris/uts/common/fs/zfs/zap.c | 4 +- .../opensolaris/uts/common/fs/zfs/zap_micro.c | 6 +- .../opensolaris/uts/common/fs/zfs/zfs_acl.c | 16 +- .../uts/common/fs/zfs/zfs_replay.c | 7 +- .../opensolaris/uts/common/fs/zfs/zfs_znode.c | 10 +- .../opensolaris/uts/common/fs/zfs/zil.c | 10 +- 29 files changed, 1050 insertions(+), 431 deletions(-) diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 8cf527b6690..2ff94970f26 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -2131,7 +2131,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { }; static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) +dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, + uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; @@ -2151,7 +2152,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { - (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; @@ -2170,6 +2171,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } dmu_object_info_from_dnode(dn, &doi); + if (dnode_slots_used != NULL) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); @@ -2192,8 +2196,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) ZDB_COMPRESS_NAME(doi.doi_compress)); } - (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", - (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + (void) printf("%10" PRIu64 + " %3u %5s %5s %5s %5s %5s %6s %s%s\n", + object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { @@ -2302,6 +2307,9 @@ dump_dir(objset_t *os) int print_header = 1; unsigned i; int error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); @@ -2346,7 +2354,7 @@ dump_dir(objset_t *os) if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, - &print_header); + &print_header, NULL); (void) printf("\n"); return; } @@ -2371,22 +2379,37 @@ dump_dir(objset_t *os) if (BP_IS_HOLE(os->os_rootbp)) return; - dump_object(os, 0, verbosity, &print_header); + dump_object(os, 0, verbosity, &print_header, NULL); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { - dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); - dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header); + dump_object(os, object, verbosity, &print_header, &dnode_slots); object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + + (void) printf("\n"); + if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); @@ -2578,7 +2601,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name) return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header); + dump_object(os, child_obj, dump_opt['v'], &header, NULL); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c index 75b0cd91d26..9f3f23f82da 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c @@ -84,15 +84,15 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) } (void) printf("%s%s", tab_prefix, ctime(&crtime)); - (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", tab_prefix, - (u_longlong_t)lr->lr_doid, - (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid), - (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid), - (longlong_t)lr->lr_mode); - (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", - tab_prefix, - (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, - (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); + (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64 + ", mode %" PRIo64 "\n", + tab_prefix, lr->lr_doid, + (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid), + (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid), + lr->lr_mode); + (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64 + ", rdev %#" PRIx64 "\n", + tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev); } /* ARGSUSED */ diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c index 54edb566ad2..51c4c8e0e64 100644 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c @@ -416,13 +416,15 @@ main(int argc, char *argv[]) drro->drr_toguid = BSWAP_64(drro->drr_toguid); } if (verbose) { - (void) printf("OBJECT object = %llu type = %u " - "bonustype = %u blksz = %u bonuslen = %u\n", - (u_longlong_t)drro->drr_object, + (void) printf("OBJECT object = %" PRIu64 + " type = %u bonustype = %u blksz = %u" + " bonuslen = %u dn_slots = %u\n", + drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, - drro->drr_bonuslen); + drro->drr_bonuslen, + drro->drr_dn_slots); } if (drro->drr_bonuslen > 0) { (void) ssread(buf, diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index 37acf34ec36..538fd040c95 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -196,6 +196,7 @@ extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern boolean_t zfs_abd_scatter_enabled; +extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; static ztest_shared_opts_t *ztest_shared_opts; @@ -322,6 +323,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate; ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; @@ -363,6 +365,7 @@ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, { ztest_dmu_write_parallel, 10, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, { ztest_dmu_commit_callbacks, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, @@ -1366,7 +1369,7 @@ ztest_bt_bonus(dmu_buf_t *db) * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ -#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ +#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* @@ -1895,6 +1898,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); + dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); @@ -3815,8 +3819,10 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) ztest_od_t od[4]; int batchsize = sizeof (od) / sizeof (od[0]); - for (int b = 0; b < batchsize; b++) - ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); + for (int b = 0; b < batchsize; b++) { + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, + 0, 0, 0); + } /* * Destroy the previous batch of objects, create a new batch, @@ -3830,6 +3836,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); } +/* + * Rewind the global allocator to verify object allocation backfilling. + */ +void +ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint64_t object; + + /* + * Rewind the global allocator randomly back to a lower object number + * to force backfilling and reclamation of recently freed dnodes. + */ + mutex_enter(&os->os_obj_lock); + object = ztest_random(os->os_obj_next_chunk); + os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + mutex_exit(&os->os_obj_lock); +} + /* * Verify that dmu_{read,write} work as expected. */ @@ -3876,8 +3902,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4146,8 +4174,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4347,7 +4377,8 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ - ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, + 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4366,7 +4397,8 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) uint64_t blocksize = ztest_random_blocksize(); void *data; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; @@ -4590,7 +4622,8 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) char name[20], string_value[20]; void *data; - ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, + 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -5411,7 +5444,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) blocksize = ztest_random_blocksize(); blocksize = MIN(blocksize, 2048); /* because we write so many */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index 7b1474edf58..9c103831242 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -292,10 +292,11 @@ zfs_prop_init(void) ZFS_VOLMODE_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "default | geom | dev | none", "VOLMODE", volmode_table); + zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); - + /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 9012baa0a99..dfaec47017f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -1812,6 +1812,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) FTAG); } } + + if (tx->tx_txg > dn->dn_dirty_txg) + dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) @@ -3757,7 +3760,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { - dnode_phys_t *dnp = db->db.db_data + i; + dnode_phys_t *dnp = + (void *)(((char *)db->db.db_data) + i); i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index f830076f767..b40ccf4a783 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -32,6 +32,14 @@ #include #include +/* + * Each of the concurrent object allocators will grab + * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to + * grab 128 slots, which is 4 blocks worth. This was experimentally + * determined to be the lowest value that eliminates the measurable effect + * of lock contention from this code path. + */ +int dmu_object_alloc_chunk_shift = 7; static uint64_t dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, @@ -44,6 +52,10 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, dnode_t *dn = NULL; int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; + uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + os->os_obj_next_percpu_len]; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + int error; if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; @@ -51,55 +63,103 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); } - - mutex_enter(&os->os_obj_lock); + + /* + * The "chunk" of dnodes that is assigned to a CPU-specific + * allocator needs to be at least one block's worth, to avoid + * lock contention on the dbuf. It can be at most one L1 block's + * worth, so that the "rescan after polishing off a L1's worth" + * logic below will be sure to kick in. + */ + if (dnodes_per_chunk < DNODES_PER_BLOCK) + dnodes_per_chunk = DNODES_PER_BLOCK; + if (dnodes_per_chunk > L1_dnode_count) + dnodes_per_chunk = L1_dnode_count; + +#ifdef __FreeBSD__ + object = atomic_load_64(cpuobj); +#else + object = *cpuobj; +#endif + for (;;) { - object = os->os_obj_next; /* - * Each time we polish off a L1 bp worth of dnodes (2^12 - * objects), move to another L1 bp that's still - * reasonably sparse (at most 1/4 full). Look from the - * beginning at most once per txg. If we still can't - * allocate from that L1 block, search for an empty L0 - * block, which will quickly skip to the end of the - * metadnode if the no nearby L0 blocks are empty. This - * fallback avoids a pathology where full dnode blocks - * containing large dnodes appear sparse because they - * have a low blk_fill, leading to many failed - * allocation attempts. In the long term a better - * mechanism to search for sparse metadnode regions, - * such as spacemaps, could be implemented. - * - * os_scan_dnodes is set during txg sync if enough objects - * have been freed since the previous rescan to justify - * backfilling again. - * - * Note that dmu_traverse depends on the behavior that we use - * multiple blocks of the dnode object before going back to - * reuse objects. Any change to this algorithm should preserve - * that property or find another solution to the issues - * described in traverse_visitbp. + * If we finished a chunk of dnodes, get a new one from + * the global allocator. */ - if (P2PHASE(object, L1_dnode_count) == 0) { - uint64_t offset; - uint64_t blkfill; - int minlvl; - int error; - if (os->os_rescan_dnodes) { - offset = 0; - os->os_rescan_dnodes = B_FALSE; - } else { - offset = object << DNODE_SHIFT; + if ((P2PHASE(object, dnodes_per_chunk) == 0) || + (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < + dn_slots)) { + DNODE_STAT_BUMP(dnode_alloc_next_chunk); + mutex_enter(&os->os_obj_lock); + ASSERT0(P2PHASE(os->os_obj_next_chunk, + dnodes_per_chunk)); + object = os->os_obj_next_chunk; + + /* + * Each time we polish off a L1 bp worth of dnodes + * (2^12 objects), move to another L1 bp that's + * still reasonably sparse (at most 1/4 full). Look + * from the beginning at most once per txg. If we + * still can't allocate from that L1 block, search + * for an empty L0 block, which will quickly skip + * to the end of the metadnode if the no nearby L0 + * blocks are empty. This fallback avoids a + * pathology where full dnode blocks containing + * large dnodes appear sparse because they have a + * low blk_fill, leading to many failed allocation + * attempts. In the long term a better mechanism to + * search for sparse metadnode regions, such as + * spacemaps, could be implemented. + * + * os_scan_dnodes is set during txg sync if enough + * objects have been freed since the previous + * rescan to justify backfilling again. + * + * Note that dmu_traverse depends on the behavior + * that we use multiple blocks of the dnode object + * before going back to reuse objects. Any change + * to this algorithm should preserve that property + * or find another solution to the issues described + * in traverse_visitbp. + */ + if (P2PHASE(object, L1_dnode_count) == 0) { + uint64_t offset; + uint64_t blkfill; + int minlvl; + if (os->os_rescan_dnodes) { + offset = 0; + os->os_rescan_dnodes = B_FALSE; + } else { + offset = object << DNODE_SHIFT; + } + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; + error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, &offset, minlvl, + blkfill, 0); + if (error == 0) { + object = offset >> DNODE_SHIFT; + } } - blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; - minlvl = restarted ? 1 : 2; - restarted = B_TRUE; - error = dnode_next_offset(DMU_META_DNODE(os), - DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); - if (error == 0) - object = offset >> DNODE_SHIFT; + /* + * Note: if "restarted", we may find a L0 that + * is not suitably aligned. + */ + os->os_obj_next_chunk = + P2ALIGN(object, dnodes_per_chunk) + + dnodes_per_chunk; + (void) atomic_swap_64(cpuobj, object); + mutex_exit(&os->os_obj_lock); } - os->os_obj_next = object + dn_slots; + + /* + * The value of (*cpuobj) before adding dn_slots is the object + * ID assigned to us. The value afterwards is the object ID + * assigned to whoever wants to do an allocation next. + */ + object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; /* * XXX We should check for an i/o error here and return @@ -107,37 +167,45 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, - FTAG, &dn); - if (dn) - break; - - if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - os->os_obj_next = object; - else + error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + dn_slots, FTAG, &dn); + if (error == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* - * Skip to next known valid starting point for a dnode. + * Another thread could have allocated it; check + * again now that we have the struct lock. */ - os->os_obj_next = P2ROUNDUP(object + 1, - DNODES_PER_BLOCK); - } - - dnode_allocate(dn, ot, blocksize, indirect_blockshift, - bonustype, bonuslen, dn_slots, tx); - mutex_exit(&os->os_obj_lock); - - dmu_tx_add_new_object(tx, dn); - dnode_rele(dn, FTAG); + if (dn->dn_type == DMU_OT_NONE) { + dnode_allocate(dn, ot, blocksize, 0, + bonustype, bonuslen, dn_slots, tx); + rw_exit(&dn->dn_struct_rwlock); + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); + return (object); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + DNODE_STAT_BUMP(dnode_alloc_race); + } - return (object); + /* + * Skip to next known valid starting point on error. This + * is the start of the next block of dnodes. + */ + if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { + object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); + DNODE_STAT_BUMP(dnode_alloc_next_block); + } + (void) atomic_swap_64(cpuobj, object); + } } uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, - bonuslen, 0, tx); + return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, 0, tx)); } uint64_t @@ -145,8 +213,8 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, - bonustype, bonuslen, 0, tx); + return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, 0, tx)); } uint64_t @@ -178,7 +246,7 @@ dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, dn_slots = DNODE_MIN_SLOTS; ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); - + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); @@ -199,7 +267,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, - bonuslen, 0, tx)); + bonuslen, DNODE_MIN_SIZE, tx)); } int @@ -211,6 +279,9 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int dn_slots = dnodesize >> DNODE_SHIFT; int err; + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); @@ -260,28 +331,52 @@ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset; - dmu_object_info_t doi; + uint64_t start_obj; struct dsl_dataset *ds = os->os_dsl_dataset; - int dnodesize; int error; - /* - * Avoid expensive dnode hold if this dataset doesn't use large dnodes. - */ - if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { - error = dmu_object_info(os, *objectp, &doi); - if (error && !(error == EINVAL && *objectp == 0)) - return (SET_ERROR(error)); - else - dnodesize = doi.doi_dnodesize; + if (*objectp == 0) { + start_obj = 1; + } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + uint64_t i = *objectp + 1; + uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); + dmu_object_info_t doi; + + /* + * Scan through the remaining meta dnode block. The contents + * of each slot in the block are known so it can be quickly + * checked. If the block is exhausted without a match then + * hand off to dnode_next_offset() for further scanning. + */ + while (i <= last_obj) { + error = dmu_object_info(os, i, &doi); + if (error == ENOENT) { + if (hole) { + *objectp = i; + return (0); + } else { + i++; + } + } else if (error == EEXIST) { + i++; + } else if (error == 0) { + if (hole) { + i += doi.doi_dnodesize >> DNODE_SHIFT; + } else { + *objectp = i; + return (0); + } + } else { + return (error); + } + } + + start_obj = i; } else { - dnodesize = DNODE_MIN_SIZE; + start_obj = *objectp + 1; } - if (*objectp == 0) - offset = 1 << DNODE_SHIFT; - else - offset = (*objectp << DNODE_SHIFT) + dnodesize; + offset = start_obj << DNODE_SHIFT; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 2e36e0e5eec..5585ceea4e1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -566,6 +566,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + os->os_obj_next_percpu_len = boot_ncpus; + os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * + sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); @@ -844,6 +847,9 @@ dmu_objset_evict_done(objset_t *os) rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); + kmem_free(os->os_obj_next_percpu, + os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); + mutex_destroy(&os->os_lock); mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); @@ -1243,10 +1249,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); multilist_sublist_remove(list, dn); + /* + * If we are not doing useraccounting (os_synced_dnodes == NULL) + * we are done with this dnode for this txg. Unset dn_dirty_txg + * if later txgs aren't dirtying it so that future holders do + * not get a stale value. Otherwise, we will do this in + * userquota_updates_task() when processing has completely + * finished for this txg. + */ multilist_t *newlist = dn->dn_objset->os_synced_dnodes; if (newlist != NULL) { (void) dnode_add_ref(dn, newlist); multilist_insert(newlist, dn); + } else { + mutex_enter(&dn->dn_mtx); + if (dn->dn_dirty_txg == tx->tx_txg) + dn->dn_dirty_txg = 0; + mutex_exit(&dn->dn_mtx); } dnode_sync(dn, tx); @@ -1606,6 +1625,8 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) + dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 25614d018cc..9fd0483ffa3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -1441,17 +1441,12 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. + * feature enabled if the stream has LARGE_BLOCKS. Same with + * large dnodes. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large dnodes - * to smaller ones, so the pool must have the LARGE_DNODE - * feature enabled if the stream has LARGE_DNODE. - */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); @@ -1659,6 +1654,9 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); @@ -1686,8 +1684,18 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + /* + * The receiving code doesn't know how to translate large blocks + * to smaller ones, so the pool must have the LARGE_BLOCKS + * feature enabled if the stream has LARGE_BLOCKS. Same with + * large dnodes. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); @@ -2149,6 +2157,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_tx_t *tx; uint64_t object; int err; + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || @@ -2159,15 +2169,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) { + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || + dn_slots > + (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(rwa->os, drro->drr_object, &doi); - if (err != 0 && err != ENOENT) + if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); - object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; @@ -2180,16 +2191,64 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (err == 0) { int nblkptr; + object = drro->drr_object; + nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr) { + nblkptr < doi.doi_nblkptr || + dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } + } else if (err == EEXIST) { + /* + * The object requested is currently an interior slot of a + * multi-slot dnode. This will be resolved when the next txg + * is synced out, since the send stream will have told us + * to free this slot when we freed the associated dnode + * earlier in the stream. + */ + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = drro->drr_object; + } else { + /* object is free and we are about to allocate a new one */ + object = DMU_NEW_OBJECT; + } + + /* + * If this is a multi-slot dnode there is a chance that this + * object will expand into a slot that is already used by + * another object from the previous snapshot. We must free + * these objects before we attempt to allocate the new dnode. + */ + if (dn_slots > 1) { + boolean_t need_sync = B_FALSE; + + for (uint64_t slot = drro->drr_object + 1; + slot < drro->drr_object + dn_slots; + slot++) { + dmu_object_info_t slot_doi; + + err = dmu_object_info(rwa->os, slot, &slot_doi); + if (err == ENOENT || err == EEXIST) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, slot); + + if (err != 0) + return (err); + + need_sync = B_TRUE; + } + + if (need_sync) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); @@ -2205,15 +2264,17 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, - drro->drr_dn_slots << DNODE_SHIFT, tx); + dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || - drro->drr_bonuslen != doi.doi_bonus_size) { + drro->drr_bonuslen != doi.doi_bonus_size || + drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(rwa->os, drro->drr_object, + err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); + drro->drr_bonustype, drro->drr_bonuslen, + drro->drr_dn_slots << DNODE_SHIFT, tx); } if (err != 0) { dmu_tx_commit(tx); @@ -2263,13 +2324,11 @@ receive_freeobjects(struct receive_writer_arg *rwa, dmu_object_info_t doi; int err; - err = dmu_object_info(rwa->os, obj, &doi); - if (err == ENOENT) { - obj++; - continue; - } else if (err != 0) { + err = dmu_object_info(rwa->os, obj, NULL); + if (err == ENOENT) + continue; + else if (err != 0) return (err); - } err = dmu_free_long_object(rwa->os, obj); if (err != 0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index e9f1f4ac19c..849a4fea28e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -1252,11 +1252,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, - tx->tx_objset, object, THT_SPILL, 0, 0); + dmu_tx_hold_t *txh; - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh != NULL) + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 91b3a041483..ae37928da50 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -40,20 +40,40 @@ #include #include +dnode_stats_t dnode_stats = { + { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, + { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, + { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_allocate", KSTAT_DATA_UINT64 }, + { "dnode_reallocate", KSTAT_DATA_UINT64 }, + { "dnode_buf_evict", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, + { "dnode_alloc_race", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, + { "dnode_move_invalid", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, + { "dnode_move_special", KSTAT_DATA_UINT64 }, + { "dnode_move_handle", KSTAT_DATA_UINT64 }, + { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, + { "dnode_move_active", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; -/* - * Define DNODE_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define DNODE_STATS -#endif /* DEBUG */ - -#ifdef DNODE_STATS -#define DNODE_STAT_ADD(stat) ((stat)++) -#else -#define DNODE_STAT_ADD(stat) /* nothing */ -#endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; @@ -126,7 +146,7 @@ dnode_cons(void *arg, void *unused, int kmflag) bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { - list_link_init(&dn->dn_dirty_link[i]); + multilist_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), @@ -136,6 +156,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; @@ -174,7 +195,7 @@ dnode_dest(void *arg, void *unused) ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); @@ -189,6 +210,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirty_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); @@ -215,12 +237,25 @@ dnode_init(void) 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); #ifdef _KERNEL kmem_cache_set_move(dnode_cache, dnode_move); + + dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", + KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dnode_ksp != NULL) { + dnode_ksp->ks_data = &dnode_stats; + kstat_install(dnode_ksp); + } #endif /* _KERNEL */ } void dnode_fini(void) { + if (dnode_ksp != NULL) { + kstat_delete(dnode_ksp); + dnode_ksp = NULL; + } + kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } @@ -333,6 +368,7 @@ dnode_byteswap(dnode_phys_t *dnp) /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); + } void @@ -344,7 +380,7 @@ dnode_buf_byteswap(void *vbuf, size_t size) ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); while (i < size) { - dnode_phys_t *dnp = vbuf + i; + dnode_phys_t *dnp = (void *)(((char *)vbuf) + i); dnode_byteswap(dnp); i += DNODE_MIN_SIZE; @@ -448,14 +484,10 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); mutex_enter(&os->os_lock); - if (dnh->dnh_dnode != NULL) { - /* Lost the allocation race. */ - mutex_exit(&os->os_lock); - kmem_cache_free(dnode_cache, dn); - return (dnh->dnh_dnode); - } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes @@ -478,6 +510,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); + return (dn); } @@ -503,11 +536,13 @@ dnode_destroy(dnode_t *dn) mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ - zrl_remove(&dn->dn_handle->dnh_zrlock); + if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) + zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { @@ -559,8 +594,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", + dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64 + " blocksize=%d ibs=%d dn_slots=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); @@ -575,6 +612,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_dirty_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); @@ -588,7 +626,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } @@ -645,9 +683,11 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT)); - dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; + dnode_free_interior_slots(dn); + DNODE_STAT_BUMP(dnode_reallocate); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); @@ -700,7 +740,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + (dn->dn_nblkptr - 1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } @@ -708,18 +748,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_exit(&dn->dn_mtx); } -#ifdef DNODE_STATS -static struct { - uint64_t dms_dnode_invalid; - uint64_t dms_dnode_recheck1; - uint64_t dms_dnode_recheck2; - uint64_t dms_dnode_special; - uint64_t dms_dnode_handle; - uint64_t dms_dnode_rwlock; - uint64_t dms_dnode_active; -} dnode_move_stats; -#endif /* DNODE_STATS */ - #ifdef _KERNEL static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) @@ -749,6 +777,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; + ndn->dn_num_slots = odn->dn_num_slots; bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], @@ -774,6 +803,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirty_txg = odn->dn_dirty_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); @@ -840,6 +870,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; + odn->dn_dirty_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; @@ -880,7 +911,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + DNODE_STAT_BUMP(dnode_move_invalid); return (KMEM_CBRC_DONT_KNOW); } @@ -890,7 +921,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + DNODE_STAT_BUMP(dnode_move_recheck1); return (KMEM_CBRC_DONT_KNOW); } @@ -908,7 +939,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + DNODE_STAT_BUMP(dnode_move_recheck2); return (KMEM_CBRC_DONT_KNOW); } @@ -921,7 +952,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + DNODE_STAT_BUMP(dnode_move_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ @@ -936,7 +967,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + DNODE_STAT_BUMP(dnode_move_handle); return (KMEM_CBRC_LATER); } @@ -952,7 +983,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + DNODE_STAT_BUMP(dnode_move_rwlock); return (KMEM_CBRC_LATER); } @@ -978,7 +1009,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + DNODE_STAT_BUMP(dnode_move_active); return (KMEM_CBRC_LATER); } @@ -1003,6 +1034,136 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) #endif /* illumos */ #endif /* _KERNEL */ +static void +dnode_slots_hold(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_add(&dnh->dnh_zrlock); + } +} + +static void +dnode_slots_rele(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (zrl_is_locked(&dnh->dnh_zrlock)) + zrl_exit(&dnh->dnh_zrlock); + else + zrl_remove(&dnh->dnh_zrlock); + } +} + +static int +dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (!zrl_tryenter(&dnh->dnh_zrlock)) { + for (int j = idx; j < i; j++) { + dnh = &children->dnc_children[j]; + zrl_exit(&dnh->dnh_zrlock); + } + + return (0); + } + } + + return (1); +} + +static void +dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnh->dnh_dnode = ptr; + } +} + +static boolean_t +dnode_check_slots_free(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + /* + * If all dnode slots are either already free or + * evictable return B_TRUE. + */ + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnode_t *dn = dnh->dnh_dnode; + + if (dn == DN_SLOT_FREE) { + continue; + } else if (DN_SLOT_IS_PTR(dn)) { + mutex_enter(&dn->dn_mtx); + boolean_t can_free = (dn->dn_type == DMU_OT_NONE && + refcount_is_zero(&dn->dn_holds) && + !DNODE_IS_DIRTY(dn)); + mutex_exit(&dn->dn_mtx); + + if (!can_free) + return (B_FALSE); + else + continue; + } else { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); + dnode_destroy(dnh->dnh_dnode); + dnh->dnh_dnode = DN_SLOT_FREE; + } + } +} + +void +dnode_free_interior_slots(dnode_t *dn) +{ + dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); + int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; + int idx = (dn->dn_object & (epb - 1)) + 1; + int slots = dn->dn_num_slots - 1; + + if (slots == 0) + return; + + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + while (!dnode_slots_tryenter(children, idx, slots)) + DNODE_STAT_BUMP(dnode_free_interior_lock_retry); + + dnode_set_slots(children, idx, slots, DN_SLOT_FREE); + dnode_slots_rele(children, idx, slots); +} + void dnode_special_close(dnode_handle_t *dnh) { @@ -1010,7 +1171,7 @@ dnode_special_close(dnode_handle_t *dnh) /* * Wait for final references to the dnode to clear. This can - * only happen if the arc is asyncronously evicting state that + * only happen if the arc is asynchronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ @@ -1030,19 +1191,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, { dnode_t *dn; - dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); + zrl_tryenter(&dnh->dnh_zrlock); + + dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); + + zrl_exit(&dnh->dnh_zrlock); } static void dnode_buf_evict_async(void *dbu) { - dnode_children_t *children_dnodes = dbu; - int i; + dnode_children_t *dnc = dbu; + + DNODE_STAT_BUMP(dnode_buf_evict); - for (i = 0; i < children_dnodes->dnc_count; i++) { - dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + for (int i = 0; i < dnc->dnc_count; i++) { + dnode_handle_t *dnh = &dnc->dnc_children[i]; dnode_t *dn; /* @@ -1050,8 +1216,9 @@ dnode_buf_evict_async(void *dbu) * another valid address, so there is no need here to guard * against changes to or from NULL. */ - if (dnh->dnh_dnode == NULL) { + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; continue; } @@ -1066,140 +1233,36 @@ dnode_buf_evict_async(void *dbu) ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - dnode_destroy(dn); /* implicit zrl_remove() */ + dnode_destroy(dn); /* implicit zrl_remove() for first slot */ zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = NULL; - } - kmem_free(children_dnodes, sizeof (dnode_children_t) + - children_dnodes->dnc_count * sizeof (dnode_handle_t)); -} - -/* - * Return true if the given index is interior to a dnode already - * allocated in the block. That is, the index is neither free nor - * allocated, but is consumed by a large dnode. - * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. - */ -static boolean_t -dnode_is_consumed(dmu_buf_impl_t *db, int idx) -{ - dnode_handle_t *dnh; - dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - int skip; - int i; - - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; - - for (i = 0; i < idx; i += skip) { - dnh = &children_dnodes->dnc_children[i]; - - zrl_add(&dnh->dnh_zrlock); - if (dnh->dnh_dnode != NULL) { - ot = dnh->dnh_dnode->dn_type; - skip = dnh->dnh_dnode->dn_num_slots; - } else { - ot = dn_block[i].dn_type; - skip = dn_block[i].dn_extra_slots + 1; - } - zrl_remove(&dnh->dnh_zrlock); - - if (ot == DMU_OT_NONE) - skip = 1; + dnh->dnh_dnode = DN_SLOT_UNINIT; } - - return (i > idx); + kmem_free(dnc, sizeof (dnode_children_t) + + dnc->dnc_count * sizeof (dnode_handle_t)); } /* - * Return true if the given index in the dnode block is a valid - * allocated dnode. That is, the index is not consumed by a large - * dnode and is not free. + * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used + * to ensure the hole at the specified object offset is large enough to + * hold the dnode being created. The slots parameter is also used to ensure + * a dnode does not span multiple dnode blocks. In both of these cases, if + * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases + * are only possible when using DNODE_MUST_BE_FREE. * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. - */ -static boolean_t -dnode_is_allocated(dmu_buf_impl_t *db, int idx) -{ - dnode_handle_t *dnh; - dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - - if (dnode_is_consumed(db, idx)) - return (B_FALSE); - - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; - - dnh = &children_dnodes->dnc_children[idx]; - - zrl_add(&dnh->dnh_zrlock); - if (dnh->dnh_dnode != NULL) - ot = dnh->dnh_dnode->dn_type; - else - ot = dn_block[idx].dn_type; - zrl_remove(&dnh->dnh_zrlock); - - return (ot != DMU_OT_NONE); -} - -/* - * Return true if the given range of indices in the dnode block are - * free. That is, the starting index is not consumed by a large dnode - * and none of the indices are allocated. + * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. + * dnode_hold_impl() will check if the requested dnode is already consumed + * as an extra dnode slot by an large dnode, in which case it returns + * ENOENT. * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. - */ -static boolean_t -dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) -{ - dnode_handle_t *dnh; - dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - int i; - - if (idx + slots > DNODES_PER_BLOCK) - return (B_FALSE); - - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; - - if (dnode_is_consumed(db, idx)) - return (B_FALSE); - - for (i = idx; i < idx + slots; i++) { - dnh = &children_dnodes->dnc_children[i]; - - zrl_add(&dnh->dnh_zrlock); - if (dnh->dnh_dnode != NULL) - ot = dnh->dnh_dnode->dn_type; - else - ot = dn_block[i].dn_type; - zrl_remove(&dnh->dnh_zrlock); - - if (ot != DMU_OT_NONE) - return (B_FALSE); - } - - return (B_TRUE); -} - -/* * errors: - * EINVAL - invalid object number. - * ENOSPC - hole too small to fulfill "slots" request - * EIO - i/o error. + * EINVAL - invalid object number or flags. + * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) + * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) + * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) + * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) + * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) + * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) + * EIO - i/o error error when reading the meta dnode dbuf. * succeeds even for free dnodes. */ int @@ -1212,7 +1275,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_children_t *children_dnodes; + dnode_children_t *dnc; + dnode_phys_t *dn_block; dnode_phys_t *dn_block_begin; dnode_handle_t *dnh; @@ -1265,10 +1329,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) + if (db == NULL) { + DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); + } err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { + DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); return (err); } @@ -1276,68 +1343,194 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; + idx = object & (epb - 1); + dn_block = (dnode_phys_t *)db->db.db_data; + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - children_dnodes = dmu_buf_get_user(&db->db); - if (children_dnodes == NULL) { + dnc = dmu_buf_get_user(&db->db); + dnh = NULL; + if (dnc == NULL) { dnode_children_t *winner; - children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + int skip = 0; + + dnc = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); - children_dnodes->dnc_count = epb; - dnh = &children_dnodes->dnc_children[0]; - for (i = 0; i < epb; i++) { + dnc->dnc_count = epb; + dnh = &dnc->dnc_children[0]; + + /* Initialize dnode slot status from dnode_phys_t */ + for (int i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); + + if (skip) { + skip--; + continue; + } + + if (dn_block[i].dn_type != DMU_OT_NONE) { + int interior = dn_block[i].dn_extra_slots; + + dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); + dnode_set_slots(dnc, i + 1, interior, + DN_SLOT_INTERIOR); + skip = interior; + } else { + dnh[i].dnh_dnode = DN_SLOT_FREE; + skip = 0; + } } - dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, + + dmu_buf_init_user(&dnc->dnc_dbu, NULL, dnode_buf_evict_async, NULL); - winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); if (winner != NULL) { - for (i = 0; i < epb; i++) { + for (int i = 0; i < epb; i++) zrl_destroy(&dnh[i].dnh_zrlock); - } - kmem_free(children_dnodes, sizeof (dnode_children_t) + + kmem_free(dnc, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); - children_dnodes = winner; + dnc = winner; } } - ASSERT(children_dnodes->dnc_count == epb); - idx = object & (epb - 1); - dn_block_begin = (dnode_phys_t *)db->db.db_data; + ASSERT(dnc->dnc_count == epb); + dn = DN_SLOT_UNINIT; - if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) { - dbuf_rele(db, FTAG); - return (ENOSPC); - } else if ((flag & DNODE_MUST_BE_ALLOCATED) && - !dnode_is_allocated(db, idx)) { + if (flag & DNODE_MUST_BE_ALLOCATED) { + slots = 1; + + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + dnh = &dnc->dnc_children[idx]; + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + break; + } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { + DNODE_STAT_BUMP(dnode_hold_alloc_interior); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { + DNODE_STAT_BUMP(dnode_hold_alloc_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); + continue; + } + + /* + * Someone else won the race and called dnode_create() + * after we checked DN_SLOT_IS_PTR() above but before + * we acquired the lock. + */ + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { + DNODE_STAT_BUMP(dnode_hold_alloc_type_none); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + DNODE_STAT_BUMP(dnode_hold_alloc_hits); + } else if (flag & DNODE_MUST_BE_FREE) { + + if (idx + slots - 1 >= DNODES_PER_BLOCK) { + DNODE_STAT_BUMP(dnode_hold_free_overflow); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_retry); + continue; + } + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + /* + * Allocated but otherwise free dnodes which would + * be in the interior of a multi-slot dnodes need + * to be freed. Single slot dnodes can be safely + * re-purposed as a performance optimization. + */ + if (slots > 1) + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + + dnh = &dnc->dnc_children[idx]; + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (!refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_refcount); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } + + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); + DNODE_STAT_BUMP(dnode_hold_free_hits); + } else { dbuf_rele(db, FTAG); - return (ENOENT); + return (SET_ERROR(EINVAL)); } - dnh = &children_dnodes->dnc_children[idx]; - zrl_add(&dnh->dnh_zrlock); - dn = dnh->dnh_dnode; - if (dn == NULL) - dn = dnode_create(os, dn_block_begin + idx, db, object, dnh); - - mutex_enter(&dn->dn_mtx); - type = dn->dn_type; - if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { + if (dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_txg); + type = dn->dn_type; mutex_exit(&dn->dn_mtx); - zrl_remove(&dnh->dnh_zrlock); + dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); - return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST); + return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? + ENOENT : EEXIST)); } + if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ - zrl_remove(&dnh->dnh_zrlock); + dnode_slots_rele(dnc, idx, slots); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -1451,7 +1644,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) /* * If we are already marked dirty, we're done. */ - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { multilist_sublist_unlock(mls); return; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 551c44aa3f2..a37607e0e30 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -554,6 +554,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + dnode_free_interior_slots(dn); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; @@ -561,6 +562,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_have_spill = B_FALSE; + dn->dn_num_slots = 1; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c index ee7852a0df0..9f9cdce8fbd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -660,6 +660,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); bonuslen = DN_BONUS_SIZE(dnodesize); + dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); + bonuslen = DN_BONUS_SIZE(dnodesize); + /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, bonuslen, &i, &used, &spilling); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index ff8eeb1b2db..6ba5bed27b2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -1109,10 +1109,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference @@ -2255,7 +2255,6 @@ spa_maxdnodesize(spa_t *spa) return (DNODE_MIN_SIZE); } - /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index b23ce019437..744914a3058 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -823,7 +823,7 @@ typedef struct dmu_object_info { uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; - uint8_t doi_pad[4]; + int8_t doi_pad[4]; uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 44bd6a35177..5cf7aea4711 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -163,7 +163,7 @@ extern "C" { * dn_allocated_txg * dn_free_txg * dn_assigned_txg - * dd_assigned_tx + * dn_dirty_txg * dn_notxholds * dn_dirtyctx * dn_dirtyctx_firstset diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index f692dae90fe..3028f043656 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -130,7 +130,11 @@ struct objset { /* Protected by os_obj_lock */ kmutex_t os_obj_lock; - uint64_t os_obj_next; + uint64_t os_obj_next_chunk; + + /* Per-CPU next object to allocate, protected by atomic ops. */ + uint64_t *os_obj_next_percpu; + int os_obj_next_percpu_len; /* Protected by os_lock */ kmutex_t os_lock; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 74acef0ae19..561bcdb29a8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -96,9 +96,16 @@ extern "C" { #define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) #define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) #define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) -#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) -#define DN_KILL_SPILLBLK (1) +#define DN_KILL_SPILLBLK (1) + +#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ +#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ +#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ +#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ +#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) +#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -120,7 +127,7 @@ extern "C" { ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) - + #define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) @@ -143,6 +150,57 @@ enum dnode_dirtycontext { /* Does dnode have a SA spill blkptr in bonus? */ #define DNODE_FLAG_SPILL_BLKPTR (1<<2) +/* + * VARIABLE-LENGTH (LARGE) DNODES + * + * The motivation for variable-length dnodes is to eliminate the overhead + * associated with using spill blocks. Spill blocks are used to store + * system attribute data (i.e. file metadata) that does not fit in the + * dnode's bonus buffer. By allowing a larger bonus buffer area the use of + * a spill block can be avoided. Spill blocks potentially incur an + * additional read I/O for every dnode in a dnode block. As a worst case + * example, reading 32 dnodes from a 16k dnode block and all of the spill + * blocks could issue 33 separate reads. Now suppose those dnodes have size + * 1024 and therefore don't need spill blocks. Then the worst case number + * of blocks read is reduced to from 33 to two--one per dnode block. + * + * ZFS-on-Linux systems that make heavy use of extended attributes benefit + * from this feature. In particular, ZFS-on-Linux supports the xattr=sa + * dataset property which allows file extended attribute data to be stored + * in the dnode bonus buffer as an alternative to the traditional + * directory-based format. Workloads such as SELinux and the Lustre + * distributed filesystem often store enough xattr data to force spill + * blocks when xattr=sa is in effect. Large dnodes may therefore provide a + * performance benefit to such systems. Other use cases that benefit from + * this feature include files with large ACLs and symbolic links with long + * target names. + * + * The size of a dnode may be a multiple of 512 bytes up to the size of a + * dnode block (currently 16384 bytes). The dn_extra_slots field of the + * on-disk dnode_phys_t structure describes the size of the physical dnode + * on disk. The field represents how many "extra" dnode_phys_t slots a + * dnode consumes in its dnode block. This convention results in a value of + * 0 for 512 byte dnodes which preserves on-disk format compatibility with + * older software which doesn't support large dnodes. + * + * Similarly, the in-memory dnode_t structure has a dn_num_slots field + * to represent the total number of dnode_phys_t slots consumed on disk. + * Thus dn->dn_num_slots is 1 greater than the corresponding + * dnp->dn_extra_slots. This difference in convention was adopted + * because, unlike on-disk structures, backward compatibility is not a + * concern for in-memory objects, so we used a more natural way to + * represent size for a dnode_t. + * + * The default size for newly created dnodes is determined by the value of + * the "dnodesize" dataset property. By default the property is set to + * "legacy" which is compatible with older software. Setting the property + * to "auto" will allow the filesystem to choose the most suitable dnode + * size. Currently this just sets the default dnode size to 1k, but future + * code improvements could dynamically choose a size based on observed + * workload patterns. Dnodes of varying sizes can coexist within the same + * dataset and even within the same dnode block. + */ + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -169,22 +227,6 @@ typedef struct dnode_phys { * protected properly. */ uint64_t dn_pad3[4]; - /* - * The tail region is 448 bytes for a 512 byte dnode, and - * correspondingly larger for larger dnode sizes. The spill - * block pointer, when present, is always at the end of the tail - * region. There are three ways this space may be used, using - * a 512 byte dnode for this diagram: - * - * 0 64 128 192 256 320 384 448 (offset) - * +---------------+---------------+---------------+-------+ - * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / | - * +---------------+---------------+---------------+-------+ - * | dn_blkptr[0] | dn_bonus[0..319] | - * +---------------+-----------------------+---------------+ - * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill | - * +---------------+-----------------------+---------------+ - */ union { blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; struct { @@ -196,7 +238,7 @@ typedef struct dnode_phys { uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)]; blkptr_t dn_spill; - }; + }; }; } dnode_phys_t; @@ -261,6 +303,7 @@ struct dnode { uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ kcondvar_t dn_notxholds; enum dnode_dirtycontext dn_dirtyctx; uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ @@ -361,8 +404,12 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); +void dnode_free_interior_slots(dnode_t *dn); boolean_t dnode_needs_remap(const dnode_t *dn); +#define DNODE_IS_DIRTY(_dn) \ + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) + #define DNODE_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (DMU_OT_IS_METADATA((_dn)->dn_type) && \ @@ -372,6 +419,140 @@ boolean_t dnode_needs_remap(const dnode_t *dn); ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) +/* + * Used for dnodestats kstat. + */ +typedef struct dnode_stats { + /* + * Number of failed attempts to hold a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_hold; + /* + * Number of failed attempts to read a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_read; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able + * to hold the requested object number which was allocated. This is + * the common case when looking up any allocated object number. + */ + kstat_named_t dnode_hold_alloc_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because it was not allocated. + */ + kstat_named_t dnode_hold_alloc_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because the object number + * refers to an interior large dnode slot. + */ + kstat_named_t dnode_hold_alloc_interior; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_alloc_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not + * need to create the dnode because another thread did so after + * dropping the read lock but before acquiring the write lock. + */ + kstat_named_t dnode_hold_alloc_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found + * a free dnode instantiated by dnode_create() but not yet allocated + * by dnode_allocate(). + */ + kstat_named_t dnode_hold_alloc_type_none; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able + * to hold the requested range of free dnode slots. + */ + kstat_named_t dnode_hold_free_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * after acquiring the zrl lock at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_free_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which were held by another thread. + */ + kstat_named_t dnode_hold_free_refcount; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which would overflow the dnode_phys_t. + */ + kstat_named_t dnode_hold_free_overflow; + /* + * Number of times a dnode_hold(...) was attempted on a dnode + * which had already been unlinked in an earlier txg. + */ + kstat_named_t dnode_hold_free_txg; + /* + * Number of times dnode_free_interior_slots() needed to retry + * acquiring a slot zrl lock due to contention. + */ + kstat_named_t dnode_free_interior_lock_retry; + /* + * Number of new dnodes allocated by dnode_allocate(). + */ + kstat_named_t dnode_allocate; + /* + * Number of dnodes re-allocated by dnode_reallocate(). + */ + kstat_named_t dnode_reallocate; + /* + * Number of meta dnode dbufs evicted. + */ + kstat_named_t dnode_buf_evict; + /* + * Number of times dmu_object_alloc*() reached the end of the existing + * object ID chunk and advanced to a new one. + */ + kstat_named_t dnode_alloc_next_chunk; + /* + * Number of times multiple threads attempted to allocate a dnode + * from the same block of free dnodes. + */ + kstat_named_t dnode_alloc_race; + /* + * Number of times dmu_object_alloc*() was forced to advance to the + * next meta dnode dbuf due to an error from dmu_object_next(). + */ + kstat_named_t dnode_alloc_next_block; + /* + * Statistics for tracking dnodes which have been moved. + */ + kstat_named_t dnode_move_invalid; + kstat_named_t dnode_move_recheck1; + kstat_named_t dnode_move_recheck2; + kstat_named_t dnode_move_special; + kstat_named_t dnode_move_handle; + kstat_named_t dnode_move_rwlock; + kstat_named_t dnode_move_active; +} dnode_stats_t; + +extern dnode_stats_t dnode_stats; + +#define DNODE_STAT_INCR(stat, val) \ + atomic_add_64(&dnode_stats.stat.value.ui64, (val)); +#define DNODE_STAT_BUMP(stat) \ + DNODE_STAT_INCR(stat, 1); + #ifdef ZFS_DEBUG /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h index e444e2fb572..4bea074b545 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h @@ -101,7 +101,7 @@ typedef struct sa_lot { sa_attr_type_t *lot_attrs; /* array of attr #'s */ uint32_t lot_var_sizes; /* how many aren't fixed size */ uint32_t lot_attr_count; /* total attr count */ - list_t lot_idx_tab; /* should be only a couple of entries */ + list_t lot_idx_tab; /* should be only a couple of entries */ int lot_instance; /* used with lot_hash to identify entry */ } sa_lot_t; @@ -134,7 +134,7 @@ typedef struct sa_idx_tab { * adding a completely new attribute is a very rare operation. */ struct sa_os { - kmutex_t sa_lock; + kmutex_t sa_lock; boolean_t sa_need_attr_registration; boolean_t sa_force_spill; uint64_t sa_master_obj; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index 3ce99a4f912..fa66788ae70 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -139,6 +139,8 @@ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); /* * Initialize an already-allocated object. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index a3c0e4c31d0..38fda1d4058 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -97,6 +97,7 @@ extern "C" { #endif #include #include +#include #include #include diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index 7cd29431698..756800f8afd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -94,7 +94,7 @@ typedef enum drr_headertype { /* flag #21 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) -/* flag #24 is reserved for the raw send (encryption) feature */ +/* flag #24 is reserved for the raw send feature */ /* flag #25 is reserved for the ZSTD compression feature */ /* @@ -120,7 +120,7 @@ typedef enum dmu_send_resume_token_version { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | reserved | feature-flags |C|S| + * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index 3f0b771df48..040dfaa29a9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -157,7 +157,7 @@ typedef enum zil_create { #define TX_ACL 13 /* Set ACL */ #define TX_CREATE_ACL 14 /* create with ACL */ #define TX_CREATE_ATTR 15 /* create + attrs */ -#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ @@ -436,7 +436,7 @@ extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); -extern int zil_check_log_chain(struct dsl_pool *dp, +extern int zil_check_log_chain(struct dsl_pool *dp, struct dsl_dataset *ds, void *tx); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 2e746f9398f..a0cadaae949 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -985,8 +985,8 @@ uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx) { - uint64_t new_obj; - + uint64_t new_obj; + VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx)) > 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 486415e7bf2..133989eca32 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -726,9 +726,9 @@ int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) - { - int err; - +{ + int err; + err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (err != 0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c index e0bc7422c5a..6332559edd7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -892,7 +892,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, int entry_type; mode_t mode; mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; + zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; @@ -1320,12 +1320,12 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, uint64_t who; int new_count, new_bytes; int ace_size; - int entry_type; + int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; boolean_t isdir; trivial_acl_t masks; @@ -1773,7 +1773,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_acl_t *aclp; ulong_t mask; int error; - int count = 0; + int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | @@ -2104,7 +2104,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); - uint64_t who; + uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; @@ -2378,9 +2378,9 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) uint32_t working_mode; int error; int is_attr; - boolean_t check_privs; + boolean_t check_privs; znode_t *xzp; - znode_t *check_zp = zp; + znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c index 9bf7643258c..7e4e9cf85f5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -310,7 +310,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; - + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); @@ -322,7 +322,6 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) * zfs_create() has no concept of these attributes, so we smuggle * the values inside the vattr's otherwise unused va_ctime, * va_nblocks, and va_fsid fields. - */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; @@ -464,8 +463,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode slot count. The * generic zfs_create() has no concept of these attributes, so - * we smuggle the values inside * the vattr's otherwise unused - * va_ctime, va_nblocks, and va_nlink fields. + * we smuggle the values inside the vattr's otherwise unused + * va_ctime, va_nblocks and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index c94cef7d456..40a7798149a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -832,7 +832,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer @@ -1862,14 +1862,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) DMU_OT_NONE, 0, tx); ASSERT(error == 0); - /* - * Give dmu_object_alloc() a hint about where to start - * allocating new objects. Otherwise, since the metadnode's - * dnode_phys_t structure isn't initialized yet, dmu_object_next() - * would fail and we'd have to skip to the next dnode block. - */ - os->os_obj_next = moid + 1; - /* * Set starting attributes. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 2f84c7a4a3b..d8b7498a71a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -63,9 +63,9 @@ * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * - * - a single, per-dataset, ZIL header; which points to a chain of - * - zero or more ZIL blocks; each of which contains - * - zero or more ZIL records + * - a single, per-dataset, ZIL header; which points to a chain of + * - zero or more ZIL blocks; each of which contains + * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and @@ -3097,8 +3097,10 @@ zil_close(zilog_t *zilog) if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); + if (zilog_is_dirty(zilog)) + zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); if (txg < spa_freeze_txg(zilog->zl_spa)) - ASSERT(!zilog_is_dirty(zilog)); + VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; -- 2.45.0