From ec8d66f68c7a6b7b425f785a043d320c0cb706a6 Mon Sep 17 00:00:00 2001 From: delphij Date: Fri, 5 Jul 2013 03:01:35 +0000 Subject: [PATCH] MFC r251629: 3741 zfs needs better comments Embellish the comments in various components of ZFS. Move some comments around closer to what they describe. Specifically, answer the questions: - What are some of the edge cases of the dbuf state machine? - What does a txg quiesce do? - When does the DMU notify threads waiting on txg's that they may proceed? - How do the calculations for RAIDZ map allocations work? - What process do the RAIDZ I/O start and done callbacks follow? While here, adjust the function prototype of dmu_zfetch.c:dmu_zfetch_colinear() to match its comment which describes its return as a boolean. Submitted by: asomers, gibbs, will Reviewed by: Matthew Ahrens , Eric Schrock , Christopher Siden Sponsored by: Spectra Logic git-svn-id: svn://svn.freebsd.org/base/stable/9@252749 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- .../lib/libzfs/common/libzfs_dataset.c | 5 ++ .../opensolaris/uts/common/fs/zfs/arc.c | 19 +++++- .../opensolaris/uts/common/fs/zfs/dbuf.c | 24 ++++++- .../opensolaris/uts/common/fs/zfs/dmu.c | 2 +- .../opensolaris/uts/common/fs/zfs/dmu_tx.c | 8 +++ .../uts/common/fs/zfs/dmu_zfetch.c | 15 +++-- .../opensolaris/uts/common/fs/zfs/spa.c | 2 + .../opensolaris/uts/common/fs/zfs/sys/dmu.h | 8 +++ .../opensolaris/uts/common/fs/zfs/txg.c | 14 +++- .../uts/common/fs/zfs/vdev_label.c | 1 + .../uts/common/fs/zfs/vdev_raidz.c | 65 +++++++++++++++++++ .../uts/common/fs/zfs/zfs_ctldir.c | 5 ++ 12 files changed, 156 insertions(+), 12 deletions(-) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c index 945290f00..b06a78ded 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c @@ -4538,6 +4538,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) return (err); } +/* + * Convert the zvol's volume size to an appropriate reservation. + * Note: If this routine is updated, it is necessary to update the ZFS test + * suite's shell version in reservation.kshlib. + */ uint64_t zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index f7a0179ed..31f02be7f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -289,7 +289,18 @@ typedef struct arc_stats { kstat_named_t arcstat_deleted; kstat_named_t arcstat_stolen; kstat_named_t arcstat_recycle_miss; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped because they have I/O in progress, are + * indrect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; @@ -3247,6 +3258,10 @@ top: mutex_exit(hash_lock); + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); @@ -3488,8 +3503,8 @@ arc_buf_evict(arc_buf_t *buf) } /* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index cda8c17e6..28aa33049 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -641,6 +641,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (!havepzio) err = zio_wait(zio); } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, @@ -649,6 +657,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); + /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || @@ -1264,7 +1273,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* - * Return TRUE if this evicted the dbuf. + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -2225,6 +2235,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db_level > 0); DBUF_VERIFY(db); + /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); @@ -2235,10 +2246,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); + /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2629,6 +2642,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); } +/* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { @@ -2663,11 +2677,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { + /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 75716ed3d..fd0464e61 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1839,7 +1839,7 @@ dmu_init(void) void dmu_fini(void) { - arc_fini(); + arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); dbuf_fini(); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 3eeaca64e..533842532 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -1014,6 +1014,10 @@ dmu_tx_unassign(dmu_tx_t *tx) txg_rele_to_quiesce(&tx->tx_txgh); + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1126,6 +1130,10 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ while (txh = list_head(&tx->tx_holds)) { dnode_t *dn = txh->txh_dnode; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index b5ca66628..8ab5e1032 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -66,11 +66,11 @@ SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN, "Number of bytes in a array_read at which we stop prefetching"); /* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); @@ -122,9 +122,9 @@ kstat_t *zfetch_ksp; * last stream, then we are probably in a strided access pattern. So * combine the two sequential streams into a single strided stream. * - * If no co-linear streams are found, return NULL. + * Returns whether co-linear streams were found. */ -static int +static boolean_t dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) { zstream_t *z_walk; @@ -344,7 +344,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ -static int +static boolean_t dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; @@ -669,7 +669,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; - int fetched; + boolean_t fetched; int inserted; unsigned int blkshft; uint64_t blksz; @@ -695,7 +695,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) ZFETCHSTAT_BUMP(zfetchstat_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_misses); - if (fetched = dmu_zfetch_colinear(zf, &zst)) { + fetched = dmu_zfetch_colinear(zf, &zst); + if (fetched) { ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index c23fa0a7d..374005656 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -27,6 +27,8 @@ */ /* + * SPA: Storage Pool Allocator + * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index b67ff2d5a..eb7ed2416 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -411,6 +411,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). + * + * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); @@ -666,8 +668,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +/* + * Like dmu_object_info_from_db, but faster still when you only care about + * the size. This is specifically optimized for zfs_getattr(). + */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index 46b5c34d3..5a83ee2df 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -348,6 +348,12 @@ txg_rele_to_sync(txg_handle_t *th) th->th_cpu = NULL; /* defensive */ } +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { @@ -397,6 +403,9 @@ txg_do_callbacks(void *arg) /* * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) @@ -407,7 +416,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - /* No need to lock tx_cpu_t at this point */ + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ int g = txg & TXG_MASK; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 92ae0ed7f..f4e86a12d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -1044,6 +1044,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } +/* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 1cc343a70..0a107debc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -433,23 +433,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_cksum_report }; +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { @@ -1529,6 +1556,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ static int vdev_raidz_io_start(zio_t *zio) { @@ -1881,6 +1925,27 @@ done: return (ret); } +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ static void vdev_raidz_io_done(zio_t *zio) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index c155e4764..3961c48f3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -633,6 +633,11 @@ static struct vop_vector zfsctl_ops_root = { .vop_fid = zfsctl_common_fid, }; +/* + * Gets the full dataset name that corresponds to the given snapshot name + * Example: + * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" + */ static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { -- 2.45.0