From 674c99a47f97818d17f51c0c548a3a603717c9e4 Mon Sep 17 00:00:00 2001 From: delphij Date: Wed, 19 Mar 2014 23:46:59 +0000 Subject: [PATCH] MFC r260138: MFV r242733: 3306 zdb should be able to issue reads in parallel 3321 'zpool reopen' command should be documented in the man page and help message illumos/illumos-gate@31d7e8fa33fae995f558673adb22641b5aa8b6e1 FreeBSD porting notes: the kernel part of this changeset depends on Solaris buf(9S) interfaces and are not really applicable for our use. vdev_disk.c is patched as-is to reduce diverge from upstream, but vdev_file.c is left intact. git-svn-id: svn://svn.freebsd.org/base/stable/9@263394 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/zdb/zdb.8 | 20 ++- cddl/contrib/opensolaris/cmd/zdb/zdb.c | 120 +++++++++++++----- cddl/contrib/opensolaris/cmd/zpool/zpool.8 | 12 +- .../opensolaris/cmd/zpool/zpool_main.c | 27 +++- .../opensolaris/lib/libzpool/common/kernel.c | 49 ++++++- .../lib/libzpool/common/sys/zfs_context.h | 32 +++++ .../uts/common/fs/zfs/sys/vdev_impl.h | 10 ++ .../opensolaris/uts/common/fs/zfs/vdev_disk.c | 19 +-- 8 files changed, 230 insertions(+), 59 deletions(-) diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.8 b/cddl/contrib/opensolaris/cmd/zdb/zdb.8 index e036b964f..fbf1fce4b 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.8 +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.8 @@ -14,11 +14,12 @@ .\" .\" Copyright 2012, Richard Lowe. .\" Copyright (c) 2012, Marcelo Araujo . +.\" Copyright (c) 2012 by Delphix. All rights reserved. .\" All Rights Reserved. .\" .\" $FreeBSD$ .\" -.Dd May 10, 2012 +.Dd December 31, 2013 .Dt ZDB 8 .Os .Sh NAME @@ -29,27 +30,35 @@ .Op Fl CumdibcsDvhLXFPA .Op Fl e Op Fl p Ar path... .Op Fl t Ar txg +.Op Fl U Ar cache +.Op Fl M Ar inflight I/Os .Ar poolname .Op Ar object ... .Nm .Op Fl divPA .Op Fl e Op Fl p Ar path... +.Op Fl U Ar cache .Ar dataset .Op Ar object ... .Nm .Fl m Op Fl LXFPA .Op Fl t Ar txg .Op Fl e Op Fl p Ar path... +.Op Fl U Ar cache .Ar poolname .Nm .Fl R Op Fl A .Op Fl e Op Fl p Ar path... +.Op Fl U Ar cache +.Ar poolname .Ar poolname .Ar vdev Ns : Ns Ar offset Ns : Ns Ar size Ns Op Ns : Ns Ar flags .Nm .Fl S .Op Fl AP .Op Fl e Op Fl p Ar path... +.Op Fl U Ar cache +.Ar poolname .Ar poolname .Nm .Fl l @@ -205,6 +214,11 @@ flag specifies the path under which devices are to be searched. .It Fl F Attempt to make an unreadable pool readable by trying progressively older transactions. +.It Fl M Ar inflight I/Os +Limit the number of outstanding checksum I/Os to the specified value. +The default value is 200. This option affects the performance of the +.Fl c +option. .It Fl P Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather than 1M. @@ -218,9 +232,7 @@ options for a means to see the available uberblocks and their associated transaction numbers. .It Fl U Ar cachefile Use a cache file other than -.Pa /etc/zfs/zpool.cache . -This option is only valid with -.Fl C +.Pa /boot/zfs/zpool.cache . .It Fl v Enable verbosity. Specify multiple times for increased verbosity. diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 02a62653f..9294cd643 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -89,6 +89,7 @@ extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; libzfs_handle_t *g_zfs; +uint64_t max_inflight = 200; /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -110,16 +111,17 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]]" - "poolname [object...]\n" - " %s [-divPA] [-e -p path...] dataset [object...]\n" - " %s -m [-LXFPA] [-t txg] [-e [-p path...]]" - "poolname [vdev [metaslab...]]\n" - " %s -R [-A] [-e [-p path...]] poolname " - "vdev:offset:size[:flags]\n" - " %s -S [-PA] [-e [-p path...]] poolname\n" - " %s -l [-uA] device\n" - " %s -C [-A] [-U config]\n\n", + "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " + "[-U config] [-M inflight I/Os] poolname [object...]\n" + " %s [-divPA] [-e -p path...] [-U config] dataset " + "[object...]\n" + " %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " + "poolname [vdev [metaslab...]]\n" + " %s -R [-A] [-e [-p path...]] poolname " + "vdev:offset:size[:flags]\n" + " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" + " %s -l [-uA] device\n" + " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " @@ -164,6 +166,8 @@ usage(void) (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); + (void) fprintf(stderr, " -M -- " + "specify the maximum number of checksumming I/Os [default is 200]"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -2154,6 +2158,47 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } +/* ARGSUSED */ +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_t *zb = &zio->io_bookmark; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + sprintf_blkptr(blkbuf, bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +/* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) @@ -2174,38 +2219,22 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { - int ioerr; size_t size = BP_GET_PSIZE(bp); - void *data = malloc(size); + void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; - ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > max_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); - free(data); - if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { - zcb->zcb_haderrors = 1; - zcb->zcb_errors[ioerr]++; - - if (dump_opt['b'] >= 2) - sprintf_blkptr(blkbuf, bp); - else - blkbuf[0] = '\0'; - - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } + zio_nowait(zio_read(NULL, spa, bp, data, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; @@ -2433,6 +2462,18 @@ dump_block_stats(spa_t *spa) zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); @@ -3203,7 +3244,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) { + while ((c = getopt(argc, argv, "bcdhilmM:suCDRSAFLXevp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': @@ -3232,6 +3273,15 @@ main(int argc, char **argv) case 'v': verbose++; break; + case 'M': + max_inflight = strtoull(optarg, NULL, 0); + if (max_inflight == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight I/Os must be greater " + "than 0\n"); + usage(); + } + break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 b/cddl/contrib/opensolaris/cmd/zpool/zpool.8 index ebd22c50d..0f4f4eaeb 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 14, 2013 +.Dd December 31, 2013 .Dt ZPOOL 8 .Os .Sh NAME @@ -141,6 +141,9 @@ .Cm remove .Ar pool device ... .Nm +.Cm reopen +.Ar pool +.Nm .Cm replace .Op Fl f .Ar pool device @@ -1431,6 +1434,13 @@ command. Non-redundant and devices cannot be removed from a pool. .It Xo .Nm +.Cm reopen +.Ar pool +.Xc +.Pp +Reopen all the vdevs associated with the pool. +.It Xo +.Nm .Cm replace .Op Fl f .Ar pool device diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c index d236c2994..63efd5479 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c @@ -247,7 +247,7 @@ get_usage(zpool_help_t idx) { case HELP_REMOVE: return (gettext("\tremove ...\n")); case HELP_REOPEN: - return (""); /* Undocumented command */ + return (gettext("\treopen \n")); case HELP_SCRUB: return (gettext("\tscrub [-s] ...\n")); case HELP_STATUS: @@ -3710,22 +3710,37 @@ zpool_do_reguid(int argc, char **argv) * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. - * - * NOTE: This command is currently undocumented. If the command is ever - * exposed then the appropriate usage() messages will need to be made. */ int zpool_do_reopen(int argc, char **argv) { + int c; int ret = 0; zpool_handle_t *zhp; char *pool; + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc--; argv++; - if (argc != 1) - return (2); + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c index c4e2d2e78..19fbf4fb0 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c +++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c @@ -661,7 +661,7 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...) if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) - (void) printf("%u ", thr_self()); + (void) printf("%ul ", thr_self()); #if 0 if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); @@ -1125,3 +1125,50 @@ zvol_create_minors(const char *name) return (0); } #endif + +#ifdef illumos +void +bioinit(buf_t *bp) +{ + bzero(bp, sizeof (buf_t)); +} + +void +biodone(buf_t *bp) +{ + if (bp->b_iodone != NULL) { + (*(bp->b_iodone))(bp); + return; + } + ASSERT((bp->b_flags & B_DONE) == 0); + bp->b_flags |= B_DONE; +} + +void +bioerror(buf_t *bp, int error) +{ + ASSERT(bp != NULL); + ASSERT(error >= 0); + + if (error != 0) { + bp->b_flags |= B_ERROR; + } else { + bp->b_flags &= ~B_ERROR; + } + bp->b_error = error; +} + + +int +geterror(struct buf *bp) +{ + int error = 0; + + if (bp->b_flags & B_ERROR) { + error = bp->b_error; + if (!error) + error = EIO; + } + return (error); +} +#endif diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h index 87ed1cace..6b7a56b7c 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -786,6 +786,38 @@ extern void cyclic_remove(cyclic_id_t); extern int cyclic_reprogram(cyclic_id_t, hrtime_t); #endif /* illumos */ +#ifdef illumos +/* + * Buf structure + */ +#define B_BUSY 0x0001 +#define B_DONE 0x0002 +#define B_ERROR 0x0004 +#define B_READ 0x0040 /* read when I/O occurs */ +#define B_WRITE 0x0100 /* non-read pseudo-flag */ + +typedef struct buf { + int b_flags; + size_t b_bcount; + union { + caddr_t b_addr; + } b_un; + + lldaddr_t _b_blkno; +#define b_lblkno _b_blkno._f + size_t b_resid; + size_t b_bufsize; + int (*b_iodone)(struct buf *); + int b_error; + void *b_private; +} buf_t; + +extern void bioinit(buf_t *); +extern void biodone(buf_t *); +extern void bioerror(buf_t *, int); +extern int geterror(buf_t *); +#endif + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 307e557c0..518ebc46e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -359,6 +359,16 @@ extern void vdev_set_min_asize(vdev_t *vd); /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; +#ifdef illumos +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; +#endif + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 2fada25a0..42612ec82 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -41,11 +41,6 @@ extern ldi_ident_t zfs_li; -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - static void vdev_disk_hold(vdev_t *vd) { @@ -443,8 +438,8 @@ vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, static void vdev_disk_io_intr(buf_t *bp) { - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. @@ -456,7 +451,7 @@ vdev_disk_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(EIO); - kmem_free(vdb, sizeof (vdev_disk_buf_t)); + kmem_free(vb, sizeof (vdev_buf_t)); zio_interrupt(zio); } @@ -487,7 +482,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; + vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; @@ -551,10 +546,10 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; + vb->vb_io = zio; + bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | -- 2.45.0