From 6e6598e71254167a813a4515a98bf20d32ceded6 Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 31 Mar 2013 19:03:25 +0000 Subject: [PATCH] MFC r242845 (delphij): Illumos r13840:97fd5cdf328a: 3145 single-copy arc 3212 ztest: race condition between vdev_online() and spa_vdev_remove() Illumos r13849:3468a95b27cd: 3258 ztest's use of file descriptors is unstable git-svn-id: svn://svn.freebsd.org/base/stable/8@248959 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/ztest/ztest.c | 112 +++++++++++------- .../opensolaris/uts/common/fs/zfs/arc.c | 90 +++++++++++++- .../opensolaris/uts/common/fs/zfs/dbuf.c | 19 ++- .../opensolaris/uts/common/fs/zfs/sys/arc.h | 1 + 4 files changed, 171 insertions(+), 51 deletions(-) diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index 640ab6872..9c38b3f34 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -121,8 +121,8 @@ #include #include -#define ZTEST_FD_DATA 3 -#define ZTEST_FD_RAND 4 +static int ztest_fd_data = -1; +static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; @@ -713,14 +713,17 @@ process_options(int argc, char **argv) UINT64_MAX >> 2); if (strlen(altdir) > 0) { - char cmd[MAXNAMELEN]; - char realaltdir[MAXNAMELEN]; + char *cmd; + char *realaltdir; char *bin; char *ztest; char *isa; int isalen; - (void) realpath(getexecname(), cmd); + cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + VERIFY(NULL != realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", @@ -751,6 +754,9 @@ process_options(int argc, char **argv) fatal(B_TRUE, "invalid alternate lib directory %s", zo->zo_alt_libpath); } + + umem_free(cmd, MAXPATHLEN); + umem_free(realaltdir, MAXPATHLEN); } } @@ -767,10 +773,12 @@ ztest_random(uint64_t range) { uint64_t r; + ASSERT3S(ztest_fd_rand, >=, 0); + if (range == 0) return (0); - if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r)) + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); @@ -4836,7 +4844,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (islog) (void) rw_unlock(&ztest_name_lock); } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); (void) vdev_online(spa, guid0, 0, NULL); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } } @@ -5795,29 +5814,16 @@ ztest_init(ztest_shared_t *zs) } static void -setup_fds(void) +setup_data_fd(void) { - int fd; -#ifdef illumos - - char *tmp = tempnam(NULL, NULL); - fd = open(tmp, O_RDWR | O_CREAT, 0700); - ASSERT3U(fd, ==, ZTEST_FD_DATA); - (void) unlink(tmp); - free(tmp); -#else - char tmp[MAXPATHLEN]; - - strlcpy(tmp, ztest_opts.zo_dir, MAXPATHLEN); - strlcat(tmp, "/ztest.XXXXXX", MAXPATHLEN); - fd = mkstemp(tmp); - ASSERT3U(fd, ==, ZTEST_FD_DATA); -#endif + static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; - fd = open("/dev/urandom", O_RDONLY); - ASSERT3U(fd, ==, ZTEST_FD_RAND); + ztest_fd_data = mkstemp(ztest_name_data); + ASSERT3S(ztest_fd_data, >=, 0); + (void) unlink(ztest_name_data); } + static int shared_data_size(ztest_shared_hdr_t *hdr) { @@ -5838,15 +5844,11 @@ setup_hdr(void) int size; ztest_shared_hdr_t *hdr; -#ifndef illumos - pwrite(ZTEST_FD_DATA, "", 1, 0); -#endif - hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); - VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t))); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); @@ -5857,7 +5859,7 @@ setup_hdr(void) hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); - VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size)); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } @@ -5870,14 +5872,14 @@ setup_data(void) uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); buf = (uint8_t *)hdr; @@ -5896,12 +5898,13 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; - char cmdbuf[MAXPATHLEN]; + char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { - (void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf)); + cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } @@ -5910,9 +5913,16 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; + char fd_data_str[12]; struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); + + (void) close(ztest_fd_rand); + VERIFY3U(11, >=, + snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); + (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); @@ -5925,6 +5935,11 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) fatal(B_TRUE, "exec failed: %s", cmd); } + if (cmdbuf != NULL) { + umem_free(cmdbuf, MAXPATHLEN); + cmd = NULL; + } + while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) @@ -5989,39 +6004,41 @@ main(int argc, char **argv) char timebuf[100]; char numbuf[6]; spa_t *spa; - char cmd[MAXNAMELEN]; + char *cmd; boolean_t hasalt; - - boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR)); - ASSERT(ischild || errno == EBADF); + char *fd_data_str = getenv("ZTEST_FD_DATA"); (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); - if (!ischild) { + ztest_fd_rand = open("/dev/urandom", O_RDONLY); + ASSERT3S(ztest_fd_rand, >=, 0); + + if (!fd_data_str) { process_options(argc, argv); - setup_fds(); + setup_data_fd(); setup_hdr(); setup_data(); bcopy(&ztest_opts, ztest_shared_opts, sizeof (*ztest_shared_opts)); } else { + ztest_fd_data = atoi(fd_data_str); setup_data(); bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); /* Override location of zpool.cache */ - (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", - ztest_opts.zo_dir); + VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; - if (ischild) { + if (fd_data_str) { metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; @@ -6044,7 +6061,8 @@ main(int argc, char **argv) (u_longlong_t)ztest_opts.zo_time); } - (void) strlcpy(cmd, getexecname(), sizeof (cmd)); + cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); + (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { @@ -6185,5 +6203,7 @@ main(int argc, char **argv) kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } + umem_free(cmd, MAXNAMELEN); + return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 1693cf5e2..35fe06c01 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -191,6 +191,7 @@ uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); @@ -321,7 +322,6 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; @@ -334,6 +334,10 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; + kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { @@ -391,7 +395,6 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, @@ -403,7 +406,11 @@ static arc_stats_t arc_stats = { { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 } + { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1516,6 +1523,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1616,6 +1634,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1799,6 +1827,48 @@ arc_buf_size(arc_buf_t *buf) return (buf->b_hdr->b_size); } +/* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. @@ -2885,8 +2955,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3401,6 +3473,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); #ifdef illumos diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index c61bc4b03..2dc614745 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -2071,7 +2071,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index f00adae06..deb57f26e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -96,6 +96,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif -- 2.45.0