From 40e23b8dab708df5dda6ef199e3377e8e4bd2ddc Mon Sep 17 00:00:00 2001 From: mav Date: Sat, 3 Oct 2015 08:03:36 +0000 Subject: [PATCH] MFC r286705: 5960 zfs recv should prefetch indirect blocks 5925 zfs receive -o origin= Reviewed by: Prakash Surya Reviewed by: Matthew Ahrens Author: Paul Dagnelie While running 'zfs recv' we noticed that every 128th 8K block required a read. We were seeing that restore_write() was calling dmu_tx_hold_write() and the indirect block was not cached. We should prefetch upcoming indirect blocks to avoid having to go to disk and blocking the restore_write(). Allow an incremental send stream to be received as a clone, even if the stream does not mark it as a clone. git-svn-id: svn://svn.freebsd.org/base/stable/10@288571 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/zdb/zdb.c | 5 +- cddl/contrib/opensolaris/cmd/zfs/zfs.8 | 8 + cddl/contrib/opensolaris/cmd/zfs/zfs_main.c | 29 +- cddl/contrib/opensolaris/cmd/ztest/ztest.c | 9 +- .../opensolaris/lib/libzfs/common/libzfs.h | 4 +- .../lib/libzfs/common/libzfs_pool.c | 4 +- .../lib/libzfs/common/libzfs_sendrecv.c | 55 +- .../lib/libzpool/common/sys/zfs_context.h | 12 +- .../opensolaris/uts/common/Makefile.files | 5 +- .../opensolaris/uts/common/fs/zfs/bptree.c | 2 +- .../opensolaris/uts/common/fs/zfs/bqueue.c | 111 +++ .../opensolaris/uts/common/fs/zfs/dbuf.c | 261 +++++- .../opensolaris/uts/common/fs/zfs/dmu.c | 44 +- .../opensolaris/uts/common/fs/zfs/dmu_diff.c | 2 +- .../uts/common/fs/zfs/dmu_object.c | 5 + .../opensolaris/uts/common/fs/zfs/dmu_send.c | 813 +++++++++++++----- .../uts/common/fs/zfs/dmu_traverse.c | 28 +- .../opensolaris/uts/common/fs/zfs/dmu_tx.c | 6 +- .../uts/common/fs/zfs/dmu_zfetch.c | 3 +- .../opensolaris/uts/common/fs/zfs/dnode.c | 18 +- .../uts/common/fs/zfs/dnode_sync.c | 6 +- .../uts/common/fs/zfs/dsl_dataset.c | 28 +- .../uts/common/fs/zfs/dsl_destroy.c | 2 +- .../opensolaris/uts/common/fs/zfs/dsl_scan.c | 3 +- .../opensolaris/uts/common/fs/zfs/spa.c | 2 +- .../opensolaris/uts/common/fs/zfs/space_map.c | 4 +- .../uts/common/fs/zfs/sys/bqueue.h | 54 ++ .../opensolaris/uts/common/fs/zfs/sys/dbuf.h | 9 +- .../opensolaris/uts/common/fs/zfs/sys/dmu.h | 5 +- .../uts/common/fs/zfs/sys/dsl_dataset.h | 2 +- .../opensolaris/uts/common/fs/zfs/sys/zio.h | 23 +- .../uts/common/fs/zfs/sys/zio_checksum.h | 2 +- .../uts/common/fs/zfs/sys/zio_priority.h | 41 + .../opensolaris/uts/common/fs/zfs/zap.c | 13 +- .../uts/common/fs/zfs/zfs_vfsops.c | 4 +- .../opensolaris/uts/common/fs/zfs/zfs_vnops.c | 3 +- .../opensolaris/uts/common/fs/zfs/zio.c | 141 ++- .../opensolaris/uts/common/fs/zfs/zvol.c | 2 +- 38 files changed, 1388 insertions(+), 380 deletions(-) create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 60f7d9a79..cb209fa87 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -2429,6 +2429,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type; boolean_t is_metadata; + if (bp == NULL) + return (0); + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); @@ -2918,7 +2921,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index a937a2c4f..e534ae9a0 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -191,11 +191,13 @@ .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnFu +.Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnFu .Op Fl d | e +.Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Nm .Cm allow @@ -2711,6 +2713,7 @@ feature. .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnFu +.Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo @@ -2718,6 +2721,7 @@ feature. .Cm receive Ns | Ns Cm recv .Op Fl vnFu .Op Fl d | e +.Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Xc .Pp @@ -2802,6 +2806,10 @@ receive operation. Do not actually receive the stream. This can be useful in conjunction with the .Fl v option to verify the name the receive operation would use. +.It Fl o Sy origin Ns = Ns Ar snapshot +Forces the stream to be received as a clone of the given snapshot. +This is only valid if the stream is an incremental stream whose source +is the same as the provided origin. .It Fl F Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index 6a4334ed4..e35ce0131 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -264,8 +264,9 @@ get_usage(zfs_help_t idx) return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive|recv [-vnFu] \n" - "\treceive|recv [-vnFu] [-d | -e] \n")); + "snapshot>\n" + "\treceive|recv [-vnFu] [-o origin=] [-d | -e] " + "\n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" @@ -791,7 +792,7 @@ zfs_do_create(int argc, char **argv) nomem(); break; case 'o': - if (parseprop(props, optarg)) + if (parseprop(props, optarg) != 0) goto error; break; case 's': @@ -3663,7 +3664,7 @@ zfs_do_snapshot(int argc, char **argv) while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': - if (parseprop(props, optarg)) + if (parseprop(props, optarg) != 0) return (1); break; case 'r': @@ -3922,10 +3923,19 @@ zfs_do_receive(int argc, char **argv) { int c, err; recvflags_t flags = { 0 }; + nvlist_t *props; + nvpair_t *nvp = NULL; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ - while ((c = getopt(argc, argv, ":denuvF")) != -1) { + while ((c = getopt(argc, argv, ":o:denuvF")) != -1) { switch (c) { + case 'o': + if (parseprop(props, optarg) != 0) + return (1); + break; case 'd': flags.isprefix = B_TRUE; break; @@ -3970,6 +3980,13 @@ zfs_do_receive(int argc, char **argv) usage(B_FALSE); } + while ((nvp = nvlist_next_nvpair(props, nvp))) { + if (strcmp(nvpair_name(nvp), "origin") != 0) { + (void) fprintf(stderr, gettext("invalid option")); + usage(B_FALSE); + } + } + if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " @@ -3978,7 +3995,7 @@ zfs_do_receive(int argc, char **argv) return (1); } - err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); + err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); return (err != 0); } diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index f76c8eafa..7cc8d5f36 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -3586,7 +3586,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); - dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); + dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, + ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. @@ -5705,8 +5706,10 @@ ztest_run(ztest_shared_t *zs) * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ - for (uint64_t object = 1; object < 50; object++) - dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); + for (uint64_t object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } spa_close(spa, FTAG); diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 8a707d1f7..44bd58b9f 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -668,8 +668,8 @@ typedef struct recvflags { boolean_t nomount; } recvflags_t; -extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, - int, avl_tree_t *); +extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, + recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 0x1, diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c index 67514b129..c677822cd 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c @@ -3535,7 +3535,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } static int -zbookmark_compare(const void *a, const void *b) +zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } @@ -3598,7 +3598,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size; - qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare); + qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index 455578978..e44ccfd77 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -64,8 +64,9 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); /* We need to use something for ENODATA. */ #define ENODATA EIDRM -static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, - int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); +static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, + recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, + uint64_t *); static const zio_cksum_t zero_cksum = { 0 }; @@ -2498,7 +2499,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ - error = zfs_receive_impl(hdl, destname, flags, fd, + error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, action_handlep); if (error == ENODATA) { @@ -2631,9 +2632,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - recvflags_t *flags, dmu_replay_record_t *drr, - dmu_replay_record_t *drr_noswap, const char *sendfs, - nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, + const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, + dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, + avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { zfs_cmd_t zc = { 0 }; @@ -2798,10 +2799,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } if (flags->verbose) (void) printf("found clone origin %s\n", zc.zc_string); + } else if (originsnap) { + (void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN); + if (flags->verbose) + (void) printf("using provided clone origin %s\n", + zc.zc_string); } stream_wantsnewfs = (drrb->drr_fromguid == 0 || - (drrb->drr_flags & DRR_FLAG_CLONE)); + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap); if (stream_wantsnewfs) { /* @@ -3179,9 +3185,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } static int -zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, - int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, - char **top_zfs, int cleanup_fd, uint64_t *action_handlep) +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, + const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, + uint64_t *action_handlep) { int err; dmu_replay_record_t drr, drr_noswap; @@ -3200,6 +3207,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } + if (originsnap && + !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " + "(%s) does not exist"), originsnap); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, @@ -3272,14 +3285,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, *cp = '\0'; sendfs = nonpackage_sendfs; } - return (zfs_receive_one(hdl, infd, tosnap, flags, - &drr, &drr_noswap, sendfs, stream_nv, stream_avl, - top_zfs, cleanup_fd, action_handlep)); + return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, + &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, + cleanup_fd, action_handlep)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); - return (zfs_receive_package(hdl, infd, tosnap, flags, - &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); + return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, + &zcksum, top_zfs, cleanup_fd, action_handlep)); } } @@ -3290,18 +3303,24 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, * (-1 will override -2). */ int -zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, - int infd, avl_tree_t *stream_avl) +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, + recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; int cleanup_fd; uint64_t action_handle = 0; + char *originsnap = NULL; + if (props) { + err = nvlist_lookup_string(props, "origin", &originsnap); + if (err && err != ENOENT) + return (err); + } cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); VERIFY(cleanup_fd >= 0); - err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, + err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, cleanup_fd, &action_handle); VERIFY(0 == close(cleanup_fd)); diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h index 03027c35d..a1b197b50 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -135,8 +135,18 @@ extern int aok; /* * DTrace SDT probes have different signatures in userland than they do in - * kernel. If they're being used in kernel code, re-define them out of + * the kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. + * + * Here's an example of how to use the set-error probes in userland: + * zfs$target:::set-error /arg0 == EBUSY/ {stack();} + * + * Here's an example of how to use DTRACE_PROBE probes in userland: + * If there is a probe declared as follows: + * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); + * Then you can use it as follows: + * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ + * {printf("%u %p\n", arg1, arg2);} */ #ifdef DTRACE_PROBE diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 4c7e225f0..286a75d83 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -22,7 +22,9 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2011, 2014 by Delphix. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # # @@ -36,6 +38,7 @@ ZFS_COMMON_OBJS += \ blkptr.o \ bpobj.o \ bptree.o \ + bqueue.o \ dbuf.o \ ddt.o \ ddt_zap.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c index 5f7d76f0e..b2b988774 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c @@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (BP_IS_HOLE(bp)) + if (bp == NULL || BP_IS_HOLE(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c new file mode 100644 index 000000000..1ddc697b5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include +#include + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, + * and offset should give its offset from the start of the struct. Return 0 on + * success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +{ + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + mutex_destroy(&q->bq_lock); + list_destroy(&q->bq_list); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + cv_signal(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + mutex_exit(&q->bq_lock); + cv_signal(&q->bq_add_cv); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 79b6aed4d..63a03ba0d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) return (abuf); } +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ uint64_t -dbuf_whichblock(dnode_t *dn, uint64_t offset) +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { - if (dn->dn_datablkshift) { - return (offset >> dn->dn_datablkshift); + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + return (offset >> (dn->dn_datablkshift + level * + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); @@ -1715,6 +1739,12 @@ dbuf_clear(dmu_buf_impl_t *db) dbuf_rele(parent, db); } +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or a userused or groupused + * object. + */ static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) @@ -1755,7 +1785,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -1930,11 +1960,96 @@ dbuf_destroy(dmu_buf_impl_t *db) arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; + +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; + + arc_flags_t aflags = + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &dpa->dpa_zb); +} + +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ +static void +dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + } + + dpa->dpa_curlevel--; + + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + (void) arc_buf_remove_ref(abuf, private); +} + +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. + */ void -dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) { - dmu_buf_impl_t *db = NULL; - blkptr_t *bp = NULL; + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -1942,35 +2057,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dnode_block_freed(dn, blkid)) return; - /* dbuf_find() returns with db_mtx held */ - if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { + mutex_exit(&db->db_mtx); /* - * This dbuf is already in the cache. We assume that - * it is already CACHED, or else about to be either - * read or filled. + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. */ - mutex_exit(&db->db_mtx); return; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { - if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - arc_flags_t aflags = - ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - zbookmark_phys_t zb; + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, 0, blkid); + curlevel = parent_level; + curblkid = parent_blkid; + } - (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, NULL, NULL, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - if (db) - dbuf_rele(db, NULL); + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; } + if (BP_IS_HOLE(&bp)) + return; + + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); + + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); + + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); } /* @@ -1978,7 +2162,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) * Note: dn_struct_rwlock must be held. */ int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; @@ -1996,6 +2181,9 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, blkptr_t *bp = NULL; int err; + if (fail_uncached) + return (SET_ERROR(ENOENT)); + ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { @@ -2012,6 +2200,11 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, db = dbuf_create(dn, level, blkid, parent, bp); } + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { @@ -2067,16 +2260,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } @@ -2429,8 +2620,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 518b6fee4..840bd7c72 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -142,7 +142,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -425,7 +425,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { @@ -529,17 +529,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) } /* - * Issue prefetch i/os for the given blocks. + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefeteched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. * - * Note: The assumption is that we *know* these blocks will be needed - * almost immediately. Therefore, the prefetch i/os will be issued at - * ZIO_PRIORITY_SYNC_READ - * - * Note: indirect blocks and other metadata will be read synchronously, - * causing this function to block if they are not already cached. + * Note that if the indirect blocks above the blocks being prefetched are not in + * cache, they will be asychronously read in. */ void -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; @@ -555,8 +554,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } @@ -571,18 +571,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - - P2ALIGN(offset, 1 << blkshift)) >> blkshift; + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); + dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); @@ -1394,7 +1400,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c index bd9e89440..e88968b7b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); - if (zb->zb_object != DMU_META_DNODE_OBJECT) + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 808864a57..6ca021eec 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (0); } +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index be1f46d7b..0b624806b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef __FreeBSD__ #undef dump_write @@ -61,10 +62,34 @@ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; +}; + +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; +}; + static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { @@ -455,58 +480,116 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) return (B_FALSE); } -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; -/* ARGSUSED */ + if (sta->cancel) + return (SET_ERROR(EINTR)); + + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset(st_arg->ds, st_arg->fromtxg, + st_arg->flags, send_cb, arg); + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); + thread_exit(); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { - dmu_sendarg_t *dsp = arg; + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + ASSERT3U(zb->zb_level, >=, 0); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (zb->zb_level == ZB_ZIL_LEVEL) { - /* - * If we are sending a non-snapshot (which is allowed on - * read-only pools), it may have a ZIL, which must be ignored. - */ - return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - dnode_phys_t *blk; - int i; int blksz = BP_GET_LSIZE(bp); arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; + ASSERT0(zb->zb_level); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(dsp, dnobj, blk+i); + dnode_phys_t *blk = abuf->b_data; + uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); + for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { + err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } @@ -521,20 +604,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); - } else if (backup_do_embed(dsp, bp)) { + } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ - int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - err = dump_write_embedded(dsp, zb->zb_object, + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); - } else { /* it's a level-0 block of a regular object */ + } else { + /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; - ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -555,20 +639,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, offset = zb->zb_blkid * blksz; - if (!(dsp->dsa_featureflags & + if (!(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); @@ -579,11 +663,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Releases dp using the specified tag. + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. */ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, #ifdef illumos boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) #else @@ -596,8 +693,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; + struct send_thread_arg to_arg; - err = dmu_objset_from_ds(ds, &os); + err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); @@ -623,35 +721,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, } #endif - if (large_block_ok && ds->ds_large_blocks) + if (large_block_ok && to_ds->ds_large_blocks) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; - } else { - embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(ds)->ds_creation_time; + dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromzb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; - fromtxg = fromzb->zbm_creation_txg; + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; } - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!ds->ds_is_snapshot) { + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -665,16 +762,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_fp = fp; dsp->dsa_os = os; dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; - mutex_enter(&ds->ds_sendstream_lock); - list_insert_head(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); - dsl_dataset_long_hold(ds, FTAG); + dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); if (dump_record(dsp, NULL, 0) != 0) { @@ -682,8 +779,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, goto out; } - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, dsp); + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, + TS_RUN, minclsyspri); + + struct send_block_record *to_data; + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; + } + + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); + + bqueue_destroy(&to_arg.q); + + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) @@ -700,20 +830,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_record(dsp, NULL, 0) != 0) { + if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; - goto out; - } out: - mutex_enter(&ds->ds_sendstream_lock); - list_remove(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); - dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_long_rele(to_ds, FTAG); return (err); } @@ -1144,7 +1272,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ @@ -1326,22 +1455,58 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } -struct restorearg { +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { objset_t *os; - int err; boolean_t byteswap; + bqueue_t q; + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; +}; + +struct receive_arg { + objset_t *os; kthread_t *td; struct file *fp; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ - - dmu_replay_record_t *drr; - dmu_replay_record_t *next_drr; - char *buf; + uint64_t voff; /* The current offset in the stream */ + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + list_t ignore_obj_list; +}; - avl_tree_t *guid_to_ds_map; +struct receive_ign_obj_node { + list_node_t node; + uint64_t object; }; typedef struct guid_map_entry { @@ -1380,7 +1545,7 @@ free_guid_map_onexit(void *arg) } static int -restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) +restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; @@ -1406,13 +1571,12 @@ restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *res } static int -restore_read(struct restorearg *ra, int len, void *buf) +receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); - ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; @@ -1529,7 +1693,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } static int -restore_object(struct restorearg *ra, struct drr_object *drro, void *data) +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) { dmu_object_info_t doi; dmu_tx_t *tx; @@ -1543,12 +1708,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } - err = dmu_object_info(ra->os, drro->drr_object, &doi); + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); @@ -1567,14 +1732,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { - err = dmu_free_long_range(ra->os, drro->drr_object, + err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { @@ -1584,7 +1749,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(ra->os, drro->drr_object, + err = dmu_object_claim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || @@ -1592,7 +1757,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(ra->os, drro->drr_object, + err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } @@ -1601,20 +1766,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) return (SET_ERROR(EINVAL)); } - dmu_object_set_checksum(ra->os, drro->drr_object, + dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); - dmu_object_set_compress(ra->os, drro->drr_object, + dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, @@ -1628,7 +1793,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) /* ARGSUSED */ static int -restore_freeobjects(struct restorearg *ra, +receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; @@ -1638,13 +1803,13 @@ restore_freeobjects(struct restorearg *ra, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { + (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(ra->os, obj, NULL) != 0) + if (dmu_object_info(rwa->os, obj, NULL) != 0) continue; - err = dmu_free_long_object(ra->os, obj); + err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } @@ -1652,7 +1817,8 @@ restore_freeobjects(struct restorearg *ra, } static int -restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) { dmu_tx_t *tx; int err; @@ -1661,10 +1827,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); @@ -1673,7 +1839,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) dmu_tx_abort(tx); return (err); } - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, @@ -1681,7 +1847,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) } dmu_buf_t *bonus; - if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); @@ -1697,7 +1863,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) * data from the stream to fulfill this write. */ static int -restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; @@ -1716,14 +1883,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { - ref_os = ra->os; + ref_os = rwa->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, @@ -1731,7 +1898,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) if (err != 0) return (err); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); @@ -1740,7 +1907,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) dmu_tx_abort(tx); return (err); } - dmu_write(ra->os, drrwbr->drr_object, + dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); @@ -1748,7 +1915,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) } static int -restore_write_embedded(struct restorearg *ra, +receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwnp, void *data) { dmu_tx_t *tx; @@ -1765,7 +1932,7 @@ restore_write_embedded(struct restorearg *ra, if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); @@ -1775,36 +1942,37 @@ restore_write_embedded(struct restorearg *ra, return (err); } - dmu_write_embedded(ra->os, drrwnp->drr_object, + dmu_write_embedded(rwa->os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, - ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int -restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1831,7 +1999,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) /* ARGSUSED */ static int -restore_free(struct restorearg *ra, struct drr_free *drrf) +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; @@ -1839,11 +2007,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf) drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(ra->os, drrf->drr_object, + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); + return (err); } @@ -1858,7 +2027,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) } static void -restore_cksum(struct restorearg *ra, int len, void *buf) +receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { fletcher_4_incremental_byteswap(buf, len, &ra->cksum); @@ -1868,30 +2037,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf) } /* - * If len != 0, read payload into buf. - * Read next record's header into ra->next_drr. + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int -restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; if (len != 0) { - ASSERT3U(len, <=, ra->bufsize); - err = restore_read(ra, len, buf); + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + err = receive_read(ra, len, ra->rrd->payload); if (err != 0) return (err); - restore_cksum(ra, len, buf); + receive_cksum(ra, len, ra->rrd->payload); } ra->prev_cksum = ra->cksum; - err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); - if (err != 0) + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (err); - if (ra->next_drr->drr_type == DRR_BEGIN) + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); + } /* * Note: checksum is of everything up to but not including the @@ -1899,107 +2080,180 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - restore_cksum(ra, + receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ra->next_drr); + &ra->next_rrd->header); - zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; - zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; + zio_cksum_t cksum_orig = + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) - byteswap_record(ra->next_drr); + byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); + } - restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&ra->ignore_obj_list); + } + if (node == NULL || node->object > object) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ static int -restore_process_record(struct restorearg *ra) +receive_read_record(struct receive_arg *ra) { int err; - switch (ra->drr->drr_type) { + switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { - struct drr_object *drro = &ra->drr->drr_u.drr_object; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); - if (err != 0) + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_object(ra, drro, ra->buf)); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + struct receive_ign_obj_node *node = + kmem_zalloc(sizeof (*node), + KM_SLEEP); + node->object = drro->drr_object; +#ifdef ZFS_DEBUG + struct receive_ign_obj_node *last_object = + list_tail(&ra->ignore_obj_list); + uint64_t last_objnum = (last_object != NULL ? + last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&ra->ignore_obj_list, node); + err = 0; + } + return (err); } case DRR_FREEOBJECTS: { - struct drr_freeobjects *drrfo = - &ra->drr->drr_u.drr_freeobjects; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_freeobjects(ra, drrfo)); + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_WRITE: { - struct drr_write *drrw = &ra->drr->drr_u.drr_write; + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), drrw->drr_length); - err = restore_read_payload_and_next_header(ra, + err = receive_read_payload_and_next_header(ra, drrw->drr_length, abuf->b_data); - if (err != 0) - return (err); - err = restore_write(ra, drrw, abuf); - /* if restore_write() is successful, it consumes the arc_buf */ - if (err != 0) + if (err != 0) { dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_length); return (err); } case DRR_WRITE_BYREF: { - struct drr_write_byref *drrwbr = - &ra->drr->drr_u.drr_write_byref; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_write_byref(ra, drrwbr)); + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = - &ra->drr->drr_u.drr_write_embedded; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); - if (err != 0) + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_write_embedded(ra, drrwe, ra->buf)); + } + + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); } case DRR_FREE: { - struct drr_free *drrf = &ra->drr->drr_u.drr_free; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_free(ra, drrf)); + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_END: { - struct drr_end *drre = &ra->drr->drr_u.drr_end; + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(EINVAL)); return (0); } case DRR_SPILL: { - struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; - err = restore_read_payload_and_next_header(ra, - drrs->drr_length, ra->buf); + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); if (err != 0) - return (err); - return (restore_spill(ra, drrs, ra->buf)); + kmem_free(buf, drrs->drr_length); + return (err); } default: return (SET_ERROR(EINVAL)); @@ -2007,6 +2261,119 @@ restore_process_record(struct restorearg *ra) } /* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + thread_exit(); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int @@ -2014,7 +2381,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; - struct restorearg ra = { 0 }; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; int featureflags; ra.byteswap = drc->drc_byteswap; @@ -2022,10 +2390,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, ra.td = curthread; ra.fp = fp; ra.voff = *voffp; - ra.bufsize = SPA_MAXBLOCKSIZE; - ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), + offsetof(struct receive_ign_obj_node, node)); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -2056,48 +2422,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, } if (*action_handlep == 0) { - ra.guid_to_ds_map = + rwa.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(ra.guid_to_ds_map, guid_compare, + avl_create(rwa.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, ra.guid_to_ds_map, + free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&ra.guid_to_ds_map); + (void **)&rwa.guid_to_ds_map); if (ra.err != 0) goto out; } - drc->drc_guid_to_ds_map = ra.guid_to_ds_map; + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = restore_read_payload_and_next_header(&ra, 0, NULL); - if (err != 0) + err = receive_read_payload_and_next_header(&ra, 0, NULL); + if (err) goto out; - for (;;) { - void *tmp; + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } - tmp = ra.next_drr; - ra.next_drr = ra.drr; - ra.drr = tmp; + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); - /* process ra.drr, read in ra.next_drr */ - err = restore_process_record(&ra); - if (err != 0) - break; - if (ra.drr->drr_type == DRR_END) + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; break; + } + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); } + mutex_exit(&rwa.mutex); + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) @@ -2111,10 +2521,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, dmu_recv_cleanup_ds(drc); } - kmem_free(ra.drr, sizeof (*ra.drr)); - kmem_free(ra.buf, ra.bufsize); - kmem_free(ra.next_drr, sizeof (*ra.next_drr)); *voffp = ra.voff; + for (struct receive_ign_obj_node *n = + list_remove_head(&ra.ignore_obj_list); n != NULL; + n = list_remove_head(&ra.ignore_obj_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&ra.ignore_obj_list); return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index e246c493a..151d04c0f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * If we already visited this bp & everything below, * don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, td->td_resume)) + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); /* @@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); @@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, break; } - if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } return (err); } @@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); + if (bp == NULL) + return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index dff9fabd1..65a017f36 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, start, + FALSE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { @@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, + FALSE, FALSE, FTAG, &dbuf); if (err) { txh->txh_tx->tx_err = err; break; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index 2804d7a28..97bdcc09f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -309,7 +309,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); + dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, + ARC_FLAG_PREFETCH); } return (fetchsz); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 4af50f56a..697a45d4e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, drop_struct_lock = TRUE; } - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) @@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ @@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (tail) { if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { @@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) + int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; @@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 06336045f..078788522 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; @@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, - i, B_TRUE, FTAG, &subdb)); + i, TRUE, FALSE, FTAG, &subdb)); rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); @@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, - TRUE, FTAG, &db)); + TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); free_children(db, blkid, nblks, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 551e35b1c..47cac8b15 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -540,6 +540,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, const char *snapname; uint64_t obj; int err = 0; + dsl_dataset_t *ds; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) @@ -548,36 +549,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) - err = dsl_dataset_hold_obj(dp, obj, tag, dsp); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds; + dsl_dataset_t *snap_ds; if (*snapname++ != '@') { - dsl_dataset_rele(*dsp, tag); + dsl_dataset_rele(ds, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); - err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); + dsl_dataset_rele(ds, tag); if (err == 0) { - mutex_enter(&ds->ds_lock); - if (ds->ds_snapname[0] == 0) - (void) strlcpy(ds->ds_snapname, snapname, - sizeof (ds->ds_snapname)); - mutex_exit(&ds->ds_lock); - *dsp = ds; + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; } } - + if (err == 0) + *dsp = ds; dsl_dir_rele(dd, FTAG); return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index 7f9046957..40e34cbf9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -552,7 +552,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index b11452137..06cfcede0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -609,7 +609,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index a998de177..019aadaa8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -1940,7 +1940,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index aeac124df..1ea829f38 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) mutex_exit(sm->sm_lock); if (end > bufsize) { - dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, - end - bufsize); + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, + end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h new file mode 100644 index 000000000..63722df1b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _BQUEUE_H +#define _BQUEUE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct bqueue { + list_t bq_list; + kmutex_t bq_lock; + kcondvar_t bq_add_cv; + kcondvar_t bq_pop_cv; + uint64_t bq_size; + uint64_t bq_maxsize; + size_t bq_node_offset; +} bqueue_t; + +typedef struct bqueue_node { + list_node_t bqn_node; + uint64_t bqn_size; +} bqueue_node_t; + + +int bqueue_init(bqueue_t *, uint64_t, size_t); +void bqueue_destroy(bqueue_t *); +void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void *bqueue_dequeue(bqueue_t *); +boolean_t bqueue_empty(bqueue_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _BQUEUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 2e07185f4..482ccb01d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -245,8 +245,7 @@ typedef struct dbuf_hash_table { kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; - -uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); +uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); @@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); -int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); +void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 07886faf0..af68eb146 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -45,6 +45,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -748,8 +749,8 @@ extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ -void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, - uint64_t len); +void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 7d490ec60..81757b521 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index fea7db574..ad352734e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -29,6 +29,7 @@ #ifndef _ZIO_H #define _ZIO_H +#include #include #include #include @@ -144,18 +145,6 @@ enum zio_compress { #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 -typedef enum zio_priority { - ZIO_PRIORITY_SYNC_READ, - ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ - ZIO_PRIORITY_ASYNC_READ, /* prefetch */ - ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ - ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ - ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ - ZIO_PRIORITY_NUM_QUEUEABLE, - - ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */ -} zio_priority_t; - #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -263,6 +252,7 @@ extern const char *zio_type_name[ZIO_TYPES]; * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . + * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. @@ -295,6 +285,9 @@ typedef struct zbookmark_phys { #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) +#define ZB_DNODE_LEVEL (-3LL) +#define ZB_DNODE_BLKID (0ULL) + #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) @@ -636,8 +629,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ -boolean_t zbookmark_is_before(const struct dnode_phys *dnp, - const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); +boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); +int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, + uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index a921a2f13..0c293ab20 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -44,7 +44,7 @@ typedef struct zio_checksum_info { zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ int ci_correctable; /* number of correctable bits */ int ci_eck; /* uses zio embedded checksum? */ - int ci_dedup; /* strong enough for dedup? */ + boolean_t ci_dedup; /* strong enough for dedup? */ char *ci_name; /* descriptive name */ } zio_checksum_info_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h new file mode 100644 index 000000000..32e90e2fb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ +#ifndef _ZIO_PRIORITY_H +#define _ZIO_PRIORITY_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum zio_priority { + ZIO_PRIORITY_SYNC_READ, + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ + ZIO_PRIORITY_NUM_QUEUEABLE, + + ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ +} zio_priority_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_PRIORITY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 36969e857..44919d247 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } else { int b; - dmu_prefetch(zap->zap_objset, zap->zap_object, + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index b690b6923..201627a32 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -939,7 +939,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error) - return (error); + goto out; } else { /* * Pre SA versions file systems should never touch diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index d90c830d6..00fcb7a2f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -2665,7 +2665,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0); + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); skip_entry: /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index e561b46b1..8548b2d23 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -93,6 +93,9 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; extern vmem_t *zio_alloc_arena; #endif +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. @@ -3453,37 +3456,127 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; -/* dnp is the dnode for zb1->zb_object */ -boolean_t -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, - const zbookmark_phys_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); + + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; } - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 554fbd1b8..491c3655a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -359,7 +359,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zvol_extent_t *ze; int bs = ma->ma_zv->zv_volblocksize; - if (BP_IS_HOLE(bp) || + if (bp == NULL || BP_IS_HOLE(bp) || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) return (0); -- 2.45.0