From f4e9fe6d3ea7a432396d668bde9d5126609261d2 Mon Sep 17 00:00:00 2001 From: mav Date: Fri, 13 Nov 2015 09:30:17 +0000 Subject: [PATCH] MFC r289362, r289445: 2605 want to resume interrupted zfs send Reviewed by: George Wilson Reviewed by: Paul Dagnelie Reviewed by: Richard Elling Reviewed by: Xin Li Reviewed by: Arne Jansen Approved by: Dan McDonald Author: Matthew Ahrens illumos/illumos-gate@9c3fd1216fa7fb02cfbc78a2518a686d54b48ab8 For more info, see: - slides http://www.slideshare.net/MatthewAhrens/openzfs-send-and-receive - video https://www.youtube.com/watch?v=iY44jPMvxog - manpage changes (for zfs resume -s and zfs send -t) - upcoming talk at the OpenZFS Developer Summit The TL;DR is: Use "zfs receive -s" to save the partially received state on failure. On failure, get the receive token with "zfs get receive_resume_token " Resume the send with "zfs send -t " Relnotes: yes git-svn-id: svn://svn.freebsd.org/base/stable/10@290756 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/zfs/zfs.8 | 69 ++- cddl/contrib/opensolaris/cmd/zfs/zfs_main.c | 120 +++- .../opensolaris/cmd/zstreamdump/zstreamdump.c | 5 +- .../opensolaris/lib/libzfs/common/libzfs.h | 10 + .../lib/libzfs/common/libzfs_compat.c | 3 + .../lib/libzfs/common/libzfs_dataset.c | 27 +- .../lib/libzfs/common/libzfs_mount.c | 11 + .../lib/libzfs/common/libzfs_sendrecv.c | 445 ++++++++++++--- .../lib/libzfs_core/common/libzfs_core.c | 69 ++- .../lib/libzfs_core/common/libzfs_core.h | 6 +- cddl/lib/libzfs/Makefile | 4 +- .../opensolaris/common/zfs/zfs_ioctl_compat.c | 135 ++++- .../opensolaris/common/zfs/zfs_ioctl_compat.h | 47 +- .../contrib/opensolaris/common/zfs/zfs_prop.c | 4 + .../uts/common/fs/zfs/dmu_objset.c | 20 + .../opensolaris/uts/common/fs/zfs/dmu_send.c | 538 +++++++++++++++--- .../uts/common/fs/zfs/dmu_traverse.c | 82 +-- .../uts/common/fs/zfs/dsl_dataset.c | 139 +++++ .../uts/common/fs/zfs/dsl_destroy.c | 12 +- .../uts/common/fs/zfs/sys/dmu_impl.h | 4 +- .../uts/common/fs/zfs/sys/dmu_send.h | 18 +- .../uts/common/fs/zfs/sys/dmu_traverse.h | 2 + .../uts/common/fs/zfs/sys/dsl_dataset.h | 23 + .../uts/common/fs/zfs/sys/zfs_ioctl.h | 22 +- .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 26 +- .../opensolaris/uts/common/sys/fs/zfs.h | 1 + 26 files changed, 1548 insertions(+), 294 deletions(-) diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index b9dd26bda..9cfa4dec3 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -189,17 +189,25 @@ .Op Fl i Ar snapshot Ns | Ns bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm +.Cm send +.Op Fl Penv +.Fl t Ar receive_resume_token +.Nm .Cm receive Ns | Ns Cm recv -.Op Fl vnFu +.Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm receive Ns | Ns Cm recv -.Op Fl vnFu +.Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Nm +.Cm receive Ns | Ns Cm recv +.Fl A +.Ar filesystem Ns | Ns Ar volume +.Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Nm @@ -597,6 +605,13 @@ For cloned file systems or volumes, the snapshot from which the clone was created. See also the .Sy clones property. +.It Sy receive_resume_token +For filesystems or volumes which have saved partially-completed state from +.Sy zfs receive -s , +this opaque token can be provided to +.Sy zfs send -t +to resume and complete the +.Sy zfs receive . .It Sy referenced The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it @@ -2714,15 +2729,28 @@ feature. .El .It Xo .Nm +.Cm send +.Op Fl Penv +.Fl t +.Ar receive_resume_token +.Xc +Creates a send stream which resumes an interrupted receive. The +.Ar receive_resume_token +is the value of this property on the filesystem +or volume that was being received into. See the documentation for +.Sy zfs receive -s +for more details. +.It Xo +.Nm .Cm receive Ns | Ns Cm recv -.Op Fl vnFu +.Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm receive Ns | Ns Cm recv -.Op Fl vnFu +.Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem @@ -2819,9 +2847,42 @@ performing the receive operation. If receiving an incremental replication stream (for example, one generated by .Qq Nm Cm send Fl R Bro Fl i | Fl I Brc ) , destroy snapshots and file systems that do not exist on the sending side. +.It Fl s +If the receive is interrupted, save the partially received state, rather +than deleting it. Interruption may be due to premature termination of +the stream +.Po e.g. due to network failure or failure of the remote system +if the stream is being read over a network connection +.Pc , +a checksum error in the stream, termination of the +.Nm zfs Cm receive +process, or unclean shutdown of the system. +.Pp +The receive can be resumed with a stream generated by +.Nm zfs Cm send Fl t Ar token , +where the +.Ar token +is the value of the +.Sy receive_resume_token +property of the filesystem or volume which is received into. +.Pp +To use this flag, the storage pool must have the +.Sy extensible_dataset +feature enabled. See +.Xr zpool-features 5 +for details on ZFS feature flags. .El .It Xo .Nm +.Cm receive Ns | Ns Cm recv +.Fl A +.Ar filesystem Ns | Ns Ar volume +.Xc +Abort an interrupted +.Nm zfs Cm receive Fl s , +deleting its saved partially received state. +.It Xo +.Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Xc diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index e35ce0131..11f1114c0 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -263,10 +263,11 @@ get_usage(zfs_help_t idx) case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: - return (gettext("\treceive|recv [-vnFu] \n" - "\treceive|recv [-vnFu] [-o origin=] [-d | -e] " - "\n")); + "\treceive|recv [-vnsFu] [-o origin=] [-d | -e] " + "\n" + "\treceive|recv -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" @@ -279,7 +280,8 @@ get_usage(zfs_help_t idx) return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " "\n" "\tsend [-Le] [-i snapshot|bookmark] " - "\n")); + "\n" + "\tsend [-nvPe] -t \n")); case HELP_SET: return (gettext("\tset " " ...\n")); @@ -3728,6 +3730,7 @@ zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; + char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; @@ -3736,7 +3739,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) { switch (c) { case 'i': if (fromname) @@ -3777,6 +3780,9 @@ zfs_do_send(int argc, char **argv) case 'e': flags.embed_data = B_TRUE; break; + case 't': + resume_token = optarg; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3792,14 +3798,28 @@ zfs_do_send(int argc, char **argv) argc -= optind; argv += optind; - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); + if (resume_token != NULL) { + if (fromname != NULL || flags.replicate || flags.props || + flags.dedup) { + (void) fprintf(stderr, + gettext("invalid flags combined with -t\n")); + usage(B_FALSE); + } + if (argc != 0) { + (void) fprintf(stderr, gettext("no additional " + "arguments are permitted with -t\n")); + usage(B_FALSE); + } + } else { + if (argc < 1) { + (void) fprintf(stderr, + gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } } if (!flags.dryrun && isatty(STDOUT_FILENO)) { @@ -3809,6 +3829,11 @@ zfs_do_send(int argc, char **argv) return (1); } + if (resume_token != NULL) { + return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token)); + } + /* * Special case sending a filesystem, or from a bookmark. */ @@ -3914,8 +3939,6 @@ zfs_do_send(int argc, char **argv) } /* - * zfs receive [-vnFu] [-d | -e] - * * Restore a backup stream from stdin. */ static int @@ -3923,6 +3946,8 @@ zfs_do_receive(int argc, char **argv) { int c, err; recvflags_t flags = { 0 }; + boolean_t abort_resumable = B_FALSE; + nvlist_t *props; nvpair_t *nvp = NULL; @@ -3930,7 +3955,7 @@ zfs_do_receive(int argc, char **argv) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":o:denuvF")) != -1) { + while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) { switch (c) { case 'o': if (parseprop(props, optarg) != 0) @@ -3952,9 +3977,15 @@ zfs_do_receive(int argc, char **argv) case 'v': flags.verbose = B_TRUE; break; + case 's': + flags.resumable = B_TRUE; + break; case 'F': flags.force = B_TRUE; break; + case 'A': + abort_resumable = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3987,6 +4018,44 @@ zfs_do_receive(int argc, char **argv) } } + if (abort_resumable) { + if (flags.isprefix || flags.istail || flags.dryrun || + flags.resumable || flags.nomount) { + (void) fprintf(stderr, gettext("invalid option")); + usage(B_FALSE); + } + + char namebuf[ZFS_MAXNAMELEN]; + (void) snprintf(namebuf, sizeof (namebuf), + "%s/%%recv", argv[0]); + + if (zfs_dataset_exists(g_zfs, namebuf, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { + zfs_handle_t *zhp = zfs_open(g_zfs, + namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + err = zfs_destroy(zhp, B_FALSE); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { + (void) fprintf(stderr, + gettext("'%s' does not have any " + "resumable receive state to abort\n"), + argv[0]); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + } + + return (err != 0); + } + if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " @@ -3994,7 +4063,6 @@ zfs_do_receive(int argc, char **argv) "You must redirect standard input.\n")); return (1); } - err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); return (err != 0); @@ -5815,6 +5883,24 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, return (0); } + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Contains partially-completed state from " + "\"zfs receive -r\", which can be resumed with " + "\"zfs send -t\"\n"), + cmdname, zfs_get_name(zhp)); + return (1); + } + /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c index f6dedc2fa..83a5b54b0 100644 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c @@ -125,7 +125,7 @@ read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) saved_cksum.zc_word[1], saved_cksum.zc_word[2], saved_cksum.zc_word[3]); - exit(1); + return (0); } return (sizeof (*drr)); } @@ -346,8 +346,7 @@ main(int argc, char *argv[]) if (verbose) (void) printf("\n"); - if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) { + if (drr->drr_payloadlen != 0) { nvlist_t *nv; int sz = drr->drr_payloadlen; diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 44bd58b9f..c7b208f89 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -621,6 +621,10 @@ typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); +extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, + const char *); +extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, + const char *token); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, @@ -661,6 +665,12 @@ typedef struct recvflags { /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; + /* + * Mark the file systems as "resumable" and do not destroy them if the + * receive is interrupted + */ + boolean_t resumable; + /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c index a3f612987..833e1b675 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c @@ -74,6 +74,9 @@ zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) { switch (zfs_ioctl_version) { + case ZFS_IOCVER_EDBP: + cflag = ZFS_CMD_COMPAT_EDBP; + break; case ZFS_IOCVER_ZCMD: cflag = ZFS_CMD_COMPAT_ZCMD; break; diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c index 343f258b2..27bbcf4e4 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c @@ -1772,22 +1772,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) return (value); } -static char * +static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; - char *value; + const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); - if ((value = (char *)zfs_prop_default_string(prop)) == NULL) - value = ""; + value = zfs_prop_default_string(prop); *source = ""; } @@ -2189,7 +2188,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, { char *source = NULL; uint64_t val; - char *str; + const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); @@ -2294,14 +2293,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, break; case ZFS_PROP_ORIGIN: - (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), - proplen); - /* - * If there is no parent at all, return failure to indicate that - * it doesn't apply to this dataset. - */ - if (propbuf[0] == '\0') + str = getprop_string(zhp, prop, &source); + if (str == NULL) return (-1); + (void) strlcpy(propbuf, str, proplen); break; case ZFS_PROP_CLONES: @@ -2482,8 +2477,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, break; case PROP_TYPE_STRING: - (void) strlcpy(propbuf, - getprop_string(zhp, prop, &source), proplen); + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + (void) strlcpy(propbuf, str, proplen); break; case PROP_TYPE_INDEX: diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c index f8596ede3..b94af5bf2 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c @@ -1072,6 +1072,17 @@ mount_cb(zfs_handle_t *zhp, void *data) return (0); } + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + zfs_close(zhp); + return (0); + } + libzfs_add_handle(cbp, zhp); if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { zfs_close(zhp); diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index 1f9007cbd..79afb738c 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved. @@ -51,6 +51,7 @@ #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" +#include #include #include #include @@ -67,6 +68,8 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); +static int guid_to_name(libzfs_handle_t *, const char *, + uint64_t, boolean_t, char *); static const zio_cksum_t zero_cksum = { 0 }; @@ -284,8 +287,7 @@ cksummer(void *arg) DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { + if (drr->drr_payloadlen != 0) { sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { @@ -995,17 +997,14 @@ static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; - zfs_cmd_t zc = { 0 }; zfs_handle_t *zhp = pa->pa_zhp; libzfs_handle_t *hdl = zhp->zfs_hdl; unsigned long long bytes; char buf[16]; - time_t t; struct tm *tm; - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (!pa->pa_parsable) @@ -1038,6 +1037,51 @@ send_progress_thread(void *arg) } } +static void +send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, + uint64_t size, boolean_t parsable) +{ + if (parsable) { + if (fromsnap != NULL) { + (void) fprintf(fout, "incremental\t%s\t%s", + fromsnap, tosnap); + } else { + (void) fprintf(fout, "full\t%s", + tosnap); + } + } else { + if (fromsnap != NULL) { + if (strchr(fromsnap, '@') == NULL && + strchr(fromsnap, '#') == NULL) { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from @%s to %s"), + fromsnap, tosnap); + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from %s to %s"), + fromsnap, tosnap); + } + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "full send of %s"), + tosnap); + } + } + + if (size != 0) { + if (parsable) { + (void) fprintf(fout, "\t%llu", + (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + " estimated size is %s"), buf); + } + } + (void) fprintf(fout, "\n"); +} + static int dump_snapshot(zfs_handle_t *zhp, void *arg) { @@ -1117,37 +1161,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) (sdd->fromorigin || sdd->replicate); if (sdd->verbose) { - uint64_t size; - err = estimate_ioctl(zhp, sdd->prevsnap_obj, + uint64_t size = 0; + (void) estimate_ioctl(zhp, sdd->prevsnap_obj, fromorigin, &size); - if (sdd->parsable) { - if (sdd->prevsnap[0] != '\0') { - (void) fprintf(fout, "incremental\t%s\t%s", - sdd->prevsnap, zhp->zfs_name); - } else { - (void) fprintf(fout, "full\t%s", - zhp->zfs_name); - } - } else { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "send from @%s to %s"), - sdd->prevsnap, zhp->zfs_name); - } - if (err == 0) { - if (sdd->parsable) { - (void) fprintf(fout, "\t%llu\n", - (longlong_t)size); - } else { - char buf[16]; - zfs_nicenum(size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - " estimated size is %s\n"), buf); - } - sdd->size += size; - } else { - (void) fprintf(fout, "\n"); - } + send_print_verbose(fout, zhp->zfs_name, + sdd->prevsnap[0] ? sdd->prevsnap : NULL, + size, sdd->parsable); + sdd->size += size; } if (!sdd->dryrun) { @@ -1358,6 +1379,233 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg) return (0); } +nvlist_t * +zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) +{ + unsigned int version; + int nread; + unsigned long long checksum, packed_len; + + /* + * Decode token header, which is: + * -- + * Note that the only supported token version is 1. + */ + nread = sscanf(token, "%u-%llx-%llx-", + &version, &checksum, &packed_len); + if (nread != 3) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid format)")); + return (NULL); + } + + if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid version %u)"), + version); + return (NULL); + } + + /* convert hexadecimal representation to binary */ + token = strrchr(token, '-') + 1; + int len = strlen(token) / 2; + unsigned char *compressed = zfs_alloc(hdl, len); + for (int i = 0; i < len; i++) { + nread = sscanf(token + i * 2, "%2hhx", compressed + i); + if (nread != 1) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt " + "(payload is not hex-encoded)")); + return (NULL); + } + } + + /* verify checksum */ + zio_cksum_t cksum; + fletcher_4_native(compressed, len, &cksum); + if (cksum.zc_word[0] != checksum) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (incorrect checksum)")); + return (NULL); + } + + /* uncompress */ + void *packed = zfs_alloc(hdl, packed_len); + uLongf packed_len_long = packed_len; + if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || + packed_len_long != packed_len) { + free(packed); + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (decompression failed)")); + return (NULL); + } + + /* unpack nvlist */ + nvlist_t *nv; + int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); + free(packed); + free(compressed); + if (error != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (nvlist_unpack failed)")); + return (NULL); + } + return (nv); +} + +int +zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + const char *resume_token) +{ + char errbuf[1024]; + char *toname; + char *fromname = NULL; + uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; + zfs_handle_t *zhp; + int error = 0; + char name[ZFS_MAXNAMELEN]; + enum lzc_send_flags lzc_flags = 0; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + if (resume_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist + */ + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + if (flags->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "resume token contents:\n")); + nvlist_print(stderr, resume_nvl); + } + + if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || + nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt")); + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + fromguid = 0; + (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + + if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + + if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { + if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is no longer the same snapshot used in " + "the initial send"), toname); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' used in the initial send no longer exists"), + toname); + } + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to access '%s'"), name); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + + if (fromguid != 0) { + if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source %#llx no longer exists"), + (longlong_t)fromguid); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + fromname = name; + } + + if (flags->verbose) { + uint64_t size = 0; + error = lzc_send_space(zhp->zfs_name, fromname, &size); + if (error == 0) + size = MAX(0, (int64_t)(size - bytes)); + send_print_verbose(stderr, zhp->zfs_name, fromname, + size, flags->parsable); + } + + if (!flags->dryrun) { + progress_arg_t pa = { 0 }; + pthread_t tid; + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = outfd; + pa.pa_parsable = flags->parsable; + + error = pthread_create(&tid, NULL, + send_progress_thread, &pa); + if (error != 0) { + zfs_close(zhp); + return (error); + } + } + + error = lzc_send_resume(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff); + + if (flags->progress) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + } + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + zfs_close(zhp); + + switch (error) { + case 0: + return (0); + case EXDEV: + case ENOENT: + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: +#ifdef illumos + case ENOSTR: +#endif + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + + zfs_close(zhp); + + return (error); +} + /* * Generate a send stream for the dataset identified by the argument zhp. * @@ -1897,6 +2145,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, typedef struct guid_to_name_data { uint64_t guid; + boolean_t bookmark_ok; char *name; char *skip; } guid_to_name_data_t; @@ -1905,20 +2154,25 @@ static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; + const char *slash; int err; if (gtnd->skip != NULL && - strcmp(zhp->zfs_name, gtnd->skip) == 0) { + (slash = strrchr(zhp->zfs_name, '/')) != NULL && + strcmp(slash + 1, gtnd->skip) == 0) { + zfs_close(zhp); return (0); } - if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); + if (err != EEXIST && gtnd->bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } @@ -1932,45 +2186,48 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - char *name) + boolean_t bookmark_ok, char *name) { - /* exhaustive search all local snapshots */ char pname[ZFS_MAXNAMELEN]; guid_to_name_data_t gtnd; - int err = 0; - zfs_handle_t *zhp; - char *cp; gtnd.guid = guid; + gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; - (void) strlcpy(pname, parent, sizeof (pname)); - /* - * Search progressively larger portions of the hierarchy. This will + * Search progressively larger portions of the hierarchy, starting + * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ - while ((cp = strrchr(pname, '/')) != NULL) { - + (void) strlcpy(pname, parent, sizeof (pname)); + char *cp = strrchr(pname, '@'); + if (cp == NULL) + cp = strchr(pname, '\0'); + for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; - zhp = make_dataset_handle(hdl, pname); + zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; - - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); + if (err != EEXIST) + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + if (err != EEXIST && bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* - * Remember the dataset that we already searched, so we - * skip it next time through. + * Remember the last portion of the dataset so we skip it next + * time through (as we've already searched that portion of the + * hierarchy). */ - gtnd.skip = pname; + gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); @@ -2568,11 +2825,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) switch (drr->drr_type) { case DRR_BEGIN: - /* NB: not to be used on v2 stream packages */ if (drr->drr_payloadlen != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid substream header")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + (void) recv_read(hdl, fd, buf, + drr->drr_payloadlen, B_FALSE, NULL); } break; @@ -2633,6 +2888,40 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) return (-1); } +static void +recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, + boolean_t resumable) +{ + char target_fs[ZFS_MAXNAMELEN]; + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream")); + + if (!resumable) + return; + (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); + *strchr(target_fs, '@') = '\0'; + zfs_handle_t *zhp = zfs_open(hdl, target_fs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return; + + char token_buf[ZFS_MAXPROPLEN]; + int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), + NULL, NULL, 0, B_TRUE); + if (error == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream.\n" + "Partially received snapshot is saved.\n" + "A resuming stream can be generated on the sending " + "system by running:\n" + " zfs send -t %s"), + token_buf); + } + zfs_close(zhp); +} + /* * Restores a backup of tosnap from the file descriptor specified by infd. */ @@ -2796,7 +3085,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, */ if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, zc.zc_value, - drrb->drr_fromguid, zc.zc_string) != 0) { + drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), @@ -2812,8 +3101,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zc.zc_string); } + boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING; stream_wantsnewfs = (drrb->drr_fromguid == 0 || - (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap); + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; if (stream_wantsnewfs) { /* @@ -2832,7 +3123,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char suffix[ZFS_MAXNAMELEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); if (guid_to_name(hdl, zc.zc_name, parent_snapguid, - zc.zc_value) == 0) { + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); } @@ -2859,7 +3150,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char snap[ZFS_MAXNAMELEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, - zc.zc_value) == 0) { + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); } @@ -2873,11 +3164,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_handle_t *zhp; /* - * Destination fs exists. Therefore this should either - * be an incremental, or the stream specifies a new fs - * (full stream or clone) and they want us to blow it - * away (and have therefore specified -F and removed any - * snapshots). + * Destination fs exists. It must be one of these cases: + * - an incremental send stream + * - the stream specifies a new fs (full stream or clone) + * and they want us to blow away the existing fs (and + * have therefore specified -F and removed any snapshots) + * - we are resuming a failed receive. */ if (stream_wantsnewfs) { if (!flags->force) { @@ -2932,6 +3224,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (-1); } } + + /* + * If we are resuming a newfs, set newfs here so that we will + * mount it if the recv succeeds this time. We can tell + * that it was a newfs on the first recv because the fs + * itself will be inconsistent (if the fs existed when we + * did the first recv, we would have received it into + * .../%recv). + */ + if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) + newfs = B_TRUE; + zfs_close(zhp); } else { /* @@ -2964,9 +3268,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, newfs = B_TRUE; } - zc.zc_begin_record = drr_noswap->drr_u.drr_begin; + zc.zc_begin_record = *drr_noswap; zc.zc_cookie = infd; zc.zc_guid = flags->force; + zc.zc_resumable = flags->resumable; if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", @@ -3103,8 +3408,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid stream (checksum mismatch)")); + recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: @@ -3306,7 +3610,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. - * (-1 will override -2). + * (-1 will override -2, if -1 and the resumable flag was specified the + * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index 5ba660de6..69d332a5f 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -514,6 +514,13 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp) int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) +{ + return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); +} + +int +lzc_send_resume(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) { nvlist_t *args; int err; @@ -526,6 +533,10 @@ lzc_send(const char *snapname, const char *from, int fd, fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + } err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); @@ -583,22 +594,9 @@ recv_read(int fd, void *buf, int ilen) return (0); } -/* - * The simplest receive case: receive from the specified fd, creating the - * specified snapshot. Apply the specified properties a "received" properties - * (which can be overridden by locally-set properties). If the stream is a - * clone, its origin snapshot must be specified by 'origin'. The 'force' - * flag will cause the target filesystem to be rolled back or destroyed if - * necessary to receive. - * - * Return 0 on success or an errno on failure. - * - * Note: this interface does not work on dedup'd streams - * (those with DMU_BACKUP_FEATURE_DEDUP). - */ -int -lzc_receive(const char *snapname, nvlist_t *props, const char *origin, - boolean_t force, int fd) +static int +lzc_receive_impl(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, boolean_t resumable, int fd) { /* * The receive ioctl is still legacy, so we need to construct our own @@ -608,7 +606,6 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, char *atp; char *packed = NULL; size_t size; - dmu_replay_record_t drr; int error; ASSERT3S(g_refcount, >, 0); @@ -644,10 +641,9 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); /* zc_begin_record is non-byteswapped BEGIN record */ - error = recv_read(fd, &drr, sizeof (drr)); + error = recv_read(fd, &zc.zc_begin_record, sizeof (zc.zc_begin_record)); if (error != 0) goto out; - zc.zc_begin_record = drr.drr_u.drr_begin; /* zc_cookie is fd to read from */ zc.zc_cookie = fd; @@ -655,6 +651,8 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, /* zc guid is force flag */ zc.zc_guid = force; + zc.zc_resumable = resumable; + /* zc_cleanup_fd is unused */ zc.zc_cleanup_fd = -1; @@ -669,6 +667,39 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, return (error); } +/* + * The simplest receive case: receive from the specified fd, creating the + * specified snapshot. Apply the specified properties as "received" properties + * (which can be overridden by locally-set properties). If the stream is a + * clone, its origin snapshot must be specified by 'origin'. The 'force' + * flag will cause the target filesystem to be rolled back or destroyed if + * necessary to receive. + * + * Return 0 on success or an errno on failure. + * + * Note: this interface does not work on dedup'd streams + * (those with DMU_BACKUP_FEATURE_DEDUP). + */ +int +lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (lzc_receive_impl(snapname, props, origin, force, B_FALSE, fd)); +} + +/* + * Like lzc_receive, but if the receive fails due to premature stream + * termination, the intermediate state will be preserved on disk. In this + * case, ECKSUM will be returned. The receive may subsequently be resumed + * with a resuming send stream generated by lzc_send_resume(). + */ +int +lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (lzc_receive_impl(snapname, props, origin, force, B_TRUE, fd)); +} + /* * Roll back this filesystem or volume to its most recent snapshot. * If snapnamebuf is not NULL, it will be filled in with the name diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index b6a4c12f2..b9235d1eb 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Martin Matuska . All rights reserved. */ @@ -59,7 +59,11 @@ enum lzc_send_flags { }; int lzc_send(const char *, const char *, int, enum lzc_send_flags); +int lzc_send_resume(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t); int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); +int lzc_receive_resumable(const char *, nvlist_t *, const char *, + boolean_t, int); int lzc_send_space(const char *, const char *, uint64_t *); boolean_t lzc_exists(const char *); diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index 19a3f2e1b..b960c9e97 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -8,10 +8,10 @@ LIB= zfs DPADD= ${LIBMD} ${LIBPTHREAD} ${LIBUMEM} ${LIBUTIL} ${LIBM} ${LIBNVPAIR} \ ${LIBAVL} ${LIBZFS_CORE} ${LIBUUTIL} ${LIBBSDXML} ${LIBGEOM} \ - ${LIBNVPAIR} + ${LIBNVPAIR} ${LIBZ} LDADD= -lmd -lpthread -lumem -lutil -luutil -lm -lnvpair -lavl \ - -lbsdxml -lgeom -lnvpair -lzfs_core + -lbsdxml -lgeom -lnvpair -lz -lzfs_core SRCS= deviceid.c \ fsshare.c \ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c index 62fed7b6e..39ad7ce5f 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c @@ -53,8 +53,52 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; zfs_cmd_zcmd_t *zcmd_c; + zfs_cmd_edbp_t *edbp_c; switch (cflag) { + case ZFS_CMD_COMPAT_EDBP: + edbp_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN); + +#define ZCMD_COPY(field) zc->field = edbp_c->field + ZCMD_COPY(zc_nvlist_src); + ZCMD_COPY(zc_nvlist_src_size); + ZCMD_COPY(zc_nvlist_dst); + ZCMD_COPY(zc_nvlist_dst_size); + ZCMD_COPY(zc_nvlist_dst_filled); + ZCMD_COPY(zc_pad2); + ZCMD_COPY(zc_history); + ZCMD_COPY(zc_guid); + ZCMD_COPY(zc_nvlist_conf); + ZCMD_COPY(zc_nvlist_conf_size); + ZCMD_COPY(zc_cookie); + ZCMD_COPY(zc_objset_type); + ZCMD_COPY(zc_perm_action); + ZCMD_COPY(zc_history_len); + ZCMD_COPY(zc_history_offset); + ZCMD_COPY(zc_obj); + ZCMD_COPY(zc_iflags); + ZCMD_COPY(zc_share); + ZCMD_COPY(zc_jailid); + ZCMD_COPY(zc_objset_stats); + zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record; + ZCMD_COPY(zc_inject_record); + ZCMD_COPY(zc_defer_destroy); + ZCMD_COPY(zc_flags); + ZCMD_COPY(zc_action_handle); + ZCMD_COPY(zc_cleanup_fd); + ZCMD_COPY(zc_simple); + zc->zc_resumable = B_FALSE; + ZCMD_COPY(zc_sendobj); + ZCMD_COPY(zc_fromobj); + ZCMD_COPY(zc_createtxg); + ZCMD_COPY(zc_stat); +#undef ZCMD_COPY + break; + case ZFS_CMD_COMPAT_ZCMD: zcmd_c = (void *)addr; /* zc */ @@ -83,14 +127,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) ZCMD_COPY(zc_share); ZCMD_COPY(zc_jailid); ZCMD_COPY(zc_objset_stats); - - /* - * zc_begin_record, zc_inject_record didn't change in embedeed-data - * block pointers - * - * TODO: CTASSERT? - */ - ZCMD_COPY(zc_begin_record); + zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record; ZCMD_COPY(zc_inject_record); /* boolean_t -> uint32_t */ @@ -100,7 +137,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) ZCMD_COPY(zc_action_handle); ZCMD_COPY(zc_cleanup_fd); ZCMD_COPY(zc_simple); - bcopy(zcmd_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad)); + zc->zc_resumable = B_FALSE; ZCMD_COPY(zc_sendobj); ZCMD_COPY(zc_fromobj); ZCMD_COPY(zc_createtxg); @@ -133,13 +170,13 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zc->zc_share = zcdm_c->zc_share; zc->zc_jailid = zcdm_c->zc_jailid; zc->zc_objset_stats = zcdm_c->zc_objset_stats; - zc->zc_begin_record = zcdm_c->zc_begin_record; + zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record; zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; (void)zcdm_c->zc_temphold; zc->zc_action_handle = zcdm_c->zc_action_handle; zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; zc->zc_simple = zcdm_c->zc_simple; - bcopy(zcdm_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad)); + zc->zc_resumable = B_FALSE; zc->zc_sendobj = zcdm_c->zc_sendobj; zc->zc_fromobj = zcdm_c->zc_fromobj; zc->zc_createtxg = zcdm_c->zc_createtxg; @@ -177,13 +214,13 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zc->zc_share = zc28_c->zc_share; zc->zc_jailid = zc28_c->zc_jailid; zc->zc_objset_stats = zc28_c->zc_objset_stats; - zc->zc_begin_record = zc28_c->zc_begin_record; + zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record; zc->zc_defer_destroy = zc28_c->zc_defer_destroy; (void)zc28_c->zc_temphold; zc->zc_action_handle = zc28_c->zc_action_handle; zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd; zc->zc_simple = zc28_c->zc_simple; - bcopy(zc28_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad)); + zc->zc_resumable = B_FALSE; zc->zc_sendobj = zc28_c->zc_sendobj; zc->zc_fromobj = zc28_c->zc_fromobj; zc->zc_createtxg = zc28_c->zc_createtxg; @@ -246,7 +283,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zc->zc_share = zc_c->zc_share; zc->zc_jailid = zc_c->zc_jailid; zc->zc_objset_stats = zc_c->zc_objset_stats; - zc->zc_begin_record = zc_c->zc_begin_record; + zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record; /* zc->zc_inject_record */ zc->zc_inject_record.zi_objset = @@ -281,8 +318,50 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; zfs_cmd_zcmd_t *zcmd_c; + zfs_cmd_edbp_t *edbp_c; switch (cflag) { + case ZFS_CMD_COMPAT_EDBP: + edbp_c = (void *)addr; + strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define ZCMD_COPY(field) edbp_c->field = zc->field + ZCMD_COPY(zc_nvlist_src); + ZCMD_COPY(zc_nvlist_src_size); + ZCMD_COPY(zc_nvlist_dst); + ZCMD_COPY(zc_nvlist_dst_size); + ZCMD_COPY(zc_nvlist_dst_filled); + ZCMD_COPY(zc_pad2); + ZCMD_COPY(zc_history); + ZCMD_COPY(zc_guid); + ZCMD_COPY(zc_nvlist_conf); + ZCMD_COPY(zc_nvlist_conf_size); + ZCMD_COPY(zc_cookie); + ZCMD_COPY(zc_objset_type); + ZCMD_COPY(zc_perm_action); + ZCMD_COPY(zc_history_len); + ZCMD_COPY(zc_history_offset); + ZCMD_COPY(zc_obj); + ZCMD_COPY(zc_iflags); + ZCMD_COPY(zc_share); + ZCMD_COPY(zc_jailid); + ZCMD_COPY(zc_objset_stats); + edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + ZCMD_COPY(zc_inject_record); + ZCMD_COPY(zc_defer_destroy); + ZCMD_COPY(zc_flags); + ZCMD_COPY(zc_action_handle); + ZCMD_COPY(zc_cleanup_fd); + ZCMD_COPY(zc_simple); + ZCMD_COPY(zc_sendobj); + ZCMD_COPY(zc_fromobj); + ZCMD_COPY(zc_createtxg); + ZCMD_COPY(zc_stat); +#undef ZCMD_COPY + break; + case ZFS_CMD_COMPAT_ZCMD: zcmd_c = (void *)addr; /* zc */ @@ -311,14 +390,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, ZCMD_COPY(zc_share); ZCMD_COPY(zc_jailid); ZCMD_COPY(zc_objset_stats); - - /* - * zc_begin_record, zc_inject_record didn't change in embedeed-data - * block pointers - * - * TODO: CTASSERT? - */ - ZCMD_COPY(zc_begin_record); + zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; ZCMD_COPY(zc_inject_record); /* boolean_t -> uint32_t */ @@ -328,7 +400,6 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, ZCMD_COPY(zc_action_handle); ZCMD_COPY(zc_cleanup_fd); ZCMD_COPY(zc_simple); - bcopy(zc->zc_pad, zcmd_c->zc_pad, sizeof(zcmd_c->zc_pad)); ZCMD_COPY(zc_sendobj); ZCMD_COPY(zc_fromobj); ZCMD_COPY(zc_createtxg); @@ -361,13 +432,12 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zcdm_c->zc_share = zc->zc_share; zcdm_c->zc_jailid = zc->zc_jailid; zcdm_c->zc_objset_stats = zc->zc_objset_stats; - zcdm_c->zc_begin_record = zc->zc_begin_record; + zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; zcdm_c->zc_temphold = 0; zcdm_c->zc_action_handle = zc->zc_action_handle; zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; zcdm_c->zc_simple = zc->zc_simple; - bcopy(zc->zc_pad, zcdm_c->zc_pad, sizeof(zcdm_c->zc_pad)); zcdm_c->zc_sendobj = zc->zc_sendobj; zcdm_c->zc_fromobj = zc->zc_fromobj; zcdm_c->zc_createtxg = zc->zc_createtxg; @@ -407,13 +477,12 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zc28_c->zc_share = zc->zc_share; zc28_c->zc_jailid = zc->zc_jailid; zc28_c->zc_objset_stats = zc->zc_objset_stats; - zc28_c->zc_begin_record = zc->zc_begin_record; + zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; zc28_c->zc_defer_destroy = zc->zc_defer_destroy; zc28_c->zc_temphold = 0; zc28_c->zc_action_handle = zc->zc_action_handle; zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd; zc28_c->zc_simple = zc->zc_simple; - bcopy(zc->zc_pad, zc28_c->zc_pad, sizeof(zc28_c->zc_pad)); zc28_c->zc_sendobj = zc->zc_sendobj; zc28_c->zc_fromobj = zc->zc_fromobj; zc28_c->zc_createtxg = zc->zc_createtxg; @@ -479,7 +548,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zc_c->zc_share = zc->zc_share; zc_c->zc_jailid = zc->zc_jailid; zc_c->zc_objset_stats = zc->zc_objset_stats; - zc_c->zc_begin_record = zc->zc_begin_record; + zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; /* zc_inject_record */ zc_c->zc_inject_record.zi_objset = @@ -697,6 +766,12 @@ zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) zp.zfs_cmd_size = sizeof(zfs_cmd_t); zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT; return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_EDBP: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t); + zp.zfs_ioctl_version = ZFS_IOCVER_EDBP; + return (ioctl(fd, ncmd, &zp)); case ZFS_CMD_COMPAT_ZCMD: ncmd = _IOWR('Z', request, struct zfs_iocparm); zp.zfs_cmd = (uint64_t)zc; @@ -801,7 +876,7 @@ zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec, int err; if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD) + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP) goto out; switch (vec) { @@ -953,7 +1028,7 @@ zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec, nvlist_t *tmpnvl; if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD) + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP) return (outnvl); switch (vec) { diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h index bdcac6f9c..8361aa32b 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h @@ -52,7 +52,8 @@ extern "C" { #define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_ZCMD 3 #define ZFS_IOCVER_EDBP 4 -#define ZFS_IOCVER_CURRENT ZFS_IOCVER_EDBP +#define ZFS_IOCVER_RESUME 5 +#define ZFS_IOCVER_CURRENT ZFS_IOCVER_RESUME /* compatibility conversion flag */ #define ZFS_CMD_COMPAT_NONE 0 @@ -61,6 +62,7 @@ extern "C" { #define ZFS_CMD_COMPAT_DEADMAN 3 #define ZFS_CMD_COMPAT_LZC 4 #define ZFS_CMD_COMPAT_ZCMD 5 +#define ZFS_CMD_COMPAT_EDBP 6 #define ZFS_IOC_COMPAT_PASS 254 #define ZFS_IOC_COMPAT_FAIL 255 @@ -246,6 +248,49 @@ typedef struct zfs_cmd_zcmd { zfs_stat_t zc_stat; } zfs_cmd_zcmd_t; +typedef struct zfs_cmd_edbp { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_edbp_t; + #ifdef _KERNEL unsigned static long zfs_ioctl_v15_to_v28[] = { 0, /* 0 ZFS_IOC_POOL_CREATE */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index dda72de87..9e4fd25f1 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -343,6 +343,10 @@ zfs_prop_init(void) zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); + zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, + "receive_resume_token", + NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "RESUMETOK"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 6846572f8..79de1d127 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -362,6 +362,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -413,6 +424,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, recordsize_changed_cb, os); } } + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); @@ -469,6 +482,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index ef139616d..024db77d0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -66,12 +66,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; -static const char *recv_clone_name = "%recv"; +const char *recv_clone_name = "%recv"; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) +static void byteswap_record(dmu_replay_record_t *drr); + struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ @@ -79,6 +81,7 @@ struct send_thread_arg { int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; + zbookmark_phys_t resume; }; struct send_block_record { @@ -93,7 +96,7 @@ struct send_block_record { static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { - dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); struct uio auio; struct iovec aiov; ASSERT0(len % 8); @@ -166,7 +169,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given - * object+offset. We know that the one-record constraint is + * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * @@ -414,6 +417,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); @@ -494,6 +510,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t record_size; int err = 0; + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + if (sta->cancel) return (SET_ERROR(EINTR)); @@ -530,8 +549,10 @@ send_traverse_thread(void *arg) struct send_block_record *data; if (st_arg->ds != NULL) { - err = traverse_dataset(st_arg->ds, st_arg->fromtxg, - st_arg->flags, send_cb, arg); + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + if (err != EINTR) st_arg->error_code = err; } @@ -560,6 +581,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT3U(zb->zb_level, >=, 0); + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); + if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); @@ -620,6 +644,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) uint64_t offset; ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { @@ -680,11 +708,13 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, + zfs_bookmark_phys_t *ancestor_zb, + boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) + vnode_t *vp, offset_t *off) #else - boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off) + struct file *fp, offset_t *off) #endif { objset_t *os; @@ -693,7 +723,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; - struct send_thread_arg to_arg; + struct send_thread_arg to_arg = { 0 }; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { @@ -730,6 +760,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); @@ -766,6 +800,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); @@ -774,7 +810,27 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); - if (dump_record(dsp, NULL, 0) != 0) { + void *payload = NULL; + size_t payload_len = 0; + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { err = dsp->dsa_err; goto out; } @@ -889,22 +945,22 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, outfd, 0, 0, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, outfd, 0, 0, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int -dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - int outfd, vnode_t *vp, offset_t *off) + vnode_t *vp, offset_t *off) #else - int outfd, struct file *fp, offset_t *off) + struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; @@ -972,10 +1028,12 @@ dmu_send(const char *tosnap, const char *fromsnap, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); } if (owned) dsl_dataset_disown(ds, FTAG); @@ -1218,6 +1276,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || @@ -1230,6 +1289,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the @@ -1333,15 +1396,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; - uint64_t crflags; + uint64_t crflags = 0; - crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? - DS_FLAG_CI_DATASET : 0; + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1379,6 +1443,31 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + if (drba->drba_cookie->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1396,56 +1485,192 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal_ds(newds, "receive", tx, ""); } +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + char recvname[ZFS_MAXNAMELEN]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + char recvname[ZFS_MAXNAMELEN]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; - dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_resumable = resumable; drc->drc_cred = CRED(); - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - else if (drrb->drr_magic != DMU_BACKUP_MAGIC) - return (SET_ERROR(EINVAL)); - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (drc->drc_byteswap) { - fletcher_4_incremental_byteswap(drr, + fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - fletcher_4_incremental_native(drr, + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - - if (drc->drc_byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } else { + return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); - return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } } struct receive_record_arg { @@ -1457,6 +1682,7 @@ struct receive_record_arg { */ arc_buf_t *write_buf; int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; @@ -1465,6 +1691,7 @@ struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; + /* * These three args are used to signal to the main thread that we're * done. @@ -1472,9 +1699,13 @@ struct receive_writer_arg { kmutex_t mutex; kcondvar_t cv; boolean_t done; + int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object, last_offset; + uint64_t bytes_read; /* bytes read when current record created */ }; struct receive_arg { @@ -1482,6 +1713,7 @@ struct receive_arg { kthread_t *td; struct file *fp; uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. @@ -1577,14 +1809,21 @@ receive_read(struct receive_arg *ra, int len, void *buf) ra->err = restore_bytes(ra, buf + done, len - done, ra->voff, &resid); - if (resid == len - done) - ra->err = SET_ERROR(EINVAL); + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } + ra->bytes_read += len; + ASSERT3U(done, ==, len); return (0); } @@ -1685,6 +1924,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } } +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) @@ -1781,6 +2057,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); + return (0); } @@ -1806,6 +2083,7 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); } + return (0); } @@ -1820,6 +2098,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -1843,8 +2133,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); + return (0); } @@ -1903,43 +2202,48 @@ receive_write_byref(struct receive_writer_arg *rwa, dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwnp, void *data) + struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; - if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); - if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); - if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); - if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrwnp->drr_object, - drrwnp->drr_offset, drrwnp->drr_length); + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - dmu_write_embedded(rwa->os, drrwnp->drr_object, - drrwnp->drr_offset, data, drrwnp->drr_etype, - drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } @@ -2013,10 +2317,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { - char name[MAXNAMELEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[MAXNAMELEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } } static void @@ -2043,12 +2353,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - err = receive_read(ra, len, ra->rrd->payload); + err = receive_read(ra, len, buf); if (err != 0) return (err); - receive_cksum(ra, len, ra->rrd->payload); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } } ra->prev_cksum = ra->cksum; @@ -2056,6 +2371,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; @@ -2235,7 +2551,7 @@ receive_read_record(struct receive_arg *ra) { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: @@ -2262,6 +2578,10 @@ receive_process_record(struct receive_writer_arg *rwa, { int err; + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2357,6 +2677,33 @@ receive_writer_thread(void *arg) thread_exit(); } +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2377,12 +2724,20 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, struct receive_arg ra = { 0 }; struct receive_writer_arg rwa = { 0 }; int featureflags; + nvlist_t *begin_nvl = NULL; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.td = curthread; ra.fp = fp; ra.voff = *voffp; + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra.bytes_read), 1, &ra.bytes_read); + } + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), offsetof(struct receive_ign_obj_node, node)); @@ -2435,9 +2790,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = receive_read_payload_and_next_header(&ra, 0, NULL); - if (err) + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(&ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(&ra, begin_nvl); + if (err != 0) + goto out; + } (void) bqueue_init(&rwa.q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); @@ -2445,6 +2820,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); rwa.os = ra.os; rwa.byteswap = drc->drc_byteswap; + rwa.resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, TS_RUN, minclsyspri); @@ -2503,13 +2879,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, err = rwa.err; out: + nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* - * destroy what we created, so we don't leave it in the - * inconsistent restoring state. + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } @@ -2669,6 +3047,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 151d04c0f..2c718df04 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -48,6 +48,7 @@ typedef struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; + zbookmark_phys_t pd_resume; } prefetch_data_t; typedef struct traverse_data { @@ -307,59 +308,52 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - dnode_phys_t *cdnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - cdnp = buf->b_data; + dnode_phys_t *child_dnp = buf->b_data; for (i = 0; i < epb; i++) { - prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { - err = traverse_dnode(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; - objset_phys_t *osp; - dnode_phys_t *mdnp, *gdnp, *udnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - osp = buf->b_data; - mdnp = &osp->os_meta_dnode; - gdnp = &osp->os_groupused_dnode; - udnp = &osp->os_userused_dnode; - - prefetch_dnode_metadata(td, mdnp, zb->zb_objset, + objset_phys_t *osp = buf->b_data; + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - prefetch_dnode_metadata(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); - prefetch_dnode_metadata(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, mdnp, zb->zb_objset, + err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); + err = traverse_dnode(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + err = traverse_dnode(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } } @@ -391,9 +385,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. */ - td->td_resume->zb_blkid = zb->zb_blkid << - (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } td->td_paused = B_TRUE; } @@ -425,6 +425,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); @@ -501,6 +505,7 @@ traverse_prefetch_thread(void *arg) td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -529,12 +534,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); - /* - * The data prefetching mechanism (the prefetch thread) is incompatible - * with resuming from a bookmark. - */ - ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); - td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; @@ -554,6 +553,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, } pd.pd_flags = flags; + if (resume != NULL) + pd.pd_resume = *resume; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); @@ -601,11 +602,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, * in syncing context). */ int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, - blkptr_cb_t func, void *arg) +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); } int @@ -625,7 +634,6 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { int err; - uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); @@ -637,8 +645,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, return (err); /* visit each dataset */ - for (obj = 1; err == 0; - err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + for (uint64_t obj = 1; err == 0; + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 4fbbe7c4e..47ec6d52c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,8 @@ #include #include #include +#include +#include SYSCTL_DECL(_vfs_zfs); @@ -701,6 +704,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; @@ -711,6 +715,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) return (gotit); } +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + static void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { @@ -1657,6 +1671,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + dmu_objset_sync(ds->ds_objset, zio, tx); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { @@ -1712,6 +1741,76 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) nvlist_free(propval); } +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[256]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native(compressed, compressed_size, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + strfree(propval); + } +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { @@ -1783,6 +1882,29 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) } } } + + if (!dsl_dataset_is_snapshot(ds)) { + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + char recvname[ZFS_MAXNAMELEN]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + (void) strcat(recvname, "/"); + (void) strcat(recvname, recv_clone_name); + if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } } void @@ -3467,3 +3589,20 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } + +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index c7a623c82..7de984507 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -968,9 +968,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { - boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + dmu_objset_rele(os, FTAG); - if (inconsistent) + if (need_destroy) (void) dsl_destroy_head(dsname); } return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 6f67b5a0b..e8d6294b9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -25,7 +25,7 @@ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -300,6 +300,8 @@ typedef struct dmu_sendarg { uint64_t dsa_featureflags; uint64_t dsa_last_data_object; uint64_t dsa_last_data_offset; + uint64_t dsa_resume_object; + uint64_t dsa_resume_offset; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 143d43f03..2865e8291 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -35,13 +35,16 @@ struct vnode; struct dsl_dataset; struct drr_begin; struct avl_tree; +struct dmu_replay_record; -int dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, +extern const char *recv_clone_name; + +int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - int outfd, struct vnode *vp, offset_t *off); + struct vnode *vp, offset_t *off); #else - int outfd, struct file *fp, offset_t *off); + struct file *fp, offset_t *off); #endif int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, uint64_t *sizep); @@ -57,12 +60,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, typedef struct dmu_recv_cookie { struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; struct drr_begin *drc_drrb; const char *drc_tofs; const char *drc_tosnap; boolean_t drc_newfs; boolean_t drc_byteswap; boolean_t drc_force; + boolean_t drc_resumable; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; @@ -70,8 +75,9 @@ typedef struct dmu_recv_cookie { cred_t *drc_cred; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc); +int dmu_recv_begin(char *tofs, char *tosnap, + struct dmu_replay_record *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc); #ifdef illumos int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, #else diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index 544b721e4..c010edd44 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, + zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index c9cd5890a..d7df05b30 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -91,6 +91,18 @@ struct dsl_pool; */ #define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" +/* + * These fields are set on datasets that are in the middle of a resumable + * receive, and allow the sender to resume the send if it is interrupted. + */ +#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" +#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" +#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" +#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" +#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" +#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" + /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. @@ -184,6 +196,14 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + /* + * When in the middle of a resumable receive, tracks how much + * progress we have made. + */ + uint64_t ds_resume_object[TXG_SIZE]; + uint64_t ds_resume_offset[TXG_SIZE]; + uint64_t ds_resume_bytes[TXG_SIZE]; + /* Protected by our dsl_dir's dd_lock */ list_t ds_prop_cbs; @@ -235,6 +255,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -315,6 +336,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); void dsl_dataset_deactivate_feature(uint64_t dsobj, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index f9eca2707..20bf54553 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -79,14 +79,15 @@ typedef enum drr_headertype { * Feature flags for zfs send streams (flags in drr_versioninfo) */ -#define DMU_BACKUP_FEATURE_DEDUP (1<<0) -#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1) -#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2) +#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ -#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16) -#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17) +#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) +#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ -#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19) +#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) +#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* * Mask of all supported backup features @@ -94,11 +95,16 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ + DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) +typedef enum dmu_send_resume_token_version { + ZFS_SEND_RESUME_TOKEN_VERSION = 1 +} dmu_send_resume_token_version_t; + /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: @@ -358,14 +364,14 @@ typedef struct zfs_cmd { zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; + dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ + boolean_t zc_resumable; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 42c02af81..012196903 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -4155,6 +4155,7 @@ static boolean_t zfs_ioc_recv_inject_err; * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read @@ -4206,13 +4207,13 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (SET_ERROR(EBADF)); } - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, - &zc->zc_begin_record, force, origin, &drc); + &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; @@ -5431,6 +5432,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. * } * * outnvl is unused @@ -5447,6 +5450,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) int fd; boolean_t largeblockok; boolean_t embedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5457,6 +5462,9 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + #ifdef illumos file_t *fp = getf(fd); #else @@ -5466,11 +5474,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, + error = dmu_send(snapname, fromname, embedok, largeblockok, fd, #ifdef illumos - fd, fp->f_vnode, &off); + resumeobj, resumeoff, fp->f_vnode, &off); #else - fd, fp, &off); + resumeobj, resumeoff, fp, &off); #endif #ifdef illumos @@ -6099,6 +6107,14 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, goto out; } break; + case ZFS_IOCVER_EDBP: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_EDBP; + break; case ZFS_IOCVER_ZCMD: if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 16d528e02..809ce9bc5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -152,6 +152,7 @@ typedef enum { ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_PREV_SNAP, + ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_NUM_PROPS } zfs_prop_t; -- 2.45.0