From 6e8cf419295d738789d1427943bd5317302e34ce Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 15 Jul 2014 04:53:34 +0000 Subject: [PATCH] MFC r268075: MFV r267565: 4757 ZFS embedded-data block pointers ("zero block compression") 4913 zfs release should not be subject to space checks git-svn-id: svn://svn.freebsd.org/base/stable/10@268649 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/zdb/zdb.c | 65 ++++- cddl/contrib/opensolaris/cmd/zfs/zfs.8 | 54 ++++- cddl/contrib/opensolaris/cmd/zfs/zfs_main.c | 17 +- .../opensolaris/cmd/zpool/zpool-features.7 | 30 ++- .../opensolaris/cmd/zstreamdump/zstreamdump.c | 55 ++++- cddl/contrib/opensolaris/cmd/ztest/ztest.c | 48 ++-- .../opensolaris/lib/libzfs/common/libzfs.h | 6 +- .../lib/libzfs/common/libzfs_sendrecv.c | 46 +++- .../lib/libzfs_core/common/libzfs_core.c | 12 +- .../lib/libzfs_core/common/libzfs_core.h | 6 +- cddl/sbin/zpool/Makefile | 1 + cddl/usr.bin/zinject/Makefile | 1 + cddl/usr.sbin/zdb/Makefile | 1 + cddl/usr.sbin/zhack/Makefile | 1 + sys/boot/zfs/zfsimpl.c | 29 +++ sys/cddl/boot/zfs/README | 7 +- sys/cddl/boot/zfs/blkptr.c | 73 ++++++ sys/cddl/boot/zfs/zfsimpl.h | 127 +++++++++- sys/cddl/boot/zfs/zfssubr.c | 10 +- .../opensolaris/common/zfs/zfeature_common.c | 5 + .../opensolaris/common/zfs/zfeature_common.h | 7 +- .../opensolaris/common/zfs/zfs_ioctl_compat.c | 123 +++++++++- .../opensolaris/common/zfs/zfs_ioctl_compat.h | 48 +++- .../opensolaris/uts/common/Makefile.files | 1 + .../opensolaris/uts/common/fs/zfs/arc.c | 93 ++++--- .../opensolaris/uts/common/fs/zfs/blkptr.c | 119 +++++++++ .../opensolaris/uts/common/fs/zfs/bpobj.c | 41 +++- .../opensolaris/uts/common/fs/zfs/dbuf.c | 61 ++++- .../opensolaris/uts/common/fs/zfs/dmu.c | 85 +++++-- .../uts/common/fs/zfs/dmu_objset.c | 32 ++- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 228 +++++++++++++++--- .../uts/common/fs/zfs/dmu_traverse.c | 2 +- .../opensolaris/uts/common/fs/zfs/dnode.c | 4 +- .../uts/common/fs/zfs/dnode_sync.c | 8 +- .../uts/common/fs/zfs/dsl_dataset.c | 2 +- .../uts/common/fs/zfs/dsl_destroy.c | 5 +- .../opensolaris/uts/common/fs/zfs/dsl_scan.c | 7 + .../uts/common/fs/zfs/dsl_userhold.c | 3 +- .../opensolaris/uts/common/fs/zfs/spa.c | 12 +- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 9 +- .../uts/common/fs/zfs/sys/blkptr.h | 38 +++ .../opensolaris/uts/common/fs/zfs/sys/dbuf.h | 3 + .../opensolaris/uts/common/fs/zfs/sys/dmu.h | 13 + .../uts/common/fs/zfs/sys/dmu_impl.h | 3 + .../uts/common/fs/zfs/sys/dmu_send.h | 10 +- .../opensolaris/uts/common/fs/zfs/sys/spa.h | 179 ++++++++++++-- .../uts/common/fs/zfs/sys/spa_impl.h | 1 + .../uts/common/fs/zfs/sys/zfs_ioctl.h | 33 ++- .../opensolaris/uts/common/fs/zfs/sys/zio.h | 12 + .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 27 ++- .../opensolaris/uts/common/fs/zfs/zil.c | 9 +- .../opensolaris/uts/common/fs/zfs/zio.c | 84 ++++++- .../uts/common/fs/zfs/zio_compress.c | 23 +- .../opensolaris/uts/common/fs/zfs/zvol.c | 2 + sys/conf/files | 1 + 55 files changed, 1630 insertions(+), 292 deletions(-) create mode 100644 sys/cddl/boot/zfs/blkptr.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index ac4ea040f..634663590 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -1059,8 +1059,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) return; } - blkbuf[0] = '\0'; + if (BP_IS_EMBEDDED(bp)) { + (void) sprintf(blkbuf, + "EMBEDDED et=%u %llxL/%llxP B=%llu", + (int)BPE_GET_ETYPE(bp), + (u_longlong_t)BPE_GET_LSIZE(bp), + (u_longlong_t)BPE_GET_PSIZE(bp), + (u_longlong_t)bp->blk_birth); + return; + } + blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", @@ -1078,7 +1087,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)bp->blk_fill, + (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } @@ -1091,8 +1100,10 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb, char blkbuf[BP_SPRINTF_LEN]; int l; - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + if (!BP_IS_EMBEDDED(bp)) { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); @@ -1146,10 +1157,10 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; - fill += cbp->blk_fill; + fill += BP_GET_FILL(cbp); } if (!err) - ASSERT3U(fill, ==, bp->blk_fill); + ASSERT3U(fill, ==, BP_GET_FILL(bp)); (void) arc_buf_remove_ref(buf, &buf); } @@ -1816,14 +1827,14 @@ dump_dir(objset_t *os) if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; - usedobjs = os->os_rootbp->blk_fill; + usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = os->os_spa->spa_dsl_pool-> dp_mos_dir->dd_phys->dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } - ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); + ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf); @@ -2134,6 +2145,9 @@ typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; + uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] + [BPE_PAYLOAD_SIZE]; uint64_t zcb_start; uint64_t zcb_lastprint; uint64_t zcb_totalasize; @@ -2188,6 +2202,13 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, } + if (BP_IS_EMBEDDED(bp)) { + zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; + zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] + [BPE_GET_PSIZE(bp)]++; + return; + } + if (dump_opt['L']) return; @@ -2287,7 +2308,8 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); - if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { + if (!BP_IS_EMBEDDED(bp) && + (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; @@ -2479,7 +2501,7 @@ dump_block_stats(spa_t *spa) zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; - int leaks = 0; + boolean_t leaks = B_FALSE; (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", @@ -2567,7 +2589,7 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = 1; + leaks = B_TRUE; } if (tzb->zb_count == 0) @@ -2599,6 +2621,23 @@ dump_block_stats(spa_t *spa) (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { + if (zcb.zcb_embedded_blocks[i] == 0) + continue; + (void) printf("\n"); + (void) printf("\tadditional, non-pointer bps of type %u: " + "%10llu\n", + i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + + if (dump_opt['b'] >= 3) { + (void) printf("\t number of (compressed) bytes: " + "number of bps\n"); + dump_histogram(zcb.zcb_embedded_histogram[i], + sizeof (zcb.zcb_embedded_histogram[i]) / + sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + } + } + if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); @@ -2711,14 +2750,14 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, - (u_longlong_t)bp->blk_fill, + (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index d0f21d3b1..4ae5fbee6 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd May 27, 2014 +.Dd June 30, 2014 .Dt ZFS 8 .Os .Sh NAME @@ -179,11 +179,12 @@ .Ar bookmark .Nm .Cm send -.Op Fl DnPpRv +.Op Fl DnPpRve .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Nm .Cm send +.Op Fl e .Op Fl i Ar snapshot Ns | Ns bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm @@ -2476,7 +2477,7 @@ feature. .It Xo .Nm .Cm send -.Op Fl DnPpRv +.Op Fl DnPpRve .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Xc @@ -2548,6 +2549,29 @@ be used regardless of the dataset's property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. .Sy sha256 ) . +.It Fl e +Generate a more compact stream by using WRITE_EMBEDDED records for blocks +which are stored more compactly on disk by the +.Sy embedded_data +pool +feature. +This flag has no effect if the +.Sy embedded_data +feature is +disabled. +The receiving system must have the +.Sy embedded_data +feature +enabled. +If the +.Sy lz4_compress +feature is active on the sending system, +then the receiving system must have that feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy embedded_data +feature. .It Fl p Include the dataset's properties in the stream. This flag is implicit when .Fl R @@ -2572,6 +2596,7 @@ on future versions of .It Xo .Nm .Cm send +.Op Fl e .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc @@ -2597,6 +2622,29 @@ specified as the last component of the name If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. +.It Fl e +Generate a more compact stream by using WRITE_EMBEDDED records for blocks +which are stored more compactly on disk by the +.Sy embedded_data +pool +feature. +This flag has no effect if the +.Sy embedded_data +feature is +disabled. +The receiving system must have the +.Sy embedded_data +feature +enabled. +If the +.Sy lz4_compress +feature is active on the sending system, +then the receiving system must have that feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy embedded_data +feature. .El .It Xo .Nm diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index 4464b6ad9..1049c394b 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -274,9 +274,9 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-DnPpRv] [-[iI] snapshot] " + return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] " "\n" - "\tsend [-i snapshot|bookmark] " + "\tsend [-e] [-i snapshot|bookmark] " "\n")); case HELP_SET: return (gettext("\tset " @@ -590,6 +590,7 @@ finish_progress(char *done) free(pt_header); pt_header = NULL; } + /* * zfs clone [-p] [-o prop=value] ... * @@ -3368,6 +3369,7 @@ rollback_check_dependent(zfs_handle_t *zhp, void *data) zfs_close(zhp); return (0); } + /* * Report any snapshots more recent than the one specified. Used when '-r' is * not specified. We reuse this same callback for the snapshot dependents - if @@ -3707,7 +3709,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) { switch (c) { case 'i': if (fromname) @@ -3742,6 +3744,9 @@ zfs_do_send(int argc, char **argv) case 'n': flags.dryrun = B_TRUE; break; + case 'e': + flags.embed_data = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3780,6 +3785,7 @@ zfs_do_send(int argc, char **argv) if (strchr(argv[0], '@') == NULL || (fromname && strchr(fromname, '#') != NULL)) { char frombuf[ZFS_MAXNAMELEN]; + enum lzc_send_flags lzc_flags = 0; if (flags.replicate || flags.doall || flags.props || flags.dedup || flags.dryrun || flags.verbose || @@ -3794,6 +3800,9 @@ zfs_do_send(int argc, char **argv) if (zhp == NULL) return (1); + if (flags.embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* @@ -3807,7 +3816,7 @@ zfs_do_send(int argc, char **argv) (void) strlcat(frombuf, fromname, sizeof (frombuf)); fromname = frombuf; } - err = zfs_send_one(zhp, fromname, STDOUT_FILENO); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags); zfs_close(zhp); return (err != 0); } diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 index f9d866b37..e2caa647c 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 @@ -23,7 +23,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 23, 2014 +.Dd June 30, 2014 .Dt ZPOOL-FEATURES 7 .Os .Sh NAME @@ -396,6 +396,34 @@ This feature becomes as soon as it is enabled and will never return to being .Sy enabled . +.It Sy embedded_data +.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:embedded_data" +.It GUID Ta com.delphix:embedded_data +.It READ\-ONLY COMPATIBLE Ta no +.It DEPENDENCIES Ta none +.El +.Pp +This feature improves the performance and compression ratio of +highly-compressible blocks. +Blocks whose contents can compress to 112 bytes +or smaller can take advantage of this feature. +.Pp +When this feature is enabled, the contents of highly-compressible blocks are +stored in the block "pointer" itself +.Po a misnomer in this case, as it contains +the compresseed data, rather than a pointer to its location on disk +.Pc . +Thus +the space of the block +.Pq one sector, typically 512 bytes or 4KB +is saved, +and no additional i/o is needed to read and write the data block. +.Pp +This feature becomes +.Sy active +as soon as it is enabled and will +never return to being +.Sy enabled . .El .Sh SEE ALSO .Xr zpool 8 diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c index f7a409162..dce1cb3d7 100644 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c @@ -49,7 +49,6 @@ */ #define DUMP_GROUPING 4 -uint64_t drr_record_count[DRR_NUMTYPES]; uint64_t total_write_size = 0; uint64_t total_stream_len = 0; FILE *send_stream = 0; @@ -123,7 +122,7 @@ print_block(char *buf, int length) * Start printing ASCII characters at a constant offset, after * the hex prints. Leave 3 characters per byte on a line (2 digit * hex number plus 1 space) plus spaces between characters and - * groupings + * groupings. */ int ascii_start = BYTES_PER_LINE * 3 + BYTES_PER_LINE / DUMP_GROUPING + 2; @@ -160,6 +159,8 @@ int main(int argc, char *argv[]) { char *buf = malloc(INITIAL_BUFLEN); + uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; + uint64_t total_records = 0; dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; struct drr_begin *drrb = &thedrr.drr_u.drr_begin; @@ -170,6 +171,7 @@ main(int argc, char *argv[]) struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; struct drr_free *drrf = &thedrr.drr_u.drr_free; struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; char c; boolean_t verbose = B_FALSE; boolean_t first = B_TRUE; @@ -264,6 +266,7 @@ main(int argc, char *argv[]) } drr_record_count[drr->drr_type]++; + total_records++; switch (drr->drr_type) { case DRR_BEGIN: @@ -376,8 +379,8 @@ main(int argc, char *argv[]) drro->drr_bonuslen); } if (drro->drr_bonuslen > 0) { - (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen, - 8), &zc); + (void) ssread(buf, + P2ROUNDUP(drro->drr_bonuslen, 8), &zc); if (dump) { print_block(buf, P2ROUNDUP(drro->drr_bonuslen, 8)); @@ -506,6 +509,38 @@ main(int argc, char *argv[]) print_block(buf, drrs->drr_length); } break; + case DRR_WRITE_EMBEDDED: + if (do_byteswap) { + drrwe->drr_object = + BSWAP_64(drrwe->drr_object); + drrwe->drr_offset = + BSWAP_64(drrwe->drr_offset); + drrwe->drr_length = + BSWAP_64(drrwe->drr_length); + drrwe->drr_toguid = + BSWAP_64(drrwe->drr_toguid); + drrwe->drr_lsize = + BSWAP_32(drrwe->drr_lsize); + drrwe->drr_psize = + BSWAP_32(drrwe->drr_psize); + } + if (verbose) { + (void) printf("WRITE_EMBEDDED object = %llu " + "offset = %llu length = %llu\n" + "toguid = %llx comp = %u etype = %u " + "lsize = %u psize = %u\n", + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset, + (u_longlong_t)drrwe->drr_length, + (u_longlong_t)drrwe->drr_toguid, + drrwe->drr_compression, + drrwe->drr_etype, + drrwe->drr_lsize, + drrwe->drr_psize); + } + (void) ssread(buf, + P2ROUNDUP(drrwe->drr_psize, 8), &zc); + break; } pcksum = zc; } @@ -524,18 +559,16 @@ main(int argc, char *argv[]) (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); (void) printf("\tTotal DRR_WRITE records = %lld\n", (u_longlong_t)drr_record_count[DRR_WRITE]); + (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n", + (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]); + (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n", + (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]); (void) printf("\tTotal DRR_FREE records = %lld\n", (u_longlong_t)drr_record_count[DRR_FREE]); (void) printf("\tTotal DRR_SPILL records = %lld\n", (u_longlong_t)drr_record_count[DRR_SPILL]); (void) printf("\tTotal records = %lld\n", - (u_longlong_t)(drr_record_count[DRR_BEGIN] + - drr_record_count[DRR_OBJECT] + - drr_record_count[DRR_FREEOBJECTS] + - drr_record_count[DRR_WRITE] + - drr_record_count[DRR_FREE] + - drr_record_count[DRR_SPILL] + - drr_record_count[DRR_END])); + (u_longlong_t)total_records); (void) printf("\tTotal write size = %lld (0x%llx)\n", (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); (void) printf("\tTotal stream length = %lld (0x%llx)\n", diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index 6e028c3f5..c0ece9848 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -53,7 +53,7 @@ * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing - * storage pool, as many times as desired. If backwards compatability + * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * @@ -1267,13 +1267,13 @@ static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { - ASSERT(bt->bt_magic == BT_MAGIC); - ASSERT(bt->bt_objset == dmu_objset_id(os)); - ASSERT(bt->bt_object == object); - ASSERT(bt->bt_offset == offset); - ASSERT(bt->bt_gen <= gen); - ASSERT(bt->bt_txg <= txg); - ASSERT(bt->bt_crtxg == crtxg); + ASSERT3U(bt->bt_magic, ==, BT_MAGIC); + ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); + ASSERT3U(bt->bt_object, ==, object); + ASSERT3U(bt->bt_offset, ==, offset); + ASSERT3U(bt->bt_gen, <=, gen); + ASSERT3U(bt->bt_txg, <=, txg); + ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * @@ -3472,6 +3472,11 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) if (error) fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); + if (error == ENOSPC) { + dmu_objset_disown(os, FTAG); + ztest_record_enospc(FTAG); + goto out; + } if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); @@ -3627,11 +3632,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) return; } - dmu_object_set_checksum(os, bigobj, - (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); + enum zio_checksum cksum; + do { + cksum = (enum zio_checksum) + ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); + } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); + dmu_object_set_checksum(os, bigobj, cksum, tx); - dmu_object_set_compress(os, bigobj, - (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); + enum zio_compress comp; + do { + comp = (enum zio_compress) + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); + } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); + dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad @@ -4711,8 +4724,13 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); - if (error) - fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); + if (error == ENOSPC) { + ztest_record_enospc("dsl_dataset_user_hold"); + goto out; + } else if (error) { + fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", + fullname, tag, error); + } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { @@ -5165,7 +5183,7 @@ ztest_run_zdb(char *pool) isa = strdup(isa); /* LINTED */ (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", + "/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s", isalen, isa, ztest_opts.zo_verbose >= 3 ? "s" : "", diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 24131d68e..ef18b457e 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -607,13 +608,16 @@ typedef struct sendflags { /* show progress (ie. -v) */ boolean_t progress; + + /* WRITE_EMBEDDED records of type DATA are permitted */ + boolean_t embed_data; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); -extern int zfs_send_one(zfs_handle_t *, const char *, int); +extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index feddb694c..97f18d7bb 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved. @@ -45,6 +45,7 @@ #include #include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -222,6 +223,7 @@ cksummer(void *arg) struct drr_object *drro = &thedrr.drr_u.drr_object; struct drr_write *drrw = &thedrr.drr_u.drr_write; struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; FILE *ofp; int outfd; dmu_replay_record_t wbr_drr = {0}; @@ -418,6 +420,20 @@ cksummer(void *arg) break; } + case DRR_WRITE_EMBEDDED: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + (void) ssread(buf, + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); + if (cksum_and_write(buf, + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), + &stream_cksum, outfd) == -1) + goto out; + break; + } + case DRR_FREE: { if (cksum_and_write(drr, sizeof (dmu_replay_record_t), @@ -799,7 +815,7 @@ typedef struct send_dump_data { char prevsnap[ZFS_MAXNAMELEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose, dryrun, parsable, progress; + boolean_t verbose, dryrun, parsable, progress, embed_data; int outfd; boolean_t err; nvlist_t *fss; @@ -878,7 +894,8 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - boolean_t fromorigin, int outfd, nvlist_t *debugnv) + boolean_t fromorigin, int outfd, enum lzc_send_flags flags, + nvlist_t *debugnv) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; @@ -892,6 +909,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; + zc.zc_flags = flags; VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); if (fromsnap && fromsnap[0] != '\0') { @@ -1144,8 +1162,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) } } + enum lzc_send_flags flags = 0; + if (sdd->embed_data) + flags |= LZC_SEND_FLAG_EMBED_DATA; + err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, - fromorigin, sdd->outfd, sdd->debugnv); + fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { (void) pthread_cancel(tid); @@ -1489,6 +1511,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.dryrun = flags->dryrun; + sdd.embed_data = flags->embed_data; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) @@ -1620,7 +1643,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } int -zfs_send_one(zfs_handle_t *zhp, const char *from, int fd) +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, + enum lzc_send_flags flags) { int err; libzfs_handle_t *hdl = zhp->zfs_hdl; @@ -1629,7 +1653,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); - err = lzc_send(zhp->zfs_name, from, fd); + err = lzc_send(zhp->zfs_name, from, fd, flags); if (err != 0) { switch (errno) { case EXDEV: @@ -2576,6 +2600,16 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) (void) recv_read(hdl, fd, buf, drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); break; + case DRR_WRITE_EMBEDDED: + if (byteswap) { + drr->drr_u.drr_write_embedded.drr_psize = + BSWAP_32(drr->drr_u.drr_write_embedded. + drr_psize); + } + (void) recv_read(hdl, fd, buf, + P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, + 8), B_FALSE, NULL); + break; case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index 1c8722337..cb38dc2d1 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -486,6 +486,8 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp) } /* + * Generate a zfs send stream for the specified snapshot and write it to + * the specified file descriptor. * * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") * @@ -499,9 +501,15 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp) * snapshot in the origin, etc. * * "fd" is the file descriptor to write the send stream to. + * + * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted + * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, + * which the receiving system must support (as indicated by support + * for the "embedded_data" feature). */ int -lzc_send(const char *snapname, const char *from, int fd) +lzc_send(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags) { nvlist_t *args; int err; @@ -510,6 +518,8 @@ lzc_send(const char *snapname, const char *from, int fd) fnvlist_add_int32(args, "fd", fd); if (from != NULL) fnvlist_add_string(args, "fromsnap", from); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index 380560ff3..99883fecc 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -53,7 +53,11 @@ int lzc_hold(nvlist_t *, int, nvlist_t **); int lzc_release(nvlist_t *, nvlist_t **); int lzc_get_holds(const char *, nvlist_t **); -int lzc_send(const char *, const char *, int); +enum lzc_send_flags { + LZC_SEND_FLAG_EMBED_DATA = 1 << 0 +}; + +int lzc_send(const char *, const char *, int, enum lzc_send_flags); int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); int lzc_send_space(const char *, const char *, uint64_t *); diff --git a/cddl/sbin/zpool/Makefile b/cddl/sbin/zpool/Makefile index 1884d249d..bfcb01741 100644 --- a/cddl/sbin/zpool/Makefile +++ b/cddl/sbin/zpool/Makefile @@ -18,6 +18,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/head CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libumem/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common +CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common diff --git a/cddl/usr.bin/zinject/Makefile b/cddl/usr.bin/zinject/Makefile index 56251490b..20d49323d 100644 --- a/cddl/usr.bin/zinject/Makefile +++ b/cddl/usr.bin/zinject/Makefile @@ -11,6 +11,7 @@ CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzfs/common +CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzfs_core/common CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs diff --git a/cddl/usr.sbin/zdb/Makefile b/cddl/usr.sbin/zdb/Makefile index fa9ab98f5..0e2ec8f82 100644 --- a/cddl/usr.sbin/zdb/Makefile +++ b/cddl/usr.sbin/zdb/Makefile @@ -15,6 +15,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common +CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common diff --git a/cddl/usr.sbin/zhack/Makefile b/cddl/usr.sbin/zhack/Makefile index f09d2d827..b8ad1b4b1 100644 --- a/cddl/usr.sbin/zhack/Makefile +++ b/cddl/usr.sbin/zhack/Makefile @@ -14,6 +14,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common +CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c index b240b9662..16b57c7a6 100644 --- a/sys/boot/zfs/zfsimpl.c +++ b/sys/boot/zfs/zfsimpl.c @@ -56,6 +56,7 @@ static const char *features_for_read[] = { "org.illumos:lz4_compress", "com.delphix:hole_birth", "com.delphix:extensible_dataset", + "com.delphix:embedded_data", NULL }; @@ -1133,6 +1134,34 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) void *pbuf; int i, error; + /* + * Process data embedded in block pointer + */ + if (BP_IS_EMBEDDED(bp)) { + ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + + size = BPE_GET_PSIZE(bp); + ASSERT(size <= BPE_PAYLOAD_SIZE); + + if (cpfunc != ZIO_COMPRESS_OFF) + pbuf = zfs_alloc(size); + else + pbuf = buf; + + decode_embedded_bp_compressed(bp, pbuf); + error = 0; + + if (cpfunc != ZIO_COMPRESS_OFF) { + error = zio_decompress_data(cpfunc, pbuf, + size, buf, BP_GET_LSIZE(bp)); + zfs_free(pbuf, size); + } + if (error != 0) + printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n", + error); + return (error); + } + error = EIO; for (i = 0; i < SPA_DVAS_PER_BP; i++) { diff --git a/sys/cddl/boot/zfs/README b/sys/cddl/boot/zfs/README index f7c045dab..5f16d5cdd 100644 --- a/sys/cddl/boot/zfs/README +++ b/sys/cddl/boot/zfs/README @@ -7,9 +7,10 @@ are used by the ZFS bootstrap: sha256.c checksum support lz4.c compression support lzjb.c compression support + blkptr.c ZFS embedded-data block pointers support zfssubr.c checksum, compression and raidz support zfsimpl.h mostly describing the physical layout -The files fletcher.c, lzjb.c and sha256.c are largely identical to the -ZFS base code (with write support removed) and could be shared but -that might complicate future imports from OpenSolaris. +The files fletcher.c, lzjb.c, lz4.c, sha256.c and blkptr.c are largely identical +to the ZFS base code (with write support removed) and could be shared but that +might complicate future imports from Illumos. diff --git a/sys/cddl/boot/zfs/blkptr.c b/sys/cddl/boot/zfs/blkptr.c new file mode 100644 index 000000000..c36c59bbe --- /dev/null +++ b/sys/cddl/boot/zfs/blkptr.c @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (int i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index 7bc4c433d..98f54791a 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -55,9 +55,14 @@ /* * Copyright 2013 by Saso Kiselkov. All rights reserved. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ #define MAXNAMELEN 256 +#define _NOTE(s) + /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ @@ -163,7 +168,7 @@ typedef struct zio_cksum { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -197,7 +202,8 @@ typedef struct zio_cksum { * G gang block indicator * B byteorder (endianness) * D dedup - * X unused + * X encryption (on version 30, which is not supported) + * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg of block allocation; zero if same as logical birth txg @@ -205,6 +211,100 @@ typedef struct zio_cksum { * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero; see above) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". + */ + +#define BPE_GET_ETYPE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ @@ -242,18 +342,22 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_LSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define BP_SET_LSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) #define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) #define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) @@ -264,6 +368,8 @@ typedef struct blkptr { #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) + #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) @@ -331,6 +437,11 @@ typedef struct blkptr { ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + /* * Embedded checksum */ @@ -1341,3 +1452,5 @@ typedef struct spa { objset_phys_t spa_mos; /* MOS for this pool */ int spa_inited; /* initialized */ } spa_t; + +static void decode_embedded_bp_compressed(const blkptr_t *, void *); diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c index 0f018487b..a64f065ad 100644 --- a/sys/cddl/boot/zfs/zfssubr.c +++ b/sys/cddl/boot/zfs/zfssubr.c @@ -30,9 +30,11 @@ static uint64_t zfs_crc64_table[256]; #define ECKSUM 666 -#define ASSERT(...) do { } while (0) -#define ASSERT3U(...) do { } while (0) -#define ASSERT3S(...) do { } while (0) +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERT(x) ((void)0) #define panic(...) do { \ printf(__VA_ARGS__); \ @@ -82,6 +84,8 @@ typedef struct zio_checksum_info { const char *ci_name; /* descriptive name */ } zio_checksum_info_t; +#include "blkptr.c" + #include "fletcher.c" #include "sha256.c" diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c index 5aa9d2b8c..163dffea4 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -214,4 +214,9 @@ zpool_feature_init(void) "com.joyent:filesystem_limits", "filesystem_limits", "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE, filesystem_limits_deps); + + zfeature_register(SPA_FEATURE_EMBEDDED_DATA, + "com.delphix:embedded_data", "embedded_data", + "Blocks which compress very well use even less space.", + B_FALSE, B_TRUE, B_TRUE, NULL); } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h index 1f668a5b5..65016f1b5 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -47,6 +47,7 @@ typedef enum spa_feature { SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_EMBEDDED_DATA, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURES @@ -67,7 +68,7 @@ typedef struct zfeature_info { const spa_feature_t *fi_depends; } zfeature_info_t; -typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); +typedef int (zfeature_func_t)(zfeature_info_t *, void *); #define ZFS_FEATURE_DEBUG @@ -76,8 +77,8 @@ extern zfeature_info_t spa_feature_table[SPA_FEATURES]; extern boolean_t zfeature_is_valid_guid(const char *); extern boolean_t zfeature_is_supported(const char *); -extern int zfeature_lookup_name(const char *name, spa_feature_t *res); -extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check); +extern int zfeature_lookup_name(const char *, spa_feature_t *); +extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); extern void zpool_feature_init(void); diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c index d44d6f408..bdb6a9900 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c @@ -19,6 +19,7 @@ * CDDL HEADER END */ /* + * Copyright 2013 Xin Li . All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Portions Copyright 2005, 2010, Oracle and/or its affiliates. * All rights reserved. @@ -51,8 +52,63 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; + zfs_cmd_zcmd_t *zcmd_c; switch (cflag) { + case ZFS_CMD_COMPAT_ZCMD: + zcmd_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN); + +#define ZCMD_COPY(field) zc->field = zcmd_c->field + ZCMD_COPY(zc_nvlist_src); + ZCMD_COPY(zc_nvlist_src_size); + ZCMD_COPY(zc_nvlist_dst); + ZCMD_COPY(zc_nvlist_dst_size); + ZCMD_COPY(zc_nvlist_dst_filled); + ZCMD_COPY(zc_pad2); + ZCMD_COPY(zc_history); + ZCMD_COPY(zc_guid); + ZCMD_COPY(zc_nvlist_conf); + ZCMD_COPY(zc_nvlist_conf_size); + ZCMD_COPY(zc_cookie); + ZCMD_COPY(zc_objset_type); + ZCMD_COPY(zc_perm_action); + ZCMD_COPY(zc_history_len); + ZCMD_COPY(zc_history_offset); + ZCMD_COPY(zc_obj); + ZCMD_COPY(zc_iflags); + ZCMD_COPY(zc_share); + ZCMD_COPY(zc_jailid); + ZCMD_COPY(zc_objset_stats); + + /* + * zc_begin_record, zc_inject_record didn't change in embedeed-data + * block pointers + * + * TODO: CTASSERT? + */ + ZCMD_COPY(zc_begin_record); + ZCMD_COPY(zc_inject_record); + + /* boolean_t -> uint32_t */ + zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy); + zc->zc_flags = 0; + + ZCMD_COPY(zc_action_handle); + ZCMD_COPY(zc_cleanup_fd); + ZCMD_COPY(zc_simple); + bcopy(zcmd_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad)); + ZCMD_COPY(zc_sendobj); + ZCMD_COPY(zc_fromobj); + ZCMD_COPY(zc_createtxg); + ZCMD_COPY(zc_stat); +#undef ZCMD_COPY + + break; + case ZFS_CMD_COMPAT_DEADMAN: zcdm_c = (void *)addr; /* zc */ @@ -79,7 +135,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zc->zc_objset_stats = zcdm_c->zc_objset_stats; zc->zc_begin_record = zcdm_c->zc_begin_record; zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; - zc->zc_temphold = zcdm_c->zc_temphold; + (void)zcdm_c->zc_temphold; zc->zc_action_handle = zcdm_c->zc_action_handle; zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; zc->zc_simple = zcdm_c->zc_simple; @@ -94,7 +150,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) /* we always assume zc_nvlist_dst_filled is true */ zc->zc_nvlist_dst_filled = B_TRUE; - break; + break; case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; @@ -123,7 +179,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) zc->zc_objset_stats = zc28_c->zc_objset_stats; zc->zc_begin_record = zc28_c->zc_begin_record; zc->zc_defer_destroy = zc28_c->zc_defer_destroy; - zc->zc_temphold = zc28_c->zc_temphold; + (void)zc28_c->zc_temphold; zc->zc_action_handle = zc28_c->zc_action_handle; zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd; zc->zc_simple = zc28_c->zc_simple; @@ -224,8 +280,63 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; + zfs_cmd_zcmd_t *zcmd_c; switch (cflag) { + case ZFS_CMD_COMPAT_ZCMD: + zcmd_c = (void *)addr; + /* zc */ + strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define ZCMD_COPY(field) zcmd_c->field = zc->field + ZCMD_COPY(zc_nvlist_src); + ZCMD_COPY(zc_nvlist_src_size); + ZCMD_COPY(zc_nvlist_dst); + ZCMD_COPY(zc_nvlist_dst_size); + ZCMD_COPY(zc_nvlist_dst_filled); + ZCMD_COPY(zc_pad2); + ZCMD_COPY(zc_history); + ZCMD_COPY(zc_guid); + ZCMD_COPY(zc_nvlist_conf); + ZCMD_COPY(zc_nvlist_conf_size); + ZCMD_COPY(zc_cookie); + ZCMD_COPY(zc_objset_type); + ZCMD_COPY(zc_perm_action); + ZCMD_COPY(zc_history_len); + ZCMD_COPY(zc_history_offset); + ZCMD_COPY(zc_obj); + ZCMD_COPY(zc_iflags); + ZCMD_COPY(zc_share); + ZCMD_COPY(zc_jailid); + ZCMD_COPY(zc_objset_stats); + + /* + * zc_begin_record, zc_inject_record didn't change in embedeed-data + * block pointers + * + * TODO: CTASSERT? + */ + ZCMD_COPY(zc_begin_record); + ZCMD_COPY(zc_inject_record); + + /* boolean_t -> uint32_t */ + zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy); + zcmd_c->zc_temphold = 0; + + ZCMD_COPY(zc_action_handle); + ZCMD_COPY(zc_cleanup_fd); + ZCMD_COPY(zc_simple); + bcopy(zc->zc_pad, zcmd_c->zc_pad, sizeof(zcmd_c->zc_pad)); + ZCMD_COPY(zc_sendobj); + ZCMD_COPY(zc_fromobj); + ZCMD_COPY(zc_createtxg); + ZCMD_COPY(zc_stat); +#undef ZCMD_COPY + + break; + case ZFS_CMD_COMPAT_DEADMAN: zcdm_c = (void *)addr; @@ -252,7 +363,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zcdm_c->zc_objset_stats = zc->zc_objset_stats; zcdm_c->zc_begin_record = zc->zc_begin_record; zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; - zcdm_c->zc_temphold = zc->zc_temphold; + zcdm_c->zc_temphold = 0; zcdm_c->zc_action_handle = zc->zc_action_handle; zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; zcdm_c->zc_simple = zc->zc_simple; @@ -270,7 +381,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zc->zc_value + strlen(zc->zc_value) + 1, (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1); #endif - break; + break; case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; @@ -298,7 +409,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, zc28_c->zc_objset_stats = zc->zc_objset_stats; zc28_c->zc_begin_record = zc->zc_begin_record; zc28_c->zc_defer_destroy = zc->zc_defer_destroy; - zc28_c->zc_temphold = zc->zc_temphold; + zc28_c->zc_temphold = 0; zc28_c->zc_action_handle = zc->zc_action_handle; zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd; zc28_c->zc_simple = zc->zc_simple; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h index 6e8d51d37..bdcac6f9c 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h @@ -19,6 +19,7 @@ * CDDL HEADER END */ /* + * Copyright 2014 Xin Li . All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Use is subject to license terms. */ @@ -50,7 +51,8 @@ extern "C" { #define ZFS_IOCVER_DEADMAN 1 #define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_ZCMD 3 -#define ZFS_IOCVER_CURRENT ZFS_IOCVER_ZCMD +#define ZFS_IOCVER_EDBP 4 +#define ZFS_IOCVER_CURRENT ZFS_IOCVER_EDBP /* compatibility conversion flag */ #define ZFS_CMD_COMPAT_NONE 0 @@ -58,6 +60,7 @@ extern "C" { #define ZFS_CMD_COMPAT_V28 2 #define ZFS_CMD_COMPAT_DEADMAN 3 #define ZFS_CMD_COMPAT_LZC 4 +#define ZFS_CMD_COMPAT_ZCMD 5 #define ZFS_IOC_COMPAT_PASS 254 #define ZFS_IOC_COMPAT_FAIL 255 @@ -200,6 +203,49 @@ typedef struct zfs_cmd_deadman { zfs_stat_t zc_stat; } zfs_cmd_deadman_t; +typedef struct zfs_cmd_zcmd { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_zcmd_t; + #ifdef _KERNEL unsigned static long zfs_ioctl_v15_to_v28[] = { 0, /* 0 ZFS_IOC_POOL_CREATE */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index a652218b9..4c7e225f0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -33,6 +33,7 @@ ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ + blkptr.o \ bpobj.o \ bptree.o \ dbuf.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 05529050c..366ca714c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -862,8 +862,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr) } static arc_buf_hdr_t * -buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { + const dva_t *dva = BP_IDENTITY(bp); + uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *buf; @@ -895,6 +897,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) arc_buf_hdr_t *fbuf; uint32_t i; + ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); + ASSERT(buf->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(buf)); *lockp = hash_lock; mutex_enter(hash_lock); @@ -2983,10 +2987,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr, *found; + arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; int freeable = FALSE; @@ -3001,12 +3005,22 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, - &hash_lock); - - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || - (found == hdr && HDR_L2_READING(hdr))); + if (HDR_IN_HASH_TABLE(hdr)) { + ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_dva.dva_word[0], ==, + BP_IDENTITY(zio->io_bp)->dva_word[0]); + ASSERT3U(hdr->b_dva.dva_word[1], ==, + BP_IDENTITY(zio->io_bp)->dva_word[1]); + + arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && + hash_lock == NULL) || + (found == hdr && + DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + } hdr->b_flags &= ~ARC_L2_EVICTED; if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) @@ -3132,16 +3146,25 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + ASSERT(!BP_IS_EMBEDDED(bp) || + BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + top: - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); - if (hdr && hdr->b_datacnt > 0) { + if (!BP_IS_EMBEDDED(bp)) { + /* + * Embedded BP's have no DVA and require no I/O to "read". + * Create an anonymous arc buf to back it. + */ + hdr = buf_hash_find(guid, bp, &hash_lock); + } + + if (hdr != NULL && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -3215,7 +3238,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); - arc_callback_t *acb; + arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; @@ -3224,15 +3247,17 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (hdr == NULL) { /* this block is not in the cache */ - arc_buf_hdr_t *exists; + arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; - exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (!BP_IS_EMBEDDED(bp)) { + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + } + if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); @@ -3304,7 +3329,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, vd = NULL; } - mutex_exit(hash_lock); + if (hash_lock != NULL) + mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in @@ -3442,8 +3468,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); + ASSERT(!BP_IS_EMBEDDED(bp)); + + hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { @@ -3767,7 +3794,7 @@ arc_write_done(zio_t *zio) ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { - if (BP_IS_HOLE(zio->io_bp)) { + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); @@ -3779,10 +3806,10 @@ arc_write_done(zio_t *zio) } /* - * If the block to be written was all-zero, we may have - * compressed it away. In this case no write was performed - * so there will be no dva/birth/checksum. The buffer must - * therefore remain anonymous (and uncached). + * If the block to be written was all-zero or compressed enough to be + * embedded in the BP, no write was performed so there will be no + * dva/birth/checksum. The buffer must therefore remain anonymous + * (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; @@ -5192,7 +5219,7 @@ static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) { void *cdata; - size_t csize, len; + size_t csize, len, rounded; ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_tmp_cdata != NULL); @@ -5202,6 +5229,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, cdata, l2hdr->b_asize, (size_t)(1ULL << l2hdr->b_dev->l2ad_vdev->vdev_ashift)); + rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); + if (rounded > csize) { + bzero((char *)cdata + csize, rounded - csize); + csize = rounded; + } + if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c new file mode 100644 index 000000000..7e61dc96f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +void +encode_embedded_bp_compressed(blkptr_t *bp, void *data, + enum zio_compress comp, int uncompressed_size, int compressed_size) +{ + uint64_t *bp64 = (uint64_t *)bp; + uint64_t w = 0; + uint8_t *data8 = data; + + ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); + ASSERT(uncompressed_size == compressed_size || + comp != ZIO_COMPRESS_OFF); + ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + + bzero(bp, sizeof (*bp)); + BP_SET_EMBEDDED(bp, B_TRUE); + BP_SET_COMPRESS(bp, comp); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BPE_SET_LSIZE(bp, uncompressed_size); + BPE_SET_PSIZE(bp, compressed_size); + + /* + * Encode the byte array into the words of the block pointer. + * First byte goes into low bits of first word (little endian). + */ + for (int i = 0; i < compressed_size; i++) { + BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); + if (i % sizeof (w) == sizeof (w) - 1) { + /* we've reached the end of a word */ + ASSERT3P(bp64, <, bp + 1); + *bp64 = w; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + w = 0; + } + } + /* write last partial word */ + if (bp64 < (uint64_t *)(bp + 1)) + *bp64 = w; +} + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (int i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c index 0fb597ba9..e75ae72f9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c @@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo) mutex_destroy(&bpo->bpo_lock); } +static boolean_t +bpobj_hasentries(bpobj_t *bpo) +{ + return (bpo->bpo_phys->bpo_num_blkptrs != 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0)); +} + static int bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, boolean_t free) @@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, out: /* If there are no entries, there should be no bytes. */ - ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || - (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || - bpo->bpo_phys->bpo_bytes == 0); + if (!bpobj_hasentries(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } mutex_exit(&bpo->bpo_lock); return (err); @@ -377,7 +386,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (used == 0) { + if (!bpobj_hasentries(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); @@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + if (BP_IS_EMBEDDED(bp)) { + /* + * The bpobj will compress better without the payload. + * + * Note that we store EMBEDDED bp's because they have an + * uncompressed size, which must be accounted for. An + * alternative would be to add their size to bpo_uncomp + * without storing the bp, but that would create additional + * complications: bpo_uncomp would be inconsistent with the + * set of BP's stored, and bpobj_iterate() wouldn't visit + * all the space accounted for in the bpobj. + */ + bzero(&stored_bp, sizeof (stored_bp)); + stored_bp.blk_prop = bp->blk_prop; + stored_bp.blk_birth = bp->blk_birth; + } else if (!BP_GET_DEDUP(bp)) { + /* The bpobj will compress better without the checksum */ + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + } + /* We never need the fill count. */ stored_bp.blk_fill = 0; - /* The bpobj will compress better if we can leave off the checksum */ - if (!BP_GET_DEDUP(bp)) - bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); - mutex_enter(&bpo->bpo_lock); offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 7215194e0..50e40c0d0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -40,6 +40,8 @@ #include #include #include +#include +#include #include /* @@ -1435,6 +1437,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } +void +dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + struct dirty_leaf *dl; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dmu_buf_will_not_fill(dbuf, tx); + + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + dl = &db->db_last_dirty->dt.dl; + encode_embedded_bp_compressed(&dl->dr_overridden_by, + data, comp, uncompressed_size, compressed_size); + BPE_SET_ETYPE(&dl->dr_overridden_by, etype); + BP_SET_TYPE(&dl->dr_overridden_by, type); + BP_SET_LEVEL(&dl->dr_overridden_by, 0); + BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); + + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; +} + /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. @@ -1819,7 +1853,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { - if (bp && !BP_IS_HOLE(bp)) { + if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; @@ -2455,7 +2489,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) uint64_t fill = 0; int i; - ASSERT(db->db_blkptr == bp); + ASSERT3P(db->db_blkptr, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -2467,7 +2501,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype)); + BP_GET_TYPE(bp) == dn->dn_bonustype) || + BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } @@ -2508,12 +2543,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; - fill += ibp->blk_fill; + fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); - bp->blk_fill = fill; + if (!BP_IS_EMBEDDED(bp)) + bp->blk_fill = fill; mutex_exit(&db->db_mtx); } @@ -2625,7 +2661,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - arc_set_callback(db->db_buf, dbuf_do_evict, db); + if (!arc_released(db->db_buf)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -2751,10 +2788,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); - if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - ASSERT(db->db_state != DB_NOFILL); + if (db->db_level == 0 && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * The BP for this block has been provided by open context + * (by dmu_sync() or dmu_buf_write_embedded()). + */ + void *contents = (data != NULL) ? data->b_data : NULL; + dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + db->db_blkptr, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 4aa1c88d0..b7152d7a8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -129,17 +129,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { }; int -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) +dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; err = dnode_hold(os, object, FTAG, &dn); if (err) @@ -148,18 +144,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + if (db == NULL) { - err = SET_ERROR(EIO); - } else { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (err); +} + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread(os, object, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); - if (err) { + if (err != 0) { dbuf_rele(db, tag); - db = NULL; + *dbp = NULL; } } - dnode_rele(dn, FTAG); - *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } @@ -855,6 +870,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + /* * DMU support for xuio */ @@ -1332,7 +1366,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); - } else { + } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } @@ -1603,9 +1637,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + /* + * Send streams include each object's checksum function. This + * check ensures that the receiving system can understand the + * checksum function transmitted. + */ + ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); @@ -1617,9 +1657,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + /* + * Send streams include each object's compression function. This + * check ensures that the receiving system can understand the + * compression function transmitted. + */ + ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); @@ -1789,7 +1834,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) - doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; + doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index ef82f1768..efed341d6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -338,7 +338,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ - if (ds) { + if (ds != NULL) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -391,7 +391,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, kmem_free(os, sizeof (objset_t)); return (err); } - } else if (ds == NULL) { + } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_LZJB; @@ -435,17 +435,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, &os->os_groupused_dnode); } - /* - * We should be the only thread trying to do this because we - * have ds_opening_lock - */ - if (ds) { - mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_objset == NULL); - ds->ds_objset = os; - mutex_exit(&ds->ds_lock); - } - *osp = os; return (0); } @@ -456,11 +445,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) int err = 0; mutex_enter(&ds->ds_opening_lock); - *osp = ds->ds_objset; - if (*osp == NULL) { + if (ds->ds_objset == NULL) { + objset_t *os; err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, dsl_dataset_get_blkptr(ds), osp); + ds, dsl_dataset_get_blkptr(ds), &os); + + if (err == 0) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } } + *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } @@ -986,6 +983,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); @@ -998,7 +996,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); } /* ARGSUSED */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index d3efa553e..1b246c8b9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -49,7 +49,14 @@ #include #include #include +#include #include +#include + +#ifdef __FreeBSD__ +#undef dump_write +#define dump_write dmu_dump_write +#endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; @@ -184,7 +191,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, } static int -dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, +dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); @@ -219,13 +226,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; - drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); - if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) - drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; - DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); - drrw->drr_key.ddk_cksum = bp->blk_cksum; + if (BP_IS_EMBEDDED(bp)) { + /* + * There's no pre-computed checksum of embedded BP's, so + * (like fletcher4-checkummed blocks) userland will have + * to compute a dedup-capable checksum itself. + */ + drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; + } else { + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + } if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); @@ -234,6 +250,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, return (0); } +static int +dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, + int blksz, const blkptr_t *bp) +{ + char buf[BPE_PAYLOAD_SIZE]; + struct drr_write_embedded *drrw = + &(dsp->dsa_drr->drr_u.drr_write_embedded); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + dsp->dsa_pending_op = PENDING_NONE; + } + + ASSERT(BP_IS_EMBEDDED(bp)); + + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; + drrw->drr_object = object; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_compression = BP_GET_COMPRESS(bp); + drrw->drr_etype = BPE_GET_ETYPE(bp); + drrw->drr_lsize = BPE_GET_LSIZE(bp); + drrw->drr_psize = BPE_GET_PSIZE(bp); + + decode_embedded_bp_compressed(bp, buf); + + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + return (EINTR); + return (0); +} + static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { @@ -354,6 +407,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) return (0); } +static boolean_t +backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +{ + if (!BP_IS_EMBEDDED(bp)) + return (B_FALSE); + + /* + * Compression function must be legacy, or explicitly enabled. + */ + if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) + return (B_FALSE); + + /* + * Embed type must be explicitly enabled. + */ + switch (BPE_GET_ETYPE(bp)) { + case BP_EMBEDDED_TYPE_DATA: + if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (B_TRUE); + break; + default: + return (B_FALSE); + } + return (B_FALSE); +} + #define BP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) @@ -422,11 +502,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); + } else if (backup_do_embed(dsp, bp)) { + /* it's an embedded level-0 block of a regular object */ + int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + err = dump_write_embedded(dsp, zb->zb_object, + zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); + ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -445,7 +531,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } } - err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, + err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -459,12 +545,11 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, + zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, #ifdef illumos - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd, - vnode_t *vp, offset_t *off) + int outfd, vnode_t *vp, offset_t *off) #else - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd, - struct file *fp, offset_t *off) + int outfd, struct file *fp, offset_t *off) #endif { objset_t *os; @@ -472,6 +557,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; + uint64_t featureflags = 0; err = dmu_objset_from_ds(ds, &os); if (err != 0) { @@ -494,13 +580,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { - DMU_SET_FEATUREFLAGS( - drr->drr_u.drr_begin.drr_versioninfo, - DMU_BACKUP_FEATURE_SA_SPILL); + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif + if (embedok && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; + } else { + embedok = B_FALSE; + } + + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, + featureflags); + drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); @@ -533,6 +629,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_featureflags = featureflags; mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -585,9 +682,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, #ifdef illumos - int outfd, vnode_t *vp, offset_t *off) + boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) #else - int outfd, struct file *fp, offset_t *off) + boolean_t embedok, int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; @@ -622,10 +719,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, zb.zbm_guid = fromds->ds_phys->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, outfd, fp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, outfd, fp, off); } dsl_dataset_rele(ds, FTAG); @@ -633,7 +730,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, } int -dmu_send(const char *tosnap, const char *fromsnap, +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else @@ -704,10 +801,10 @@ dmu_send(const char *tosnap, const char *fromsnap, dsl_pool_rele(dp, FTAG); return (err); } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, outfd, fp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, outfd, fp, off); } if (owned) @@ -877,6 +974,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; @@ -890,11 +988,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ - if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) { + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plan WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); - } error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1214,7 +1323,6 @@ backup_byteswap(dmu_replay_record_t *drr) break; case DRR_OBJECT: DO64(drr_object.drr_object); - /* DO64(drr_object.drr_allocation_txg); */ DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); @@ -1252,6 +1360,14 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_prop); break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); @@ -1439,7 +1555,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; - avl_index_t where; + avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; @@ -1462,8 +1578,9 @@ restore_write_byref(struct restorearg *ra, objset_t *os, ref_os = os; } - if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); + if (err != 0) return (err); tx = dmu_tx_create(os); @@ -1482,6 +1599,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os, return (0); } +static int +restore_write_embedded(struct restorearg *ra, objset_t *os, + struct drr_write_embedded *drrwnp) +{ + dmu_tx_t *tx; + int err; + void *data; + + if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + return (EINVAL); + + if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + return (EINVAL); + + if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (EINVAL); + if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (EINVAL); + + data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8)); + if (data == NULL) + return (ra->err); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrwnp->drr_object, + drrwnp->drr_offset, drrwnp->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(os, drrwnp->drr_object, + drrwnp->drr_offset, data, drrwnp->drr_etype, + drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + dmu_tx_commit(tx); + return (0); +} + static int restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) { @@ -1677,6 +1836,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, ra.err = restore_write_byref(&ra, os, &drrwbr); break; } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded drrwe = + drr->drr_u.drr_write_embedded; + ra.err = restore_write_embedded(&ra, os, &drrwe); + break; + } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 1f64d73ac..db0869ca6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -458,7 +458,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (pfd->pd_cancel) return (SET_ERROR(EINTR)); - if (BP_IS_HOLE(bp) || + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index fa976d097..ffcf524f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -1816,8 +1816,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { - if (bp[i].blk_fill >= minfill && - bp[i].blk_fill <= maxfill && + if (BP_GET_FILL(&bp[i]) >= minfill && + BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 326e96cf9..684d64f02 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -233,8 +233,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif -#define ALL -1 - static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) @@ -362,7 +360,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, free_children(db, blkid, nblks, tx); dbuf_rele(db, FTAG); - } } @@ -594,11 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } - ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(dnp->dn_nlevels < 2 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); if (dn->dn_next_type[txgoff] != 0) { dnp->dn_type = dn->dn_type; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 417239722..f21af21eb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1701,7 +1701,7 @@ dsl_dataset_space(dsl_dataset_t *ds, else *availbytesp = 0; } - *usedobjsp = ds->ds_phys->ds_bp.blk_fill; + *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index 639412cde..441036c25 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -539,7 +539,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { @@ -589,6 +589,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) uint64_t count; objset_t *mos; + ASSERT(!dsl_dataset_is_snapshot(ds)); if (dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); @@ -711,7 +712,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0); - /* Remove our reservation */ + /* Remove our reservation. */ if (ds->ds_reserved != 0) { dsl_dataset_set_refreservation_sync_impl(ds, (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index c62be1add..50c56f9eb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -1487,6 +1487,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) } if (err == ERESTART) return; + /* finished; verify that space accounting went to zero */ + ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes); + ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes); + ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) @@ -1669,6 +1673,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, count_block(dp->dp_blkstats, bp); + if (BP_IS_EMBEDDED(bp)) + return (0); + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c index 7f2c26f76..be5b7102c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c @@ -600,8 +600,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, ddura.ddura_chkholds = fnvlist_alloc(); error = dsl_sync_task(pool, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, &ddura, - fnvlist_num_pairs(holds)); + dsl_dataset_user_release_sync, &ddura, 0); fnvlist_free(ddura.ddura_todelete); fnvlist_free(ddura.ddura_chkholds); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index adee7b6ba..ad85d9c5b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -1883,7 +1883,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { - if (!BP_IS_HOLE(bp)) { + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); @@ -2431,9 +2431,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, - &spa->spa_feat_enabled_txg_obj) != 0) { + &spa->spa_feat_enabled_txg_obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } } spa->spa_is_initializing = B_TRUE; @@ -5539,11 +5538,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ASSERT(!locked); ASSERT(vd == vd->vdev_top); - /* - * XXX - Once we have bp-rewrite this should - * become the common case. - */ - mg = vd->vdev_mg; /* @@ -6775,7 +6769,7 @@ spa_upgrade(spa_t *spa, uint64_t version) * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); - ASSERT(version >= spa->spa_uberblock.ub_version); + ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index dbcb72916..a1d715a7b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -1391,7 +1391,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } - checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + if (!BP_IS_EMBEDDED(bp)) { + checksum = + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1693,7 +1696,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; - for (int d = 0; d < SPA_DVAS_PER_BP; d++) + for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); @@ -1706,7 +1709,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int d = 0; d < SPA_DVAS_PER_BP; d++) + for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h new file mode 100644 index 000000000..b720482a7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BLKPTR_H +#define _SYS_BLKPTR_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void encode_embedded_bp_compressed(blkptr_t *, void *, + enum zio_compress, int, int); +void decode_embedded_bp_compressed(const blkptr_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BLKPTR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 207834dfd..c80b7ffba 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -274,6 +274,9 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); +void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index e359a83e2..43b34e072 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -119,6 +119,14 @@ typedef enum dmu_object_byteswap { ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) +/* + * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't + * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill + * is repurposed for embedded BPs. + */ +#define DMU_OT_HAS_FILL(ot) \ + ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) + #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) @@ -396,6 +404,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx); + /* * Decide how to write a block: checksum, compression, number of copies, etc. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 63440646a..6f67b5a0b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -297,12 +297,15 @@ typedef struct dmu_sendarg { int dsa_err; dmu_pendop_t dsa_pending_op; boolean_t dsa_incremental; + uint64_t dsa_featureflags; uint64_t dsa_last_data_object; uint64_t dsa_last_data_offset; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, + void *, dmu_buf_t **); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 2b95b8cf7..b5d617025 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -36,19 +36,19 @@ struct dsl_dataset; struct drr_begin; struct avl_tree; -int dmu_send(const char *tosnap, const char *fromsnap, int outfd, +int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, #ifdef illumos - struct vnode *vp, offset_t *off); + int outfd, struct vnode *vp, offset_t *off); #else - struct file *fp, offset_t *off); + int outfd, struct file *fp, offset_t *off); #endif int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, #ifdef illumos - int outfd, struct vnode *vp, offset_t *off); + boolean_t embedok, int outfd, vnode_t *vp, offset_t *off); #else - int outfd, struct file *fp, offset_t *off); + boolean_t embedok, int outfd, struct file *fp, offset_t *off); #endif typedef struct dmu_recv_cookie { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 64609c632..783166e12 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -170,7 +170,7 @@ typedef struct zio_cksum { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -204,7 +204,8 @@ typedef struct zio_cksum { * G gang block indicator * B byteorder (endianness) * D dedup - * X unused + * X encryption (on version 30, which is not supported) + * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg of block allocation; zero if same as logical birth txg @@ -212,6 +213,100 @@ typedef struct zio_cksum { * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero; see above) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". + */ + +#define BPE_GET_ETYPE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ @@ -258,20 +353,37 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) -#define BP_SET_LSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ - BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) -#define BP_SET_PSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) + (BP_IS_EMBEDDED(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_PSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) +#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) -#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) -#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) +#define BP_GET_CHECKSUM(bp) \ + (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BP_SET_CHECKSUM(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, x); \ +_NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) @@ -279,9 +391,6 @@ typedef struct blkptr { #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) -#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1) -#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) - #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) @@ -289,31 +398,39 @@ typedef struct blkptr { #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ - ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + (BP_IS_EMBEDDED(bp) ? 0 : \ + (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } +#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) + #define BP_GET_ASIZE(bp) \ - (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[2])) + (BP_IS_EMBEDDED(bp) ? 0 : \ + DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ - (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_COUNT_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ - DVA_GET_GANG(&(bp)->blk_dva[2])) + DVA_GET_GANG(&(bp)->blk_dva[2]))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ @@ -321,6 +438,7 @@ typedef struct blkptr { #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) @@ -341,11 +459,13 @@ typedef struct blkptr { (zcp)->zc_word[3] = w3; \ } -#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) -#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) -#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) \ + (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ @@ -399,6 +519,17 @@ typedef struct blkptr { " birth=%lluL", \ (u_longlong_t)bp->blk_birth); \ } \ + } else if (BP_IS_EMBEDDED(bp)) { \ + len = func(buf + len, size - len, \ + "EMBEDDED [L%llu %s] et=%u %s " \ + "size=%llxL/%llxP birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (int)BPE_GET_ETYPE(bp), \ + compress, \ + (u_longlong_t)BPE_GET_LSIZE(bp), \ + (u_longlong_t)BPE_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ @@ -432,7 +563,7 @@ typedef struct blkptr { (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ - (u_longlong_t)bp->blk_fill, \ + (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 6fd44c89d..2e3a1428e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index af2def2da..73fbf3c9c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H @@ -79,15 +79,19 @@ typedef enum drr_headertype { * Feature flags for zfs send streams (flags in drr_versioninfo) */ -#define DMU_BACKUP_FEATURE_DEDUP (0x1) -#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) -#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) +#define DMU_BACKUP_FEATURE_DEDUP (1<<0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2) +/* flags #3 - #15 are reserved for incompatible closed-source implementations */ +#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16) +#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ - DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) + DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ + DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -210,7 +214,7 @@ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_SPILL, DRR_NUMTYPES + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { @@ -222,6 +226,19 @@ typedef struct dmu_replay_record { struct drr_free drr_free; struct drr_write_byref drr_write_byref; struct drr_spill drr_spill; + struct drr_write_embedded { + uint64_t drr_object; + uint64_t drr_offset; + /* logical length, should equal blocksize */ + uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_compression; + uint8_t drr_etype; + uint8_t drr_pad[6]; + uint32_t drr_lsize; /* uncompressed size of payload */ + uint32_t drr_psize; /* compr. (real) size of payload */ + /* (possibly compressed) content follows */ + } drr_write_embedded; } drr_u; } dmu_replay_record_t; @@ -324,8 +341,8 @@ typedef struct zfs_cmd { dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; - boolean_t zc_defer_destroy; - boolean_t zc_temphold; + uint32_t zc_defer_destroy; + uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 8b4e83451..c35585c1a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -85,6 +85,12 @@ enum zio_checksum { ZIO_CHECKSUM_FUNCTIONS }; +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 + #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON @@ -114,6 +120,12 @@ enum zio_compress { ZIO_COMPRESS_FUNCTIONS }; +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 + /* N.B. when altering this value, also change BOOTFS_COMPRESS_VALID below */ #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 4f7c082cb..fb925be2f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -4354,6 +4354,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. + * zc_flags if =1, WRITE_EMBEDDED records are permitted * * outputs: * zc_objset_type estimated size, if zc_guid is set @@ -4364,6 +4365,7 @@ zfs_ioc_send(zfs_cmd_t *zc) int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); + boolean_t embedok = (zc->zc_flags & 0x1); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4429,9 +4431,9 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, #ifdef illumos - zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); + zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off); #else - zc->zc_fromobj, zc->zc_cookie, fp, &off); + zc->zc_fromobj, embedok, zc->zc_cookie, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) @@ -5369,6 +5371,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc) * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted * } * * outnvl is unused @@ -5382,6 +5386,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) offset_t off; char *fromname = NULL; int fd; + boolean_t embedok; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5389,15 +5394,17 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); + embedok = nvlist_exists(innvl, "embedok"); + file_t *fp = getf(fd, cap_rights_init(&rights, CAP_READ)); if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; #ifdef illumos - error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off); #else - error = dmu_send(snapname, fromname, fd, fp, &off); + error = dmu_send(snapname, fromname, embedok, fd, fp, &off); #endif #ifdef illumos @@ -6009,6 +6016,18 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, error = SET_ERROR(EFAULT); goto out; } + if (zc_iocparm->zfs_ioctl_version != ZFS_IOCVER_CURRENT) { + compat = B_TRUE; + + switch (zc_iocparm->zfs_ioctl_version) { + case ZFS_IOCVER_ZCMD: + cflag = ZFS_CMD_COMPAT_ZCMD; + break; + default: + error = SET_ERROR(EINVAL); + goto out; + } + } } if (compat) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index ea5bb13a7..1b899d171 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -149,10 +149,15 @@ int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; - const dva_t *dva = BP_IDENTITY(bp); + const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; + if (BP_IS_EMBEDDED(bp)) + return (0); + + dva = BP_IDENTITY(bp); + if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); @@ -843,7 +848,7 @@ zil_lwb_write_done(zio_t *zio) ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(zio->io_bp->blk_fill == 0); + ASSERT(BP_GET_FILL(zio->io_bp) == 0); /* * Ensure the lwb buffer pointer is cleared before releasing diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 95f6bbd6e..be30626a7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -37,6 +37,7 @@ #include #include #include +#include #include SYSCTL_DECL(_vfs_zfs); @@ -268,7 +269,7 @@ zio_buf_alloc(size_t size) size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; int flags = zio_exclude_metadata ? KM_NODEBUG : 0; - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); @@ -703,6 +704,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_physdone = physdone; zio->io_prop = *zp; + /* + * Data can be NULL if we are going to call zio_write_override() to + * provide the already-allocated BP. But we may need the data to + * verify a dedup hit (if requested). In this case, don't try to + * dedup (just take the already-allocated BP verbatim). + */ + if (data == NULL && zio->io_prop.zp_dedup_verify) { + zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; + } + return (zio); } @@ -742,6 +753,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + + /* + * The check for EMBEDDED is a performance optimization. We + * process the free here (by ignoring it) rather than + * putting it on the list and then processing it in zio_free_sync(). + */ + if (BP_IS_EMBEDDED(bp)) + return; metaslab_check_free(spa, bp); /* @@ -767,13 +786,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; - dprintf_bp(bp, "freeing in txg %llu, pass %u", - (longlong_t)txg, spa->spa_sync_pass); - ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + metaslab_check_free(spa, bp); arc_freed(spa, bp); @@ -801,6 +820,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; + dprintf_bp(bp, "claiming in txg %llu", txg); + + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that @@ -1020,12 +1044,20 @@ zio_read_bp_init(zio_t **ziop) if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { - uint64_t psize = BP_GET_PSIZE(bp); + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + decode_embedded_bp_compressed(bp, zio->io_data); + } else { + ASSERT(!BP_IS_EMBEDDED(bp)); + } + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; @@ -1070,6 +1102,9 @@ zio_write_bp_init(zio_t **ziop) *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (BP_IS_EMBEDDED(bp)) + return (ZIO_PIPELINE_CONTINUE); + /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite @@ -1118,7 +1153,7 @@ zio_write_bp_init(zio_t **ziop) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), + ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } @@ -1130,9 +1165,38 @@ zio_write_bp_init(zio_t **ziop) if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); + } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && + zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && + spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + encode_embedded_bp_compressed(bp, + cbuf, compress, lsize, psize); + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); + BP_SET_TYPE(bp, zio->io_prop.zp_type); + BP_SET_LEVEL(bp, zio->io_prop.zp_level); + zio_buf_free(cbuf, lsize); + bp->blk_birth = zio->io_txg; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_EMBEDDED_DATA)); + return (ZIO_PIPELINE_CONTINUE); } else { - ASSERT(psize < lsize); - zio_push_transform(zio, cbuf, psize, lsize, NULL); + /* + * Round up compressed size to MINBLOCKSIZE and + * zero the tail. + */ + size_t rounded = + P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); + if (rounded > psize) { + bzero((char *)cbuf + psize, rounded - psize); + psize = rounded; + } + if (psize == lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + } else { + zio_push_transform(zio, cbuf, + psize, lsize, NULL); + } } } @@ -2897,7 +2961,7 @@ zio_checksum_verified(zio_t *zio) /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. - * An error of 0 indictes success. ENXIO indicates whole-device failure, + * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. @@ -3009,7 +3073,7 @@ zio_done(zio_t **ziop) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); - if (bp != NULL) { + if (bp != NULL && !BP_IS_EMBEDDED(bp)) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c index 30994a4ef..502335b22 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c @@ -41,14 +41,12 @@ typedef struct zcomp_stats { kstat_named_t zcompstat_attempts; kstat_named_t zcompstat_empty; - kstat_named_t zcompstat_skipped_minblocksize; kstat_named_t zcompstat_skipped_insufficient_gain; } zcomp_stats_t; static zcomp_stats_t zcomp_stats = { { "attempts", KSTAT_DATA_UINT64 }, { "empty", KSTAT_DATA_UINT64 }, - { "skipped_minblocksize", KSTAT_DATA_UINT64 }, { "skipped_insufficient_gain", KSTAT_DATA_UINT64 } }; @@ -103,7 +101,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len, size_t minblocksize) { uint64_t *word, *word_end; - size_t c_len, d_len, r_len; + size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); @@ -129,12 +127,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len, return (s_len); /* Compress at least 12.5% */ - d_len = P2ALIGN(s_len - (s_len >> 3), minblocksize); - if (d_len == 0) { - ZCOMPSTAT_BUMP(zcompstat_skipped_minblocksize); - return (s_len); - } - + d_len = s_len - (s_len >> 3); c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); if (c_len > d_len) { @@ -142,19 +135,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len, return (s_len); } - /* - * Cool. We compressed at least as much as we were hoping to. - * For both security and repeatability, pad out the last sector. - */ - r_len = P2ROUNDUP(c_len, minblocksize); - if (r_len > c_len) { - bzero((char *)dst + c_len, r_len - c_len); - c_len = r_len; - } - ASSERT3U(c_len, <=, d_len); - ASSERT(P2PHASE(c_len, minblocksize) == 0); - return (c_len); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index cc19e9b8e..6ae7cd9c2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -315,6 +315,8 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) return (0); + VERIFY(!BP_IS_EMBEDDED(bp)); + VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); ma->ma_blks++; diff --git a/sys/conf/files b/sys/conf/files index c5519879b..f73dbac1b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -179,6 +179,7 @@ cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with " cddl/contrib/opensolaris/uts/common/fs/gfs.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}" -- 2.45.0