From f8e901c41205ef80f720b268f402b6d0e1cb0317 Mon Sep 17 00:00:00 2001 From: delphij Date: Fri, 9 May 2014 08:10:33 +0000 Subject: [PATCH] MFC r264835: MFV r264829: 3897 zfs filesystem and snapshot limits git-svn-id: svn://svn.freebsd.org/base/stable/9@265754 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- cddl/contrib/opensolaris/cmd/zfs/zfs.8 | 59 +- .../opensolaris/cmd/zpool/zpool-features.7 | 19 +- .../lib/libzfs/common/libzfs_dataset.c | 29 + .../lib/libzfs/common/libzfs_util.c | 11 + .../opensolaris/common/zfs/zfeature_common.c | 9 + .../opensolaris/common/zfs/zfeature_common.h | 1 + .../contrib/opensolaris/common/zfs/zfs_prop.c | 12 + .../uts/common/fs/zfs/dmu_objset.c | 11 +- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 40 +- .../uts/common/fs/zfs/dsl_dataset.c | 159 ++++- .../uts/common/fs/zfs/dsl_destroy.c | 14 +- .../opensolaris/uts/common/fs/zfs/dsl_dir.c | 595 +++++++++++++++++- .../uts/common/fs/zfs/sys/dmu_send.h | 3 +- .../uts/common/fs/zfs/sys/dsl_dataset.h | 7 +- .../uts/common/fs/zfs/sys/dsl_dir.h | 18 +- .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 21 +- .../opensolaris/uts/common/sys/fs/zfs.h | 6 +- 17 files changed, 986 insertions(+), 28 deletions(-) diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index 2948fd736..474a55e57 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -24,13 +24,13 @@ .\" Copyright (c) 2012, Glen Barber .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2013 Nexenta Systems, Inc. All Rights Reserved. -.\" Copyright (c) 2013, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2013, Steven Hartland .\" Copyright (c) 2014, Xin LI .\" .\" $FreeBSD$ .\" -.Dd March 20, 2014 +.Dd April 23, 2014 .Dt ZFS 8 .Os .Sh NAME @@ -536,6 +536,13 @@ if the snapshot has been marked for deferred destroy by using the .Qq Nm Cm destroy -d command. Otherwise, the property is .Cm off . +.It Sy filesystem_count +The total number of filesystems and volumes that exist under this location in the +dataset tree. +This value is only available when a +.Sy filesystem_limit +has +been set somewhere in the tree under which the dataset resides. .It Sy logicalreferenced The amount of space that is .Qq logically @@ -594,6 +601,12 @@ The compression ratio achieved for the space of this dataset, expressed as a multiplier. See also the .Sy compressratio property. +.It Sy snapshot_count +The total number of snapshots that exist under this location in the dataset tree. +This value is only available when a +.Sy snapshot_limit +has been set somewhere +in the tree under which the dataset resides. .It Sy type The type of dataset: .Sy filesystem , volume , No or Sy snapshot . @@ -1014,6 +1027,23 @@ The .Sy mlslabel property is currently not supported on .Fx . +.It Sy filesystem_limit Ns = Ns Ar count | Cm none +Limits the number of filesystems and volumes that can exist under this point in +the dataset tree. +The limit is not enforced if the user is allowed to change +the limit. +Setting a +.Sy filesystem_limit +on a descendent of a filesystem that +already has a +.Sy filesystem_limit +does not override the ancestor's +.Sy filesystem_limit , +but rather imposes an additional limit. +This feature must be enabled to be used +.Po see +.Xr zpool-features 7 +.Pc . .It Sy mountpoint Ns = Ns Ar path | Cm none | legacy Controls the mount point used for this file system. See the .Qq Sx Mount Points @@ -1055,6 +1085,27 @@ the ancestor's quota, but rather imposes an additional limit. Quotas cannot be set on volumes, as the .Sy volsize property acts as an implicit quota. +.It Sy snapshot_limit Ns = Ns Ar count | Cm none +Limits the number of snapshots that can be created on a dataset and its +descendents. +Setting a +.Sy snapshot_limit +on a descendent of a dataset that already +has a +.Sy snapshot_limit +does not override the ancestor's +.Sy snapshot_limit , +but +rather imposes an additional limit. +The limit is not enforced if the user is +allowed to change the limit. +For example, this means that recursive snapshots +taken from the global zone are counted against each delegated dataset within +a jail. +This feature must be enabled to be used +.Po see +.Xr zpool-features 7 +.Pc . .It Sy userquota@ Ns Ar user Ns = Ns Ar size | Cm none Limits the amount of space consumed by the specified user. Similar to the @@ -2698,6 +2749,7 @@ protocol .It dedup Ta property .It devices Ta property .It exec Ta property +.It filesystem_limit Ta property .It logbias Ta property .It jailed Ta property .It mlslabel Ta property @@ -2716,6 +2768,7 @@ protocol .It sharenfs Ta property .It sharesmb Ta property .It snapdir Ta property +.It snapshot_limit Ta property .It sync Ta property .It utf8only Ta property .It version Ta property @@ -3063,10 +3116,12 @@ pool/home/bob compression on local pool/home/bob atime on default pool/home/bob devices on default pool/home/bob exec on default +pool/home/bob filesystem_limit none default pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob jailed off default pool/home/bob snapdir hidden default +pool/home/bob snapshot_limit none default pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 index 5c6c47a52..f9d866b37 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 @@ -23,7 +23,7 @@ .\" .\" $FreeBSD$ .\" -.Dd January 2, 2014 +.Dd April 23, 2014 .Dt ZPOOL-FEATURES 7 .Os .Sh NAME @@ -187,6 +187,23 @@ This feature is .Sy active while there are any filesystems, volumes, or snapshots which were created after enabling this feature. +.It Sy filesystem_limits +.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:filesystem_limits" +.It GUID Ta com.joyent:filesystem_limits +.It READ\-ONLY COMPATIBLE Ta yes +.It DEPENDENCIES Ta extensible_dataset +.El +.Pp +This feature enables filesystem and snapshot limits. +These limits can be used +to control how many filesystems and/or snapshots can be created at the point in +the tree on which the limits are set. +.Pp +This feature is +.Sy active +once either of the limit properties has been +set on a dataset. +Once activated the feature is never deactivated. .It Sy lz4_compress .Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:lz4_compress" .It GUID Ta org.illumos:lz4_compress diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c index b2813f467..bb46fe71e 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek . @@ -1910,6 +1911,10 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { @@ -2315,6 +2320,30 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, } break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + + /* + * If limit is UINT64_MAX, we translate this into 'none' (unless + * literal is set), and indicate that it's the default value. + * Otherwise, we print the number nicely and indicate that it's + * set locally. + */ + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else if (val == UINT64_MAX) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(val, propbuf, proplen); + } + break; + case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c index 7b2f1c7e9..9878b6816 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -1266,6 +1267,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, "use 'none' to disable quota/refquota")); goto error; } + + /* + * Special handling for "*_limit=none". In this case it's not + * 0 but UINT64_MAX. + */ + if ((type & ZFS_TYPE_DATASET) && isnone && + (prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT)) { + *ivalp = UINT64_MAX; + } break; case PROP_TYPE_INDEX: diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c index 28b0c9e35..5aa9d2b8c 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -205,4 +205,13 @@ zpool_feature_init(void) "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", B_TRUE, B_FALSE, B_FALSE, bookmarks_deps); + + static const spa_feature_t filesystem_limits_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_FS_SS_LIMIT, + "com.joyent:filesystem_limits", "filesystem_limits", + "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE, + filesystem_limits_deps); } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h index fc5bdc574..1f668a5b5 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -48,6 +48,7 @@ typedef enum spa_feature { SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURES } spa_feature_t; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index 84037321f..84cb82ff0 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -371,6 +371,18 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); + zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + " | none", "FSLIMIT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "SSLIMIT"); + zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + "", "FSCOUNT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "SSCOUNT"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index b5c0f5f5b..0c4fe26e3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -758,9 +759,11 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); dsl_dir_rele(pdd, FTAG); - return (0); + return (error); } static void @@ -844,6 +847,12 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EXDEV)); } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EDQUOT)); + } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index ea6d3b4dd..d3efa553e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. */ @@ -806,6 +806,20 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (error != ENOENT) return (error == 0 ? EEXIST : error); + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred); + if (error != 0) + return (error); + if (fromguid != 0) { dsl_dataset_t *snap; uint64_t obj = ds->ds_phys->ds_prev_snap_obj; @@ -912,6 +926,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, @@ -1021,6 +1054,7 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_cred = CRED(); if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) drc->drc_byteswap = B_TRUE; @@ -1740,7 +1774,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, - drc->drc_tosnap, tx, B_TRUE); + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); @@ -1748,7 +1782,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, - drc->drc_tosnap, tx, B_TRUE); + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 6efbfb718..417239722 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2011 Martin Matuska * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. */ @@ -321,7 +321,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) } int -dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; @@ -338,6 +339,11 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); + + if (err == 0 && adj_cnt) + dsl_fs_ss_count_adjust(ds->ds_dir, -1, + DD_FIELD_SNAPSHOT_COUNT, tx); + return (err); } @@ -765,6 +771,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_deleg_set_create_perms(dd, tx, cr); + /* + * Since we're creating a new node we know it's a leaf, so we can + * initialize the counts if the limit feature is active. + */ + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + uint64_t cnt = 0; + objset_t *os = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (cnt), 1, &cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (cnt), 1, &cnt, tx)); + } + dsl_dir_rele(dd, FTAG); /* @@ -971,11 +992,12 @@ typedef struct dsl_dataset_snapshot_arg { nvlist_t *ddsa_snaps; nvlist_t *ddsa_props; nvlist_t *ddsa_errors; + cred_t *ddsa_cr; } dsl_dataset_snapshot_arg_t; int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv) + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) { int error; uint64_t value; @@ -1013,6 +1035,18 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, if (!recv && DS_IS_INCONSISTENT(ds)) return (SET_ERROR(EBUSY)); + /* + * Skip the check for temporary snapshots or if we have already checked + * the counts in dsl_dataset_snapshot_check. This means we really only + * check the count here when we're receiving a stream. + */ + if (cnt != 0 && cr != NULL) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); + if (error != 0) + return (error); + } + error = dsl_dataset_snapshot_reserve_space(ds, tx); if (error != 0) return (error); @@ -1028,6 +1062,99 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) nvpair_t *pair; int rv = 0; + /* + * Pre-compute how many total new snapshots will be created for each + * level in the tree and below. This is needed for validating the + * snapshot limit when either taking a recursive snapshot or when + * taking multiple snapshots. + * + * The problem is that the counts are not actually adjusted when + * we are checking, only when we finally sync. For a single snapshot, + * this is easy, the count will increase by 1 at each node up the tree, + * but its more complicated for the recursive/multiple snapshot case. + * + * The dsl_fs_ss_limit_check function does recursively check the count + * at each level up the tree but since it is validating each snapshot + * independently we need to be sure that we are validating the complete + * count for the entire set of snapshots. We do this by rolling up the + * counts for each component of the name into an nvlist and then + * checking each of those cases with the aggregated count. + * + * This approach properly handles not only the recursive snapshot + * case (where we get all of those on the ddsa_snaps list) but also + * the sibling case (e.g. snapshot a/b and a/c so that we will also + * validate the limit on 'a' using a count of 2). + * + * We validate the snapshot names in the third loop and only report + * name errors once. + */ + if (dmu_tx_is_syncing(tx)) { + nvlist_t *cnt_track = NULL; + cnt_track = fnvlist_alloc(); + + /* Rollup aggregated counts into the cnt_track list */ + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + char *pdelim; + uint64_t val; + char nm[MAXPATHLEN]; + + (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); + pdelim = strchr(nm, '@'); + if (pdelim == NULL) + continue; + *pdelim = '\0'; + + do { + if (nvlist_lookup_uint64(cnt_track, nm, + &val) == 0) { + /* update existing entry */ + fnvlist_add_uint64(cnt_track, nm, + val + 1); + } else { + /* add to list */ + fnvlist_add_uint64(cnt_track, nm, 1); + } + + pdelim = strrchr(nm, '/'); + if (pdelim != NULL) + *pdelim = '\0'; + } while (pdelim != NULL); + } + + /* Check aggregated counts at each level */ + for (pair = nvlist_next_nvpair(cnt_track, NULL); + pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { + int error = 0; + char *name; + uint64_t cnt = 0; + dsl_dataset_t *ds; + + name = nvpair_name(pair); + cnt = fnvpair_value_uint64(pair); + ASSERT(cnt > 0); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + ddsa->ddsa_cr); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + rv = error; + /* only report one error for this check */ + break; + } + } + nvlist_free(cnt_track); + } + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; @@ -1048,8 +1175,9 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) if (error == 0) error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { + /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, - atp + 1, tx, B_FALSE); + atp + 1, tx, B_FALSE, 0, NULL); dsl_dataset_rele(ds, FTAG); } @@ -1061,6 +1189,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) rv = error; } } + return (rv); } @@ -1088,6 +1217,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, bcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); + dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL @@ -1266,6 +1396,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) ddsa.ddsa_snaps = snaps; ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; + ddsa.ddsa_cr = CRED(); if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, @@ -1315,8 +1446,9 @@ dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); + /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, - tx, B_FALSE); + tx, B_FALSE, 0, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); @@ -1689,7 +1821,8 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", ddrsa->ddrsa_newsnapname); - VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, + B_FALSE)); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); mutex_exit(&ds->ds_lock); @@ -1936,6 +2069,7 @@ typedef struct dsl_dataset_promote_arg { dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; + cred_t *cr; } dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); @@ -1953,6 +2087,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) dsl_dataset_t *origin_ds; int err; uint64_t unused; + uint64_t ss_mv_cnt; err = promote_hold(ddpa, dp, FTAG); if (err != 0) @@ -1999,6 +2134,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ + ss_mv_cnt = 0; ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; @@ -2007,6 +2143,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; + ss_mv_cnt++; + /* * If there are long holds, we won't be able to evict * the objset. @@ -2049,9 +2187,9 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; } - /* Check that there is enough space here */ + /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - ddpa->used); + 0, ss_mv_cnt, ddpa->used, ddpa->cr); if (err != 0) goto out; @@ -2191,10 +2329,12 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) /* move snap name entry */ VERIFY0(dsl_dataset_get_snapname(ds)); VERIFY0(dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx)); + ds->ds_snapname, tx, B_TRUE)); VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + dsl_fs_ss_count_adjust(hds->ds_dir, 1, + DD_FIELD_SNAPSHOT_COUNT, tx); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -2432,6 +2572,7 @@ dsl_dataset_promote(const char *name, char *conflsnap) ddpa.ddpa_clonename = name; ddpa.err_ds = conflsnap; + ddpa.cr = CRED(); return (dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index 9e695cdec..639412cde 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2013 by Joyent, Inc. All rights reserved. */ #include @@ -430,7 +431,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT3U(val, ==, obj); } #endif - VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); dsl_dataset_rele(ds_head, FTAG); if (ds_prev != NULL) @@ -656,6 +657,17 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) ASSERT0(dd->dd_phys->dd_head_dataset_obj); + /* + * Decrement the filesystem count for all parent filesystems. + * + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We never count this temporary + * clone, whose name begins with a '%'. + */ + if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, -1, + DD_FIELD_FILESYSTEM_COUNT, tx); + /* * Remove our reservation. The impl() routine avoids setting the * actual property, which would require the (already destroyed) ds. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index c8de009d1..d61dbc177 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -23,6 +23,7 @@ * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. */ #include @@ -44,7 +45,86 @@ #ifdef _KERNEL #include #endif +#include +#include +#include #include "zfs_namecheck.h" +#include "zfs_prop.h" + +/* + * Filesystem and Snapshot Limits + * ------------------------------ + * + * These limits are used to restrict the number of filesystems and/or snapshots + * that can be created at a given level in the tree or below. A typical + * use-case is with a delegated dataset where the administrator wants to ensure + * that a user within the zone is not creating too many additional filesystems + * or snapshots, even though they're not exceeding their space quota. + * + * The filesystem and snapshot counts are stored as extensible properties. This + * capability is controlled by a feature flag and must be enabled to be used. + * Once enabled, the feature is not active until the first limit is set. At + * that point, future operations to create/destroy filesystems or snapshots + * will validate and update the counts. + * + * Because the count properties will not exist before the feature is active, + * the counts are updated when a limit is first set on an uninitialized + * dsl_dir node in the tree (The filesystem/snapshot count on a node includes + * all of the nested filesystems/snapshots. Thus, a new leaf node has a + * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and + * snapshot count properties on a node indicate uninitialized counts on that + * node.) When first setting a limit on an uninitialized node, the code starts + * at the filesystem with the new limit and descends into all sub-filesystems + * to add the count properties. + * + * In practice this is lightweight since a limit is typically set when the + * filesystem is created and thus has no children. Once valid, changing the + * limit value won't require a re-traversal since the counts are already valid. + * When recursively fixing the counts, if a node with a limit is encountered + * during the descent, the counts are known to be valid and there is no need to + * descend into that filesystem's children. The counts on filesystems above the + * one with the new limit will still be uninitialized, unless a limit is + * eventually set on one of those filesystems. The counts are always recursively + * updated when a limit is set on a dataset, unless there is already a limit. + * When a new limit value is set on a filesystem with an existing limit, it is + * possible for the new limit to be less than the current count at that level + * since a user who can change the limit is also allowed to exceed the limit. + * + * Once the feature is active, then whenever a filesystem or snapshot is + * created, the code recurses up the tree, validating the new count against the + * limit at each initialized level. In practice, most levels will not have a + * limit set. If there is a limit at any initialized level up the tree, the + * check must pass or the creation will fail. Likewise, when a filesystem or + * snapshot is destroyed, the counts are recursively adjusted all the way up + * the initizized nodes in the tree. Renaming a filesystem into different point + * in the tree will first validate, then update the counts on each branch up to + * the common ancestor. A receive will also validate the counts and then update + * them. + * + * An exception to the above behavior is that the limit is not enforced if the + * user has permission to modify the limit. This is primarily so that + * recursive snapshots in the global zone always work. We want to prevent a + * denial-of-service in which a lower level delegated dataset could max out its + * limit and thus block recursive snapshots from being taken in the global zone. + * Because of this, it is possible for the snapshot count to be over the limit + * and snapshots taken in the global zone could cause a lower level dataset to + * hit or exceed its limit. The administrator taking the global zone recursive + * snapshot should be aware of this side-effect and behave accordingly. + * For consistency, the filesystem limit is also not enforced if the user can + * modify the limit. + * + * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() + * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in + * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by + * dsl_dir_init_fs_ss_count(). + * + * There is a special case when we receive a filesystem that already exists. In + * this case a temporary clone name of %X is created (see dmu_recv_begin). We + * never update the filesystem counts for temporary clones. + * + * Likewise, we do not update the snapshot counts for temporary snapshots, + * such as those created by zfs diff. + */ static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); @@ -383,6 +463,402 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, return (err); } +/* + * If the counts are already initialized for this filesystem and its + * descendants then do nothing, otherwise initialize the counts. + * + * The counts on this filesystem, and those below, may be uninitialized due to + * either the use of a pre-existing pool which did not support the + * filesystem/snapshot limit feature, or one in which the feature had not yet + * been enabled. + * + * Recursively descend the filesystem tree and update the filesystem/snapshot + * counts on each filesystem below, then update the cumulative count on the + * current filesystem. If the filesystem already has a count set on it, + * then we know that its counts, and the counts on the filesystems below it, + * are already correct, so we don't have to update this filesystem. + */ +static void +dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) +{ + uint64_t my_fs_cnt = 0; + uint64_t my_ss_cnt = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *os = dp->dp_meta_objset; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_dataset_t *ds; + + ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_zapify(dd, tx); + + /* + * If the filesystem count has already been initialized then we + * don't need to recurse down any further. + */ + if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Iterate my child dirs */ + for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { + dsl_dir_t *chld_dd; + uint64_t count; + + VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, + &chld_dd)); + + /* + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and + * temporary datasets. + */ + if (chld_dd->dd_myname[0] == '$' || + chld_dd->dd_myname[0] == '%') { + dsl_dir_rele(chld_dd, FTAG); + continue; + } + + my_fs_cnt++; /* count this child */ + + dsl_dir_init_fs_ss_count(chld_dd, tx); + + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); + my_fs_cnt += count; + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); + my_ss_cnt += count; + + dsl_dir_rele(chld_dd, FTAG); + } + zap_cursor_fini(zc); + /* Count my snapshots (we counted children's snapshots above) */ + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &ds)); + + for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + /* Don't count temporary snapshots */ + if (za->za_name[0] != '%') + my_ss_cnt++; + } + + dsl_dataset_rele(ds, FTAG); + + kmem_free(zc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + + /* we're in a sync task, update counts */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); +} + +static int +dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + int error; + + error = dsl_dataset_hold(dp, ddname, FTAG, &ds); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + dd = ds->ds_dir; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && + dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT) == 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EALREADY)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + spa_t *spa; + + VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); + + spa = dsl_dataset_get_spa(ds); + + if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Since the feature was not active and we're now setting a + * limit, increment the feature-active counter so that the + * feature becomes active for the first time. + * + * We are already in a sync task so we can update the MOS. + */ + spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); + } + + /* + * Since we are now setting a non-UINT64_MAX limit on the filesystem, + * we need to ensure the counts are correct. Descend down the tree from + * this point and update all of the counts to be accurate. + */ + dsl_dir_init_fs_ss_count(ds->ds_dir, tx); + + dsl_dataset_rele(ds, FTAG); +} + +/* + * Make sure the feature is enabled and activate it if necessary. + * Since we're setting a limit, ensure the on-disk counts are valid. + * This is only called by the ioctl path when setting a limit value. + * + * We do not need to validate the new limit, since users who can change the + * limit are also allowed to exceed the limit. + */ +int +dsl_dir_activate_fs_ss_limit(const char *ddname) +{ + int error; + + error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, + dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0); + + if (error == EALREADY) + error = 0; + + return (error); +} + +/* + * Used to determine if the filesystem_limit or snapshot_limit should be + * enforced. We allow the limit to be exceeded if the user has permission to + * write the property value. We pass in the creds that we got in the open + * context since we will always be the GZ root in syncing context. We also have + * to handle the case where we are allowed to change the limit on the current + * dataset, but there may be another limit in the tree above. + * + * We can never modify these two properties within a non-global zone. In + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We + * can't use that function since we are already holding the dp_config_rwlock. + * In addition, we already have the dd and dealing with snapshots is simplified + * in this code. + */ + +typedef enum { + ENFORCE_ALWAYS, + ENFORCE_NEVER, + ENFORCE_ABOVE +} enforce_res_t; + +static enforce_res_t +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) +{ + enforce_res_t enforce = ENFORCE_ALWAYS; + uint64_t obj; + dsl_dataset_t *ds; + uint64_t zoned; + + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + +#ifdef _KERNEL +#ifdef __FreeBSD__ + if (jailed(cr)) +#else + if (crgetzoneid(cr) != GLOBAL_ZONEID) +#endif + return (ENFORCE_ALWAYS); + + if (secpolicy_zfs(cr) == 0) + return (ENFORCE_NEVER); +#endif + + if ((obj = dd->dd_phys->dd_head_dataset_obj) == 0) + return (ENFORCE_ALWAYS); + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) + return (ENFORCE_ALWAYS); + + if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { + /* Only root can access zoned fs's from the GZ */ + enforce = ENFORCE_ALWAYS; + } else { + if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) + enforce = ENFORCE_ABOVE; + } + + dsl_dataset_rele(ds, FTAG); + return (enforce); +} + +/* + * Check if adding additional child filesystem(s) would exceed any filesystem + * limits or adding additional snapshot(s) would exceed any snapshot limits. + * The prop argument indicates which limit to check. + * + * Note that all filesystem limits up to the root (or the highest + * initialized) filesystem or the given ancestor must be satisfied. + */ +int +dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, + dsl_dir_t *ancestor, cred_t *cr) +{ + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t limit, count; + char *count_prop; + enforce_res_t enforce; + int err = 0; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + + /* + * If we're allowed to change the limit, don't enforce the limit + * e.g. this can happen if a snapshot is taken by an administrative + * user in the global zone (i.e. a recursive snapshot by root). + * However, we must handle the case of delegated permissions where we + * are allowed to change the limit on the current dataset, but there + * is another limit in the tree above. + */ + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); + if (enforce == ENFORCE_NEVER) + return (0); + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment + * is 0. + */ + if (delta == 0) + return (0); + + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } + + /* + * If an ancestor has been provided, stop checking the limit once we + * hit that dir. We need this during rename so that we don't overcount + * the check once we recurse up to the common ancestor. + */ + if (ancestor == dd) + return (0); + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know there is no limit here (or above). The counts are + * not valid on this node and we know we won't touch this node's counts. + */ + if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count) == ENOENT) + return (0); + + err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, + B_FALSE); + if (err != 0) + return (err); + + /* Is there a limit which we've hit? */ + if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) + return (SET_ERROR(EDQUOT)); + + if (dd->dd_parent != NULL) + err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, + ancestor, cr); + + return (err); +} + +/* + * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all + * parents. When a new filesystem/snapshot is created, increment the count on + * all parents, and when a filesystem/snapshot is destroyed, decrement the + * count. + */ +void +dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, + dmu_tx_t *tx) +{ + int err; + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t count; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || + strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); + + /* + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We don't count this temporary + * clone, whose name begins with a '%'. We also ignore hidden ($FREE, + * $MOS & $ORIGIN) objsets. + */ + if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && + strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment is 0 + */ + if (delta == 0) + return; + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know the counts are not valid on this node and we + * know we shouldn't touch this node's counts. An uninitialized count + * on the node indicates that either the feature has not yet been + * activated or there are no limits on this part of the tree. + */ + if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, + prop, sizeof (count), 1, &count)) == ENOENT) + return; + VERIFY0(err); + + count += delta; + /* Use a signed verify to make sure we're not neg. */ + VERIFY3S(count, >=, 0); + + VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, + tx)); + + /* Roll up this additional count into our ancestors */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); +} + uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) @@ -407,8 +883,12 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); - if (pds) + if (pds) { ddphys->dd_parent_obj = pds->dd_object; + + /* update the filesystem counts */ + dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); + } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, @@ -457,6 +937,22 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) } mutex_exit(&dd->dd_lock); + if (dsl_dir_is_zapified(dd)) { + uint64_t count; + objset_t *os = dd->dd_pool->dp_meta_objset; + + if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_FILESYSTEM_COUNT, count); + } + if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_SNAPSHOT_COUNT, count); + } + } + if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; @@ -1155,6 +1651,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; + cred_t *ddra_cred; } dsl_dir_rename_arg_t; /* ARGSUSED */ @@ -1219,10 +1716,50 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } } + if (dmu_tx_is_syncing(tx)) { + if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Although this is the check function and we don't + * normally make on-disk changes in check functions, + * we need to do that here. + * + * Ensure this portion of the tree's counts have been + * initialized in case the new parent has limits set. + */ + dsl_dir_init_fs_ss_count(dd, tx); + } + } + if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + if (dsl_dir_is_zapified(dd)) { + int err; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt); + if (err != ENOENT && err != 0) + return (err); + + /* + * have to add 1 for the filesystem itself that we're + * moving + */ + fs_cnt++; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt); + if (err != ENOENT && err != 0) + return (err); + } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { @@ -1232,7 +1769,7 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } error = dsl_dir_transfer_possible(dd->dd_parent, - newparent, myspace); + newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); @@ -1264,6 +1801,37 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + /* + * We already made sure the dd counts were initialized in the + * check function. + */ + if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt)); + /* add 1 for the filesystem itself that we're moving */ + fs_cnt++; + + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt)); + } + + dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + + dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, @@ -1321,17 +1889,20 @@ dsl_dir_rename(const char *oldname, const char *newname) ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; + ddra.ddra_cred = CRED(); return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); } int -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; + int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); @@ -1339,6 +1910,15 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) if (avail < space) return (SET_ERROR(ENOSPC)); + err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + return (0); } @@ -1371,3 +1951,12 @@ dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } + +boolean_t +dsl_dir_is_zapified(dsl_dir_t *dd) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(dd->dd_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 13ef5115b..2b95b8cf7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #ifndef _DMU_SEND_H @@ -63,6 +63,7 @@ typedef struct dmu_recv_cookie { zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; void *drc_owner; + cred_t *drc_cred; } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 5132ef9ce..d9552b226 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ @@ -266,7 +266,7 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv); + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx); @@ -276,7 +276,8 @@ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); int dsl_dataset_get_snapname(dsl_dataset_t *ds); int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value); -int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); +int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt); void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h index 87e2ce831..752ccadb7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -38,6 +39,14 @@ extern "C" { struct dsl_dataset; +/* + * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. + * They should be of the format :. + */ + +#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" +#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" + typedef enum dd_used { DD_USED_HEAD, DD_USED_SNAP, @@ -129,8 +138,13 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); +int dsl_dir_activate_fs_ss_limit(const char *); +int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, + cred_t *); +void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); int dsl_dir_rename(const char *oldname, const char *newname); -int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); @@ -138,6 +152,8 @@ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); +void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); +boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 6bf36f32b..83f9369a4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -25,7 +25,7 @@ * All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -634,12 +634,14 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, break; case ZFS_PROP_QUOTA: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curthread)) { uint64_t zoned; char setpoint[MAXNAMELEN]; /* * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) + * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "jailed", &zoned, @@ -2457,6 +2459,21 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (intval == UINT64_MAX) { + /* clearing the limit, just do it */ + err = 0; + } else { + err = dsl_dir_activate_fs_ss_limit(dsname); + } + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 47450661e..84d7af6ed 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. */ @@ -145,6 +145,10 @@ typedef enum { ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ + ZFS_PROP_FILESYSTEM_LIMIT, + ZFS_PROP_SNAPSHOT_LIMIT, + ZFS_PROP_FILESYSTEM_COUNT, + ZFS_PROP_SNAPSHOT_COUNT, ZFS_NUM_PROPS } zfs_prop_t; -- 2.45.0