From ecbb9492636dc9b3d7178a16b65d3b8731360951 Mon Sep 17 00:00:00 2001 From: smh Date: Wed, 5 Jun 2013 13:03:47 +0000 Subject: [PATCH] Added ZFS TRIM support which is enabled by default. To disable ZFS TRIM support set vfs.zfs.trim.enabled=0 in loader.conf. Creating new ZFS pools and adding new devices to existing pools first performs a full device level TRIM which can take a significant amount of time. The sysctl vfs.zfs.vdev.trim_on_init can be set to 0 to disable this behaviour. ZFS TRIM requires the underlying device support BIO_DELETE which is currently provided by methods such as ATA TRIM and SCSI UNMAP via CAM, which are typically supported by SSD's. Stats for ZFS TRIM can be monitored by looking at the sysctl's under kstat.zfs.misc.zio_trim. MFC r240868: Add TRIM support MFC r244155: Renamed zfs trim stats MFC r244187: Upgrade TRIM free request sizes optimisation MFC r244188: Added vfs.zfs.vdev.trim_on_init sysctl MFC r248572: Add TRIM support for L2ARC MFC r248574: Improve TXG handling in the TRIM module MFC r248575: TRIM cache devices based on time instead of TXGs MFC r248576: Names the ZFS TRIM thread MFC r248577: Optimisation of TRIM processing MFC r248602: Fix for building libzpool under i386 MFC r249921: Enabled ZFS TRIM by default git-svn-id: svn://svn.freebsd.org/base/stable/9@251419 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- UPDATING | 16 + cddl/lib/libzpool/Makefile | 2 +- .../opensolaris/kern/opensolaris_kstat.c | 2 +- sys/cddl/compat/opensolaris/sys/dkio.h | 2 + sys/cddl/compat/opensolaris/sys/kstat.h | 2 + sys/cddl/compat/opensolaris/sys/time.h | 1 + .../opensolaris/uts/common/fs/zfs/arc.c | 7 + .../opensolaris/uts/common/fs/zfs/dsl_scan.c | 5 +- .../opensolaris/uts/common/fs/zfs/spa.c | 19 +- .../uts/common/fs/zfs/sys/spa_impl.h | 3 + .../uts/common/fs/zfs/sys/trim_map.h | 51 ++ .../opensolaris/uts/common/fs/zfs/sys/vdev.h | 1 + .../uts/common/fs/zfs/sys/vdev_impl.h | 2 + .../opensolaris/uts/common/fs/zfs/sys/zio.h | 47 +- .../uts/common/fs/zfs/sys/zio_impl.h | 10 +- .../opensolaris/uts/common/fs/zfs/trim_map.c | 638 ++++++++++++++++++ .../opensolaris/uts/common/fs/zfs/vdev.c | 9 + .../opensolaris/uts/common/fs/zfs/vdev_geom.c | 56 +- .../uts/common/fs/zfs/vdev_label.c | 23 +- .../uts/common/fs/zfs/vdev_mirror.c | 7 +- .../uts/common/fs/zfs/vdev_raidz.c | 36 +- .../opensolaris/uts/common/fs/zfs/zil.c | 5 + .../opensolaris/uts/common/fs/zfs/zio.c | 110 ++- sys/modules/zfs/Makefile | 1 + 24 files changed, 1001 insertions(+), 54 deletions(-) create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h create mode 100644 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c diff --git a/UPDATING b/UPDATING index c1dae5f7f..21549c5f6 100644 --- a/UPDATING +++ b/UPDATING @@ -11,6 +11,22 @@ handbook: Items affecting the ports and packages system can be found in /usr/ports/UPDATING. Please read that file before running portupgrade. +20130605: + Added ZFS TRIM support which is enabled by default. To disable + ZFS TRIM support set vfs.zfs.trim.enabled=0 in loader.conf. + + Creating new ZFS pools and adding new devices to existing pools + first performs a full device level TRIM which can take a significant + amount of time. The sysctl vfs.zfs.vdev.trim_on_init can be set to 0 + to disable this behaviour. + + ZFS TRIM requires the underlying device support BIO_DELETE which + is currently provided by methods such as ATA TRIM and SCSI UNMAP + via CAM, which are typically supported by SSD's. + + Stats for ZFS TRIM can be monitored by looking at the sysctl's + under kstat.zfs.misc.zio_trim. + 20130524: `list' command has been added to hastctl(8). For now, it is full equivalent of `status' command. diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 417c1ccb1..b159d3af7 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -26,7 +26,7 @@ ATOMIC_SRCS= opensolaris_atomic.c LIB= zpool -ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c +ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c trim_map.c ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/} KERNEL_SRCS= kernel.c taskq.c util.c LIST_SRCS= list.c diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c index 621aec09b..ba6510992 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c @@ -118,7 +118,7 @@ kstat_install(kstat_t *ksp) SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksent->name, CTLTYPE_U64 | CTLFLAG_RD, ksent, sizeof(*ksent), - kstat_sysctl, "QU", ""); + kstat_sysctl, "QU", ksent->desc); } } diff --git a/sys/cddl/compat/opensolaris/sys/dkio.h b/sys/cddl/compat/opensolaris/sys/dkio.h index a365c335c..50cafabb1 100644 --- a/sys/cddl/compat/opensolaris/sys/dkio.h +++ b/sys/cddl/compat/opensolaris/sys/dkio.h @@ -75,6 +75,8 @@ extern "C" { */ #define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +#define DKIOCTRIM (DKIOC|35) /* TRIM a block */ + struct dk_callback { void (*dkc_callback)(void *dkc_cookie, int error); void *dkc_cookie; diff --git a/sys/cddl/compat/opensolaris/sys/kstat.h b/sys/cddl/compat/opensolaris/sys/kstat.h index d73bd220f..acf6626cc 100644 --- a/sys/cddl/compat/opensolaris/sys/kstat.h +++ b/sys/cddl/compat/opensolaris/sys/kstat.h @@ -53,6 +53,8 @@ typedef struct kstat_named { #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 uchar_t data_type; +#define KSTAT_DESCLEN 128 + char desc[KSTAT_DESCLEN]; union { uint64_t ui64; } value; diff --git a/sys/cddl/compat/opensolaris/sys/time.h b/sys/cddl/compat/opensolaris/sys/time.h index 97c82fc62..2e8344721 100644 --- a/sys/cddl/compat/opensolaris/sys/time.h +++ b/sys/cddl/compat/opensolaris/sys/time.h @@ -35,6 +35,7 @@ #define MILLISEC 1000 #define MICROSEC 1000000 #define NANOSEC 1000000000 +#define TIME_MAX LLONG_MAX typedef longlong_t hrtime_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 8d8d7d06b..accf2a569 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -130,6 +130,7 @@ #endif #include #include +#include #include #include @@ -1691,6 +1692,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } if (l2hdr != NULL) { + trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, + hdr->b_size, 0); list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3528,6 +3531,8 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_private = NULL; if (l2hdr) { + trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, + hdr->b_size, 0); list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); @@ -4442,6 +4447,8 @@ l2arc_write_done(zio_t *zio) list_remove(buflist, ab); abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; + trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr, + ab->b_size, 0); kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index b7569d82e..8fe24f54e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -397,7 +397,8 @@ void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), + pio->io_flags)); } static uint64_t @@ -1364,7 +1365,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, - dmu_tx_get_txg(tx), bp, 0)); + dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index f0ba5980b..c23fa0a7d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -67,6 +67,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -1001,6 +1002,11 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), @@ -1029,6 +1035,12 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + /* + * Stop TRIM thread in case spa_unload() wasn't called directly + * before spa_deactivate(). + */ + trim_thread_destroy(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); @@ -1144,6 +1156,11 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + /* * Stop async tasks. */ @@ -5875,7 +5892,7 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - zio->io_flags)); + BP_GET_PSIZE(bp), zio->io_flags)); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index cfc57e4c0..e9526a298 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -221,6 +221,9 @@ struct spa { spa_proc_state_t spa_proc_state; /* see definition */ struct proc *spa_proc; /* "zpool-poolname" process */ uint64_t spa_did; /* if procp != p0, did of t1 */ + kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ + kmutex_t spa_trim_lock; /* protects spa_trim_cv */ + kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h new file mode 100644 index 000000000..f228d0766 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_TRIM_MAP_H +#define _SYS_TRIM_MAP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void trim_map_create(vdev_t *vd); +extern void trim_map_destroy(vdev_t *vd); +extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg); +extern boolean_t trim_map_write_start(zio_t *zio); +extern void trim_map_write_done(zio_t *zio); + +extern void trim_thread_create(spa_t *spa); +extern void trim_thread_destroy(spa_t *spa); +extern void trim_thread_wakeup(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TRIM_MAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index 5a7836612..920b1b0b3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -46,6 +46,7 @@ typedef enum vdev_dtl_type { } vdev_dtl_type_t; extern boolean_t zfs_nocacheflush; +extern boolean_t zfs_trim_enabled; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 9825d06ae..ba9704bac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -183,6 +183,7 @@ struct vdev { uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if trim failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ @@ -198,6 +199,7 @@ struct vdev { spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ + struct trim_map *vdev_trimmap; /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 13c901ead..342e7c1a6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -137,7 +138,8 @@ enum zio_compress { #define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) #define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) -#define ZIO_PRIORITY_TABLE_SIZE 12 +#define ZIO_PRIORITY_TRIM (zio_priority_table[12]) +#define ZIO_PRIORITY_TABLE_SIZE 13 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -367,6 +369,39 @@ typedef struct zio_link { list_node_t zl_child_node; } zio_link_t; +/* + * Used for TRIM kstat. + */ +typedef struct zio_trim_stats { + /* + * Number of bytes successfully TRIMmed. + */ + kstat_named_t bytes; + + /* + * Number of successful TRIM requests. + */ + kstat_named_t success; + + /* + * Number of TRIM requests that failed because TRIM is not + * supported. + */ + kstat_named_t unsupported; + + /* + * Number of TRIM requests that failed for other reasons. + */ + kstat_named_t failed; +} zio_trim_stats_t; + +extern zio_trim_stats_t zio_trim_stats; + +#define ZIO_TRIM_STAT_INCR(stat, val) \ + atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); +#define ZIO_TRIM_STAT_BUMP(stat) \ + ZIO_TRIM_STAT_INCR(stat, 1); + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -441,6 +476,8 @@ struct zio { /* FreeBSD only. */ struct ostask io_task; #endif + avl_node_t io_trim_node; + list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -472,8 +509,8 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *priv, int priority, - enum zio_flag flags); + uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, + int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, @@ -486,12 +523,14 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, enum zio_flag flags); + const blkptr_t *bp, uint64_t size, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, + uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h index 6e355419f..aa44744e5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -130,9 +130,9 @@ enum zio_stage { ZIO_STAGE_READY = 1 << 16, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--- */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ @@ -214,7 +214,9 @@ enum zio_stage { (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_DVA_FREE) + ZIO_STAGE_DVA_FREE | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) #define ZIO_DDT_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c new file mode 100644 index 000000000..fd7394a93 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c @@ -0,0 +1,638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#include +#include +#include +#include +#include + +/* + * Calculate the zio end, upgrading based on ashift which would be + * done by zio_vdev_io_start. + * + * This makes free range consolidation much more effective + * than it would otherwise be as well as ensuring that entire + * blocks are invalidated by writes. + */ +#define TRIM_ZIO_END(vd, offset, size) (offset + \ + P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) + +#define TRIM_MAP_SINC(tm, size) \ + atomic_add_64(&(tm)->tm_bytes, (size)) + +#define TRIM_MAP_SDEC(tm, size) \ + atomic_add_64(&(tm)->tm_bytes, -(size)) + +#define TRIM_MAP_QINC(tm) \ + atomic_inc_64(&(tm)->tm_pending); \ + +#define TRIM_MAP_QDEC(tm) \ + atomic_dec_64(&(tm)->tm_pending); + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; + uint64_t tm_pending; /* Count of pending TRIMs. */ + uint64_t tm_bytes; /* Total size in bytes of queued TRIMs. */ +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ + hrtime_t ts_time; /* Segment creation time. */ +} trim_seg_t; + +extern boolean_t zfs_trim_enabled; + +static u_int trim_txg_delay = 32; +static u_int trim_timeout = 30; +static u_int trim_max_interval = 1; +/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */ +static uint64_t trim_vdev_max_bytes = 2147483648; +/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */ +static u_int trim_vdev_max_pending = 64; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); + +TUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, + 0, "Delay TRIMs by up to this many TXGs"); + +TUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, + "Delay TRIMs by up to this many seconds"); + +TUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, + &trim_max_interval, 0, + "Maximum interval between TRIM queue processing (seconds)"); + +SYSCTL_DECL(_vfs_zfs_vdev); +TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes); +SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN, + &trim_vdev_max_bytes, 0, + "Maximum pending TRIM bytes for a vdev"); + +TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending); +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, + &trim_vdev_max_pending, 0, + "Maximum pending TRIM segments for a vdev"); + + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start); + TRIM_MAP_QDEC(tm); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + hrtime_t time; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + time = gethrtime(); + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start); + merge_after = (ts_after != NULL && ts_after->ts_start == end); + + if (merge_before && merge_after) { + TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end); + TRIM_MAP_QDEC(tm); + avl_remove(&tm->tm_queued_frees, ts_before); + list_remove(&tm->tm_head, ts_before); + ts_after->ts_start = ts_before->ts_start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + TRIM_MAP_SINC(tm, end - ts_before->ts_end); + ts_before->ts_end = end; + ts_before->ts_txg = txg; + ts_before->ts_time = time; + } else if (merge_after) { + TRIM_MAP_SINC(tm, ts_after->ts_start - start); + ts_after->ts_start = start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + } else { + TRIM_MAP_SINC(tm, end - start); + TRIM_MAP_QINC(tm); + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + ts->ts_time = time; + avl_insert(&tm->tm_queued_frees, ts, where); + list_insert_tail(&tm->tm_head, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + TRIM_MAP_SDEC(tm, end - start); + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + nts->ts_time = ts->ts_time; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + list_insert_after(&tm->tm_head, ts, nts); + TRIM_MAP_QINC(tm); + } else if (left_over) { + ts->ts_end = start; + } else if (right_over) { + ts->ts_start = end; + } else { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + TRIM_MAP_QDEC(tm); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + trim_map_t *tm = vd->vdev_trimmap; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (!zfs_trim_enabled || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg / time) or NULL if: + * 1. The list is empty + * 2. The first element's txg is greater than txgsafe + * 3. The first element's txg is not greater than the txg argument and the + * the first element's time is not greater than time argument + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(txgsafe >= txg); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txgsafe && + (ts->ts_txg <= txg || ts->ts_time <= time || + tm->tm_bytes > trim_vdev_max_bytes || + tm->tm_pending > trim_vdev_max_pending)) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t size, txgtarget, txgsafe; + hrtime_t timelimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + timelimit = gethrtime() - trim_timeout * NANOSEC; + if (vd->vdev_isl2cache) { + txgsafe = UINT64_MAX; + txgtarget = UINT64_MAX; + } else { + txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); + if (txgsafe > trim_txg_delay) + txgtarget = txgsafe - trim_txg_delay; + else + txgtarget = 0; + } + + mutex_enter(&tm->tm_lock); + /* Loop until we have sent all outstanding free's */ + while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit)) + != NULL) { + list_remove(&tm->tm_head, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + size = ts->ts_end - ts->ts_start; + zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size)); + TRIM_MAP_SDEC(tm, size); + TRIM_MAP_QDEC(tm); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + +#ifdef _KERNEL + (void) snprintf(curthread->td_name, sizeof(curthread->td_name), + "trim %s", spa_name(spa)); +#endif + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + + (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, + hz * trim_max_interval); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 631254cb7..bdbff31cd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -43,6 +43,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -1196,6 +1197,11 @@ vdev_open(vdev_t *vd) if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_notrim = B_FALSE; + trim_map_create(vd); + } + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1441,6 +1447,9 @@ vdev_close(vdev_t *vd) vdev_cache_purge(vd); + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index c972038d0..fd0382ac8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -49,14 +49,17 @@ struct g_class zfs_vdev_class = { DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); -/* - * Don't send BIO_FLUSH. - */ +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable = 0; TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable); -SYSCTL_DECL(_vfs_zfs_vdev); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable = 0; +TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); static void vdev_geom_orphan(struct g_consumer *cp) @@ -663,8 +666,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. */ vd->vdev_nowritecache = B_FALSE; @@ -710,6 +713,15 @@ vdev_geom_io_intr(struct bio *bp) */ vd->vdev_nowritecache = B_TRUE; } + if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) { + /* + * If we get ENOTSUP, we know that no future + * attempts will ever succeed. In this case we + * set a persistent bit so that we don't bother + * with the ioctl in the future. + */ + vd->vdev_notrim = B_TRUE; + } if (zio->io_error == EIO && !vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being @@ -752,17 +764,21 @@ vdev_geom_io_start(zio_t *zio) } switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; - if (vd->vdev_nowritecache) { zio->io_error = ENOTSUP; break; } - + goto sendreq; + case DKIOCTRIM: + if (vdev_geom_bio_delete_disable) + break; + if (vd->vdev_notrim) { + zio->io_error = ENOTSUP; + break; + } goto sendreq; default: zio->io_error = ENOTSUP; @@ -787,11 +803,21 @@ sendreq: bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: - bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + case DKIOCTRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + } break; } bp->bio_done = vdev_geom_io_intr; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 5ee7c7d15..92ae0ed7f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -145,8 +145,14 @@ #include #include #include +#include #include +static boolean_t vdev_trim_on_init = B_TRUE; +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); + /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. @@ -717,6 +723,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ASSERT(reason == VDEV_LABEL_REPLACE); } + /* + * TRIM the whole thing so that we start with a clean slate. + * It's just an optimization, so we don't care if it fails. + * Don't TRIM if removing so that we don't interfere with zpool + * disaster recovery. + */ + if (zfs_trim_enabled && vdev_trim_on_init && (reason == VDEV_LABEL_CREATE || + reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE)) + zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize)); + /* * Initialize its label. */ @@ -1282,5 +1298,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + return (error); + + trim_thread_wakeup(spa); + + return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index fa3bc02a2..d4eaa3b09 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -293,10 +293,11 @@ vdev_mirror_io_start(zio_t *zio) c = vdev_mirror_child_select(zio); children = (c >= 0); } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); /* - * Writes go to all children. + * Writes and frees go to all children. */ c = 0; children = mm->mm_children; @@ -377,6 +378,8 @@ vdev_mirror_io_done(zio_t *zio) zio->io_error = vdev_mirror_worst_error(mm); } return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 1d6cac0b6..1cc343a70 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -259,7 +259,9 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + if (rm->rm_col[c].rc_data != NULL) + zio_buf_free(rm->rm_col[c].rc_data, + rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -504,14 +506,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + if (zio->io_type != ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = zio->io_data; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } /* * If all data stored spans all columns, there's a danger that parity @@ -1536,6 +1544,18 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + return (ZIO_PIPELINE_CONTINUE); + } + if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); @@ -1917,6 +1937,8 @@ vdev_raidz_io_done(zio_t *zio) if (total_errors > rm->rm_firstdatacol) zio->io_error = vdev_raidz_worst_error(rm); + return; + } else if (zio->io_type == ZIO_TYPE_FREE) { return; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 3120b7be0..7bc0e9bee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -83,6 +83,11 @@ boolean_t zfs_nocacheflush = B_FALSE; TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush); SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, &zfs_nocacheflush, 0, "Disable cache flush"); +boolean_t zfs_trim_enabled = B_TRUE; +SYSCTL_DECL(_vfs_zfs_trim); +TUNABLE_INT("vfs.zfs.trim.enabled", &zfs_trim_enabled); +SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, + "Enable ZFS TRIM"); static kmem_cache_t *zil_lwb_cache; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 76c27cf46..50d41a699 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -35,6 +35,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -43,6 +44,19 @@ TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); +zio_trim_stats_t zio_trim_stats = { + { "bytes", KSTAT_DATA_UINT64, + "Number of bytes successfully TRIMmed" }, + { "success", KSTAT_DATA_UINT64, + "Number of successful TRIM requests" }, + { "unsupported", KSTAT_DATA_UINT64, + "Number of TRIM requests that failed because TRIM is not supported" }, + { "failed", KSTAT_DATA_UINT64, + "Number of TRIM requests that failed for reasons other than not supported" }, +}; + +static kstat_t *zio_trim_ksp; + /* * ========================================================================== * I/O priority table @@ -61,6 +75,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ + 30, /* ZIO_PRIORITY_TRIM */ }; /* @@ -209,6 +224,16 @@ zio_init(void) zfs_mg_alloc_failures = 8; zio_inject_init(); + + zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", + KSTAT_TYPE_NAMED, + sizeof(zio_trim_stats) / sizeof(kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zio_trim_ksp != NULL) { + zio_trim_ksp->ks_data = &zio_trim_stats; + kstat_install(zio_trim_ksp); + } } void @@ -236,6 +261,11 @@ zio_fini(void) kmem_cache_destroy(zio_cache); zio_inject_fini(); + + if (zio_trim_ksp != NULL) { + kstat_delete(zio_trim_ksp); + zio_trim_ksp = NULL; + } } /* @@ -543,7 +573,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); @@ -730,7 +760,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + uint64_t size, enum zio_flag flags) { zio_t *zio; @@ -743,7 +773,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, metaslab_check_free(spa, bp); - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -780,15 +810,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags) +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int priority, + enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -797,7 +828,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -922,11 +953,22 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, void zio_flush(zio_t *zio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +zio_t * +zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, + NULL, NULL, ZIO_PRIORITY_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); +} + void zio_shrink(zio_t *zio, uint64_t size) { @@ -1549,6 +1591,7 @@ zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } @@ -1681,7 +1724,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) } } - if (gn == gio->io_gang_tree) + if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) @@ -2403,7 +2446,7 @@ zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) /* * ========================================================================== - * Read and write to physical devices + * Read, write and delete to physical devices * ========================================================================== */ static int @@ -2426,6 +2469,11 @@ zio_vdev_io_start(zio_t *zio) return (vdev_mirror_ops.vdev_op_io_start(zio)); } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); + return (ZIO_PIPELINE_CONTINUE); + } + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -2450,18 +2498,22 @@ zio_vdev_io_start(zio_t *zio) if (P2PHASE(zio->io_size, align) != 0) { uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + char *abuf = NULL; + if (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE) + abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } - zio_push_transform(zio, abuf, asize, asize, zio_subblock); + zio_push_transform(zio, abuf, asize, abuf ? asize : 0, + zio_subblock); } ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2501,6 +2553,11 @@ zio_vdev_io_start(zio_t *zio) } } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) { + if (!trim_map_write_start(zio)) + return (ZIO_PIPELINE_STOP); + } + return (vd->vdev_ops->vdev_op_io_start(zio)); } @@ -2514,9 +2571,16 @@ zio_vdev_io_done(zio_t *zio) if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_type == ZIO_TYPE_WRITE) { + trim_map_write_done(zio); + } + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { vdev_queue_io_done(zio); @@ -2592,6 +2656,20 @@ zio_vdev_io_assess(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); + if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + switch (zio->io_error) { + case 0: + ZIO_TRIM_STAT_INCR(bytes, zio->io_size); + ZIO_TRIM_STAT_BUMP(success); + break; + case EOPNOTSUPP: + ZIO_TRIM_STAT_BUMP(unsupported); + break; + default: + ZIO_TRIM_STAT_BUMP(failed); + break; + } + /* * If the I/O failed, determine whether we should attempt to retry it. * diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index 002392a57..c1dd6e4ce 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -72,6 +72,7 @@ SRCS+= sha2.c ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/} SRCS+= ${ZFS_SRCS} SRCS+= vdev_geom.c +SRCS+= trim_map.c # Use FreeBSD's namecache. CFLAGS+=-DFREEBSD_NAMECACHE -- 2.45.0