From e9272225e6bed840b00eef1c817b188c172338ee Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 17 Mar 2021 22:33:47 +0100 Subject: [PATCH] vfs: fix vnlru marker handling for filtered/unfiltered cases The global list has a marker with an invariant that free vnodes are placed somewhere past that. A caller which performs filtering (like ZFS) can move said marker all the way to the end, across free vnodes which don't match. Then a caller which does not perform filtering will fail to find them. This makes vn_alloc_hard sleep for 1 second instead of reclaiming, resulting in significant stalls. Fix the problem by requiring an explicit marker by callers which do filtering. As a temporary measure extend vnlru_free to restart if it fails to reclaim anything. Big thanks go to the reporter for testing several iterations of the patch. Reported by: Yamagi Tested by: Yamagi Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D29324 --- .../openzfs/module/os/freebsd/zfs/arc_os.c | 14 +++- sys/kern/vfs_subr.c | 72 +++++++++++++++++-- sys/sys/vnode.h | 3 + 3 files changed, 80 insertions(+), 9 deletions(-) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 4fc7468bfa4..0d5cffbe8d1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -51,6 +51,9 @@ #include #include +static struct sx arc_vnlru_lock; +static struct vnode *arc_vnlru_marker; + extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; @@ -157,7 +160,9 @@ arc_prune_task(void *arg) arc_reduce_target_size(ptob(nr_scan)); free(arg, M_TEMP); - vnlru_free(nr_scan, &zfs_vfsops); + sx_xlock(&arc_vnlru_lock); + vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); + sx_xunlock(&arc_vnlru_lock); } /* @@ -234,7 +239,8 @@ arc_lowmem_init(void) { arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); - + arc_vnlru_marker = vnlru_alloc_marker(); + sx_init(&arc_vnlru_lock, "arc vnlru lock"); } void @@ -242,6 +248,10 @@ arc_lowmem_fini(void) { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); + if (arc_vnlru_marker != NULL) { + vnlru_free_marker(arc_vnlru_marker); + sx_destroy(&arc_vnlru_lock); + } } void diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 37e713ee48e..af12252e8a8 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1216,9 +1216,9 @@ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, * Attempt to reduce the free list by the requested amount. */ static int -vnlru_free_locked(int count, struct vfsops *mnt_op) +vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) { - struct vnode *vp, *mvp; + struct vnode *vp; struct mount *mp; int ocount; @@ -1226,7 +1226,6 @@ vnlru_free_locked(int count, struct vfsops *mnt_op) if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; - mvp = vnode_list_free_marker; vp = mvp; for (;;) { if (count == 0) { @@ -1268,13 +1267,72 @@ vnlru_free_locked(int count, struct vfsops *mnt_op) return (ocount - count); } +static int +vnlru_free_locked(int count) +{ + + mtx_assert(&vnode_list_mtx, MA_OWNED); + return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); +} + +void +vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) +{ + + MPASS(mnt_op != NULL); + MPASS(mvp != NULL); + VNPASS(mvp->v_type == VMARKER, mvp); + mtx_lock(&vnode_list_mtx); + vnlru_free_impl(count, mnt_op, mvp); + mtx_unlock(&vnode_list_mtx); +} + +/* + * Temporary binary compat, don't use. Call vnlru_free_vfsops instead. + */ void vnlru_free(int count, struct vfsops *mnt_op) { + struct vnode *mvp; + + if (count == 0) + return; + mtx_lock(&vnode_list_mtx); + mvp = vnode_list_free_marker; + if (vnlru_free_impl(count, mnt_op, mvp) == 0) { + /* + * It is possible the marker was moved over eligible vnodes by + * callers which filtered by different ops. If so, start from + * scratch. + */ + if (vnlru_read_freevnodes() > 0) { + TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); + TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); + } + vnlru_free_impl(count, mnt_op, mvp); + } + mtx_unlock(&vnode_list_mtx); +} + +struct vnode * +vnlru_alloc_marker(void) +{ + struct vnode *mvp; + mvp = vn_alloc_marker(NULL); + mtx_lock(&vnode_list_mtx); + TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); + mtx_unlock(&vnode_list_mtx); + return (mvp); +} + +void +vnlru_free_marker(struct vnode *mvp) +{ mtx_lock(&vnode_list_mtx); - vnlru_free_locked(count, mnt_op); + TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); + vn_free_marker(mvp); } static void @@ -1423,7 +1481,7 @@ vnlru_proc(void) * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { - vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); + vnlru_free_locked(rnumvnodes - desiredvnodes); rnumvnodes = atomic_load_long(&numvnodes); } /* @@ -1615,7 +1673,7 @@ vn_alloc_hard(struct mount *mp) * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ - if (vnlru_free_locked(1, NULL) > 0) + if (vnlru_free_locked(1) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* @@ -1625,7 +1683,7 @@ vn_alloc_hard(struct mount *mp) msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) - vnlru_free_locked(1, NULL); + vnlru_free_locked(1); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 0e46bea14b6..f05b03c74c8 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -826,7 +826,10 @@ void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); +struct vnode *vnlru_alloc_marker(void); +void vnlru_free_marker(struct vnode *); void vnlru_free(int, struct vfsops *); +void vnlru_free_vfsops(int, struct vfsops *, struct vnode *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); -- 2.45.0