From 0c236d3d528d5b6de5d70309841d2cd2405465fd Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 13 Jan 2020 02:39:41 +0000 Subject: [PATCH] vfs: per-cpu batched requeuing of free vnodes Constant requeuing adds significant lock contention in certain workloads. Lessen the problem by batching it. Per-cpu areas are locked in order to synchronize against UMA freeing memory. vnode's v_mflag is converted to short to prevent the struct from growing. Sample result from an incremental make -s -j 104 bzImage on tmpfs: stock: 122.38s user 1780.45s system 6242% cpu 30.480 total patched: 144.84s user 985.90s system 4856% cpu 23.282 total Reviewed by: jeff Tested by: pho (in a larger patch, previous version) Differential Revision: https://reviews.freebsd.org/D22998 --- sys/kern/vfs_subr.c | 120 +++++++++++++++++++++++++++++++++++++++++--- sys/sys/vnode.h | 3 +- 2 files changed, 116 insertions(+), 7 deletions(-) diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 18dc3d1e1e9..c3a6af4efc2 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -295,6 +295,16 @@ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); +#define VDBATCH_SIZE 8 +struct vdbatch { + u_int index; + struct mtx lock; + struct vnode *tab[VDBATCH_SIZE]; +}; +DPCPU_DEFINE_STATIC(struct vdbatch, vd); + +static void vdbatch_dequeue(struct vnode *vp); + /* * When shutting down the syncer, run it at four times normal speed. */ @@ -552,6 +562,8 @@ vnode_init(void *mem, int size, int flags) */ rangelock_init(&vp->v_rl); + vp->v_dbatchcpu = NOCPU; + mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -568,6 +580,7 @@ vnode_fini(void *mem, int size) struct bufobj *bo; vp = mem; + vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -602,8 +615,9 @@ vnode_fini(void *mem, int size) static void vntblinit(void *dummy __unused) { + struct vdbatch *vd; + int cpu, physvnodes, virtvnodes; u_int i; - int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the @@ -669,6 +683,12 @@ vntblinit(void *dummy __unused) for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; + + CPU_FOREACH(cpu) { + vd = DPCPU_ID_PTR((cpu), vd); + bzero(vd, sizeof(*vd)); + mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); + } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); @@ -3199,6 +3219,98 @@ vholdnz(struct vnode *vp) #endif } +static void __noinline +vdbatch_process(struct vdbatch *vd) +{ + struct vnode *vp; + int i; + + mtx_assert(&vd->lock, MA_OWNED); + MPASS(vd->index == VDBATCH_SIZE); + + mtx_lock(&vnode_list_mtx); + for (i = 0; i < VDBATCH_SIZE; i++) { + vp = vd->tab[i]; + TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); + TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); + MPASS(vp->v_dbatchcpu != NOCPU); + vp->v_dbatchcpu = NOCPU; + } + bzero(vd->tab, sizeof(vd->tab)); + vd->index = 0; + mtx_unlock(&vnode_list_mtx); +} + +static void +vdbatch_enqueue(struct vnode *vp) +{ + struct vdbatch *vd; + + ASSERT_VI_LOCKED(vp, __func__); + VNASSERT(!VN_IS_DOOMED(vp), vp, + ("%s: deferring requeue of a doomed vnode", __func__)); + + if (vp->v_dbatchcpu != NOCPU) { + VI_UNLOCK(vp); + return; + } + + /* + * A hack: pin us to the current CPU so that we know what to put in + * ->v_dbatchcpu. + */ + sched_pin(); + vd = DPCPU_PTR(vd); + mtx_lock(&vd->lock); + MPASS(vd->index < VDBATCH_SIZE); + MPASS(vd->tab[vd->index] == NULL); + vp->v_dbatchcpu = curcpu; + vd->tab[vd->index] = vp; + vd->index++; + VI_UNLOCK(vp); + if (vd->index == VDBATCH_SIZE) + vdbatch_process(vd); + mtx_unlock(&vd->lock); + sched_unpin(); +} + +/* + * This routine must only be called for vnodes which are about to be + * deallocated. Supporting dequeue for arbitrary vndoes would require + * validating that the locked batch matches. + */ +static void +vdbatch_dequeue(struct vnode *vp) +{ + struct vdbatch *vd; + int i; + short cpu; + + VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, + ("%s: called for a used vnode\n", __func__)); + + cpu = vp->v_dbatchcpu; + if (cpu == NOCPU) + return; + + vd = DPCPU_ID_PTR(cpu, vd); + mtx_lock(&vd->lock); + for (i = 0; i < vd->index; i++) { + if (vd->tab[i] != vp) + continue; + vp->v_dbatchcpu = NOCPU; + vd->index--; + vd->tab[i] = vd->tab[vd->index]; + vd->tab[vd->index] = NULL; + break; + } + mtx_unlock(&vd->lock); + /* + * Either we dequeued the vnode above or the target CPU beat us to it. + */ + MPASS(vp->v_dbatchcpu == NOCPU); +} + /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd @@ -3236,12 +3348,8 @@ vdrop_deactivate(struct vnode *vp) mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } - mtx_lock(&vnode_list_mtx); - TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); - TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); - mtx_unlock(&vnode_list_mtx); atomic_add_long(&freevnodes, 1); - VI_UNLOCK(vp); + vdbatch_enqueue(vp); } void diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 088e81429d7..402e5648241 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -171,7 +171,8 @@ struct vnode { u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ - u_int v_mflag; /* l mnt-specific vnode flags */ + u_short v_mflag; /* l mnt-specific vnode flags */ + short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ u_int v_hash; -- 2.45.0