From 0c236d3d528d5b6de5d70309841d2cd2405465fd Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Mon, 13 Jan 2020 02:39:41 +0000
Subject: [PATCH] vfs: per-cpu batched requeuing of free vnodes

Constant requeuing adds significant lock contention in certain
workloads. Lessen the problem by batching it.

Per-cpu areas are locked in order to synchronize against UMA freeing
memory.

vnode's v_mflag is converted to short to prevent the struct from
growing.

Sample result from an incremental make -s -j 104 bzImage on tmpfs:
stock:   122.38s user 1780.45s system 6242% cpu 30.480 total
patched: 144.84s user 985.90s system 4856% cpu 23.282 total

Reviewed by:	jeff
Tested by:	pho (in a larger patch, previous version)
Differential Revision:	https://reviews.freebsd.org/D22998
---
 sys/kern/vfs_subr.c | 120 +++++++++++++++++++++++++++++++++++++++++---
 sys/sys/vnode.h     |   3 +-
 2 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 18dc3d1e1e9..c3a6af4efc2 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -295,6 +295,16 @@ static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
+#define	VDBATCH_SIZE 8
+struct vdbatch {
+	u_int index;
+	struct mtx lock;
+	struct vnode *tab[VDBATCH_SIZE];
+};
+DPCPU_DEFINE_STATIC(struct vdbatch, vd);
+
+static void	vdbatch_dequeue(struct vnode *vp);
+
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
@@ -552,6 +562,8 @@ vnode_init(void *mem, int size, int flags)
 	 */
 	rangelock_init(&vp->v_rl);
 
+	vp->v_dbatchcpu = NOCPU;
+
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
@@ -568,6 +580,7 @@ vnode_fini(void *mem, int size)
 	struct bufobj *bo;
 
 	vp = mem;
+	vdbatch_dequeue(vp);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
@@ -602,8 +615,9 @@ vnode_fini(void *mem, int size)
 static void
 vntblinit(void *dummy __unused)
 {
+	struct vdbatch *vd;
+	int cpu, physvnodes, virtvnodes;
 	u_int i;
-	int physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
@@ -669,6 +683,12 @@ vntblinit(void *dummy __unused)
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
+
+	CPU_FOREACH(cpu) {
+		vd = DPCPU_ID_PTR((cpu), vd);
+		bzero(vd, sizeof(*vd));
+		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
+	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
@@ -3199,6 +3219,98 @@ vholdnz(struct vnode *vp)
 #endif
 }
 
+static void __noinline
+vdbatch_process(struct vdbatch *vd)
+{
+	struct vnode *vp;
+	int i;
+
+	mtx_assert(&vd->lock, MA_OWNED);
+	MPASS(vd->index == VDBATCH_SIZE);
+
+	mtx_lock(&vnode_list_mtx);
+	for (i = 0; i < VDBATCH_SIZE; i++) {
+		vp = vd->tab[i];
+		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
+		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
+		MPASS(vp->v_dbatchcpu != NOCPU);
+		vp->v_dbatchcpu = NOCPU;
+	}
+	bzero(vd->tab, sizeof(vd->tab));
+	vd->index = 0;
+	mtx_unlock(&vnode_list_mtx);
+}
+
+static void
+vdbatch_enqueue(struct vnode *vp)
+{
+	struct vdbatch *vd;
+
+	ASSERT_VI_LOCKED(vp, __func__);
+	VNASSERT(!VN_IS_DOOMED(vp), vp,
+	    ("%s: deferring requeue of a doomed vnode", __func__));
+
+	if (vp->v_dbatchcpu != NOCPU) {
+		VI_UNLOCK(vp);
+		return;
+	}
+
+	/*
+	 * A hack: pin us to the current CPU so that we know what to put in
+	 * ->v_dbatchcpu.
+	 */
+	sched_pin();
+	vd = DPCPU_PTR(vd);
+	mtx_lock(&vd->lock);
+	MPASS(vd->index < VDBATCH_SIZE);
+	MPASS(vd->tab[vd->index] == NULL);
+	vp->v_dbatchcpu = curcpu;
+	vd->tab[vd->index] = vp;
+	vd->index++;
+	VI_UNLOCK(vp);
+	if (vd->index == VDBATCH_SIZE)
+		vdbatch_process(vd);
+	mtx_unlock(&vd->lock);
+	sched_unpin();
+}
+
+/*
+ * This routine must only be called for vnodes which are about to be
+ * deallocated. Supporting dequeue for arbitrary vndoes would require
+ * validating that the locked batch matches.
+ */
+static void
+vdbatch_dequeue(struct vnode *vp)
+{
+	struct vdbatch *vd;
+	int i;
+	short cpu;
+
+	VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
+	    ("%s: called for a used vnode\n", __func__));
+
+	cpu = vp->v_dbatchcpu;
+	if (cpu == NOCPU)
+		return;
+
+	vd = DPCPU_ID_PTR(cpu, vd);
+	mtx_lock(&vd->lock);
+	for (i = 0; i < vd->index; i++) {
+		if (vd->tab[i] != vp)
+			continue;
+		vp->v_dbatchcpu = NOCPU;
+		vd->index--;
+		vd->tab[i] = vd->tab[vd->index];
+		vd->tab[vd->index] = NULL;
+		break;
+	}
+	mtx_unlock(&vd->lock);
+	/*
+	 * Either we dequeued the vnode above or the target CPU beat us to it.
+	 */
+	MPASS(vp->v_dbatchcpu == NOCPU);
+}
+
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
@@ -3236,12 +3348,8 @@ vdrop_deactivate(struct vnode *vp)
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
-	mtx_lock(&vnode_list_mtx);
-	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
-	TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
-	mtx_unlock(&vnode_list_mtx);
 	atomic_add_long(&freevnodes, 1);
-	VI_UNLOCK(vp);
+	vdbatch_enqueue(vp);
 }
 
 void
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 088e81429d7..402e5648241 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -171,7 +171,8 @@ struct vnode {
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
-	u_int	v_mflag;			/* l mnt-specific vnode flags */
+	u_short	v_mflag;			/* l mnt-specific vnode flags */
+	short	v_dbatchcpu;			/* i LRU requeue deferral batch */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	u_int	v_hash;
-- 
2.45.0