2 * Copyright (c) 2015 Netflix, Inc.
4 * Derived from gs_rr.c:
5 * Copyright (c) 2009-2010 Fabio Checconi
6 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * A simple scheduler that just delays certain transactions by a certain
36 * amount. We collect all the transactions that are 'done' and put them on
37 * a queue. The queue is run through every so often and the transactions that
38 * have taken longer than the threshold delay are completed.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
45 #include <sys/callout.h>
46 #include <sys/malloc.h>
47 #include <sys/module.h>
49 #include <sys/queue.h>
51 #include <sys/sysctl.h>
52 #include "gs_scheduler.h"
54 /* Useful constants */
55 #define BTFRAC_1US 18446744073709ULL /* 2^64 / 1000000 */
57 /* list of scheduler instances */
58 LIST_HEAD(g_scheds, g_delay_softc);
61 * Per device descriptor, holding the Round Robin list of queues
62 * accessing the disk, a reference to the geom, and the timer.
64 struct g_delay_softc {
65 struct g_geom *sc_geom;
67 struct bio_queue_head sc_bioq; /* queue of pending requests */
68 struct callout sc_wait; /* timer for completing with delays */
71 int sc_in_flight; /* requests in the driver */
75 * parameters, config and stats
77 struct g_delay_params {
79 int bypass; /* bypass scheduling */
80 int units; /* how many instances */
81 int latency; /* How big a latncy are hoping for */
84 static struct g_delay_params me = {
90 struct g_delay_params *gs_delay_me = &me;
92 SYSCTL_DECL(_kern_geom_sched);
93 static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, delay, CTLFLAG_RW, 0,
94 "GEOM_SCHED DELAY stuff");
95 SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, bypass, CTLFLAG_RD,
96 &me.bypass, 0, "Scheduler bypass");
97 SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, units, CTLFLAG_RD,
98 &me.units, 0, "Scheduler instances");
99 SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, latency, CTLFLAG_RW,
100 &me.latency, 0, "Minimum latency for requests, in microseconds (1/hz resolution)");
101 SYSCTL_QUAD(_kern_geom_sched_delay, OID_AUTO, io, CTLFLAG_RW,
102 &me.io, 0, "I/Os delayed\n");
105 g_delay_init_class(void *data, void *priv)
111 g_delay_fini_class(void *data, void *priv)
116 * Called on a request arrival, timeout or completion.
117 * Try to serve a request among those queued.
120 g_delay_next(void *data, int force)
122 struct g_delay_softc *sc = data;
126 bp = bioq_first(&sc->sc_bioq);
131 * If the time isn't yet ripe for this bp to be let loose,
132 * then the time isn't ripe for any of its friends either
133 * since we insert in-order. Terminate if the bio hasn't
134 * aged appropriately. Note that there's pathology here
135 * such that we may be up to one tick early in releasing
136 * this I/O. We could implement this up to a tick late too
139 getbinuptime(&bt); /* BIO's bio_t0 is uptime */
140 if (bintime_cmp(&bp->bio_t0, &bt, >))
145 * The bp has mellowed enough, let it through and update stats.
146 * If there's others, we'll catch them next time we get called.
150 bp = bioq_takefirst(&sc->sc_bioq);
155 * Called when a real request for disk I/O arrives.
156 * Locate the queue associated with the client.
157 * If the queue is the one we are anticipating for, reset its timeout;
158 * if the queue is not in the round robin list, insert it in the list.
159 * On any error, do not queue the request and return -1, the caller
160 * will take care of this request.
163 g_delay_start(void *data, struct bio *bp)
165 struct g_delay_softc *sc = data;
168 return (-1); /* bypass the scheduler */
170 bp->bio_caller1 = sc;
171 getbinuptime(&bp->bio_t0); /* BIO's bio_t0 is uptime */
172 bintime_addx(&bp->bio_t0, BTFRAC_1US * me.latency);
175 * Keep the I/Os ordered. Lower layers will reorder as we release them down.
176 * We rely on this in g_delay_next() so that we delay all things equally. Even
177 * if we move to multiple queues to push stuff down the stack, we'll want to
178 * insert in order and let the lower layers do whatever reordering they want.
180 bioq_insert_tail(&sc->sc_bioq, bp);
186 g_delay_timeout(void *data)
188 struct g_delay_softc *sc = data;
190 g_sched_lock(sc->sc_geom);
191 g_sched_dispatch(sc->sc_geom);
192 g_sched_unlock(sc->sc_geom);
193 callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
197 * Module glue: allocate descriptor, initialize its fields.
200 g_delay_init(struct g_geom *geom)
202 struct g_delay_softc *sc;
204 sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO);
206 bioq_init(&sc->sc_bioq);
207 callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
208 callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc);
215 * Module glue -- drain the callout structure, destroy the
216 * hash table and its element, and free the descriptor.
219 g_delay_fini(void *data)
221 struct g_delay_softc *sc = data;
223 /* We're force drained before getting here */
225 /* Kick out timers */
226 callout_drain(&sc->sc_wait);
228 free(sc, M_GEOM_SCHED);
232 * Called when the request under service terminates.
233 * Start the anticipation timer if needed.
236 g_delay_done(void *data, struct bio *bp)
238 struct g_delay_softc *sc = data;
242 g_sched_dispatch(sc->sc_geom);
246 g_delay_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
247 struct g_consumer *cp, struct g_provider *pp)
251 static struct g_gsched g_delay = {
254 .gs_init = g_delay_init,
255 .gs_fini = g_delay_fini,
256 .gs_start = g_delay_start,
257 .gs_done = g_delay_done,
258 .gs_next = g_delay_next,
259 .gs_dumpconf = g_delay_dumpconf,
260 .gs_init_class = g_delay_init_class,
261 .gs_fini_class = g_delay_fini_class,
264 DECLARE_GSCHED_MODULE(delay, &g_delay);