2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2009-2010 Fabio Checconi
5 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * A round-robin (RR) anticipatory scheduler, with per-client queues.
36 * The goal of this implementation is to improve throughput compared
37 * to the pure elevator algorithm, and insure some fairness among
40 * Requests coming from the same client are put in the same queue.
41 * We use anticipation to help reducing seeks, and each queue
42 * is never served continuously for more than a given amount of
43 * time or data. Queues are then served in a round-robin fashion.
45 * Each queue can be in any of the following states:
46 * READY immediately serve the first pending request;
47 * BUSY one request is under service, wait for completion;
48 * IDLING do not serve incoming requests immediately, unless
49 * they are "eligible" as defined later.
51 * Scheduling is made looking at the status of all queues,
52 * and the first one in round-robin order is privileged.
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
59 #include <sys/callout.h>
60 #include <sys/malloc.h>
61 #include <sys/module.h>
63 #include <sys/queue.h>
65 #include <sys/sysctl.h>
66 #include "gs_scheduler.h"
68 /* possible states of the scheduler */
70 G_QUEUE_READY = 0, /* Ready to dispatch. */
71 G_QUEUE_BUSY, /* Waiting for a completion. */
72 G_QUEUE_IDLING /* Waiting for a new request. */
75 /* possible queue flags */
77 /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */
78 G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */
84 * Queue descriptor, containing reference count, scheduling
85 * state, a queue of pending requests, configuration parameters.
86 * Queues with pending request(s) and not under service are also
87 * stored in a Round Robin (RR) list.
90 struct g_rr_softc *q_sc; /* link to the parent */
92 enum g_rr_state q_status;
93 unsigned int q_service; /* service received so far */
94 int q_slice_end; /* actual slice end time, in ticks */
95 enum g_rr_flags q_flags; /* queue flags */
96 struct bio_queue_head q_bioq;
98 /* Scheduling parameters */
99 unsigned int q_budget; /* slice size in bytes */
100 unsigned int q_slice_duration; /* slice size in ticks */
101 unsigned int q_wait_ticks; /* wait time for anticipation */
103 /* Stats to drive the various heuristics. */
104 struct g_savg q_thinktime; /* Thinktime average. */
105 struct g_savg q_seekdist; /* Seek distance average. */
107 int q_bionum; /* Number of requests. */
109 off_t q_lastoff; /* Last submitted req. offset. */
110 int q_lastsub; /* Last submitted req. time. */
112 /* Expiration deadline for an empty queue. */
115 TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
119 TAILQ_HEAD(g_rr_tailq, g_rr_queue);
121 /* list of scheduler instances */
122 LIST_HEAD(g_scheds, g_rr_softc);
124 /* Default quantum for RR between queues. */
125 #define G_RR_DEFAULT_BUDGET 0x00800000
128 * Per device descriptor, holding the Round Robin list of queues
129 * accessing the disk, a reference to the geom, and the timer.
132 struct g_geom *sc_geom;
135 * sc_active is the queue we are anticipating for.
136 * It is set only in gs_rr_next(), and possibly cleared
137 * only in gs_rr_next() or on a timeout.
138 * The active queue is never in the Round Robin list
139 * even if it has requests queued.
141 struct g_rr_queue *sc_active;
142 struct callout sc_wait; /* timer for sc_active */
144 struct g_rr_tailq sc_rr_tailq; /* the round-robin list */
145 int sc_nqueues; /* number of queues */
148 int sc_in_flight; /* requests in the driver */
150 LIST_ENTRY(g_rr_softc) sc_next;
153 /* Descriptor for bounded values, min and max are constant. */
161 * parameters, config and stats
164 int queues; /* total number of queues */
165 int w_anticipate; /* anticipate writes */
166 int bypass; /* bypass scheduling writes */
168 int units; /* how many instances */
169 /* sc_head is used for debugging */
170 struct g_scheds sc_head; /* first scheduler instance */
172 struct x_bound queue_depth; /* max parallel requests */
173 struct x_bound wait_ms; /* wait time, milliseconds */
174 struct x_bound quantum_ms; /* quantum size, milliseconds */
175 struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */
178 int wait_hit; /* success in anticipation */
179 int wait_miss; /* failure in anticipation */
183 * Default parameters for the scheduler. The quantum sizes target
184 * a 80MB/s disk; if the hw is faster or slower the minimum of the
185 * two will have effect: the clients will still be isolated but
186 * the fairness may be limited. A complete solution would involve
187 * the on-line measurement of the actual disk throughput to derive
188 * these parameters. Or we may just choose to ignore service domain
189 * fairness and accept what can be achieved with time-only budgets.
191 static struct g_rr_params me = {
192 .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
194 .queue_depth = { 1, 1, 50 },
195 .wait_ms = { 1, 10, 30 },
196 .quantum_ms = { 1, 100, 500 },
197 .quantum_kb = { 16, 8192, 65536 },
200 struct g_rr_params *gs_rr_me = &me;
202 SYSCTL_DECL(_kern_geom_sched);
203 static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
204 "GEOM_SCHED ROUND ROBIN stuff");
205 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
206 &me.units, 0, "Scheduler instances");
207 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
208 &me.queues, 0, "Total rr queues");
209 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
210 &me.wait_ms.x_cur, 0, "Wait time milliseconds");
211 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
212 &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
213 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
214 &me.bypass, 0, "Bypass scheduler");
215 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
216 &me.w_anticipate, 0, "Do anticipation on writes");
217 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
218 &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
219 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
220 &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
221 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
222 &me.wait_hit, 0, "Hits in anticipation");
223 SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
224 &me.wait_miss, 0, "Misses in anticipation");
227 /* print the status of a queue */
229 gs_rr_dump_q(struct g_rr_queue *qp, int index)
234 TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
237 printf("--- rr queue %d %p status %d len %d ---\n",
238 index, qp, qp->q_status, l);
242 * Dump the scheduler status when writing to this sysctl variable.
243 * XXX right now we only dump the status of the last instance created.
244 * not a severe issue because this is only for debugging
247 gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
250 struct g_rr_softc *sc;
252 error = sysctl_handle_int(oidp, &val, 0, req);
253 if (error || !req->newptr )
256 printf("called %s\n", __FUNCTION__);
258 LIST_FOREACH(sc, &me.sc_head, sc_next) {
260 printf("--- sc %p active %p nqueues %d "
261 "callout %d in_flight %d ---\n",
262 sc, sc->sc_active, sc->sc_nqueues,
263 callout_active(&sc->sc_wait),
265 for (i = 0; i < G_RR_HASH_SIZE; i++) {
266 struct g_rr_queue *qp;
267 LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
268 gs_rr_dump_q(qp, tot);
276 SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
277 CTLTYPE_UINT | CTLFLAG_RW,
278 0, sizeof(int), gs_rr_sysctl_status, "I", "status");
280 #endif /* DEBUG_QUEUES */
283 * Get a bounded value, optionally convert to a min of t_min ticks.
286 get_bounded(struct x_bound *v, int t_min)
293 else if (x > v->x_max)
296 x = x * hz / 1000; /* convert to ticks */
304 * Get a reference to the queue for bp, using the generic
305 * classification mechanism.
307 static struct g_rr_queue *
308 g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
311 return (g_sched_get_class(sc->sc_geom, bp));
315 g_rr_init_class(void *data, void *priv)
317 struct g_rr_softc *sc = data;
318 struct g_rr_queue *qp = priv;
320 bioq_init(&qp->q_bioq);
323 * Set the initial parameters for the client:
324 * slice size in bytes and ticks, and wait ticks.
325 * Right now these are constant, but we could have
326 * autoconfiguration code to adjust the values based on
327 * the actual workload.
329 qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
330 qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
331 qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
333 qp->q_sc = sc; /* link to the parent */
334 qp->q_sc->sc_nqueues++;
341 * Release a reference to the queue.
344 g_rr_queue_put(struct g_rr_queue *qp)
347 g_sched_put_class(qp->q_sc->sc_geom, qp);
351 g_rr_fini_class(void *data, void *priv)
353 struct g_rr_queue *qp = priv;
355 KASSERT(bioq_first(&qp->q_bioq) == NULL,
356 ("released nonempty queue"));
357 qp->q_sc->sc_nqueues--;
362 g_rr_queue_expired(struct g_rr_queue *qp)
365 if (qp->q_service >= qp->q_budget)
368 if ((qp->q_flags & G_FLAG_COMPLETED) &&
369 ticks - qp->q_slice_end >= 0)
376 g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
378 int wait = get_bounded(&me.wait_ms, 2);
380 if (!me.w_anticipate && (bp->bio_cmd == BIO_WRITE))
383 if (g_savg_valid(&qp->q_thinktime) &&
384 g_savg_read(&qp->q_thinktime) > wait)
387 if (g_savg_valid(&qp->q_seekdist) &&
388 g_savg_read(&qp->q_seekdist) > 8192)
395 * Called on a request arrival, timeout or completion.
396 * Try to serve a request among those queued.
399 g_rr_next(void *data, int force)
401 struct g_rr_softc *sc = data;
402 struct g_rr_queue *qp;
403 struct bio *bp, *next;
407 if (me.bypass == 0 && !force) {
408 if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
411 /* Try with the queue under service first. */
412 if (qp != NULL && qp->q_status != G_QUEUE_READY) {
414 * Queue is anticipating, ignore request.
415 * We should check that we are not past
416 * the timeout, but in that case the timeout
417 * will fire immediately afterwards so we
422 } else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
424 sc->sc_active = qp = NULL;
428 * No queue under service, look for the first in RR order.
429 * If we find it, select if as sc_active, clear service
430 * and record the end time of the slice.
433 qp = TAILQ_FIRST(&sc->sc_rr_tailq);
435 return (NULL); /* no queues at all, return */
436 /* otherwise select the new queue for service. */
437 TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
440 qp->q_flags &= ~G_FLAG_COMPLETED;
443 bp = bioq_takefirst(&qp->q_bioq); /* surely not NULL */
444 qp->q_service += bp->bio_length; /* charge the service */
447 * The request at the head of the active queue is always
448 * dispatched, and gs_rr_next() will be called again
450 * We need to prepare for what to do next:
452 * 1. have we reached the end of the (time or service) slice ?
453 * If so, clear sc_active and possibly requeue the previous
454 * active queue if it has more requests pending;
455 * 2. do we have more requests in sc_active ?
456 * If yes, do not anticipate, as gs_rr_next() will run again;
457 * if no, decide whether or not to anticipate depending
458 * on read or writes (e.g., anticipate only on reads).
460 expired = g_rr_queue_expired(qp); /* are we expired ? */
461 next = bioq_first(&qp->q_bioq); /* do we have one more ? */
463 sc->sc_active = NULL;
464 /* Either requeue or release reference. */
466 TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
469 } else if (next != NULL) {
470 qp->q_status = G_QUEUE_READY;
472 if (!force && g_rr_should_anticipate(qp, bp)) {
474 qp->q_status = G_QUEUE_BUSY;
476 /* do not anticipate, release reference */
478 sc->sc_active = NULL;
481 /* If sc_active != NULL, its q_status is always correct. */
489 g_rr_update_thinktime(struct g_rr_queue *qp)
491 int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
493 if (qp->q_sc->sc_active != qp)
496 qp->q_lastsub = ticks;
497 delta = (delta > 2 * wait) ? 2 * wait : delta;
498 if (qp->q_bionum > 7)
499 g_savg_add_sample(&qp->q_thinktime, delta);
503 g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
507 if (qp->q_lastoff > bp->bio_offset)
508 dist = qp->q_lastoff - bp->bio_offset;
510 dist = bp->bio_offset - qp->q_lastoff;
512 if (dist > (8192 * 8))
515 qp->q_lastoff = bp->bio_offset + bp->bio_length;
517 if (qp->q_bionum > 7)
518 g_savg_add_sample(&qp->q_seekdist, dist);
522 * Called when a real request for disk I/O arrives.
523 * Locate the queue associated with the client.
524 * If the queue is the one we are anticipating for, reset its timeout;
525 * if the queue is not in the round robin list, insert it in the list.
526 * On any error, do not queue the request and return -1, the caller
527 * will take care of this request.
530 g_rr_start(void *data, struct bio *bp)
532 struct g_rr_softc *sc = data;
533 struct g_rr_queue *qp;
536 return (-1); /* bypass the scheduler */
538 /* Get the queue for the request. */
539 qp = g_rr_queue_get(sc, bp);
541 return (-1); /* allocation failed, tell upstream */
543 if (bioq_first(&qp->q_bioq) == NULL) {
545 * We are inserting into an empty queue.
546 * Reset its state if it is sc_active,
547 * otherwise insert it in the RR list.
549 if (qp == sc->sc_active) {
550 qp->q_status = G_QUEUE_READY;
551 callout_stop(&sc->sc_wait);
553 g_sched_priv_ref(qp);
554 TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
558 qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
560 g_rr_update_thinktime(qp);
561 g_rr_update_seekdist(qp, bp);
563 /* Inherit the reference returned by g_rr_queue_get(). */
564 bp->bio_caller1 = qp;
565 bioq_disksort(&qp->q_bioq, bp);
571 * Callout executed when a queue times out anticipating a new request.
574 g_rr_wait_timeout(void *data)
576 struct g_rr_softc *sc = data;
577 struct g_geom *geom = sc->sc_geom;
581 * We can race with other events, so check if
582 * sc_active is still valid.
584 if (sc->sc_active != NULL) {
585 /* Release the reference to the queue. */
586 g_rr_queue_put(sc->sc_active);
587 sc->sc_active = NULL;
589 me.wait_miss++; /* record the miss */
591 g_sched_dispatch(geom);
592 g_sched_unlock(geom);
596 * Module glue: allocate descriptor, initialize its fields.
599 g_rr_init(struct g_geom *geom)
601 struct g_rr_softc *sc;
603 /* XXX check whether we can sleep */
604 sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
606 TAILQ_INIT(&sc->sc_rr_tailq);
607 callout_init(&sc->sc_wait, 1);
608 LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
615 * Module glue -- drain the callout structure, destroy the
616 * hash table and its element, and free the descriptor.
619 g_rr_fini(void *data)
621 struct g_rr_softc *sc = data;
623 callout_drain(&sc->sc_wait);
624 KASSERT(sc->sc_active == NULL, ("still a queue under service"));
625 KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
627 LIST_REMOVE(sc, sc_next);
629 free(sc, M_GEOM_SCHED);
633 * Called when the request under service terminates.
634 * Start the anticipation timer if needed.
637 g_rr_done(void *data, struct bio *bp)
639 struct g_rr_softc *sc = data;
640 struct g_rr_queue *qp;
644 qp = bp->bio_caller1;
647 * When the first request for this queue completes, update the
648 * duration and end of the slice. We do not do it when the
649 * slice starts to avoid charging to the queue the time for
652 if (!(qp->q_flags & G_FLAG_COMPLETED)) {
653 qp->q_flags |= G_FLAG_COMPLETED;
655 * recompute the slice duration, in case we want
656 * to make it adaptive. This is not used right now.
657 * XXX should we do the same for q_quantum and q_wait_ticks ?
659 qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
660 qp->q_slice_end = ticks + qp->q_slice_duration;
663 if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
664 /* The queue is trying anticipation, start the timer. */
665 qp->q_status = G_QUEUE_IDLING;
666 /* may make this adaptive */
667 qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
669 callout_reset(&sc->sc_wait, qp->q_wait_ticks,
670 g_rr_wait_timeout, sc);
672 g_sched_dispatch(sc->sc_geom);
674 /* Release a reference to the queue. */
679 g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
680 struct g_consumer *cp, struct g_provider *pp)
682 if (indent == NULL) { /* plaintext */
683 sbuf_printf(sb, " units %d queues %d",
684 me.units, me.queues);
688 static struct g_gsched g_rr = {
690 .gs_priv_size = sizeof(struct g_rr_queue),
691 .gs_init = g_rr_init,
692 .gs_fini = g_rr_fini,
693 .gs_start = g_rr_start,
694 .gs_done = g_rr_done,
695 .gs_next = g_rr_next,
696 .gs_dumpconf = g_rr_dumpconf,
697 .gs_init_class = g_rr_init_class,
698 .gs_fini_class = g_rr_fini_class,
701 DECLARE_GSCHED_MODULE(rr, &g_rr);