2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2009-2010 Fabio Checconi
5 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Main control module for geom-based disk schedulers ('sched').
37 * A 'sched' node is typically inserted transparently between
38 * an existing provider pp and its original geom gp
42 * using the command "geom sched insert <provider>" and
43 * resulting in the following topology
45 * [pp --> sched_gp --> cp] [new_pp --> gp ... ]
47 * Deletion "geom sched destroy <provider>.sched." restores the
48 * original chain. The normal "geom sched create <provide>"
52 * Internally, the 'sched' uses the following data structures
54 * geom{} g_sched_softc{} g_gsched{}
55 * +----------+ +---------------+ +-------------+
56 * | softc *-|--->| sc_gsched *-|-->| gs_init |
57 * | ... | | | | gs_fini |
58 * | | | [ hash table] | | gs_start |
59 * +----------+ | | | ... |
65 * +---------------+ | algorithm- |
69 * A g_sched_softc{} is created with a "geom sched insert" call.
70 * In turn this instantiates a specific scheduling algorithm,
71 * which sets sc_gsched to point to the algorithm callbacks,
72 * and calls gs_init() to create the g_*_softc{} .
73 * The other callbacks (gs_start, gs_next, ...) are invoked
76 * g_sched_softc{} is defined in g_sched.h and mostly used here;
77 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
78 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
81 * When a bio is received on the provider, it goes to the
82 * g_sched_start() which calls gs_start() to initially queue it;
83 * then we call g_sched_dispatch() that loops around gs_next()
84 * to select zero or more bio's to be sent downstream.
86 * g_sched_dispatch() can also be called as a result of a timeout,
87 * e.g. when doing anticipation or pacing requests.
89 * When a bio comes back, it goes to g_sched_done() which in turn
90 * calls gs_done(). The latter does any necessary housekeeping in
91 * the scheduling algorithm, and may decide to call g_sched_dispatch()
92 * to send more bio's downstream.
94 * If an algorithm needs per-flow queues, these are created
95 * calling gs_init_class() and destroyed with gs_fini_class(),
96 * and they are also inserted in the hash table implemented in
99 * If an algorithm is replaced, or a transparently-inserted node is
100 * removed with "geom sched destroy", we need to remove all references
101 * to the g_*_softc{} and g_sched_softc from the bio's still in
102 * the scheduler. g_sched_forced_dispatch() helps doing this.
103 * XXX need to explain better.
106 #include <sys/cdefs.h>
107 #include <sys/param.h>
108 #include <sys/systm.h>
109 #include <sys/kernel.h>
110 #include <sys/module.h>
111 #include <sys/lock.h>
112 #include <sys/mutex.h>
114 #include <sys/limits.h>
115 #include <sys/hash.h>
116 #include <sys/sbuf.h>
117 #include <sys/sysctl.h>
118 #include <sys/malloc.h>
119 #include <sys/proc.h> /* we access curthread */
120 #include <geom/geom.h>
121 #include <geom/geom_dbg.h>
122 #include "gs_scheduler.h"
123 #include "g_sched.h" /* geom hooks */
126 * Size of the per-geom hash table storing traffic classes.
127 * We may decide to change it at a later time, it has no ABI
128 * implications as it is only used for run-time allocations.
130 #define G_SCHED_HASH_SIZE 32
132 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
133 static int g_sched_destroy_geom(struct gctl_req *req,
134 struct g_class *mp, struct g_geom *gp);
135 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
137 static struct g_geom *g_sched_taste(struct g_class *mp,
138 struct g_provider *pp, int flags __unused);
139 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
140 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141 static void g_sched_init(struct g_class *mp);
142 static void g_sched_fini(struct g_class *mp);
143 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
144 int fflag, struct thread *td);
146 struct g_class g_sched_class = {
147 .name = G_SCHED_CLASS_NAME,
148 .version = G_VERSION,
149 .ctlreq = g_sched_config,
150 .taste = g_sched_taste,
151 .destroy_geom = g_sched_destroy_geom,
152 .init = g_sched_init,
153 .ioctl = g_sched_ioctl,
157 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
160 * Global variables describing the state of the geom_sched module.
161 * There is only one static instance of this structure.
163 LIST_HEAD(gs_list, g_gsched); /* type, link field */
164 struct geom_sched_vars {
166 struct gs_list gs_scheds; /* list of algorithms */
168 u_int gs_sched_count; /* how many algorithms ? */
169 u_int gs_patched; /* g_io_request was patched */
171 u_int gs_initialized;
172 u_int gs_expire_secs; /* expiration of hash entries */
174 struct bio_queue_head gs_pending;
177 /* The following are for stats, usually protected by gs_mtx. */
178 u_long gs_requests; /* total requests */
179 u_long gs_done; /* total done */
180 u_int gs_in_flight; /* requests in flight */
181 u_int gs_writes_in_flight;
182 u_int gs_bytes_in_flight;
183 u_int gs_write_bytes_in_flight;
185 char gs_names[256]; /* names of schedulers */
188 static struct geom_sched_vars me = {
189 .gs_expire_secs = 10,
192 SYSCTL_DECL(_kern_geom);
193 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
197 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
200 &me.gs_bytes_in_flight, 0, "Bytes in flight");
202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
203 &me.gs_writes_in_flight, 0, "Write Requests in flight");
205 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
206 &me.gs_in_flight, 0, "Requests in flight");
208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
209 &me.gs_done, 0, "Total done");
211 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
212 &me.gs_requests, 0, "Total requests");
214 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
215 &me.gs_names, 0, "Algorithm names");
217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
218 &me.gs_sched_count, 0, "Number of algorithms");
220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
221 &me.gs_debug, 0, "Debug level");
223 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
224 &me.gs_expire_secs, 0, "Expire time in seconds");
227 * g_sched calls the scheduler algorithms with this lock held.
228 * The locking functions are exposed so the scheduler algorithms can also
229 * protect themselves e.g. when running a callout handler.
232 g_sched_lock(struct g_geom *gp)
234 struct g_sched_softc *sc = gp->softc;
236 mtx_lock(&sc->sc_mtx);
240 g_sched_unlock(struct g_geom *gp)
242 struct g_sched_softc *sc = gp->softc;
244 mtx_unlock(&sc->sc_mtx);
248 * Support functions to handle references to the module,
249 * which are coming from devices using this scheduler.
252 g_gsched_ref(struct g_gsched *gsp)
255 atomic_add_int(&gsp->gs_refs, 1);
259 g_gsched_unref(struct g_gsched *gsp)
262 atomic_add_int(&gsp->gs_refs, -1);
266 * Update the stats when this request is done.
269 g_sched_update_stats(struct bio *bio)
274 me.gs_bytes_in_flight -= bio->bio_length;
275 if (bio->bio_cmd == BIO_WRITE) {
276 me.gs_writes_in_flight--;
277 me.gs_write_bytes_in_flight -= bio->bio_length;
282 * Dispatch any pending request.
285 g_sched_forced_dispatch(struct g_geom *gp)
287 struct g_sched_softc *sc = gp->softc;
288 struct g_gsched *gsp = sc->sc_gsched;
291 KASSERT(mtx_owned(&sc->sc_mtx),
292 ("sc_mtx not owned during forced dispatch"));
294 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
295 g_io_request(bp, LIST_FIRST(&gp->consumer));
299 * The main dispatch loop, called either here after the start
300 * routine, or by scheduling algorithms when they receive a timeout
301 * or a 'done' notification. Does not share code with the forced
302 * dispatch path, since the gs_done() callback can call us.
305 g_sched_dispatch(struct g_geom *gp)
307 struct g_sched_softc *sc = gp->softc;
308 struct g_gsched *gsp = sc->sc_gsched;
311 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
313 if ((sc->sc_flags & G_SCHED_FLUSHING))
316 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
317 g_io_request(bp, LIST_FIRST(&gp->consumer));
321 * Recent (8.0 and above) versions of FreeBSD have support to
322 * register classifiers of disk requests. The classifier is
323 * invoked by g_io_request(), and stores the information into
324 * bp->bio_classifier1.
326 * Support for older versions, which is left here only for
327 * documentation purposes, relies on two hacks:
328 * 1. classification info is written into the bio_caller1
329 * field of the topmost node in the bio chain. This field
330 * is rarely used, but this module is incompatible with
331 * those that use bio_caller1 for other purposes,
332 * such as ZFS and gjournal;
333 * 2. g_io_request() is patched in-memory when the module is
334 * loaded, so that the function calls a classifier as its
335 * first thing. g_io_request() is restored when the module
336 * is unloaded. This functionality is only supported for
337 * x86 and amd64, other architectures need source code changes.
341 * Lookup the identity of the issuer of the original request.
342 * In the current implementation we use the curthread of the
343 * issuer, but different mechanisms may be implemented later
344 * so we do not make assumptions on the return value which for
345 * us is just an opaque identifier.
349 g_sched_classify(struct bio *bp)
352 /* we have classifier fields in the struct bio */
353 return ((u_long)bp->bio_classifier1);
356 /* Return the hash chain for the given key. */
357 static inline struct g_hash *
358 g_sched_hash(struct g_sched_softc *sc, u_long key)
361 return (&sc->sc_hash[key & sc->sc_mask]);
365 * Helper function for the children classes, which takes
366 * a geom and a bio and returns the private descriptor
367 * associated to the request. This involves fetching
368 * the classification field and [al]locating the
369 * corresponding entry in the hash table.
372 g_sched_get_class(struct g_geom *gp, struct bio *bp)
374 struct g_sched_softc *sc;
375 struct g_sched_class *gsc;
376 struct g_gsched *gsp;
377 struct g_hash *bucket;
381 key = g_sched_classify(bp);
382 bucket = g_sched_hash(sc, key);
383 LIST_FOREACH(gsc, bucket, gsc_clist) {
384 if (key == gsc->gsc_key) {
386 return (gsc->gsc_priv);
391 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
392 M_GEOM_SCHED, M_NOWAIT | M_ZERO);
396 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
397 free(gsc, M_GEOM_SCHED);
401 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */
403 LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
405 gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
407 return (gsc->gsc_priv);
411 * Release a reference to the per-client descriptor,
414 g_sched_put_class(struct g_geom *gp, void *priv)
416 struct g_sched_class *gsc;
417 struct g_sched_softc *sc;
419 gsc = g_sched_priv2class(priv);
420 gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
422 if (--gsc->gsc_refs > 0)
426 sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
428 LIST_REMOVE(gsc, gsc_clist);
429 free(gsc, M_GEOM_SCHED);
433 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
434 struct g_gsched *gsp, void *data)
436 struct g_sched_class *cp, *cp2;
442 if (data && gsp->gs_hash_unref)
443 gsp->gs_hash_unref(data);
445 for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
446 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
447 g_sched_put_class(gp, cp->gsc_priv);
450 hashdestroy(hp, M_GEOM_SCHED, mask);
453 static struct g_hash *
454 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
458 if (gsp->gs_priv_size == 0)
461 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
467 g_sched_flush_classes(struct g_geom *gp)
469 struct g_sched_softc *sc;
470 struct g_sched_class *cp, *cp2;
475 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
478 for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
479 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
480 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
481 g_sched_put_class(gp, cp->gsc_priv);
485 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
489 * Wait for the completion of any outstanding request. To ensure
490 * that this does not take forever the caller has to make sure that
491 * no new request enter the scehduler before calling us.
493 * Must be called with the gp mutex held and topology locked.
496 g_sched_wait_pending(struct g_geom *gp)
498 struct g_sched_softc *sc = gp->softc;
499 int endticks = ticks + hz;
503 while (sc->sc_pending && endticks - ticks >= 0)
504 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
506 return (sc->sc_pending ? ETIMEDOUT : 0);
510 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
512 struct g_sched_softc *sc = gp->softc;
515 /* Set the flushing flag: new bios will not enter the scheduler. */
516 sc->sc_flags |= G_SCHED_FLUSHING;
518 g_sched_forced_dispatch(gp);
519 error = g_sched_wait_pending(gp);
523 /* No more requests pending or in flight from the old gsp. */
525 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
529 * Avoid deadlock here by releasing the gp mutex and reacquiring
530 * it once done. It should be safe, since no reconfiguration or
531 * destruction can take place due to the geom topology lock; no
532 * new request can use the current sc_data since we flagged the
533 * geom as being flushed.
536 gsp->gs_fini(sc->sc_data);
539 sc->sc_gsched = NULL;
544 sc->sc_flags &= ~G_SCHED_FLUSHING;
550 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
555 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
562 * Support function for create/taste -- locate the desired
563 * algorithm and grab a reference to it.
565 static struct g_gsched *
566 g_gsched_find(const char *name)
568 struct g_gsched *gsp = NULL;
570 mtx_lock(&me.gs_mtx);
571 LIST_FOREACH(gsp, &me.gs_scheds, glist) {
572 if (strcmp(name, gsp->gs_name) == 0) {
577 mtx_unlock(&me.gs_mtx);
583 * Rebuild the list of scheduler names.
584 * To be called with me.gs_mtx lock held.
587 g_gsched_build_names(struct g_gsched *gsp)
590 struct g_gsched *cur;
593 LIST_FOREACH(cur, &me.gs_scheds, glist) {
594 l = strlen(cur->gs_name);
595 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
597 me.gs_names[pos++] = ' ';
598 strcpy(me.gs_names + pos, cur->gs_name);
602 me.gs_names[pos] = '\0';
606 * Register or unregister individual scheduling algorithms.
609 g_gsched_register(struct g_gsched *gsp)
611 struct g_gsched *cur;
614 mtx_lock(&me.gs_mtx);
615 LIST_FOREACH(cur, &me.gs_scheds, glist) {
616 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
620 G_SCHED_DEBUG(0, "A scheduler named %s already"
621 "exists.", gsp->gs_name);
624 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
627 g_gsched_build_names(gsp);
629 mtx_unlock(&me.gs_mtx);
634 struct g_gsched_unregparm {
635 struct g_gsched *gup_gsp;
640 g_gsched_unregister(void *arg, int flag)
642 struct g_gsched_unregparm *parm = arg;
643 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
644 struct g_sched_softc *sc;
645 struct g_geom *gp, *gp_tmp;
652 if (flag == EV_CANCEL)
655 mtx_lock(&me.gs_mtx);
657 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
658 if (gp->class != &g_sched_class)
659 continue; /* Should not happen. */
662 if (sc->sc_gsched == gsp) {
663 error = g_sched_remove(gp, gsp);
669 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
673 if (gsp->gs_refs != 1) {
674 G_SCHED_DEBUG(0, "%s still in use.",
676 parm->gup_error = EBUSY;
678 LIST_REMOVE(gsp, glist);
680 g_gsched_build_names(gsp);
686 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
687 parm->gup_error = ENOENT;
691 mtx_unlock(&me.gs_mtx);
695 g_gsched_global_init(void)
698 if (!me.gs_initialized) {
699 G_SCHED_DEBUG(0, "Initializing global data.");
700 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
701 LIST_INIT(&me.gs_scheds);
702 bioq_init(&me.gs_pending);
703 me.gs_initialized = 1;
708 * Module event called when a scheduling algorithm module is loaded or
712 g_gsched_modevent(module_t mod, int cmd, void *arg)
714 struct g_gsched *gsp = arg;
715 struct g_gsched_unregparm parm;
718 G_SCHED_DEBUG(0, "Modevent %d.", cmd);
721 * If the module is loaded at boot, the geom thread that calls
722 * g_sched_init() might actually run after g_gsched_modevent(),
723 * so make sure that the module is properly initialized.
725 g_gsched_global_init();
730 error = g_gsched_register(gsp);
731 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
732 gsp->gs_name, error);
734 g_retaste(&g_sched_class);
741 error = g_waitfor_event(g_gsched_unregister,
742 &parm, M_WAITOK, NULL);
744 error = parm.gup_error;
745 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
746 gsp->gs_name, error);
754 #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp)
757 g_sched_type(struct bio *bp)
760 if (bp->bio_cmd == BIO_READ)
762 else if (bp->bio_cmd == BIO_WRITE)
768 g_sched_trace_bio_START(struct bio *bp)
771 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
772 g_sched_type(bp), bp->bio_offset / ULONG_MAX,
773 bp->bio_offset, bp->bio_length);
777 g_sched_trace_bio_DONE(struct bio *bp)
780 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
781 g_sched_type(bp), bp->bio_offset / ULONG_MAX,
782 bp->bio_offset, bp->bio_length);
785 #define TRC_BIO_EVENT(e, bp)
789 * g_sched_done() and g_sched_start() dispatch the geom requests to
790 * the scheduling algorithm in use.
793 g_sched_done(struct bio *bio)
795 struct g_geom *gp = bio->bio_caller2;
796 struct g_sched_softc *sc = gp->softc;
798 TRC_BIO_EVENT(DONE, bio);
800 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
804 g_sched_update_stats(bio);
805 sc->sc_gsched->gs_done(sc->sc_data, bio);
806 if (!--sc->sc_pending)
809 g_sched_flush_classes(gp);
816 g_sched_start(struct bio *bp)
818 struct g_geom *gp = bp->bio_to->geom;
819 struct g_sched_softc *sc = gp->softc;
822 TRC_BIO_EVENT(START, bp);
823 G_SCHED_LOGREQ(bp, "Request received.");
825 cbp = g_clone_bio(bp);
827 g_io_deliver(bp, ENOMEM);
830 cbp->bio_done = g_sched_done;
831 cbp->bio_to = LIST_FIRST(&gp->provider);
832 KASSERT(cbp->bio_to != NULL, ("NULL provider"));
834 /* We only schedule reads and writes. */
835 if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
838 G_SCHED_LOGREQ(cbp, "Sending request.");
842 * Call the algorithm's gs_start to queue the request in the
843 * scheduler. If gs_start fails then pass the request down,
844 * otherwise call g_sched_dispatch() which tries to push
845 * one or more requests down.
847 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
848 sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
853 * We use bio_caller1 to mark requests that are scheduled
854 * so make sure it is not NULL.
856 if (cbp->bio_caller1 == NULL)
857 cbp->bio_caller1 = &me; /* anything not NULL */
859 cbp->bio_caller2 = gp;
862 /* Update general stats. */
865 me.gs_bytes_in_flight += bp->bio_length;
866 if (bp->bio_cmd == BIO_WRITE) {
867 me.gs_writes_in_flight++;
868 me.gs_write_bytes_in_flight += bp->bio_length;
870 g_sched_dispatch(gp);
875 cbp->bio_done = g_std_done;
876 cbp->bio_caller1 = NULL; /* not scheduled */
877 g_io_request(cbp, LIST_FIRST(&gp->consumer));
881 * The next few functions are the geom glue.
884 g_sched_orphan(struct g_consumer *cp)
888 g_sched_destroy(cp->geom, 1);
892 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
895 struct g_consumer *cp;
899 cp = LIST_FIRST(&gp->consumer);
900 error = g_access(cp, dr, dw, de);
906 g_sched_temporary_start(struct bio *bio)
909 mtx_lock(&me.gs_mtx);
911 bioq_disksort(&me.gs_pending, bio);
912 mtx_unlock(&me.gs_mtx);
916 g_sched_flush_pending(g_start_t *start)
920 while ((bp = bioq_takefirst(&me.gs_pending)))
925 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
926 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
928 struct g_sched_softc *sc = gp->softc;
929 g_start_t *saved_start, *flush = g_sched_start;
930 int error = 0, endticks = ticks + hz;
932 g_cancel_event(newpp); /* prevent taste() */
933 /* copy private fields */
934 newpp->private = pp->private;
935 newpp->index = pp->index;
937 /* Queue all the early requests coming for us. */
939 saved_start = pp->geom->start;
940 dstgp->start = g_sched_temporary_start;
942 while (pp->nstart - pp->nend != me.gs_npending &&
943 endticks - ticks >= 0)
944 tsleep(pp, PRIBIO, "-", hz/10);
946 if (pp->nstart - pp->nend != me.gs_npending) {
952 /* link pp to this geom */
953 LIST_REMOVE(pp, provider);
955 LIST_INSERT_HEAD(&gp->provider, pp, provider);
958 * replicate the counts from the parent in the
959 * new provider and consumer nodes
961 cp->acr = newpp->acr = pp->acr;
962 cp->acw = newpp->acw = pp->acw;
963 cp->ace = newpp->ace = pp->ace;
964 sc->sc_flags |= G_SCHED_PROXYING;
967 dstgp->start = saved_start;
969 g_sched_flush_pending(flush);
975 * Create a geom node for the device passed as *pp.
976 * If successful, add a reference to this gsp.
979 g_sched_create(struct gctl_req *req, struct g_class *mp,
980 struct g_provider *pp, struct g_gsched *gsp, int proxy)
982 struct g_sched_softc *sc = NULL;
983 struct g_geom *gp, *dstgp;
984 struct g_provider *newpp = NULL;
985 struct g_consumer *cp = NULL;
991 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
992 LIST_FOREACH(gp, &mp->geom, geom) {
993 if (strcmp(gp->name, name) == 0) {
994 gctl_error(req, "Geom %s already exists.",
1000 gp = g_new_geomf(mp, "%s", name);
1001 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1003 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1004 sc->sc_gsched = gsp;
1005 sc->sc_data = gsp->gs_init(gp);
1006 if (sc->sc_data == NULL) {
1011 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1014 * Do not initialize the flush mechanism, will be initialized
1015 * on the first insertion on the hash table.
1018 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1021 gp->start = g_sched_start;
1022 gp->orphan = g_sched_orphan;
1023 gp->access = g_sched_access;
1024 gp->dumpconf = g_sched_dumpconf;
1026 newpp = g_new_providerf(dstgp, "%s", gp->name);
1027 newpp->mediasize = pp->mediasize;
1028 newpp->sectorsize = pp->sectorsize;
1030 cp = g_new_consumer(gp);
1031 error = g_attach(cp, proxy ? newpp : pp);
1033 gctl_error(req, "Cannot attach to provider %s.",
1038 g_error_provider(newpp, 0);
1040 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1044 G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1052 if (cp->provider != NULL)
1054 g_destroy_consumer(cp);
1057 g_destroy_provider(newpp);
1059 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1062 gsp->gs_fini(sc->sc_data);
1070 * Support for dynamic switching of scheduling algorithms.
1071 * First initialize the data structures for the new algorithm,
1072 * then call g_sched_remove_locked() to flush all references
1073 * to the old one, finally link the new algorithm.
1076 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1077 struct g_provider *pp, struct g_gsched *gsp)
1079 struct g_sched_softc *sc;
1081 struct g_hash *newh;
1089 data = gsp->gs_init(gp);
1093 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1094 if (gsp->gs_priv_size && !newh) {
1100 if (sc->sc_gsched) { /* can be NULL in some cases */
1101 error = g_sched_remove_locked(gp, sc->sc_gsched);
1107 sc->sc_gsched = gsp;
1118 g_sched_hash_fini(gp, newh, mask, gsp, data);
1129 * Stop the request flow directed to the proxy, redirecting the new
1130 * requests to the me.gs_pending queue.
1132 static struct g_provider *
1133 g_detach_proxy(struct g_geom *gp)
1135 struct g_consumer *cp;
1136 struct g_provider *pp, *newpp;
1139 pp = LIST_FIRST(&gp->provider);
1142 cp = LIST_FIRST(&gp->consumer);
1145 newpp = cp->provider;
1150 pp->geom->start = g_sched_temporary_start;
1154 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1160 g_sched_blackhole(struct bio *bp)
1163 g_io_deliver(bp, ENXIO);
1167 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1168 struct g_provider *newpp)
1171 LIST_REMOVE(pp, provider);
1173 pp->private = newpp->private;
1174 pp->index = newpp->index;
1177 LIST_INSERT_HEAD(&gp->provider, pp, provider);
1181 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1183 struct g_geom *gp = oldpp->geom;
1185 g_reparent_provider(oldpp, newpp->geom, newpp);
1188 * Hackish: let the system destroy the old provider for us, just
1189 * in case someone attached a consumer to it, in which case a
1190 * direct call to g_destroy_provider() would not work.
1192 g_reparent_provider(newpp, gp, NULL);
1196 * Complete the proxy destruction, linking the old provider to its
1197 * original geom, and destroying the proxy provider. Also take care
1198 * of issuing the pending requests collected in me.gs_pending (if any).
1201 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1203 struct g_consumer *cp;
1204 struct g_provider *newpp;
1207 cp = LIST_FIRST(&gp->consumer);
1210 newpp = cp->provider;
1214 /* Relink the provider to its original geom. */
1215 g_unproxy_provider(oldpp, newpp);
1217 /* Detach consumer from provider, and destroy provider. */
1218 cp->acr = newpp->acr = 0;
1219 cp->acw = newpp->acw = 0;
1220 cp->ace = newpp->ace = 0;
1223 /* Send the pending bios through the right start function. */
1224 g_sched_flush_pending(oldpp->geom->start);
1228 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1230 /* We cannot send the pending bios anywhere... */
1231 g_sched_flush_pending(g_sched_blackhole);
1237 g_sched_destroy(struct g_geom *gp, boolean_t force)
1239 struct g_provider *pp, *oldpp = NULL;
1240 struct g_sched_softc *sc;
1241 struct g_gsched *gsp;
1244 g_topology_assert();
1248 if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1249 pp = LIST_FIRST(&gp->provider);
1250 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1251 const char *msg = force ?
1252 "but we force removal" : "cannot remove";
1254 G_SCHED_DEBUG(!force,
1255 "Device %s is still open (r%dw%de%d), %s.",
1256 pp->name, pp->acr, pp->acw, pp->ace, msg);
1260 G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1263 oldpp = g_detach_proxy(gp);
1265 gsp = sc->sc_gsched;
1268 * XXX bad hack here: force a dispatch to release
1269 * any reference to the hash table still held by
1274 * We are dying here, no new requests should enter
1275 * the scheduler. This is granted by the topolgy,
1276 * either in case we were proxying (new bios are
1277 * being redirected) or not (see the access check
1280 g_sched_forced_dispatch(gp);
1281 error = g_sched_wait_pending(gp);
1285 * Not all the requests came home: this might happen
1286 * under heavy load, or if we were waiting for any
1287 * bio which is served in the event path (see
1288 * geom_slice.c for an example of how this can
1289 * happen). Try to restore a working configuration
1292 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1293 g_sched_flush_pending(force ?
1294 g_sched_blackhole : g_sched_start);
1298 * In the forced destroy case there is not so much
1299 * we can do, we have pending bios that will call
1300 * g_sched_done() somehow, and we don't want them
1301 * to crash the system using freed memory. We tell
1302 * the user that something went wrong, and leak some
1304 * Note: the callers using force = 1 ignore the
1308 G_SCHED_DEBUG(0, "Pending requests while "
1309 " destroying geom, some memory leaked.");
1316 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1319 gsp->gs_fini(sc->sc_data);
1320 g_gsched_unref(gsp);
1321 sc->sc_gsched = NULL;
1325 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1326 error = g_destroy_proxy(gp, oldpp);
1330 G_SCHED_DEBUG(0, "Unrecoverable error while "
1331 "destroying a proxy geom, leaking some "
1339 mtx_destroy(&sc->sc_mtx);
1343 g_wither_geom(gp, ENXIO);
1349 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1353 return (g_sched_destroy(gp, 0));
1357 * Functions related to the classification of requests.
1359 * On recent FreeBSD versions (8.0 and above), we store a reference
1360 * to the issuer of a request in bp->bio_classifier1 as soon
1361 * as the bio is posted to the geom queue (and not later, because
1362 * requests are managed by the g_down thread afterwards).
1366 * Classifier support for recent FreeBSD versions: we use
1367 * a very simple classifier, only use curthread to tag a request.
1368 * The classifier is registered at module load, and unregistered
1372 g_sched_tag(void *arg, struct bio *bp)
1375 bp->bio_classifier1 = curthread;
1379 static struct g_classifier_hook g_sched_classifier = {
1380 .func = g_sched_tag,
1384 g_classifier_ini(void)
1387 g_register_classifier(&g_sched_classifier);
1391 g_classifier_fini(void)
1394 g_unregister_classifier(&g_sched_classifier);
1398 g_sched_init(struct g_class *mp)
1401 g_gsched_global_init();
1403 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1404 mp, &g_sched_class);
1406 /* Patch g_io_request to store classification info in the bio. */
1411 g_sched_fini(struct g_class *mp)
1414 g_classifier_fini();
1416 G_SCHED_DEBUG(0, "Unloading...");
1418 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1419 mtx_destroy(&me.gs_mtx);
1423 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1426 struct g_consumer *cp;
1429 cp = LIST_FIRST(&pp->geom->consumer);
1432 gp = cp->provider->geom;
1433 if (gp->ioctl == NULL)
1435 return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1439 * Read the i-th argument for a request, skipping the /dev/
1440 * prefix if present.
1443 g_sched_argi(struct gctl_req *req, int i)
1445 static const char *dev_prefix = "/dev/";
1448 int l = strlen(dev_prefix);
1450 snprintf(param, sizeof(param), "arg%d", i);
1451 name = gctl_get_asciiparam(req, param);
1453 gctl_error(req, "No 'arg%d' argument", i);
1454 else if (strncmp(name, dev_prefix, l) == 0)
1460 * Fetch nargs and do appropriate checks.
1463 g_sched_get_nargs(struct gctl_req *req)
1467 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1468 if (nargs == NULL) {
1469 gctl_error(req, "No 'nargs' argument");
1473 gctl_error(req, "Missing device(s).");
1478 * Check whether we should add the class on certain volumes when
1479 * this geom is created. Right now this is under control of a kenv
1480 * variable containing the names of all devices that we care about.
1481 * Probably we should only support transparent insertion as the
1482 * preferred mode of operation.
1484 static struct g_geom *
1485 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1488 struct g_gsched *gsp = NULL; /* the . algorithm we want */
1489 const char *s; /* generic string pointer */
1490 const char *taste_names; /* devices we like */
1493 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1494 mp->name, pp->name);
1495 g_topology_assert();
1497 G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1500 /* do not taste on ourselves */
1501 if (pp->geom->class == mp)
1504 taste_names = kern_getenv("geom.sched.taste");
1505 if (taste_names == NULL)
1508 l = strlen(pp->name);
1509 for (s = taste_names; *s &&
1510 (s = strstr(s, pp->name)); s++) {
1511 /* further checks for an exact match */
1512 if ( (s == taste_names || s[-1] == ' ') &&
1513 (s[l] == '\0' || s[l] == ' ') )
1518 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1521 /* look up the provider name in the list */
1522 s = kern_getenv("geom.sched.algo");
1526 gsp = g_gsched_find(s); /* also get a reference */
1528 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1532 /* XXX create with 1 as last argument ? */
1533 g_sched_create(NULL, mp, pp, gsp, 0);
1534 g_gsched_unref(gsp);
1540 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1542 struct g_provider *pp;
1543 struct g_gsched *gsp;
1547 g_topology_assert();
1549 name = gctl_get_asciiparam(req, "algo");
1551 gctl_error(req, "No '%s' argument", "algo");
1555 gsp = g_gsched_find(name); /* also get a reference */
1557 gctl_error(req, "Bad algorithm '%s'", name);
1561 nargs = g_sched_get_nargs(req);
1564 * Run on the arguments, and break on any error.
1565 * We look for a device name, but skip the /dev/ prefix if any.
1567 for (i = 0; i < nargs; i++) {
1568 name = g_sched_argi(req, i);
1571 pp = g_provider_by_name(name);
1573 G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1574 gctl_error(req, "Provider %s is invalid.", name);
1577 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1581 g_gsched_unref(gsp);
1585 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1587 struct g_provider *pp;
1588 struct g_gsched *gsp;
1592 g_topology_assert();
1594 name = gctl_get_asciiparam(req, "algo");
1596 gctl_error(req, "No '%s' argument", "algo");
1600 gsp = g_gsched_find(name); /* also get a reference */
1602 gctl_error(req, "Bad algorithm '%s'", name);
1606 nargs = g_sched_get_nargs(req);
1609 * Run on the arguments, and break on any error.
1610 * We look for a device name, but skip the /dev/ prefix if any.
1612 for (i = 0; i < nargs; i++) {
1613 name = g_sched_argi(req, i);
1616 pp = g_provider_by_name(name);
1617 if (pp == NULL || pp->geom->class != mp) {
1618 G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1619 gctl_error(req, "Provider %s is invalid.", name);
1622 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1626 g_gsched_unref(gsp);
1629 static struct g_geom *
1630 g_sched_find_geom(struct g_class *mp, const char *name)
1634 LIST_FOREACH(gp, &mp->geom, geom) {
1635 if (strcmp(gp->name, name) == 0)
1642 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1644 int nargs, *force, error, i;
1648 g_topology_assert();
1650 nargs = g_sched_get_nargs(req);
1652 force = gctl_get_paraml(req, "force", sizeof(*force));
1653 if (force == NULL) {
1654 gctl_error(req, "No 'force' argument");
1658 for (i = 0; i < nargs; i++) {
1659 name = g_sched_argi(req, i);
1663 gp = g_sched_find_geom(mp, name);
1665 G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1666 gctl_error(req, "Device %s is invalid.", name);
1670 error = g_sched_destroy(gp, *force);
1672 gctl_error(req, "Cannot destroy device %s (error=%d).",
1680 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1684 g_topology_assert();
1686 version = gctl_get_paraml(req, "version", sizeof(*version));
1687 if (version == NULL) {
1688 gctl_error(req, "No '%s' argument.", "version");
1692 if (*version != G_SCHED_VERSION) {
1693 gctl_error(req, "Userland and kernel parts are "
1698 if (strcmp(verb, "create") == 0) {
1699 g_sched_ctl_create(req, mp, 0);
1701 } else if (strcmp(verb, "insert") == 0) {
1702 g_sched_ctl_create(req, mp, 1);
1704 } else if (strcmp(verb, "configure") == 0) {
1705 g_sched_ctl_configure(req, mp);
1707 } else if (strcmp(verb, "destroy") == 0) {
1708 g_sched_ctl_destroy(req, mp);
1712 gctl_error(req, "Unknown verb.");
1716 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1717 struct g_consumer *cp, struct g_provider *pp)
1719 struct g_sched_softc *sc = gp->softc;
1720 struct g_gsched *gsp = sc->sc_gsched;
1721 if (indent == NULL) { /* plaintext */
1722 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1724 if (gsp != NULL && gsp->gs_dumpconf)
1725 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1728 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1729 MODULE_VERSION(geom_sched, 0);