sys/geom/sched/g_sched.c

   1 /*-
   2  * Copyright (c) 2009-2010 Fabio Checconi
   3  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 /*
  29  * $Id$
  30  * $FreeBSD$
  31  *
  32  * Main control module for geom-based disk schedulers ('sched').
  33  *
  34  * USER VIEW
  35  * A 'sched' node is typically inserted transparently between
  36  * an existing provider pp and its original geom gp
  37  *
  38  *      [pp --> gp  ..]
  39  *
  40  * using the command "geom sched insert <provider>" and
  41  * resulting in the following topology
  42  *
  43  *      [pp --> sched_gp --> cp]   [new_pp --> gp ... ]
  44  *
  45  * Deletion "geom sched destroy <provider>.sched." restores the
  46  * original chain. The normal "geom sched create <provide>"
  47  * is also supported.
  48  *
  49  * INTERNALS
  50  * Internally, the 'sched' uses the following data structures
  51  *
  52  *   geom{}         g_sched_softc{}      g_gsched{}
  53  * +----------+    +---------------+   +-------------+
  54  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
  55  * |  ...     |    |               |   |  gs_fini    |
  56  * |          |    | [ hash table] |   |  gs_start   |
  57  * +----------+    |               |   |  ...        |
  58  *                 |               |   +-------------+
  59  *                 |               |
  60  *                 |               |     g_*_softc{}
  61  *                 |               |   +-------------+
  62  *                 | sc_data     *-|-->|             |
  63  *                 +---------------+   |  algorithm- |
  64  *                                     |  specific   |
  65  *                                     +-------------+
  66  *
  67  * A g_sched_softc{} is created with a "geom sched insert" call.
  68  * In turn this instantiates a specific scheduling algorithm,
  69  * which sets sc_gsched to point to the algorithm callbacks,
  70  * and calls gs_init() to create the g_*_softc{} .
  71  * The other callbacks (gs_start, gs_next, ...) are invoked
  72  * as needed
  73  *
  74  * g_sched_softc{} is defined in g_sched.h and mostly used here;
  75  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
  76  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
  77  *
  78  * DATA MOVING
  79  * When a bio is received on the provider, it goes to the
  80  * g_sched_start() which calls gs_start() to initially queue it;
  81  * then we call g_sched_dispatch() that loops around gs_next()
  82  * to select zero or more bio's to be sent downstream.
  83  *
  84  * g_sched_dispatch() can also be called as a result of a timeout,
  85  * e.g. when doing anticipation or pacing requests.
  86  *
  87  * When a bio comes back, it goes to g_sched_done() which in turn
  88  * calls gs_done(). The latter does any necessary housekeeping in
  89  * the scheduling algorithm, and may decide to call g_sched_dispatch()
  90  * to send more bio's downstream.
  91  *
  92  * If an algorithm needs per-flow queues, these are created
  93  * calling gs_init_class() and destroyed with gs_fini_class(),
  94  * and they are also inserted in the hash table implemented in
  95  * the g_sched_softc{}
  96  *
  97  * If an algorithm is replaced, or a transparently-inserted node is
  98  * removed with "geom sched destroy", we need to remove all references
  99  * to the g_*_softc{} and g_sched_softc from the bio's still in
 100  * the scheduler. g_sched_forced_dispatch() helps doing this.
 101  * XXX need to explain better.
 102  */
 103
 104 #include <sys/cdefs.h>
 105 #include <sys/param.h>
 106 #include <sys/systm.h>
 107 #include <sys/kernel.h>
 108 #include <sys/module.h>
 109 #include <sys/lock.h>
 110 #include <sys/mutex.h>
 111 #include <sys/bio.h>
 112 #include <sys/limits.h>
 113 #include <sys/hash.h>
 114 #include <sys/sbuf.h>
 115 #include <sys/sysctl.h>
 116 #include <sys/malloc.h>
 117 #include <sys/proc.h>           /* we access curthread */
 118 #include <geom/geom.h>
 119 #include "gs_scheduler.h"
 120 #include "g_sched.h"            /* geom hooks */
 121
 122 /*
 123  * Size of the per-geom hash table storing traffic classes.
 124  * We may decide to change it at a later time, it has no ABI
 125  * implications as it is only used for run-time allocations.
 126  */
 127 #define G_SCHED_HASH_SIZE       32
 128
 129 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
 130 static int g_sched_destroy_geom(struct gctl_req *req,
 131     struct g_class *mp, struct g_geom *gp);
 132 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
 133     const char *verb);
 134 static struct g_geom *g_sched_taste(struct g_class *mp,
 135     struct g_provider *pp, int flags __unused);
 136 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
 137     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 138 static void g_sched_init(struct g_class *mp);
 139 static void g_sched_fini(struct g_class *mp);
 140 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
 141     int fflag, struct thread *td);
 142
 143 struct g_class g_sched_class = {
 144         .name = G_SCHED_CLASS_NAME,
 145         .version = G_VERSION,
 146         .ctlreq = g_sched_config,
 147         .taste = g_sched_taste,
 148         .destroy_geom = g_sched_destroy_geom,
 149         .init = g_sched_init,
 150         .ioctl = g_sched_ioctl,
 151         .fini = g_sched_fini
 152 };
 153
 154 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
 155
 156 /*
 157  * Global variables describing the state of the geom_sched module.
 158  * There is only one static instance of this structure.
 159  */
 160 LIST_HEAD(gs_list, g_gsched);   /* type, link field */
 161 struct geom_sched_vars {
 162         struct mtx      gs_mtx;
 163         struct gs_list  gs_scheds;      /* list of algorithms */
 164         u_int           gs_debug;
 165         u_int           gs_sched_count; /* how many algorithms ? */
 166         u_int           gs_patched;     /* g_io_request was patched */
 167
 168         u_int           gs_initialized;
 169         u_int           gs_expire_secs; /* expiration of hash entries */
 170
 171         struct bio_queue_head gs_pending;
 172         u_int           gs_npending;
 173
 174         /* The following are for stats, usually protected by gs_mtx. */
 175         u_long          gs_requests;    /* total requests */
 176         u_long          gs_done;        /* total done */
 177         u_int           gs_in_flight;   /* requests in flight */
 178         u_int           gs_writes_in_flight;
 179         u_int           gs_bytes_in_flight;
 180         u_int           gs_write_bytes_in_flight;
 181
 182         char            gs_names[256];  /* names of schedulers */
 183 };
 184
 185 static struct geom_sched_vars me = {
 186         .gs_expire_secs = 10,
 187 };
 188
 189 SYSCTL_DECL(_kern_geom);
 190 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
 191     "GEOM_SCHED stuff");
 192
 193 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
 194     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
 195
 196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
 197     &me.gs_bytes_in_flight, 0, "Bytes in flight");
 198
 199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
 200     &me.gs_writes_in_flight, 0, "Write Requests in flight");
 201
 202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
 203     &me.gs_in_flight, 0, "Requests in flight");
 204
 205 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
 206     &me.gs_done, 0, "Total done");
 207
 208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
 209     &me.gs_requests, 0, "Total requests");
 210
 211 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
 212     &me.gs_names, 0, "Algorithm names");
 213
 214 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
 215     &me.gs_sched_count, 0, "Number of algorithms");
 216
 217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
 218     &me.gs_debug, 0, "Debug level");
 219
 220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
 221     &me.gs_expire_secs, 0, "Expire time in seconds");
 222
 223 /*
 224  * g_sched calls the scheduler algorithms with this lock held.
 225  * The locking functions are exposed so the scheduler algorithms can also
 226  * protect themselves e.g. when running a callout handler.
 227  */
 228 void
 229 g_sched_lock(struct g_geom *gp)
 230 {
 231         struct g_sched_softc *sc = gp->softc;
 232
 233         mtx_lock(&sc->sc_mtx);
 234 }
 235
 236 void
 237 g_sched_unlock(struct g_geom *gp)
 238 {
 239         struct g_sched_softc *sc = gp->softc;
 240
 241         mtx_unlock(&sc->sc_mtx);
 242 }
 243
 244 /*
 245  * Support functions to handle references to the module,
 246  * which are coming from devices using this scheduler.
 247  */
 248 static inline void
 249 g_gsched_ref(struct g_gsched *gsp)
 250 {
 251
 252         atomic_add_int(&gsp->gs_refs, 1);
 253 }
 254
 255 static inline void
 256 g_gsched_unref(struct g_gsched *gsp)
 257 {
 258
 259         atomic_add_int(&gsp->gs_refs, -1);
 260 }
 261
 262 /*
 263  * Update the stats when this request is done.
 264  */
 265 static void
 266 g_sched_update_stats(struct bio *bio)
 267 {
 268
 269         me.gs_done++;
 270         me.gs_in_flight--;
 271         me.gs_bytes_in_flight -= bio->bio_length;
 272         if (bio->bio_cmd & BIO_WRITE) {
 273                 me.gs_writes_in_flight--;
 274                 me.gs_write_bytes_in_flight -= bio->bio_length;
 275         }
 276 }
 277
 278 /*
 279  * Dispatch any pending request.
 280  */
 281 static void
 282 g_sched_forced_dispatch(struct g_geom *gp)
 283 {
 284         struct g_sched_softc *sc = gp->softc;
 285         struct g_gsched *gsp = sc->sc_gsched;
 286         struct bio *bp;
 287
 288         KASSERT(mtx_owned(&sc->sc_mtx),
 289             ("sc_mtx not owned during forced dispatch"));
 290
 291         while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
 292                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 293 }
 294
 295 /*
 296  * The main dispatch loop, called either here after the start
 297  * routine, or by scheduling algorithms when they receive a timeout
 298  * or a 'done' notification.  Does not share code with the forced
 299  * dispatch path, since the gs_done() callback can call us.
 300  */
 301 void
 302 g_sched_dispatch(struct g_geom *gp)
 303 {
 304         struct g_sched_softc *sc = gp->softc;
 305         struct g_gsched *gsp = sc->sc_gsched;
 306         struct bio *bp;
 307
 308         KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
 309
 310         if ((sc->sc_flags & G_SCHED_FLUSHING))
 311                 return;
 312
 313         while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
 314                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 315 }
 316
 317 /*
 318  * Recent (8.0 and above) versions of FreeBSD have support to
 319  * register classifiers of disk requests. The classifier is
 320  * invoked by g_io_request(), and stores the information into
 321  * bp->bio_classifier1.
 322  *
 323  * Support for older versions, which is left here only for
 324  * documentation purposes, relies on two hacks:
 325  * 1. classification info is written into the bio_caller1
 326  *    field of the topmost node in the bio chain. This field
 327  *    is rarely used, but this module is incompatible with
 328  *    those that use bio_caller1 for other purposes,
 329  *    such as ZFS and gjournal;
 330  * 2. g_io_request() is patched in-memory when the module is
 331  *    loaded, so that the function calls a classifier as its
 332  *    first thing. g_io_request() is restored when the module
 333  *    is unloaded. This functionality is only supported for
 334  *    x86 and amd64, other architectures need source code changes.
 335  */
 336
 337 /*
 338  * Lookup the identity of the issuer of the original request.
 339  * In the current implementation we use the curthread of the
 340  * issuer, but different mechanisms may be implemented later
 341  * so we do not make assumptions on the return value which for
 342  * us is just an opaque identifier.
 343  */
 344
 345 static inline u_long
 346 g_sched_classify(struct bio *bp)
 347 {
 348
 349 #if __FreeBSD_version > 800098
 350         /* we have classifier fields in the struct bio */
 351 #define HAVE_BIO_CLASSIFIER
 352         return ((u_long)bp->bio_classifier1);
 353 #else
 354 #warning old version!!!
 355         while (bp->bio_parent != NULL)
 356                 bp = bp->bio_parent;
 357
 358         return ((u_long)bp->bio_caller1);
 359 #endif
 360 }
 361
 362 /* Return the hash chain for the given key. */
 363 static inline struct g_hash *
 364 g_sched_hash(struct g_sched_softc *sc, u_long key)
 365 {
 366
 367         return (&sc->sc_hash[key & sc->sc_mask]);
 368 }
 369
 370 /*
 371  * Helper function for the children classes, which takes
 372  * a geom and a bio and returns the private descriptor
 373  * associated to the request.  This involves fetching
 374  * the classification field and [al]locating the
 375  * corresponding entry in the hash table.
 376  */
 377 void *
 378 g_sched_get_class(struct g_geom *gp, struct bio *bp)
 379 {
 380         struct g_sched_softc *sc;
 381         struct g_sched_class *gsc;
 382         struct g_gsched *gsp;
 383         struct g_hash *bucket;
 384         u_long key;
 385
 386         sc = gp->softc;
 387         key = g_sched_classify(bp);
 388         bucket = g_sched_hash(sc, key);
 389         LIST_FOREACH(gsc, bucket, gsc_clist) {
 390                 if (key == gsc->gsc_key) {
 391                         gsc->gsc_refs++;
 392                         return (gsc->gsc_priv);
 393                 }
 394         }
 395
 396         gsp = sc->sc_gsched;
 397         gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
 398             M_GEOM_SCHED, M_NOWAIT | M_ZERO);
 399         if (!gsc)
 400                 return (NULL);
 401
 402         if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 403                 free(gsc, M_GEOM_SCHED);
 404                 return (NULL);
 405         }
 406
 407         gsc->gsc_refs = 2;      /* 1 for the hash table, 1 for the caller. */
 408         gsc->gsc_key = key;
 409         LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
 410
 411         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 412
 413         return (gsc->gsc_priv);
 414 }
 415
 416 /*
 417  * Release a reference to the per-client descriptor,
 418  */
 419 void
 420 g_sched_put_class(struct g_geom *gp, void *priv)
 421 {
 422         struct g_sched_class *gsc;
 423         struct g_sched_softc *sc;
 424
 425         gsc = g_sched_priv2class(priv);
 426         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 427
 428         if (--gsc->gsc_refs > 0)
 429                 return;
 430
 431         sc = gp->softc;
 432         sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
 433
 434         LIST_REMOVE(gsc, gsc_clist);
 435         free(gsc, M_GEOM_SCHED);
 436 }
 437
 438 static void
 439 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
 440     struct g_gsched *gsp, void *data)
 441 {
 442         struct g_sched_class *cp, *cp2;
 443         int i;
 444
 445         if (!hp)
 446                 return;
 447
 448         if (data && gsp->gs_hash_unref)
 449                 gsp->gs_hash_unref(data);
 450
 451         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 452                 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
 453                         g_sched_put_class(gp, cp->gsc_priv);
 454         }
 455
 456         hashdestroy(hp, M_GEOM_SCHED, mask);
 457 }
 458
 459 static struct g_hash *
 460 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
 461 {
 462         struct g_hash *hash;
 463
 464         if (gsp->gs_priv_size == 0)
 465                 return (NULL);
 466
 467         hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
 468
 469         return (hash);
 470 }
 471
 472 static void
 473 g_sched_flush_classes(struct g_geom *gp)
 474 {
 475         struct g_sched_softc *sc;
 476         struct g_sched_class *cp, *cp2;
 477         int i;
 478
 479         sc = gp->softc;
 480
 481         if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
 482                 return;
 483
 484         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 485                 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
 486                         if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
 487                                 g_sched_put_class(gp, cp->gsc_priv);
 488                 }
 489         }
 490
 491         sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
 492 }
 493
 494 /*
 495  * Wait for the completion of any outstanding request.  To ensure
 496  * that this does not take forever the caller has to make sure that
 497  * no new request enter the scehduler before calling us.
 498  *
 499  * Must be called with the gp mutex held and topology locked.
 500  */
 501 static int
 502 g_sched_wait_pending(struct g_geom *gp)
 503 {
 504         struct g_sched_softc *sc = gp->softc;
 505         int endticks = ticks + hz;
 506
 507         g_topology_assert();
 508
 509         while (sc->sc_pending && endticks - ticks >= 0)
 510                 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
 511
 512         return (sc->sc_pending ? ETIMEDOUT : 0);
 513 }
 514
 515 static int
 516 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
 517 {
 518         struct g_sched_softc *sc = gp->softc;
 519         int error;
 520
 521         /* Set the flushing flag: new bios will not enter the scheduler. */
 522         sc->sc_flags |= G_SCHED_FLUSHING;
 523
 524         g_sched_forced_dispatch(gp);
 525         error = g_sched_wait_pending(gp);
 526         if (error)
 527                 goto failed;
 528
 529         /* No more requests pending or in flight from the old gsp. */
 530
 531         g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
 532         sc->sc_hash = NULL;
 533
 534         /*
 535          * Avoid deadlock here by releasing the gp mutex and reacquiring
 536          * it once done.  It should be safe, since no reconfiguration or
 537          * destruction can take place due to the geom topology lock; no
 538          * new request can use the current sc_data since we flagged the
 539          * geom as being flushed.
 540          */
 541         g_sched_unlock(gp);
 542         gsp->gs_fini(sc->sc_data);
 543         g_sched_lock(gp);
 544
 545         sc->sc_gsched = NULL;
 546         sc->sc_data = NULL;
 547         g_gsched_unref(gsp);
 548
 549 failed:
 550         sc->sc_flags &= ~G_SCHED_FLUSHING;
 551
 552         return (error);
 553 }
 554
 555 static int
 556 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
 557 {
 558         int error;
 559
 560         g_sched_lock(gp);
 561         error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
 562         g_sched_unlock(gp);
 563
 564         return (error);
 565 }
 566
 567 /*
 568  * Support function for create/taste -- locate the desired
 569  * algorithm and grab a reference to it.
 570  */
 571 static struct g_gsched *
 572 g_gsched_find(const char *name)
 573 {
 574         struct g_gsched *gsp = NULL;
 575
 576         mtx_lock(&me.gs_mtx);
 577         LIST_FOREACH(gsp, &me.gs_scheds, glist) {
 578                 if (strcmp(name, gsp->gs_name) == 0) {
 579                         g_gsched_ref(gsp);
 580                         break;
 581                 }
 582         }
 583         mtx_unlock(&me.gs_mtx);
 584
 585         return (gsp);
 586 }
 587
 588 /*
 589  * Rebuild the list of scheduler names.
 590  * To be called with me.gs_mtx lock held.
 591  */
 592 static void
 593 g_gsched_build_names(struct g_gsched *gsp)
 594 {
 595         int pos, l;
 596         struct g_gsched *cur;
 597
 598         pos = 0;
 599         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 600                 l = strlen(cur->gs_name);
 601                 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
 602                         if (pos != 0)
 603                                 me.gs_names[pos++] = ' ';
 604                         strcpy(me.gs_names + pos, cur->gs_name);
 605                         pos += l;
 606                 }
 607         }
 608         me.gs_names[pos] = '\0';
 609 }
 610
 611 /*
 612  * Register or unregister individual scheduling algorithms.
 613  */
 614 static int
 615 g_gsched_register(struct g_gsched *gsp)
 616 {
 617         struct g_gsched *cur;
 618         int error = 0;
 619
 620         mtx_lock(&me.gs_mtx);
 621         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 622                 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
 623                         break;
 624         }
 625         if (cur != NULL) {
 626                 G_SCHED_DEBUG(0, "A scheduler named %s already"
 627                     "exists.", gsp->gs_name);
 628                 error = EEXIST;
 629         } else {
 630                 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
 631                 gsp->gs_refs = 1;
 632                 me.gs_sched_count++;
 633                 g_gsched_build_names(gsp);
 634         }
 635         mtx_unlock(&me.gs_mtx);
 636
 637         return (error);
 638 }
 639
 640 struct g_gsched_unregparm {
 641         struct g_gsched *gup_gsp;
 642         int             gup_error;
 643 };
 644
 645 static void
 646 g_gsched_unregister(void *arg, int flag)
 647 {
 648         struct g_gsched_unregparm *parm = arg;
 649         struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
 650         struct g_sched_softc *sc;
 651         struct g_geom *gp, *gp_tmp;
 652         int error;
 653
 654         parm->gup_error = 0;
 655
 656         g_topology_assert();
 657
 658         if (flag == EV_CANCEL)
 659                 return;
 660
 661         mtx_lock(&me.gs_mtx);
 662
 663         LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
 664                 if (gp->class != &g_sched_class)
 665                         continue;       /* Should not happen. */
 666
 667                 sc = gp->softc;
 668                 if (sc->sc_gsched == gsp) {
 669                         error = g_sched_remove(gp, gsp);
 670                         if (error)
 671                                 goto failed;
 672                 }
 673         }
 674
 675         LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
 676                 if (cur != gsp)
 677                         continue;
 678
 679                 if (gsp->gs_refs != 1) {
 680                         G_SCHED_DEBUG(0, "%s still in use.",
 681                             gsp->gs_name);
 682                         parm->gup_error = EBUSY;
 683                 } else {
 684                         LIST_REMOVE(gsp, glist);
 685                         me.gs_sched_count--;
 686                         g_gsched_build_names(gsp);
 687                 }
 688                 break;
 689         }
 690
 691         if (cur == NULL) {
 692                 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
 693                 parm->gup_error = ENOENT;
 694         }
 695
 696 failed:
 697         mtx_unlock(&me.gs_mtx);
 698 }
 699
 700 static inline void
 701 g_gsched_global_init(void)
 702 {
 703
 704         if (!me.gs_initialized) {
 705                 G_SCHED_DEBUG(0, "Initializing global data.");
 706                 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
 707                 LIST_INIT(&me.gs_scheds);
 708                 gs_bioq_init(&me.gs_pending);
 709                 me.gs_initialized = 1;
 710         }
 711 }
 712
 713 /*
 714  * Module event called when a scheduling algorithm module is loaded or
 715  * unloaded.
 716  */
 717 int
 718 g_gsched_modevent(module_t mod, int cmd, void *arg)
 719 {
 720         struct g_gsched *gsp = arg;
 721         struct g_gsched_unregparm parm;
 722         int error;
 723
 724         G_SCHED_DEBUG(0, "Modevent %d.", cmd);
 725
 726         /*
 727          * If the module is loaded at boot, the geom thread that calls
 728          * g_sched_init() might actually run after g_gsched_modevent(),
 729          * so make sure that the module is properly initialized.
 730          */
 731         g_gsched_global_init();
 732
 733         error = EOPNOTSUPP;
 734         switch (cmd) {
 735         case MOD_LOAD:
 736                 error = g_gsched_register(gsp);
 737                 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
 738                     gsp->gs_name, error);
 739                 if (error == 0)
 740                         g_retaste(&g_sched_class);
 741                 break;
 742
 743         case MOD_UNLOAD:
 744                 parm.gup_gsp = gsp;
 745                 parm.gup_error = 0;
 746
 747                 error = g_waitfor_event(g_gsched_unregister,
 748                     &parm, M_WAITOK, NULL);
 749                 if (error == 0)
 750                         error = parm.gup_error;
 751                 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
 752                     gsp->gs_name, error);
 753                 break;
 754         };
 755
 756         return (error);
 757 }
 758
 759 #ifdef KTR
 760 #define TRC_BIO_EVENT(e, bp)    g_sched_trace_bio_ ## e (bp)
 761
 762 static inline char
 763 g_sched_type(struct bio *bp)
 764 {
 765
 766         if (0 != (bp->bio_cmd & BIO_READ))
 767                 return ('R');
 768         else if (0 != (bp->bio_cmd & BIO_WRITE))
 769                 return ('W');
 770         return ('U');
 771 }
 772
 773 static inline void
 774 g_sched_trace_bio_START(struct bio *bp)
 775 {
 776
 777         CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
 778             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 779             bp->bio_offset, bp->bio_length);
 780 }
 781
 782 static inline void
 783 g_sched_trace_bio_DONE(struct bio *bp)
 784 {
 785
 786         CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
 787             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 788             bp->bio_offset, bp->bio_length);
 789 }
 790 #else /* !KTR */
 791 #define TRC_BIO_EVENT(e, bp)
 792 #endif /* !KTR */
 793
 794 /*
 795  * g_sched_done() and g_sched_start() dispatch the geom requests to
 796  * the scheduling algorithm in use.
 797  */
 798 static void
 799 g_sched_done(struct bio *bio)
 800 {
 801         struct g_geom *gp = bio->bio_caller2;
 802         struct g_sched_softc *sc = gp->softc;
 803
 804         TRC_BIO_EVENT(DONE, bio);
 805
 806         KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
 807
 808         g_sched_lock(gp);
 809
 810         g_sched_update_stats(bio);
 811         sc->sc_gsched->gs_done(sc->sc_data, bio);
 812         if (!--sc->sc_pending)
 813                 wakeup(gp);
 814
 815         g_sched_flush_classes(gp);
 816         g_sched_unlock(gp);
 817
 818         g_std_done(bio);
 819 }
 820
 821 static void
 822 g_sched_start(struct bio *bp)
 823 {
 824         struct g_geom *gp = bp->bio_to->geom;
 825         struct g_sched_softc *sc = gp->softc;
 826         struct bio *cbp;
 827
 828         TRC_BIO_EVENT(START, bp);
 829         G_SCHED_LOGREQ(bp, "Request received.");
 830
 831         cbp = g_clone_bio(bp);
 832         if (cbp == NULL) {
 833                 g_io_deliver(bp, ENOMEM);
 834                 return;
 835         }
 836         cbp->bio_done = g_sched_done;
 837         cbp->bio_to = LIST_FIRST(&gp->provider);
 838         KASSERT(cbp->bio_to != NULL, ("NULL provider"));
 839
 840         /* We only schedule reads and writes. */
 841         if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
 842                 goto bypass;
 843
 844         G_SCHED_LOGREQ(cbp, "Sending request.");
 845
 846         g_sched_lock(gp);
 847         /*
 848          * Call the algorithm's gs_start to queue the request in the
 849          * scheduler. If gs_start fails then pass the request down,
 850          * otherwise call g_sched_dispatch() which tries to push
 851          * one or more requests down.
 852          */
 853         if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
 854             sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
 855                 g_sched_unlock(gp);
 856                 goto bypass;
 857         }
 858         /*
 859          * We use bio_caller1 to mark requests that are scheduled
 860          * so make sure it is not NULL.
 861          */
 862         if (cbp->bio_caller1 == NULL)
 863                 cbp->bio_caller1 = &me; /* anything not NULL */
 864
 865         cbp->bio_caller2 = gp;
 866         sc->sc_pending++;
 867
 868         /* Update general stats. */
 869         me.gs_in_flight++;
 870         me.gs_requests++;
 871         me.gs_bytes_in_flight += bp->bio_length;
 872         if (bp->bio_cmd & BIO_WRITE) {
 873                 me.gs_writes_in_flight++;
 874                 me.gs_write_bytes_in_flight += bp->bio_length;
 875         }
 876         g_sched_dispatch(gp);
 877         g_sched_unlock(gp);
 878         return;
 879
 880 bypass:
 881         cbp->bio_done = g_std_done;
 882         cbp->bio_caller1 = NULL; /* not scheduled */
 883         g_io_request(cbp, LIST_FIRST(&gp->consumer));
 884 }
 885
 886 /*
 887  * The next few functions are the geom glue.
 888  */
 889 static void
 890 g_sched_orphan(struct g_consumer *cp)
 891 {
 892
 893         g_topology_assert();
 894         g_sched_destroy(cp->geom, 1);
 895 }
 896
 897 static int
 898 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
 899 {
 900         struct g_geom *gp;
 901         struct g_consumer *cp;
 902         int error;
 903
 904         gp = pp->geom;
 905         cp = LIST_FIRST(&gp->consumer);
 906         error = g_access(cp, dr, dw, de);
 907
 908         return (error);
 909 }
 910
 911 static void
 912 g_sched_temporary_start(struct bio *bio)
 913 {
 914
 915         mtx_lock(&me.gs_mtx);
 916         me.gs_npending++;
 917         gs_bioq_disksort(&me.gs_pending, bio);
 918         mtx_unlock(&me.gs_mtx);
 919 }
 920
 921 static void
 922 g_sched_flush_pending(g_start_t *start)
 923 {
 924         struct bio *bp;
 925
 926         while ((bp = gs_bioq_takefirst(&me.gs_pending)))
 927                 start(bp);
 928 }
 929
 930 static int
 931 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
 932     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
 933 {
 934         struct g_sched_softc *sc = gp->softc;
 935         g_start_t *saved_start, *flush = g_sched_start;
 936         int error = 0, endticks = ticks + hz;
 937
 938         g_cancel_event(newpp);  /* prevent taste() */
 939         /* copy private fields */
 940         newpp->private = pp->private;
 941         newpp->index = pp->index;
 942
 943         /* Queue all the early requests coming for us. */
 944         me.gs_npending = 0;
 945         saved_start = pp->geom->start;
 946         dstgp->start = g_sched_temporary_start;
 947
 948         while (pp->nstart - pp->nend != me.gs_npending &&
 949             endticks - ticks >= 0)
 950                 tsleep(pp, PRIBIO, "-", hz/10);
 951
 952         if (pp->nstart - pp->nend != me.gs_npending) {
 953                 flush = saved_start;
 954                 error = ETIMEDOUT;
 955                 goto fail;
 956         }
 957
 958         /* link pp to this geom */
 959         LIST_REMOVE(pp, provider);
 960         pp->geom = gp;
 961         LIST_INSERT_HEAD(&gp->provider, pp, provider);
 962
 963         /*
 964          * replicate the counts from the parent in the
 965          * new provider and consumer nodes
 966          */
 967         cp->acr = newpp->acr = pp->acr;
 968         cp->acw = newpp->acw = pp->acw;
 969         cp->ace = newpp->ace = pp->ace;
 970         sc->sc_flags |= G_SCHED_PROXYING;
 971
 972 fail:
 973         dstgp->start = saved_start;
 974
 975         g_sched_flush_pending(flush);
 976
 977         return (error);
 978 }
 979
 980 /*
 981  * Create a geom node for the device passed as *pp.
 982  * If successful, add a reference to this gsp.
 983  */
 984 static int
 985 g_sched_create(struct gctl_req *req, struct g_class *mp,
 986     struct g_provider *pp, struct g_gsched *gsp, int proxy)
 987 {
 988         struct g_sched_softc *sc = NULL;
 989         struct g_geom *gp, *dstgp;
 990         struct g_provider *newpp = NULL;
 991         struct g_consumer *cp = NULL;
 992         char name[64];
 993         int error;
 994
 995         g_topology_assert();
 996
 997         snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
 998         LIST_FOREACH(gp, &mp->geom, geom) {
 999                 if (strcmp(gp->name, name) == 0) {
1000                         gctl_error(req, "Geom %s already exists.",
1001                             name);
1002                         return (EEXIST);
1003                 }
1004         }
1005
1006         gp = g_new_geomf(mp, "%s", name);
1007         dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1008
1009         sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1010         sc->sc_gsched = gsp;
1011         sc->sc_data = gsp->gs_init(gp);
1012         if (sc->sc_data == NULL) {
1013                 error = ENOMEM;
1014                 goto fail;
1015         }
1016
1017         sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1018
1019         /*
1020          * Do not initialize the flush mechanism, will be initialized
1021          * on the first insertion on the hash table.
1022          */
1023
1024         mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1025
1026         gp->softc = sc;
1027         gp->start = g_sched_start;
1028         gp->orphan = g_sched_orphan;
1029         gp->access = g_sched_access;
1030         gp->dumpconf = g_sched_dumpconf;
1031
1032         newpp = g_new_providerf(dstgp, "%s", gp->name);
1033         newpp->mediasize = pp->mediasize;
1034         newpp->sectorsize = pp->sectorsize;
1035
1036         cp = g_new_consumer(gp);
1037         error = g_attach(cp, proxy ? newpp : pp);
1038         if (error != 0) {
1039                 gctl_error(req, "Cannot attach to provider %s.",
1040                     pp->name);
1041                 goto fail;
1042         }
1043
1044         g_error_provider(newpp, 0);
1045         if (proxy) {
1046                 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1047                 if (error)
1048                         goto fail;
1049         }
1050         G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1051
1052         g_gsched_ref(gsp);
1053
1054         return (0);
1055
1056 fail:
1057         if (cp != NULL) {
1058                 if (cp->provider != NULL)
1059                         g_detach(cp);
1060                 g_destroy_consumer(cp);
1061         }
1062         if (newpp != NULL)
1063                 g_destroy_provider(newpp);
1064         if (sc->sc_hash)
1065                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1066                     gsp, sc->sc_data);
1067         if (sc->sc_data)
1068                 gsp->gs_fini(sc->sc_data);
1069         g_free(gp->softc);
1070         g_destroy_geom(gp);
1071
1072         return (error);
1073 }
1074
1075 /*
1076  * Support for dynamic switching of scheduling algorithms.
1077  * First initialize the data structures for the new algorithm,
1078  * then call g_sched_remove_locked() to flush all references
1079  * to the old one, finally link the new algorithm.
1080  */
1081 static int
1082 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1083     struct g_provider *pp, struct g_gsched *gsp)
1084 {
1085         struct g_sched_softc *sc;
1086         struct g_geom *gp;
1087         struct g_hash *newh;
1088         void *data;
1089         u_long mask;
1090         int error = 0;
1091
1092         gp = pp->geom;
1093         sc = gp->softc;
1094
1095         data = gsp->gs_init(gp);
1096         if (data == NULL)
1097                 return (ENOMEM);
1098
1099         newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1100         if (gsp->gs_priv_size && !newh) {
1101                 error = ENOMEM;
1102                 goto fail;
1103         }
1104
1105         g_sched_lock(gp);
1106         if (sc->sc_gsched) {    /* can be NULL in some cases */
1107                 error = g_sched_remove_locked(gp, sc->sc_gsched);
1108                 if (error)
1109                         goto fail;
1110         }
1111
1112         g_gsched_ref(gsp);
1113         sc->sc_gsched = gsp;
1114         sc->sc_data = data;
1115         sc->sc_hash = newh;
1116         sc->sc_mask = mask;
1117
1118         g_sched_unlock(gp);
1119
1120         return (0);
1121
1122 fail:
1123         if (newh)
1124                 g_sched_hash_fini(gp, newh, mask, gsp, data);
1125
1126         if (data)
1127                 gsp->gs_fini(data);
1128
1129         g_sched_unlock(gp);
1130
1131         return (error);
1132 }
1133
1134 /*
1135  * Stop the request flow directed to the proxy, redirecting the new
1136  * requests to the me.gs_pending queue.
1137  */
1138 static struct g_provider *
1139 g_detach_proxy(struct g_geom *gp)
1140 {
1141         struct g_consumer *cp;
1142         struct g_provider *pp, *newpp;
1143
1144         do {
1145                 pp = LIST_FIRST(&gp->provider);
1146                 if (pp == NULL)
1147                         break;
1148                 cp = LIST_FIRST(&gp->consumer);
1149                 if (cp == NULL)
1150                         break;
1151                 newpp = cp->provider;
1152                 if (newpp == NULL)
1153                         break;
1154
1155                 me.gs_npending = 0;
1156                 pp->geom->start = g_sched_temporary_start;
1157
1158                 return (pp);
1159         } while (0);
1160         printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1161
1162         return (NULL);
1163 }
1164
1165 static void
1166 g_sched_blackhole(struct bio *bp)
1167 {
1168
1169         g_io_deliver(bp, ENXIO);
1170 }
1171
1172 static inline void
1173 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1174     struct g_provider *newpp)
1175 {
1176
1177         LIST_REMOVE(pp, provider);
1178         if (newpp) {
1179                 pp->private = newpp->private;
1180                 pp->index = newpp->index;
1181         }
1182         pp->geom = gp;
1183         LIST_INSERT_HEAD(&gp->provider, pp, provider);
1184 }
1185
1186 static inline void
1187 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1188 {
1189         struct g_geom *gp = oldpp->geom;
1190
1191         g_reparent_provider(oldpp, newpp->geom, newpp);
1192
1193         /*
1194          * Hackish: let the system destroy the old provider for us, just
1195          * in case someone attached a consumer to it, in which case a
1196          * direct call to g_destroy_provider() would not work.
1197          */
1198         g_reparent_provider(newpp, gp, NULL);
1199 }
1200
1201 /*
1202  * Complete the proxy destruction, linking the old provider to its
1203  * original geom, and destroying the proxy provider.  Also take care
1204  * of issuing the pending requests collected in me.gs_pending (if any).
1205  */
1206 static int
1207 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1208 {
1209         struct g_consumer *cp;
1210         struct g_provider *newpp;
1211
1212         do {
1213                 cp = LIST_FIRST(&gp->consumer);
1214                 if (cp == NULL)
1215                         break;
1216                 newpp = cp->provider;
1217                 if (newpp == NULL)
1218                         break;
1219
1220                 /* Relink the provider to its original geom. */
1221                 g_unproxy_provider(oldpp, newpp);
1222
1223                 /* Detach consumer from provider, and destroy provider. */
1224                 cp->acr = newpp->acr = 0;
1225                 cp->acw = newpp->acw = 0;
1226                 cp->ace = newpp->ace = 0;
1227                 g_detach(cp);
1228
1229                 /* Send the pending bios through the right start function. */
1230                 g_sched_flush_pending(oldpp->geom->start);
1231
1232                 return (0);
1233         } while (0);
1234         printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1235
1236         /* We cannot send the pending bios anywhere... */
1237         g_sched_flush_pending(g_sched_blackhole);
1238
1239         return (EINVAL);
1240 }
1241
1242 static int
1243 g_sched_destroy(struct g_geom *gp, boolean_t force)
1244 {
1245         struct g_provider *pp, *oldpp = NULL;
1246         struct g_sched_softc *sc;
1247         struct g_gsched *gsp;
1248         int error;
1249
1250         g_topology_assert();
1251         sc = gp->softc;
1252         if (sc == NULL)
1253                 return (ENXIO);
1254         if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1255                 pp = LIST_FIRST(&gp->provider);
1256                 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1257                         const char *msg = force ?
1258                                 "but we force removal" : "cannot remove";
1259
1260                         G_SCHED_DEBUG(!force,
1261                             "Device %s is still open (r%dw%de%d), %s.",
1262                             pp->name, pp->acr, pp->acw, pp->ace, msg);
1263                         if (!force)
1264                                 return (EBUSY);
1265                 } else {
1266                         G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1267                 }
1268         } else
1269                 oldpp = g_detach_proxy(gp);
1270
1271         gsp = sc->sc_gsched;
1272         if (gsp) {
1273                 /*
1274                  * XXX bad hack here: force a dispatch to release
1275                  * any reference to the hash table still held by
1276                  * the scheduler.
1277                  */
1278                 g_sched_lock(gp);
1279                 /*
1280                  * We are dying here, no new requests should enter
1281                  * the scheduler.  This is granted by the topolgy,
1282                  * either in case we were proxying (new bios are
1283                  * being redirected) or not (see the access check
1284                  * above).
1285                  */
1286                 g_sched_forced_dispatch(gp);
1287                 error = g_sched_wait_pending(gp);
1288
1289                 if (error) {
1290                         /*
1291                          * Not all the requests came home: this might happen
1292                          * under heavy load, or if we were waiting for any
1293                          * bio which is served in the event path (see
1294                          * geom_slice.c for an example of how this can
1295                          * happen).  Try to restore a working configuration
1296                          * if we can fail.
1297                          */
1298                         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1299                                 g_sched_flush_pending(force ?
1300                                     g_sched_blackhole : g_sched_start);
1301                         }
1302
1303                         /*
1304                          * In the forced destroy case there is not so much
1305                          * we can do, we have pending bios that will call
1306                          * g_sched_done() somehow, and we don't want them
1307                          * to crash the system using freed memory.  We tell
1308                          * the user that something went wrong, and leak some
1309                          * memory here.
1310                          * Note: the callers using force = 1 ignore the
1311                          * return value.
1312                          */
1313                         if (force) {
1314                                 G_SCHED_DEBUG(0, "Pending requests while "
1315                                     " destroying geom, some memory leaked.");
1316                         }
1317
1318                         return (error);
1319                 }
1320
1321                 g_sched_unlock(gp);
1322                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1323                     gsp, sc->sc_data);
1324                 sc->sc_hash = NULL;
1325                 gsp->gs_fini(sc->sc_data);
1326                 g_gsched_unref(gsp);
1327                 sc->sc_gsched = NULL;
1328         }
1329
1330         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1331                 error = g_destroy_proxy(gp, oldpp);
1332
1333                 if (error) {
1334                         if (force) {
1335                                 G_SCHED_DEBUG(0, "Unrecoverable error while "
1336                                     "destroying a proxy geom, leaking some "
1337                                     " memory.");
1338                         }
1339
1340                         return (error);
1341                 }
1342         }
1343
1344         mtx_destroy(&sc->sc_mtx);
1345
1346         g_free(gp->softc);
1347         gp->softc = NULL;
1348         g_wither_geom(gp, ENXIO);
1349
1350         return (error);
1351 }
1352
1353 static int
1354 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1355     struct g_geom *gp)
1356 {
1357
1358         return (g_sched_destroy(gp, 0));
1359 }
1360
1361 /*
1362  * Functions related to the classification of requests.
1363  *
1364  * On recent FreeBSD versions (8.0 and above), we store a reference
1365  * to the issuer of a request in bp->bio_classifier1 as soon
1366  * as the bio is posted to the geom queue (and not later, because
1367  * requests are managed by the g_down thread afterwards).
1368  *
1369  * On older versions of the system (but this code is not used
1370  * in any existing release), we [ab]use the caller1 field in the
1371  * root element of the bio tree to store the classification info.
1372  * The marking is done at the beginning of g_io_request()
1373  * and only if we find that the field is NULL.
1374  *
1375  * To avoid rebuilding the kernel, this module will patch the
1376  * initial part of g_io_request() so it jumps to some hand-coded
1377  * assembly that does the marking and then executes the original
1378  * body of g_io_request().
1379  *
1380  * fake_ioreq[] is architecture-specific machine code
1381  * that implements the above. CODE_SIZE, STORE_SIZE etc.
1382  * are constants used in the patching routine. Look at the
1383  * code in g_ioreq_patch() for the details.
1384  */
1385
1386 #ifndef HAVE_BIO_CLASSIFIER
1387 /*
1388  * Support for old FreeBSD versions
1389  */
1390 #if defined(__i386__)
1391 #define CODE_SIZE       29
1392 #define STORE_SIZE      5
1393 #define EPILOGUE        5
1394 #define SIZE            (CODE_SIZE + STORE_SIZE + EPILOGUE)
1395
1396 static u_char fake_ioreq[SIZE] = {
1397         0x8b, 0x44, 0x24, 0x04,         /* mov bp, %eax */
1398         /* 1: */
1399         0x89, 0xc2,                     /* mov %eax, %edx # edx = bp */
1400         0x8b, 0x40, 0x64,               /* mov bp->bio_parent, %eax */
1401         0x85, 0xc0,                     /* test %eax, %eax */
1402         0x75, 0xf7,                     /* jne 1b */
1403         0x8b, 0x42, 0x30,               /* mov bp->bp_caller1, %eax */
1404         0x85, 0xc0,                     /* test %eax, %eax */
1405         0x75, 0x09,                     /* jne 2f */
1406         0x64, 0xa1, 0x00, 0x00,         /* mov %fs:0, %eax */
1407         0x00, 0x00,
1408         0x89, 0x42, 0x30,               /* mov %eax, bp->bio_caller1 */
1409         /* 2: */
1410         0x55, 0x89, 0xe5, 0x57, 0x56,
1411         0xe9, 0x00, 0x00, 0x00, 0x00,   /* jmp back... */
1412 };
1413 #elif defined(__amd64)
1414 #define CODE_SIZE       38
1415 #define STORE_SIZE      6
1416 #define EPILOGUE        5
1417 #define SIZE            (CODE_SIZE + STORE_SIZE + EPILOGUE)
1418
1419 static u_char fake_ioreq[SIZE] = {
1420         0x48, 0x89, 0xf8,               /* mov bp, %rax */
1421         /* 1: */
1422         0x48, 0x89, 0xc2,               /* mov %rax, %rdx # rdx = bp */
1423         0x48, 0x8b, 0x82, 0xa8,         /* mov bp->bio_parent, %rax */
1424         0x00, 0x00, 0x00,
1425         0x48, 0x85, 0xc0,               /* test %rax, %rax */
1426         0x75, 0xf1,                     /* jne 1b */
1427         0x48, 0x83, 0x7a, 0x58,         /* cmp $0, bp->bp_caller1 */
1428         0x00,
1429         0x75, 0x0d,                     /* jne 2f */
1430         0x65, 0x48, 0x8b, 0x04,         /* mov %gs:0, %rax */
1431         0x25, 0x00, 0x00, 0x00,
1432         0x00,
1433         0x48, 0x89, 0x42, 0x58,         /* mov %rax, bp->bio_caller1 */
1434         /* 2: */
1435         0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1436         0xe9, 0x00, 0x00, 0x00, 0x00,   /* jmp back... */
1437 };
1438 #else /* neither x86 nor amd64 */
1439 static void
1440 g_new_io_request(struct bio *bp, struct g_consumer *cp)
1441 {
1442         struct bio *top = bp;
1443
1444         /*
1445          * bio classification: if bio_caller1 is available in the
1446          * root of the 'struct bio' tree, store there the thread id
1447          * of the thread that originated the request.
1448          * More sophisticated classification schemes can be used.
1449          */
1450         while (top->bio_parent)
1451                 top = top->bio_parent;
1452
1453         if (top->bio_caller1 == NULL)
1454                 top->bio_caller1 = curthread;
1455 }
1456
1457 #error please add the code above in g_new_io_request() to the beginning of \
1458         /sys/geom/geom_io.c::g_io_request(), and remove this line.
1459 #endif /* end of arch-specific code */
1460
1461 static int
1462 g_ioreq_patch(void)
1463 {
1464         u_char *original;
1465         u_long ofs;
1466         int found;
1467
1468         if (me.gs_patched)
1469                 return (-1);
1470
1471         original = (u_char *)g_io_request;
1472
1473         found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1474         if (!found)
1475                 return (-1);
1476
1477         /* Jump back to the original + STORE_SIZE. */
1478         ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1479         bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1480
1481         /* Patch the original address with a jump to the trampoline. */
1482         *original = 0xe9;     /* jump opcode */
1483         ofs = fake_ioreq - (original + 5);
1484         bcopy(&ofs, original + 1, 4);
1485
1486         me.gs_patched = 1;
1487
1488         return (0);
1489 }
1490
1491 /*
1492  * Restore the original code, this is easy.
1493  */
1494 static void
1495 g_ioreq_restore(void)
1496 {
1497         u_char *original;
1498
1499         if (me.gs_patched) {
1500                 original = (u_char *)g_io_request;
1501                 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1502                 me.gs_patched = 0;
1503         }
1504 }
1505
1506 static inline void
1507 g_classifier_ini(void)
1508 {
1509
1510         g_ioreq_patch();
1511 }
1512
1513 static inline void
1514 g_classifier_fini(void)
1515 {
1516
1517         g_ioreq_restore();
1518 }
1519
1520 /*--- end of support code for older FreeBSD versions */
1521
1522 #else /* HAVE_BIO_CLASSIFIER */
1523
1524 /*
1525  * Classifier support for recent FreeBSD versions: we use
1526  * a very simple classifier, only use curthread to tag a request.
1527  * The classifier is registered at module load, and unregistered
1528  * at module unload.
1529  */
1530 static int
1531 g_sched_tag(void *arg, struct bio *bp)
1532 {
1533
1534         bp->bio_classifier1 = curthread;
1535         return (1);
1536 }
1537
1538 static struct g_classifier_hook g_sched_classifier = {
1539         .func = g_sched_tag,
1540 };
1541
1542 static inline void
1543 g_classifier_ini(void)
1544 {
1545
1546         g_register_classifier(&g_sched_classifier);
1547 }
1548
1549 static inline void
1550 g_classifier_fini(void)
1551 {
1552
1553         g_unregister_classifier(&g_sched_classifier);
1554 }
1555 #endif /* HAVE_BIO_CLASSIFIER */
1556
1557 static void
1558 g_sched_init(struct g_class *mp)
1559 {
1560
1561         g_gsched_global_init();
1562
1563         G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1564             mp, &g_sched_class);
1565
1566         /* Patch g_io_request to store classification info in the bio. */
1567         g_classifier_ini();
1568 }
1569
1570 static void
1571 g_sched_fini(struct g_class *mp)
1572 {
1573
1574         g_classifier_fini();
1575
1576         G_SCHED_DEBUG(0, "Unloading...");
1577
1578         KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1579         mtx_destroy(&me.gs_mtx);
1580 }
1581
1582 static int
1583 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1584     struct thread *td)
1585 {
1586         struct g_consumer *cp;
1587         struct g_geom *gp;
1588
1589         cp = LIST_FIRST(&pp->geom->consumer);
1590         if (cp == NULL)
1591                 return (ENOIOCTL);
1592         gp = cp->provider->geom;
1593         if (gp->ioctl == NULL)
1594                 return (ENOIOCTL);
1595         return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1596 }
1597
1598 /*
1599  * Read the i-th argument for a request, skipping the /dev/
1600  * prefix if present.
1601  */
1602 static const char *
1603 g_sched_argi(struct gctl_req *req, int i)
1604 {
1605         static const char *dev_prefix = "/dev/";
1606         const char *name;
1607         char param[16];
1608         int l = strlen(dev_prefix);
1609
1610         snprintf(param, sizeof(param), "arg%d", i);
1611         name = gctl_get_asciiparam(req, param);
1612         if (name == NULL)
1613                 gctl_error(req, "No 'arg%d' argument", i);
1614         else if (strncmp(name, dev_prefix, l) == 0)
1615                 name += l;
1616         return (name);
1617 }
1618
1619 /*
1620  * Fetch nargs and do appropriate checks.
1621  */
1622 static int
1623 g_sched_get_nargs(struct gctl_req *req)
1624 {
1625         int *nargs;
1626
1627         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1628         if (nargs == NULL) {
1629                 gctl_error(req, "No 'nargs' argument");
1630                 return (0);
1631         }
1632         if (*nargs <= 0)
1633                 gctl_error(req, "Missing device(s).");
1634         return (*nargs);
1635 }
1636
1637 /*
1638  * Check whether we should add the class on certain volumes when
1639  * this geom is created. Right now this is under control of a kenv
1640  * variable containing the names of all devices that we care about.
1641  * Probably we should only support transparent insertion as the
1642  * preferred mode of operation.
1643  */
1644 static struct g_geom *
1645 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1646                 int flags __unused)
1647 {
1648         struct g_gsched *gsp = NULL;    /* the . algorithm we want */
1649         const char *s;                  /* generic string pointer */
1650         const char *taste_names;        /* devices we like */
1651         int l;
1652
1653         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1654             mp->name, pp->name);
1655         g_topology_assert();
1656
1657         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1658
1659         do {
1660                 /* do not taste on ourselves */
1661                 if (pp->geom->class == mp)
1662                         break;
1663
1664                 taste_names = getenv("geom.sched.taste");
1665                 if (taste_names == NULL)
1666                         break;
1667
1668                 l = strlen(pp->name);
1669                 for (s = taste_names; *s &&
1670                     (s = strstr(s, pp->name)); s++) {
1671                         /* further checks for an exact match */
1672                         if ( (s == taste_names || s[-1] == ' ') &&
1673                              (s[l] == '\0' || s[l] == ' ') )
1674                                 break;
1675                 }
1676                 if (s == NULL)
1677                         break;
1678                 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1679                     pp->name, s);
1680
1681                 /* look up the provider name in the list */
1682                 s = getenv("geom.sched.algo");
1683                 if (s == NULL)
1684                         s = "rr";
1685
1686                 gsp = g_gsched_find(s); /* also get a reference */
1687                 if (gsp == NULL) {
1688                         G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1689                         break;
1690                 }
1691
1692                 /* XXX create with 1 as last argument ? */
1693                 g_sched_create(NULL, mp, pp, gsp, 0);
1694                 g_gsched_unref(gsp);
1695         } while (0);
1696         return NULL;
1697 }
1698
1699 static void
1700 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1701 {
1702         struct g_provider *pp;
1703         struct g_gsched *gsp;
1704         const char *name;
1705         int i, nargs;
1706
1707         g_topology_assert();
1708
1709         name = gctl_get_asciiparam(req, "algo");
1710         if (name == NULL) {
1711                 gctl_error(req, "No '%s' argument", "algo");
1712                 return;
1713         }
1714
1715         gsp = g_gsched_find(name);      /* also get a reference */
1716         if (gsp == NULL) {
1717                 gctl_error(req, "Bad algorithm '%s'", name);
1718                 return;
1719         }
1720
1721         nargs = g_sched_get_nargs(req);
1722
1723         /*
1724          * Run on the arguments, and break on any error.
1725          * We look for a device name, but skip the /dev/ prefix if any.
1726          */
1727         for (i = 0; i < nargs; i++) {
1728                 name = g_sched_argi(req, i);
1729                 if (name == NULL)
1730                         break;
1731                 pp = g_provider_by_name(name);
1732                 if (pp == NULL) {
1733                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1734                         gctl_error(req, "Provider %s is invalid.", name);
1735                         break;
1736                 }
1737                 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1738                         break;
1739         }
1740
1741         g_gsched_unref(gsp);
1742 }
1743
1744 static void
1745 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1746 {
1747         struct g_provider *pp;
1748         struct g_gsched *gsp;
1749         const char *name;
1750         int i, nargs;
1751
1752         g_topology_assert();
1753
1754         name = gctl_get_asciiparam(req, "algo");
1755         if (name == NULL) {
1756                 gctl_error(req, "No '%s' argument", "algo");
1757                 return;
1758         }
1759
1760         gsp = g_gsched_find(name);      /* also get a reference */
1761         if (gsp == NULL) {
1762                 gctl_error(req, "Bad algorithm '%s'", name);
1763                 return;
1764         }
1765
1766         nargs = g_sched_get_nargs(req);
1767
1768         /*
1769          * Run on the arguments, and break on any error.
1770          * We look for a device name, but skip the /dev/ prefix if any.
1771          */
1772         for (i = 0; i < nargs; i++) {
1773                 name = g_sched_argi(req, i);
1774                 if (name == NULL)
1775                         break;
1776                 pp = g_provider_by_name(name);
1777                 if (pp == NULL || pp->geom->class != mp) {
1778                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1779                         gctl_error(req, "Provider %s is invalid.", name);
1780                         break;
1781                 }
1782                 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1783                         break;
1784         }
1785
1786         g_gsched_unref(gsp);
1787 }
1788
1789 static struct g_geom *
1790 g_sched_find_geom(struct g_class *mp, const char *name)
1791 {
1792         struct g_geom *gp;
1793
1794         LIST_FOREACH(gp, &mp->geom, geom) {
1795                 if (strcmp(gp->name, name) == 0)
1796                         return (gp);
1797         }
1798         return (NULL);
1799 }
1800
1801 static void
1802 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1803 {
1804         int nargs, *force, error, i;
1805         struct g_geom *gp;
1806         const char *name;
1807
1808         g_topology_assert();
1809
1810         nargs = g_sched_get_nargs(req);
1811
1812         force = gctl_get_paraml(req, "force", sizeof(*force));
1813         if (force == NULL) {
1814                 gctl_error(req, "No 'force' argument");
1815                 return;
1816         }
1817
1818         for (i = 0; i < nargs; i++) {
1819                 name = g_sched_argi(req, i);
1820                 if (name == NULL)
1821                         break;
1822
1823                 gp = g_sched_find_geom(mp, name);
1824                 if (gp == NULL) {
1825                         G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1826                         gctl_error(req, "Device %s is invalid.", name);
1827                         break;
1828                 }
1829
1830                 error = g_sched_destroy(gp, *force);
1831                 if (error != 0) {
1832                         gctl_error(req, "Cannot destroy device %s (error=%d).",
1833                             gp->name, error);
1834                         break;
1835                 }
1836         }
1837 }
1838
1839 static void
1840 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1841 {
1842         uint32_t *version;
1843
1844         g_topology_assert();
1845
1846         version = gctl_get_paraml(req, "version", sizeof(*version));
1847         if (version == NULL) {
1848                 gctl_error(req, "No '%s' argument.", "version");
1849                 return;
1850         }
1851
1852         if (*version != G_SCHED_VERSION) {
1853                 gctl_error(req, "Userland and kernel parts are "
1854                     "out of sync.");
1855                 return;
1856         }
1857
1858         if (strcmp(verb, "create") == 0) {
1859                 g_sched_ctl_create(req, mp, 0);
1860                 return;
1861         } else if (strcmp(verb, "insert") == 0) {
1862                 g_sched_ctl_create(req, mp, 1);
1863                 return;
1864         } else if (strcmp(verb, "configure") == 0) {
1865                 g_sched_ctl_configure(req, mp);
1866                 return;
1867         } else if (strcmp(verb, "destroy") == 0) {
1868                 g_sched_ctl_destroy(req, mp);
1869                 return;
1870         }
1871
1872         gctl_error(req, "Unknown verb.");
1873 }
1874
1875 static void
1876 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1877     struct g_consumer *cp, struct g_provider *pp)
1878 {
1879         struct g_sched_softc *sc = gp->softc;
1880         struct g_gsched *gsp = sc->sc_gsched;
1881         if (indent == NULL) {   /* plaintext */
1882                 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1883         }
1884         if (gsp != NULL && gsp->gs_dumpconf)
1885                 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1886 }
1887
1888 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1889 MODULE_VERSION(geom_sched, 0);