sys/geom/sched/g_sched.c

   1 /*-
   2  * Copyright (c) 2009-2010 Fabio Checconi
   3  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 /*
  29  * $Id$
  30  * $FreeBSD$
  31  *
  32  * Main control module for geom-based disk schedulers ('sched').
  33  *
  34  * USER VIEW
  35  * A 'sched' node is typically inserted transparently between
  36  * an existing provider pp and its original geom gp
  37  *
  38  *      [pp --> gp  ..]
  39  *
  40  * using the command "geom sched insert <provider>" and
  41  * resulting in the following topology
  42  *
  43  *      [pp --> sched_gp --> cp]   [new_pp --> gp ... ]
  44  *
  45  * Deletion "geom sched destroy <provider>.sched." restores the
  46  * original chain. The normal "geom sched create <provide>"
  47  * is also supported.
  48  *
  49  * INTERNALS
  50  * Internally, the 'sched' uses the following data structures
  51  *
  52  *   geom{}         g_sched_softc{}      g_gsched{}
  53  * +----------+    +---------------+   +-------------+
  54  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
  55  * |  ...     |    |               |   |  gs_fini    |
  56  * |          |    | [ hash table] |   |  gs_start   |
  57  * +----------+    |               |   |  ...        |
  58  *                 |               |   +-------------+
  59  *                 |               |
  60  *                 |               |     g_*_softc{}
  61  *                 |               |   +-------------+
  62  *                 | sc_data     *-|-->|             |
  63  *                 +---------------+   |  algorithm- |
  64  *                                     |  specific   |
  65  *                                     +-------------+
  66  *
  67  * A g_sched_softc{} is created with a "geom sched insert" call.
  68  * In turn this instantiates a specific scheduling algorithm,
  69  * which sets sc_gsched to point to the algorithm callbacks,
  70  * and calls gs_init() to create the g_*_softc{} .
  71  * The other callbacks (gs_start, gs_next, ...) are invoked
  72  * as needed
  73  *
  74  * g_sched_softc{} is defined in g_sched.h and mostly used here;
  75  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
  76  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
  77  *
  78  * DATA MOVING
  79  * When a bio is received on the provider, it goes to the
  80  * g_sched_start() which calls gs_start() to initially queue it;
  81  * then we call g_sched_dispatch() that loops around gs_next()
  82  * to select zero or more bio's to be sent downstream.
  83  *
  84  * g_sched_dispatch() can also be called as a result of a timeout,
  85  * e.g. when doing anticipation or pacing requests.
  86  *
  87  * When a bio comes back, it goes to g_sched_done() which in turn
  88  * calls gs_done(). The latter does any necessary housekeeping in
  89  * the scheduling algorithm, and may decide to call g_sched_dispatch()
  90  * to send more bio's downstream.
  91  *
  92  * If an algorithm needs per-flow queues, these are created
  93  * calling gs_init_class() and destroyed with gs_fini_class(),
  94  * and they are also inserted in the hash table implemented in
  95  * the g_sched_softc{}
  96  *
  97  * If an algorithm is replaced, or a transparently-inserted node is
  98  * removed with "geom sched destroy", we need to remove all references
  99  * to the g_*_softc{} and g_sched_softc from the bio's still in
 100  * the scheduler. g_sched_forced_dispatch() helps doing this.
 101  * XXX need to explain better.
 102  */
 103
 104 #include <sys/cdefs.h>
 105 #include <sys/param.h>
 106 #include <sys/systm.h>
 107 #include <sys/kernel.h>
 108 #include <sys/module.h>
 109 #include <sys/lock.h>
 110 #include <sys/mutex.h>
 111 #include <sys/bio.h>
 112 #include <sys/limits.h>
 113 #include <sys/hash.h>
 114 #include <sys/sysctl.h>
 115 #include <sys/malloc.h>
 116 #include <sys/proc.h>           /* we access curthread */
 117 #include <geom/geom.h>
 118 #include "gs_scheduler.h"
 119 #include "g_sched.h"            /* geom hooks */
 120
 121 /*
 122  * Size of the per-geom hash table storing traffic classes.
 123  * We may decide to change it at a later time, it has no ABI
 124  * implications as it is only used for run-time allocations.
 125  */
 126 #define G_SCHED_HASH_SIZE       32
 127
 128 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
 129 static int g_sched_destroy_geom(struct gctl_req *req,
 130     struct g_class *mp, struct g_geom *gp);
 131 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
 132     const char *verb);
 133 static struct g_geom *g_sched_taste(struct g_class *mp,
 134     struct g_provider *pp, int flags __unused);
 135 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
 136     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 137 static void g_sched_init(struct g_class *mp);
 138 static void g_sched_fini(struct g_class *mp);
 139 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
 140     int fflag, struct thread *td);
 141
 142 struct g_class g_sched_class = {
 143         .name = G_SCHED_CLASS_NAME,
 144         .version = G_VERSION,
 145         .ctlreq = g_sched_config,
 146         .taste = g_sched_taste,
 147         .destroy_geom = g_sched_destroy_geom,
 148         .init = g_sched_init,
 149         .ioctl = g_sched_ioctl,
 150         .fini = g_sched_fini
 151 };
 152
 153 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
 154
 155 /*
 156  * Global variables describing the state of the geom_sched module.
 157  * There is only one static instance of this structure.
 158  */
 159 LIST_HEAD(gs_list, g_gsched);   /* type, link field */
 160 struct geom_sched_vars {
 161         struct mtx      gs_mtx;
 162         struct gs_list  gs_scheds;      /* list of algorithms */
 163         u_int           gs_debug;
 164         u_int           gs_sched_count; /* how many algorithms ? */
 165         u_int           gs_patched;     /* g_io_request was patched */
 166
 167         u_int           gs_initialized;
 168         u_int           gs_expire_secs; /* expiration of hash entries */
 169
 170         struct bio_queue_head gs_pending;
 171         u_int           gs_npending;
 172
 173         /* The following are for stats, usually protected by gs_mtx. */
 174         u_long          gs_requests;    /* total requests */
 175         u_long          gs_done;        /* total done */
 176         u_int           gs_in_flight;   /* requests in flight */
 177         u_int           gs_writes_in_flight;
 178         u_int           gs_bytes_in_flight;
 179         u_int           gs_write_bytes_in_flight;
 180
 181         char            gs_names[256];  /* names of schedulers */
 182 };
 183
 184 static struct geom_sched_vars me = {
 185         .gs_expire_secs = 10,
 186 };
 187
 188 SYSCTL_DECL(_kern_geom);
 189 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
 190     "GEOM_SCHED stuff");
 191
 192 SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
 193     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
 194
 195 SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
 196     &me.gs_bytes_in_flight, 0, "Bytes in flight");
 197
 198 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
 199     &me.gs_writes_in_flight, 0, "Write Requests in flight");
 200
 201 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
 202     &me.gs_in_flight, 0, "Requests in flight");
 203
 204 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
 205     &me.gs_done, 0, "Total done");
 206
 207 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
 208     &me.gs_requests, 0, "Total requests");
 209
 210 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
 211     &me.gs_names, 0, "Algorithm names");
 212
 213 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
 214     &me.gs_sched_count, 0, "Number of algorithms");
 215
 216 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
 217     &me.gs_debug, 0, "Debug level");
 218
 219 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
 220     &me.gs_expire_secs, 0, "Expire time in seconds");
 221
 222 /*
 223  * g_sched calls the scheduler algorithms with this lock held.
 224  * The locking functions are exposed so the scheduler algorithms can also
 225  * protect themselves e.g. when running a callout handler.
 226  */
 227 void
 228 g_sched_lock(struct g_geom *gp)
 229 {
 230         struct g_sched_softc *sc = gp->softc;
 231
 232         mtx_lock(&sc->sc_mtx);
 233 }
 234
 235 void
 236 g_sched_unlock(struct g_geom *gp)
 237 {
 238         struct g_sched_softc *sc = gp->softc;
 239
 240         mtx_unlock(&sc->sc_mtx);
 241 }
 242
 243 /*
 244  * Support functions to handle references to the module,
 245  * which are coming from devices using this scheduler.
 246  */
 247 static inline void
 248 g_gsched_ref(struct g_gsched *gsp)
 249 {
 250
 251         atomic_add_int(&gsp->gs_refs, 1);
 252 }
 253
 254 static inline void
 255 g_gsched_unref(struct g_gsched *gsp)
 256 {
 257
 258         atomic_add_int(&gsp->gs_refs, -1);
 259 }
 260
 261 /*
 262  * Update the stats when this request is done.
 263  */
 264 static void
 265 g_sched_update_stats(struct bio *bio)
 266 {
 267
 268         me.gs_done++;
 269         me.gs_in_flight--;
 270         me.gs_bytes_in_flight -= bio->bio_length;
 271         if (bio->bio_cmd & BIO_WRITE) {
 272                 me.gs_writes_in_flight--;
 273                 me.gs_write_bytes_in_flight -= bio->bio_length;
 274         }
 275 }
 276
 277 /*
 278  * Dispatch any pending request.
 279  */
 280 static void
 281 g_sched_forced_dispatch(struct g_geom *gp)
 282 {
 283         struct g_sched_softc *sc = gp->softc;
 284         struct g_gsched *gsp = sc->sc_gsched;
 285         struct bio *bp;
 286
 287         KASSERT(mtx_owned(&sc->sc_mtx),
 288             ("sc_mtx not owned during forced dispatch"));
 289
 290         while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
 291                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 292 }
 293
 294 /*
 295  * The main dispatch loop, called either here after the start
 296  * routine, or by scheduling algorithms when they receive a timeout
 297  * or a 'done' notification.  Does not share code with the forced
 298  * dispatch path, since the gs_done() callback can call us.
 299  */
 300 void
 301 g_sched_dispatch(struct g_geom *gp)
 302 {
 303         struct g_sched_softc *sc = gp->softc;
 304         struct g_gsched *gsp = sc->sc_gsched;
 305         struct bio *bp;
 306
 307         KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
 308
 309         if ((sc->sc_flags & G_SCHED_FLUSHING))
 310                 return;
 311
 312         while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
 313                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 314 }
 315
 316 /*
 317  * Recent (8.0 and above) versions of FreeBSD have support to
 318  * register classifiers of disk requests. The classifier is
 319  * invoked by g_io_request(), and stores the information into
 320  * bp->bio_classifier1.
 321  *
 322  * Support for older versions, which is left here only for
 323  * documentation purposes, relies on two hacks:
 324  * 1. classification info is written into the bio_caller1
 325  *    field of the topmost node in the bio chain. This field
 326  *    is rarely used, but this module is incompatible with
 327  *    those that use bio_caller1 for other purposes,
 328  *    such as ZFS and gjournal;
 329  * 2. g_io_request() is patched in-memory when the module is
 330  *    loaded, so that the function calls a classifier as its
 331  *    first thing. g_io_request() is restored when the module
 332  *    is unloaded. This functionality is only supported for
 333  *    x86 and amd64, other architectures need source code changes.
 334  */
 335
 336 /*
 337  * Lookup the identity of the issuer of the original request.
 338  * In the current implementation we use the curthread of the
 339  * issuer, but different mechanisms may be implemented later
 340  * so we do not make assumptions on the return value which for
 341  * us is just an opaque identifier.
 342  */
 343
 344 static inline u_long
 345 g_sched_classify(struct bio *bp)
 346 {
 347
 348 #if __FreeBSD_version > 800098
 349         /* we have classifier fields in the struct bio */
 350 #define HAVE_BIO_CLASSIFIER
 351         return ((u_long)bp->bio_classifier1);
 352 #else
 353 #warning old version!!!
 354         while (bp->bio_parent != NULL)
 355                 bp = bp->bio_parent;
 356
 357         return ((u_long)bp->bio_caller1);
 358 #endif
 359 }
 360
 361 /* Return the hash chain for the given key. */
 362 static inline struct g_hash *
 363 g_sched_hash(struct g_sched_softc *sc, u_long key)
 364 {
 365
 366         return (&sc->sc_hash[key & sc->sc_mask]);
 367 }
 368
 369 /*
 370  * Helper function for the children classes, which takes
 371  * a geom and a bio and returns the private descriptor
 372  * associated to the request.  This involves fetching
 373  * the classification field and [al]locating the
 374  * corresponding entry in the hash table.
 375  */
 376 void *
 377 g_sched_get_class(struct g_geom *gp, struct bio *bp)
 378 {
 379         struct g_sched_softc *sc;
 380         struct g_sched_class *gsc;
 381         struct g_gsched *gsp;
 382         struct g_hash *bucket;
 383         u_long key;
 384
 385         sc = gp->softc;
 386         key = g_sched_classify(bp);
 387         bucket = g_sched_hash(sc, key);
 388         LIST_FOREACH(gsc, bucket, gsc_clist) {
 389                 if (key == gsc->gsc_key) {
 390                         gsc->gsc_refs++;
 391                         return (gsc->gsc_priv);
 392                 }
 393         }
 394
 395         gsp = sc->sc_gsched;
 396         gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
 397             M_GEOM_SCHED, M_NOWAIT | M_ZERO);
 398         if (!gsc)
 399                 return (NULL);
 400
 401         if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 402                 free(gsc, M_GEOM_SCHED);
 403                 return (NULL);
 404         }
 405
 406         gsc->gsc_refs = 2;      /* 1 for the hash table, 1 for the caller. */
 407         gsc->gsc_key = key;
 408         LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
 409
 410         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 411
 412         return (gsc->gsc_priv);
 413 }
 414
 415 /*
 416  * Release a reference to the per-client descriptor,
 417  */
 418 void
 419 g_sched_put_class(struct g_geom *gp, void *priv)
 420 {
 421         struct g_sched_class *gsc;
 422         struct g_sched_softc *sc;
 423
 424         gsc = g_sched_priv2class(priv);
 425         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 426
 427         if (--gsc->gsc_refs > 0)
 428                 return;
 429
 430         sc = gp->softc;
 431         sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
 432
 433         LIST_REMOVE(gsc, gsc_clist);
 434         free(gsc, M_GEOM_SCHED);
 435 }
 436
 437 static void
 438 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
 439     struct g_gsched *gsp, void *data)
 440 {
 441         struct g_sched_class *cp, *cp2;
 442         int i;
 443
 444         if (!hp)
 445                 return;
 446
 447         if (data && gsp->gs_hash_unref)
 448                 gsp->gs_hash_unref(data);
 449
 450         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 451                 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
 452                         g_sched_put_class(gp, cp->gsc_priv);
 453         }
 454
 455         hashdestroy(hp, M_GEOM_SCHED, mask);
 456 }
 457
 458 static struct g_hash *
 459 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
 460 {
 461         struct g_hash *hash;
 462
 463         if (gsp->gs_priv_size == 0)
 464                 return (NULL);
 465
 466         hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
 467
 468         return (hash);
 469 }
 470
 471 static void
 472 g_sched_flush_classes(struct g_geom *gp)
 473 {
 474         struct g_sched_softc *sc;
 475         struct g_sched_class *cp, *cp2;
 476         int i;
 477
 478         sc = gp->softc;
 479
 480         if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
 481                 return;
 482
 483         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 484                 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
 485                         if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
 486                                 g_sched_put_class(gp, cp->gsc_priv);
 487                 }
 488         }
 489
 490         sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
 491 }
 492
 493 /*
 494  * Wait for the completion of any outstanding request.  To ensure
 495  * that this does not take forever the caller has to make sure that
 496  * no new request enter the scehduler before calling us.
 497  *
 498  * Must be called with the gp mutex held and topology locked.
 499  */
 500 static int
 501 g_sched_wait_pending(struct g_geom *gp)
 502 {
 503         struct g_sched_softc *sc = gp->softc;
 504         int endticks = ticks + hz;
 505
 506         g_topology_assert();
 507
 508         while (sc->sc_pending && endticks - ticks >= 0)
 509                 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
 510
 511         return (sc->sc_pending ? ETIMEDOUT : 0);
 512 }
 513
 514 static int
 515 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
 516 {
 517         struct g_sched_softc *sc = gp->softc;
 518         int error;
 519
 520         /* Set the flushing flag: new bios will not enter the scheduler. */
 521         sc->sc_flags |= G_SCHED_FLUSHING;
 522
 523         g_sched_forced_dispatch(gp);
 524         error = g_sched_wait_pending(gp);
 525         if (error)
 526                 goto failed;
 527
 528         /* No more requests pending or in flight from the old gsp. */
 529
 530         g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
 531         sc->sc_hash = NULL;
 532
 533         /*
 534          * Avoid deadlock here by releasing the gp mutex and reacquiring
 535          * it once done.  It should be safe, since no reconfiguration or
 536          * destruction can take place due to the geom topology lock; no
 537          * new request can use the current sc_data since we flagged the
 538          * geom as being flushed.
 539          */
 540         g_sched_unlock(gp);
 541         gsp->gs_fini(sc->sc_data);
 542         g_sched_lock(gp);
 543
 544         sc->sc_gsched = NULL;
 545         sc->sc_data = NULL;
 546         g_gsched_unref(gsp);
 547
 548 failed:
 549         sc->sc_flags &= ~G_SCHED_FLUSHING;
 550
 551         return (error);
 552 }
 553
 554 static int
 555 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
 556 {
 557         int error;
 558
 559         g_sched_lock(gp);
 560         error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
 561         g_sched_unlock(gp);
 562
 563         return (error);
 564 }
 565
 566 /*
 567  * Support function for create/taste -- locate the desired
 568  * algorithm and grab a reference to it.
 569  */
 570 static struct g_gsched *
 571 g_gsched_find(const char *name)
 572 {
 573         struct g_gsched *gsp = NULL;
 574
 575         mtx_lock(&me.gs_mtx);
 576         LIST_FOREACH(gsp, &me.gs_scheds, glist) {
 577                 if (strcmp(name, gsp->gs_name) == 0) {
 578                         g_gsched_ref(gsp);
 579                         break;
 580                 }
 581         }
 582         mtx_unlock(&me.gs_mtx);
 583
 584         return (gsp);
 585 }
 586
 587 /*
 588  * Rebuild the list of scheduler names.
 589  * To be called with me.gs_mtx lock held.
 590  */
 591 static void
 592 g_gsched_build_names(struct g_gsched *gsp)
 593 {
 594         int pos, l;
 595         struct g_gsched *cur;
 596
 597         pos = 0;
 598         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 599                 l = strlen(cur->gs_name);
 600                 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
 601                         if (pos != 0)
 602                                 me.gs_names[pos++] = ' ';
 603                         strcpy(me.gs_names + pos, cur->gs_name);
 604                         pos += l;
 605                 }
 606         }
 607         me.gs_names[pos] = '\0';
 608 }
 609
 610 /*
 611  * Register or unregister individual scheduling algorithms.
 612  */
 613 static int
 614 g_gsched_register(struct g_gsched *gsp)
 615 {
 616         struct g_gsched *cur;
 617         int error = 0;
 618
 619         mtx_lock(&me.gs_mtx);
 620         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 621                 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
 622                         break;
 623         }
 624         if (cur != NULL) {
 625                 G_SCHED_DEBUG(0, "A scheduler named %s already"
 626                     "exists.", gsp->gs_name);
 627                 error = EEXIST;
 628         } else {
 629                 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
 630                 gsp->gs_refs = 1;
 631                 me.gs_sched_count++;
 632                 g_gsched_build_names(gsp);
 633         }
 634         mtx_unlock(&me.gs_mtx);
 635
 636         return (error);
 637 }
 638
 639 struct g_gsched_unregparm {
 640         struct g_gsched *gup_gsp;
 641         int             gup_error;
 642 };
 643
 644 static void
 645 g_gsched_unregister(void *arg, int flag)
 646 {
 647         struct g_gsched_unregparm *parm = arg;
 648         struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
 649         struct g_sched_softc *sc;
 650         struct g_geom *gp, *gp_tmp;
 651         int error;
 652
 653         parm->gup_error = 0;
 654
 655         g_topology_assert();
 656
 657         if (flag == EV_CANCEL)
 658                 return;
 659
 660         mtx_lock(&me.gs_mtx);
 661
 662         LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
 663                 if (gp->class != &g_sched_class)
 664                         continue;       /* Should not happen. */
 665
 666                 sc = gp->softc;
 667                 if (sc->sc_gsched == gsp) {
 668                         error = g_sched_remove(gp, gsp);
 669                         if (error)
 670                                 goto failed;
 671                 }
 672         }
 673
 674         LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
 675                 if (cur != gsp)
 676                         continue;
 677
 678                 if (gsp->gs_refs != 1) {
 679                         G_SCHED_DEBUG(0, "%s still in use.",
 680                             gsp->gs_name);
 681                         parm->gup_error = EBUSY;
 682                 } else {
 683                         LIST_REMOVE(gsp, glist);
 684                         me.gs_sched_count--;
 685                         g_gsched_build_names(gsp);
 686                 }
 687                 break;
 688         }
 689
 690         if (cur == NULL) {
 691                 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
 692                 parm->gup_error = ENOENT;
 693         }
 694
 695 failed:
 696         mtx_unlock(&me.gs_mtx);
 697 }
 698
 699 static inline void
 700 g_gsched_global_init(void)
 701 {
 702
 703         if (!me.gs_initialized) {
 704                 G_SCHED_DEBUG(0, "Initializing global data.");
 705                 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
 706                 LIST_INIT(&me.gs_scheds);
 707                 gs_bioq_init(&me.gs_pending);
 708                 me.gs_initialized = 1;
 709         }
 710 }
 711
 712 /*
 713  * Module event called when a scheduling algorithm module is loaded or
 714  * unloaded.
 715  */
 716 int
 717 g_gsched_modevent(module_t mod, int cmd, void *arg)
 718 {
 719         struct g_gsched *gsp = arg;
 720         struct g_gsched_unregparm parm;
 721         int error;
 722
 723         G_SCHED_DEBUG(0, "Modevent %d.", cmd);
 724
 725         /*
 726          * If the module is loaded at boot, the geom thread that calls
 727          * g_sched_init() might actually run after g_gsched_modevent(),
 728          * so make sure that the module is properly initialized.
 729          */
 730         g_gsched_global_init();
 731
 732         error = EOPNOTSUPP;
 733         switch (cmd) {
 734         case MOD_LOAD:
 735                 error = g_gsched_register(gsp);
 736                 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
 737                     gsp->gs_name, error);
 738                 if (error == 0)
 739                         g_retaste(&g_sched_class);
 740                 break;
 741
 742         case MOD_UNLOAD:
 743                 parm.gup_gsp = gsp;
 744                 parm.gup_error = 0;
 745
 746                 error = g_waitfor_event(g_gsched_unregister,
 747                     &parm, M_WAITOK, NULL);
 748                 if (error == 0)
 749                         error = parm.gup_error;
 750                 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
 751                     gsp->gs_name, error);
 752                 break;
 753         };
 754
 755         return (error);
 756 }
 757
 758 #ifdef KTR
 759 #define TRC_BIO_EVENT(e, bp)    g_sched_trace_bio_ ## e (bp)
 760
 761 static inline char
 762 g_sched_type(struct bio *bp)
 763 {
 764
 765         if (0 != (bp->bio_cmd & BIO_READ))
 766                 return ('R');
 767         else if (0 != (bp->bio_cmd & BIO_WRITE))
 768                 return ('W');
 769         return ('U');
 770 }
 771
 772 static inline void
 773 g_sched_trace_bio_START(struct bio *bp)
 774 {
 775
 776         CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
 777             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 778             bp->bio_offset, bp->bio_length);
 779 }
 780
 781 static inline void
 782 g_sched_trace_bio_DONE(struct bio *bp)
 783 {
 784
 785         CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
 786             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 787             bp->bio_offset, bp->bio_length);
 788 }
 789 #else /* !KTR */
 790 #define TRC_BIO_EVENT(e, bp)
 791 #endif /* !KTR */
 792
 793 /*
 794  * g_sched_done() and g_sched_start() dispatch the geom requests to
 795  * the scheduling algorithm in use.
 796  */
 797 static void
 798 g_sched_done(struct bio *bio)
 799 {
 800         struct g_geom *gp = bio->bio_caller2;
 801         struct g_sched_softc *sc = gp->softc;
 802
 803         TRC_BIO_EVENT(DONE, bio);
 804
 805         KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
 806
 807         g_sched_lock(gp);
 808
 809         g_sched_update_stats(bio);
 810         sc->sc_gsched->gs_done(sc->sc_data, bio);
 811         if (!--sc->sc_pending)
 812                 wakeup(gp);
 813
 814         g_sched_flush_classes(gp);
 815         g_sched_unlock(gp);
 816
 817         g_std_done(bio);
 818 }
 819
 820 static void
 821 g_sched_start(struct bio *bp)
 822 {
 823         struct g_geom *gp = bp->bio_to->geom;
 824         struct g_sched_softc *sc = gp->softc;
 825         struct bio *cbp;
 826
 827         TRC_BIO_EVENT(START, bp);
 828         G_SCHED_LOGREQ(bp, "Request received.");
 829
 830         cbp = g_clone_bio(bp);
 831         if (cbp == NULL) {
 832                 g_io_deliver(bp, ENOMEM);
 833                 return;
 834         }
 835         cbp->bio_done = g_sched_done;
 836         cbp->bio_to = LIST_FIRST(&gp->provider);
 837         KASSERT(cbp->bio_to != NULL, ("NULL provider"));
 838
 839         /* We only schedule reads and writes. */
 840         if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
 841                 goto bypass;
 842
 843         G_SCHED_LOGREQ(cbp, "Sending request.");
 844
 845         g_sched_lock(gp);
 846         /*
 847          * Call the algorithm's gs_start to queue the request in the
 848          * scheduler. If gs_start fails then pass the request down,
 849          * otherwise call g_sched_dispatch() which tries to push
 850          * one or more requests down.
 851          */
 852         if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
 853             sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
 854                 g_sched_unlock(gp);
 855                 goto bypass;
 856         }
 857         /*
 858          * We use bio_caller1 to mark requests that are scheduled
 859          * so make sure it is not NULL.
 860          */
 861         if (cbp->bio_caller1 == NULL)
 862                 cbp->bio_caller1 = &me; /* anything not NULL */
 863
 864         cbp->bio_caller2 = gp;
 865         sc->sc_pending++;
 866
 867         /* Update general stats. */
 868         me.gs_in_flight++;
 869         me.gs_requests++;
 870         me.gs_bytes_in_flight += bp->bio_length;
 871         if (bp->bio_cmd & BIO_WRITE) {
 872                 me.gs_writes_in_flight++;
 873                 me.gs_write_bytes_in_flight += bp->bio_length;
 874         }
 875         g_sched_dispatch(gp);
 876         g_sched_unlock(gp);
 877         return;
 878
 879 bypass:
 880         cbp->bio_done = g_std_done;
 881         cbp->bio_caller1 = NULL; /* not scheduled */
 882         g_io_request(cbp, LIST_FIRST(&gp->consumer));
 883 }
 884
 885 /*
 886  * The next few functions are the geom glue.
 887  */
 888 static void
 889 g_sched_orphan(struct g_consumer *cp)
 890 {
 891
 892         g_topology_assert();
 893         g_sched_destroy(cp->geom, 1);
 894 }
 895
 896 static int
 897 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
 898 {
 899         struct g_geom *gp;
 900         struct g_consumer *cp;
 901         int error;
 902
 903         gp = pp->geom;
 904         cp = LIST_FIRST(&gp->consumer);
 905         error = g_access(cp, dr, dw, de);
 906
 907         return (error);
 908 }
 909
 910 static void
 911 g_sched_temporary_start(struct bio *bio)
 912 {
 913
 914         mtx_lock(&me.gs_mtx);
 915         me.gs_npending++;
 916         gs_bioq_disksort(&me.gs_pending, bio);
 917         mtx_unlock(&me.gs_mtx);
 918 }
 919
 920 static void
 921 g_sched_flush_pending(g_start_t *start)
 922 {
 923         struct bio *bp;
 924
 925         while ((bp = gs_bioq_takefirst(&me.gs_pending)))
 926                 start(bp);
 927 }
 928
 929 static int
 930 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
 931     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
 932 {
 933         struct g_sched_softc *sc = gp->softc;
 934         g_start_t *saved_start, *flush = g_sched_start;
 935         int error = 0, endticks = ticks + hz;
 936
 937         g_cancel_event(newpp);  /* prevent taste() */
 938         /* copy private fields */
 939         newpp->private = pp->private;
 940         newpp->index = pp->index;
 941
 942         /* Queue all the early requests coming for us. */
 943         me.gs_npending = 0;
 944         saved_start = pp->geom->start;
 945         dstgp->start = g_sched_temporary_start;
 946
 947         while (pp->nstart - pp->nend != me.gs_npending &&
 948             endticks - ticks >= 0)
 949                 tsleep(pp, PRIBIO, "-", hz/10);
 950
 951         if (pp->nstart - pp->nend != me.gs_npending) {
 952                 flush = saved_start;
 953                 error = ETIMEDOUT;
 954                 goto fail;
 955         }
 956
 957         /* link pp to this geom */
 958         LIST_REMOVE(pp, provider);
 959         pp->geom = gp;
 960         LIST_INSERT_HEAD(&gp->provider, pp, provider);
 961
 962         /*
 963          * replicate the counts from the parent in the
 964          * new provider and consumer nodes
 965          */
 966         cp->acr = newpp->acr = pp->acr;
 967         cp->acw = newpp->acw = pp->acw;
 968         cp->ace = newpp->ace = pp->ace;
 969         sc->sc_flags |= G_SCHED_PROXYING;
 970
 971 fail:
 972         dstgp->start = saved_start;
 973
 974         g_sched_flush_pending(flush);
 975
 976         return (error);
 977 }
 978
 979 /*
 980  * Create a geom node for the device passed as *pp.
 981  * If successful, add a reference to this gsp.
 982  */
 983 static int
 984 g_sched_create(struct gctl_req *req, struct g_class *mp,
 985     struct g_provider *pp, struct g_gsched *gsp, int proxy)
 986 {
 987         struct g_sched_softc *sc = NULL;
 988         struct g_geom *gp, *dstgp;
 989         struct g_provider *newpp = NULL;
 990         struct g_consumer *cp = NULL;
 991         char name[64];
 992         int error;
 993
 994         g_topology_assert();
 995
 996         snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
 997         LIST_FOREACH(gp, &mp->geom, geom) {
 998                 if (strcmp(gp->name, name) == 0) {
 999                         gctl_error(req, "Geom %s already exists.",
1000                             name);
1001                         return (EEXIST);
1002                 }
1003         }
1004
1005         gp = g_new_geomf(mp, name);
1006         dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1007
1008         sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1009         sc->sc_gsched = gsp;
1010         sc->sc_data = gsp->gs_init(gp);
1011         if (sc->sc_data == NULL) {
1012                 error = ENOMEM;
1013                 goto fail;
1014         }
1015
1016         sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1017
1018         /*
1019          * Do not initialize the flush mechanism, will be initialized
1020          * on the first insertion on the hash table.
1021          */
1022
1023         mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1024
1025         gp->softc = sc;
1026         gp->start = g_sched_start;
1027         gp->orphan = g_sched_orphan;
1028         gp->access = g_sched_access;
1029         gp->dumpconf = g_sched_dumpconf;
1030
1031         newpp = g_new_providerf(dstgp, gp->name);
1032         newpp->mediasize = pp->mediasize;
1033         newpp->sectorsize = pp->sectorsize;
1034
1035         cp = g_new_consumer(gp);
1036         error = g_attach(cp, proxy ? newpp : pp);
1037         if (error != 0) {
1038                 gctl_error(req, "Cannot attach to provider %s.",
1039                     pp->name);
1040                 goto fail;
1041         }
1042
1043         g_error_provider(newpp, 0);
1044         if (proxy) {
1045                 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1046                 if (error)
1047                         goto fail;
1048         }
1049         G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1050
1051         g_gsched_ref(gsp);
1052
1053         return (0);
1054
1055 fail:
1056         if (cp != NULL) {
1057                 if (cp->provider != NULL)
1058                         g_detach(cp);
1059                 g_destroy_consumer(cp);
1060         }
1061         if (newpp != NULL)
1062                 g_destroy_provider(newpp);
1063         if (sc->sc_hash)
1064                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1065                     gsp, sc->sc_data);
1066         if (sc->sc_data)
1067                 gsp->gs_fini(sc->sc_data);
1068         g_free(gp->softc);
1069         g_destroy_geom(gp);
1070
1071         return (error);
1072 }
1073
1074 /*
1075  * Support for dynamic switching of scheduling algorithms.
1076  * First initialize the data structures for the new algorithm,
1077  * then call g_sched_remove_locked() to flush all references
1078  * to the old one, finally link the new algorithm.
1079  */
1080 static int
1081 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1082     struct g_provider *pp, struct g_gsched *gsp)
1083 {
1084         struct g_sched_softc *sc;
1085         struct g_geom *gp;
1086         struct g_hash *newh;
1087         void *data;
1088         u_long mask;
1089         int error = 0;
1090
1091         gp = pp->geom;
1092         sc = gp->softc;
1093
1094         data = gsp->gs_init(gp);
1095         if (data == NULL)
1096                 return (ENOMEM);
1097
1098         newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1099         if (gsp->gs_priv_size && !newh) {
1100                 error = ENOMEM;
1101                 goto fail;
1102         }
1103
1104         g_sched_lock(gp);
1105         if (sc->sc_gsched) {    /* can be NULL in some cases */
1106                 error = g_sched_remove_locked(gp, sc->sc_gsched);
1107                 if (error)
1108                         goto fail;
1109         }
1110
1111         g_gsched_ref(gsp);
1112         sc->sc_gsched = gsp;
1113         sc->sc_data = data;
1114         sc->sc_hash = newh;
1115         sc->sc_mask = mask;
1116
1117         g_sched_unlock(gp);
1118
1119         return (0);
1120
1121 fail:
1122         if (newh)
1123                 g_sched_hash_fini(gp, newh, mask, gsp, data);
1124
1125         if (data)
1126                 gsp->gs_fini(data);
1127
1128         g_sched_unlock(gp);
1129
1130         return (error);
1131 }
1132
1133 /*
1134  * Stop the request flow directed to the proxy, redirecting the new
1135  * requests to the me.gs_pending queue.
1136  */
1137 static struct g_provider *
1138 g_detach_proxy(struct g_geom *gp)
1139 {
1140         struct g_consumer *cp;
1141         struct g_provider *pp, *newpp;
1142
1143         do {
1144                 pp = LIST_FIRST(&gp->provider);
1145                 if (pp == NULL)
1146                         break;
1147                 cp = LIST_FIRST(&gp->consumer);
1148                 if (cp == NULL)
1149                         break;
1150                 newpp = cp->provider;
1151                 if (newpp == NULL)
1152                         break;
1153
1154                 me.gs_npending = 0;
1155                 pp->geom->start = g_sched_temporary_start;
1156
1157                 return (pp);
1158         } while (0);
1159         printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1160
1161         return (NULL);
1162 }
1163
1164 static void
1165 g_sched_blackhole(struct bio *bp)
1166 {
1167
1168         g_io_deliver(bp, ENXIO);
1169 }
1170
1171 static inline void
1172 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1173     struct g_provider *newpp)
1174 {
1175
1176         LIST_REMOVE(pp, provider);
1177         if (newpp) {
1178                 pp->private = newpp->private;
1179                 pp->index = newpp->index;
1180         }
1181         pp->geom = gp;
1182         LIST_INSERT_HEAD(&gp->provider, pp, provider);
1183 }
1184
1185 static inline void
1186 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1187 {
1188         struct g_geom *gp = oldpp->geom;
1189
1190         g_reparent_provider(oldpp, newpp->geom, newpp);
1191
1192         /*
1193          * Hackish: let the system destroy the old provider for us, just
1194          * in case someone attached a consumer to it, in which case a
1195          * direct call to g_destroy_provider() would not work.
1196          */
1197         g_reparent_provider(newpp, gp, NULL);
1198 }
1199
1200 /*
1201  * Complete the proxy destruction, linking the old provider to its
1202  * original geom, and destroying the proxy provider.  Also take care
1203  * of issuing the pending requests collected in me.gs_pending (if any).
1204  */
1205 static int
1206 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1207 {
1208         struct g_consumer *cp;
1209         struct g_provider *newpp;
1210
1211         do {
1212                 cp = LIST_FIRST(&gp->consumer);
1213                 if (cp == NULL)
1214                         break;
1215                 newpp = cp->provider;
1216                 if (newpp == NULL)
1217                         break;
1218
1219                 /* Relink the provider to its original geom. */
1220                 g_unproxy_provider(oldpp, newpp);
1221
1222                 /* Detach consumer from provider, and destroy provider. */
1223                 cp->acr = newpp->acr = 0;
1224                 cp->acw = newpp->acw = 0;
1225                 cp->ace = newpp->ace = 0;
1226                 g_detach(cp);
1227
1228                 /* Send the pending bios through the right start function. */
1229                 g_sched_flush_pending(oldpp->geom->start);
1230
1231                 return (0);
1232         } while (0);
1233         printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1234
1235         /* We cannot send the pending bios anywhere... */
1236         g_sched_flush_pending(g_sched_blackhole);
1237
1238         return (EINVAL);
1239 }
1240
1241 static int
1242 g_sched_destroy(struct g_geom *gp, boolean_t force)
1243 {
1244         struct g_provider *pp, *oldpp = NULL;
1245         struct g_sched_softc *sc;
1246         struct g_gsched *gsp;
1247         int error;
1248
1249         g_topology_assert();
1250         sc = gp->softc;
1251         if (sc == NULL)
1252                 return (ENXIO);
1253         if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1254                 pp = LIST_FIRST(&gp->provider);
1255                 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1256                         const char *msg = force ?
1257                                 "but we force removal" : "cannot remove";
1258
1259                         G_SCHED_DEBUG(!force,
1260                             "Device %s is still open (r%dw%de%d), %s.",
1261                             pp->name, pp->acr, pp->acw, pp->ace, msg);
1262                         if (!force)
1263                                 return (EBUSY);
1264                 } else {
1265                         G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1266                 }
1267         } else
1268                 oldpp = g_detach_proxy(gp);
1269
1270         gsp = sc->sc_gsched;
1271         if (gsp) {
1272                 /*
1273                  * XXX bad hack here: force a dispatch to release
1274                  * any reference to the hash table still held by
1275                  * the scheduler.
1276                  */
1277                 g_sched_lock(gp);
1278                 /*
1279                  * We are dying here, no new requests should enter
1280                  * the scheduler.  This is granted by the topolgy,
1281                  * either in case we were proxying (new bios are
1282                  * being redirected) or not (see the access check
1283                  * above).
1284                  */
1285                 g_sched_forced_dispatch(gp);
1286                 error = g_sched_wait_pending(gp);
1287
1288                 if (error) {
1289                         /*
1290                          * Not all the requests came home: this might happen
1291                          * under heavy load, or if we were waiting for any
1292                          * bio which is served in the event path (see
1293                          * geom_slice.c for an example of how this can
1294                          * happen).  Try to restore a working configuration
1295                          * if we can fail.
1296                          */
1297                         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1298                                 g_sched_flush_pending(force ?
1299                                     g_sched_blackhole : g_sched_start);
1300                         }
1301
1302                         /*
1303                          * In the forced destroy case there is not so much
1304                          * we can do, we have pending bios that will call
1305                          * g_sched_done() somehow, and we don't want them
1306                          * to crash the system using freed memory.  We tell
1307                          * the user that something went wrong, and leak some
1308                          * memory here.
1309                          * Note: the callers using force = 1 ignore the
1310                          * return value.
1311                          */
1312                         if (force) {
1313                                 G_SCHED_DEBUG(0, "Pending requests while "
1314                                     " destroying geom, some memory leaked.");
1315                         }
1316
1317                         return (error);
1318                 }
1319
1320                 g_sched_unlock(gp);
1321                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1322                     gsp, sc->sc_data);
1323                 sc->sc_hash = NULL;
1324                 gsp->gs_fini(sc->sc_data);
1325                 g_gsched_unref(gsp);
1326                 sc->sc_gsched = NULL;
1327         }
1328
1329         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1330                 error = g_destroy_proxy(gp, oldpp);
1331
1332                 if (error) {
1333                         if (force) {
1334                                 G_SCHED_DEBUG(0, "Unrecoverable error while "
1335                                     "destroying a proxy geom, leaking some "
1336                                     " memory.");
1337                         }
1338
1339                         return (error);
1340                 }
1341         }
1342
1343         mtx_destroy(&sc->sc_mtx);
1344
1345         g_free(gp->softc);
1346         gp->softc = NULL;
1347         g_wither_geom(gp, ENXIO);
1348
1349         return (error);
1350 }
1351
1352 static int
1353 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1354     struct g_geom *gp)
1355 {
1356
1357         return (g_sched_destroy(gp, 0));
1358 }
1359
1360 /*
1361  * Functions related to the classification of requests.
1362  *
1363  * On recent FreeBSD versions (8.0 and above), we store a reference
1364  * to the issuer of a request in bp->bio_classifier1 as soon
1365  * as the bio is posted to the geom queue (and not later, because
1366  * requests are managed by the g_down thread afterwards).
1367  *
1368  * On older versions of the system (but this code is not used
1369  * in any existing release), we [ab]use the caller1 field in the
1370  * root element of the bio tree to store the classification info.
1371  * The marking is done at the beginning of g_io_request()
1372  * and only if we find that the field is NULL.
1373  *
1374  * To avoid rebuilding the kernel, this module will patch the
1375  * initial part of g_io_request() so it jumps to some hand-coded
1376  * assembly that does the marking and then executes the original
1377  * body of g_io_request().
1378  *
1379  * fake_ioreq[] is architecture-specific machine code
1380  * that implements the above. CODE_SIZE, STORE_SIZE etc.
1381  * are constants used in the patching routine. Look at the
1382  * code in g_ioreq_patch() for the details.
1383  */
1384
1385 #ifndef HAVE_BIO_CLASSIFIER
1386 /*
1387  * Support for old FreeBSD versions
1388  */
1389 #if defined(__i386__)
1390 #define CODE_SIZE       29
1391 #define STORE_SIZE      5
1392 #define EPILOGUE        5
1393 #define SIZE            (CODE_SIZE + STORE_SIZE + EPILOGUE)
1394
1395 static u_char fake_ioreq[SIZE] = {
1396         0x8b, 0x44, 0x24, 0x04,         /* mov bp, %eax */
1397         /* 1: */
1398         0x89, 0xc2,                     /* mov %eax, %edx # edx = bp */
1399         0x8b, 0x40, 0x64,               /* mov bp->bio_parent, %eax */
1400         0x85, 0xc0,                     /* test %eax, %eax */
1401         0x75, 0xf7,                     /* jne 1b */
1402         0x8b, 0x42, 0x30,               /* mov bp->bp_caller1, %eax */
1403         0x85, 0xc0,                     /* test %eax, %eax */
1404         0x75, 0x09,                     /* jne 2f */
1405         0x64, 0xa1, 0x00, 0x00,         /* mov %fs:0, %eax */
1406         0x00, 0x00,
1407         0x89, 0x42, 0x30,               /* mov %eax, bp->bio_caller1 */
1408         /* 2: */
1409         0x55, 0x89, 0xe5, 0x57, 0x56,
1410         0xe9, 0x00, 0x00, 0x00, 0x00,   /* jmp back... */
1411 };
1412 #elif defined(__amd64)
1413 #define CODE_SIZE       38
1414 #define STORE_SIZE      6
1415 #define EPILOGUE        5
1416 #define SIZE            (CODE_SIZE + STORE_SIZE + EPILOGUE)
1417
1418 static u_char fake_ioreq[SIZE] = {
1419         0x48, 0x89, 0xf8,               /* mov bp, %rax */
1420         /* 1: */
1421         0x48, 0x89, 0xc2,               /* mov %rax, %rdx # rdx = bp */
1422         0x48, 0x8b, 0x82, 0xa8,         /* mov bp->bio_parent, %rax */
1423         0x00, 0x00, 0x00,
1424         0x48, 0x85, 0xc0,               /* test %rax, %rax */
1425         0x75, 0xf1,                     /* jne 1b */
1426         0x48, 0x83, 0x7a, 0x58,         /* cmp $0, bp->bp_caller1 */
1427         0x00,
1428         0x75, 0x0d,                     /* jne 2f */
1429         0x65, 0x48, 0x8b, 0x04,         /* mov %gs:0, %rax */
1430         0x25, 0x00, 0x00, 0x00,
1431         0x00,
1432         0x48, 0x89, 0x42, 0x58,         /* mov %rax, bp->bio_caller1 */
1433         /* 2: */
1434         0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1435         0xe9, 0x00, 0x00, 0x00, 0x00,   /* jmp back... */
1436 };
1437 #else /* neither x86 nor amd64 */
1438 static void
1439 g_new_io_request(struct bio *bp, struct g_consumer *cp)
1440 {
1441         struct bio *top = bp;
1442
1443         /*
1444          * bio classification: if bio_caller1 is available in the
1445          * root of the 'struct bio' tree, store there the thread id
1446          * of the thread that originated the request.
1447          * More sophisticated classification schemes can be used.
1448          */
1449         while (top->bio_parent)
1450                 top = top->bio_parent;
1451
1452         if (top->bio_caller1 == NULL)
1453                 top->bio_caller1 = curthread;
1454 }
1455
1456 #error please add the code above in g_new_io_request() to the beginning of \
1457         /sys/geom/geom_io.c::g_io_request(), and remove this line.
1458 #endif /* end of arch-specific code */
1459
1460 static int
1461 g_ioreq_patch(void)
1462 {
1463         u_char *original;
1464         u_long ofs;
1465         int found;
1466
1467         if (me.gs_patched)
1468                 return (-1);
1469
1470         original = (u_char *)g_io_request;
1471
1472         found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1473         if (!found)
1474                 return (-1);
1475
1476         /* Jump back to the original + STORE_SIZE. */
1477         ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1478         bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1479
1480         /* Patch the original address with a jump to the trampoline. */
1481         *original = 0xe9;     /* jump opcode */
1482         ofs = fake_ioreq - (original + 5);
1483         bcopy(&ofs, original + 1, 4);
1484
1485         me.gs_patched = 1;
1486
1487         return (0);
1488 }
1489
1490 /*
1491  * Restore the original code, this is easy.
1492  */
1493 static void
1494 g_ioreq_restore(void)
1495 {
1496         u_char *original;
1497
1498         if (me.gs_patched) {
1499                 original = (u_char *)g_io_request;
1500                 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1501                 me.gs_patched = 0;
1502         }
1503 }
1504
1505 static inline void
1506 g_classifier_ini(void)
1507 {
1508
1509         g_ioreq_patch();
1510 }
1511
1512 static inline void
1513 g_classifier_fini(void)
1514 {
1515
1516         g_ioreq_restore();
1517 }
1518
1519 /*--- end of support code for older FreeBSD versions */
1520
1521 #else /* HAVE_BIO_CLASSIFIER */
1522
1523 /*
1524  * Classifier support for recent FreeBSD versions: we use
1525  * a very simple classifier, only use curthread to tag a request.
1526  * The classifier is registered at module load, and unregistered
1527  * at module unload.
1528  */
1529 static int
1530 g_sched_tag(void *arg, struct bio *bp)
1531 {
1532
1533         bp->bio_classifier1 = curthread;
1534         return (1);
1535 }
1536
1537 static struct g_classifier_hook g_sched_classifier = {
1538         .func = g_sched_tag,
1539 };
1540
1541 static inline void
1542 g_classifier_ini(void)
1543 {
1544
1545         g_register_classifier(&g_sched_classifier);
1546 }
1547
1548 static inline void
1549 g_classifier_fini(void)
1550 {
1551
1552         g_unregister_classifier(&g_sched_classifier);
1553 }
1554 #endif /* HAVE_BIO_CLASSIFIER */
1555
1556 static void
1557 g_sched_init(struct g_class *mp)
1558 {
1559
1560         g_gsched_global_init();
1561
1562         G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1563             mp, &g_sched_class);
1564
1565         /* Patch g_io_request to store classification info in the bio. */
1566         g_classifier_ini();
1567 }
1568
1569 static void
1570 g_sched_fini(struct g_class *mp)
1571 {
1572
1573         g_classifier_fini();
1574
1575         G_SCHED_DEBUG(0, "Unloading...");
1576
1577         KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1578         mtx_destroy(&me.gs_mtx);
1579 }
1580
1581 static int
1582 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1583     struct thread *td)
1584 {
1585         struct g_consumer *cp;
1586         struct g_geom *gp;
1587
1588         cp = LIST_FIRST(&pp->geom->consumer);
1589         if (cp == NULL)
1590                 return (ENOIOCTL);
1591         gp = cp->provider->geom;
1592         if (gp->ioctl == NULL)
1593                 return (ENOIOCTL);
1594         return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1595 }
1596
1597 /*
1598  * Read the i-th argument for a request, skipping the /dev/
1599  * prefix if present.
1600  */
1601 static const char *
1602 g_sched_argi(struct gctl_req *req, int i)
1603 {
1604         static const char *dev_prefix = "/dev/";
1605         const char *name;
1606         char param[16];
1607         int l = strlen(dev_prefix);
1608
1609         snprintf(param, sizeof(param), "arg%d", i);
1610         name = gctl_get_asciiparam(req, param);
1611         if (name == NULL)
1612                 gctl_error(req, "No 'arg%d' argument", i);
1613         else if (strncmp(name, dev_prefix, l) == 0)
1614                 name += l;
1615         return (name);
1616 }
1617
1618 /*
1619  * Fetch nargs and do appropriate checks.
1620  */
1621 static int
1622 g_sched_get_nargs(struct gctl_req *req)
1623 {
1624         int *nargs;
1625
1626         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1627         if (nargs == NULL) {
1628                 gctl_error(req, "No 'nargs' argument");
1629                 return (0);
1630         }
1631         if (*nargs <= 0)
1632                 gctl_error(req, "Missing device(s).");
1633         return (*nargs);
1634 }
1635
1636 /*
1637  * Check whether we should add the class on certain volumes when
1638  * this geom is created. Right now this is under control of a kenv
1639  * variable containing the names of all devices that we care about.
1640  * Probably we should only support transparent insertion as the
1641  * preferred mode of operation.
1642  */
1643 static struct g_geom *
1644 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1645                 int flags __unused)
1646 {
1647         struct g_gsched *gsp = NULL;    /* the . algorithm we want */
1648         const char *s;                  /* generic string pointer */
1649         const char *taste_names;        /* devices we like */
1650         int l;
1651
1652         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1653             mp->name, pp->name);
1654         g_topology_assert();
1655
1656         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1657
1658         do {
1659                 /* do not taste on ourselves */
1660                 if (pp->geom->class == mp)
1661                         break;
1662
1663                 taste_names = getenv("geom.sched.taste");
1664                 if (taste_names == NULL)
1665                         break;
1666
1667                 l = strlen(pp->name);
1668                 for (s = taste_names; *s &&
1669                     (s = strstr(s, pp->name)); s++) {
1670                         /* further checks for an exact match */
1671                         if ( (s == taste_names || s[-1] == ' ') &&
1672                              (s[l] == '\0' || s[l] == ' ') )
1673                                 break;
1674                 }
1675                 if (s == NULL)
1676                         break;
1677                 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1678                     pp->name, s);
1679
1680                 /* look up the provider name in the list */
1681                 s = getenv("geom.sched.algo");
1682                 if (s == NULL)
1683                         s = "rr";
1684
1685                 gsp = g_gsched_find(s); /* also get a reference */
1686                 if (gsp == NULL) {
1687                         G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1688                         break;
1689                 }
1690
1691                 /* XXX create with 1 as last argument ? */
1692                 g_sched_create(NULL, mp, pp, gsp, 0);
1693                 g_gsched_unref(gsp);
1694         } while (0);
1695         return NULL;
1696 }
1697
1698 static void
1699 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1700 {
1701         struct g_provider *pp;
1702         struct g_gsched *gsp;
1703         const char *name;
1704         int i, nargs;
1705
1706         g_topology_assert();
1707
1708         name = gctl_get_asciiparam(req, "algo");
1709         if (name == NULL) {
1710                 gctl_error(req, "No '%s' argument", "algo");
1711                 return;
1712         }
1713
1714         gsp = g_gsched_find(name);      /* also get a reference */
1715         if (gsp == NULL) {
1716                 gctl_error(req, "Bad algorithm '%s'", name);
1717                 return;
1718         }
1719
1720         nargs = g_sched_get_nargs(req);
1721
1722         /*
1723          * Run on the arguments, and break on any error.
1724          * We look for a device name, but skip the /dev/ prefix if any.
1725          */
1726         for (i = 0; i < nargs; i++) {
1727                 name = g_sched_argi(req, i);
1728                 if (name == NULL)
1729                         break;
1730                 pp = g_provider_by_name(name);
1731                 if (pp == NULL) {
1732                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1733                         gctl_error(req, "Provider %s is invalid.", name);
1734                         break;
1735                 }
1736                 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1737                         break;
1738         }
1739
1740         g_gsched_unref(gsp);
1741 }
1742
1743 static void
1744 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1745 {
1746         struct g_provider *pp;
1747         struct g_gsched *gsp;
1748         const char *name;
1749         int i, nargs;
1750
1751         g_topology_assert();
1752
1753         name = gctl_get_asciiparam(req, "algo");
1754         if (name == NULL) {
1755                 gctl_error(req, "No '%s' argument", "algo");
1756                 return;
1757         }
1758
1759         gsp = g_gsched_find(name);      /* also get a reference */
1760         if (gsp == NULL) {
1761                 gctl_error(req, "Bad algorithm '%s'", name);
1762                 return;
1763         }
1764
1765         nargs = g_sched_get_nargs(req);
1766
1767         /*
1768          * Run on the arguments, and break on any error.
1769          * We look for a device name, but skip the /dev/ prefix if any.
1770          */
1771         for (i = 0; i < nargs; i++) {
1772                 name = g_sched_argi(req, i);
1773                 if (name == NULL)
1774                         break;
1775                 pp = g_provider_by_name(name);
1776                 if (pp == NULL || pp->geom->class != mp) {
1777                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1778                         gctl_error(req, "Provider %s is invalid.", name);
1779                         break;
1780                 }
1781                 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1782                         break;
1783         }
1784
1785         g_gsched_unref(gsp);
1786 }
1787
1788 static struct g_geom *
1789 g_sched_find_geom(struct g_class *mp, const char *name)
1790 {
1791         struct g_geom *gp;
1792
1793         LIST_FOREACH(gp, &mp->geom, geom) {
1794                 if (strcmp(gp->name, name) == 0)
1795                         return (gp);
1796         }
1797         return (NULL);
1798 }
1799
1800 static void
1801 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1802 {
1803         int nargs, *force, error, i;
1804         struct g_geom *gp;
1805         const char *name;
1806
1807         g_topology_assert();
1808
1809         nargs = g_sched_get_nargs(req);
1810
1811         force = gctl_get_paraml(req, "force", sizeof(*force));
1812         if (force == NULL) {
1813                 gctl_error(req, "No 'force' argument");
1814                 return;
1815         }
1816
1817         for (i = 0; i < nargs; i++) {
1818                 name = g_sched_argi(req, i);
1819                 if (name == NULL)
1820                         break;
1821
1822                 gp = g_sched_find_geom(mp, name);
1823                 if (gp == NULL) {
1824                         G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1825                         gctl_error(req, "Device %s is invalid.", name);
1826                         break;
1827                 }
1828
1829                 error = g_sched_destroy(gp, *force);
1830                 if (error != 0) {
1831                         gctl_error(req, "Cannot destroy device %s (error=%d).",
1832                             gp->name, error);
1833                         break;
1834                 }
1835         }
1836 }
1837
1838 static void
1839 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1840 {
1841         uint32_t *version;
1842
1843         g_topology_assert();
1844
1845         version = gctl_get_paraml(req, "version", sizeof(*version));
1846         if (version == NULL) {
1847                 gctl_error(req, "No '%s' argument.", "version");
1848                 return;
1849         }
1850
1851         if (*version != G_SCHED_VERSION) {
1852                 gctl_error(req, "Userland and kernel parts are "
1853                     "out of sync.");
1854                 return;
1855         }
1856
1857         if (strcmp(verb, "create") == 0) {
1858                 g_sched_ctl_create(req, mp, 0);
1859                 return;
1860         } else if (strcmp(verb, "insert") == 0) {
1861                 g_sched_ctl_create(req, mp, 1);
1862                 return;
1863         } else if (strcmp(verb, "configure") == 0) {
1864                 g_sched_ctl_configure(req, mp);
1865                 return;
1866         } else if (strcmp(verb, "destroy") == 0) {
1867                 g_sched_ctl_destroy(req, mp);
1868                 return;
1869         }
1870
1871         gctl_error(req, "Unknown verb.");
1872 }
1873
1874 static void
1875 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1876     struct g_consumer *cp, struct g_provider *pp)
1877 {
1878         struct g_sched_softc *sc = gp->softc;
1879         struct g_gsched *gsp = sc->sc_gsched;
1880         if (indent == NULL) {   /* plaintext */
1881                 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1882         }
1883         if (gsp != NULL && gsp->gs_dumpconf)
1884                 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1885 }
1886
1887 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1888 MODULE_VERSION(geom_sched, 0);