sys/geom/sched/g_sched.c

   1 /*-
   2  * Copyright (c) 2009-2010 Fabio Checconi
   3  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 /*
  29  * $Id$
  30  * $FreeBSD$
  31  *
  32  * Main control module for geom-based disk schedulers ('sched').
  33  *
  34  * USER VIEW
  35  * A 'sched' node is typically inserted transparently between
  36  * an existing provider pp and its original geom gp
  37  *
  38  *      [pp --> gp  ..]
  39  *
  40  * using the command "geom sched insert <provider>" and
  41  * resulting in the following topology
  42  *
  43  *      [pp --> sched_gp --> cp]   [new_pp --> gp ... ]
  44  *
  45  * Deletion "geom sched destroy <provider>.sched." restores the
  46  * original chain. The normal "geom sched create <provide>"
  47  * is also supported.
  48  *
  49  * INTERNALS
  50  * Internally, the 'sched' uses the following data structures
  51  *
  52  *   geom{}         g_sched_softc{}      g_gsched{}
  53  * +----------+    +---------------+   +-------------+
  54  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
  55  * |  ...     |    |               |   |  gs_fini    |
  56  * |          |    | [ hash table] |   |  gs_start   |
  57  * +----------+    |               |   |  ...        |
  58  *                 |               |   +-------------+
  59  *                 |               |
  60  *                 |               |     g_*_softc{}
  61  *                 |               |   +-------------+
  62  *                 | sc_data     *-|-->|             |
  63  *                 +---------------+   |  algorithm- |
  64  *                                     |  specific   |
  65  *                                     +-------------+
  66  *
  67  * A g_sched_softc{} is created with a "geom sched insert" call.
  68  * In turn this instantiates a specific scheduling algorithm,
  69  * which sets sc_gsched to point to the algorithm callbacks,
  70  * and calls gs_init() to create the g_*_softc{} .
  71  * The other callbacks (gs_start, gs_next, ...) are invoked
  72  * as needed
  73  *
  74  * g_sched_softc{} is defined in g_sched.h and mostly used here;
  75  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
  76  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
  77  *
  78  * DATA MOVING
  79  * When a bio is received on the provider, it goes to the
  80  * g_sched_start() which calls gs_start() to initially queue it;
  81  * then we call g_sched_dispatch() that loops around gs_next()
  82  * to select zero or more bio's to be sent downstream.
  83  *
  84  * g_sched_dispatch() can also be called as a result of a timeout,
  85  * e.g. when doing anticipation or pacing requests.
  86  *
  87  * When a bio comes back, it goes to g_sched_done() which in turn
  88  * calls gs_done(). The latter does any necessary housekeeping in
  89  * the scheduling algorithm, and may decide to call g_sched_dispatch()
  90  * to send more bio's downstream.
  91  *
  92  * If an algorithm needs per-flow queues, these are created
  93  * calling gs_init_class() and destroyed with gs_fini_class(),
  94  * and they are also inserted in the hash table implemented in
  95  * the g_sched_softc{}
  96  *
  97  * If an algorithm is replaced, or a transparently-inserted node is
  98  * removed with "geom sched destroy", we need to remove all references
  99  * to the g_*_softc{} and g_sched_softc from the bio's still in
 100  * the scheduler. g_sched_forced_dispatch() helps doing this.
 101  * XXX need to explain better.
 102  */
 103
 104 #include <sys/cdefs.h>
 105 #include <sys/param.h>
 106 #include <sys/systm.h>
 107 #include <sys/kernel.h>
 108 #include <sys/module.h>
 109 #include <sys/lock.h>
 110 #include <sys/mutex.h>
 111 #include <sys/bio.h>
 112 #include <sys/limits.h>
 113 #include <sys/hash.h>
 114 #include <sys/sbuf.h>
 115 #include <sys/sysctl.h>
 116 #include <sys/malloc.h>
 117 #include <sys/proc.h>           /* we access curthread */
 118 #include <geom/geom.h>
 119 #include "gs_scheduler.h"
 120 #include "g_sched.h"            /* geom hooks */
 121
 122 /*
 123  * Size of the per-geom hash table storing traffic classes.
 124  * We may decide to change it at a later time, it has no ABI
 125  * implications as it is only used for run-time allocations.
 126  */
 127 #define G_SCHED_HASH_SIZE       32
 128
 129 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
 130 static int g_sched_destroy_geom(struct gctl_req *req,
 131     struct g_class *mp, struct g_geom *gp);
 132 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
 133     const char *verb);
 134 static struct g_geom *g_sched_taste(struct g_class *mp,
 135     struct g_provider *pp, int flags __unused);
 136 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
 137     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 138 static void g_sched_init(struct g_class *mp);
 139 static void g_sched_fini(struct g_class *mp);
 140 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
 141     int fflag, struct thread *td);
 142
 143 struct g_class g_sched_class = {
 144         .name = G_SCHED_CLASS_NAME,
 145         .version = G_VERSION,
 146         .ctlreq = g_sched_config,
 147         .taste = g_sched_taste,
 148         .destroy_geom = g_sched_destroy_geom,
 149         .init = g_sched_init,
 150         .ioctl = g_sched_ioctl,
 151         .fini = g_sched_fini
 152 };
 153
 154 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
 155
 156 /*
 157  * Global variables describing the state of the geom_sched module.
 158  * There is only one static instance of this structure.
 159  */
 160 LIST_HEAD(gs_list, g_gsched);   /* type, link field */
 161 struct geom_sched_vars {
 162         struct mtx      gs_mtx;
 163         struct gs_list  gs_scheds;      /* list of algorithms */
 164         u_int           gs_debug;
 165         u_int           gs_sched_count; /* how many algorithms ? */
 166         u_int           gs_patched;     /* g_io_request was patched */
 167
 168         u_int           gs_initialized;
 169         u_int           gs_expire_secs; /* expiration of hash entries */
 170
 171         struct bio_queue_head gs_pending;
 172         u_int           gs_npending;
 173
 174         /* The following are for stats, usually protected by gs_mtx. */
 175         u_long          gs_requests;    /* total requests */
 176         u_long          gs_done;        /* total done */
 177         u_int           gs_in_flight;   /* requests in flight */
 178         u_int           gs_writes_in_flight;
 179         u_int           gs_bytes_in_flight;
 180         u_int           gs_write_bytes_in_flight;
 181
 182         char            gs_names[256];  /* names of schedulers */
 183 };
 184
 185 static struct geom_sched_vars me = {
 186         .gs_expire_secs = 10,
 187 };
 188
 189 SYSCTL_DECL(_kern_geom);
 190 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
 191     "GEOM_SCHED stuff");
 192
 193 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
 194     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
 195
 196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
 197     &me.gs_bytes_in_flight, 0, "Bytes in flight");
 198
 199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
 200     &me.gs_writes_in_flight, 0, "Write Requests in flight");
 201
 202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
 203     &me.gs_in_flight, 0, "Requests in flight");
 204
 205 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
 206     &me.gs_done, 0, "Total done");
 207
 208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
 209     &me.gs_requests, 0, "Total requests");
 210
 211 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
 212     &me.gs_names, 0, "Algorithm names");
 213
 214 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
 215     &me.gs_sched_count, 0, "Number of algorithms");
 216
 217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
 218     &me.gs_debug, 0, "Debug level");
 219
 220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
 221     &me.gs_expire_secs, 0, "Expire time in seconds");
 222
 223 /*
 224  * g_sched calls the scheduler algorithms with this lock held.
 225  * The locking functions are exposed so the scheduler algorithms can also
 226  * protect themselves e.g. when running a callout handler.
 227  */
 228 void
 229 g_sched_lock(struct g_geom *gp)
 230 {
 231         struct g_sched_softc *sc = gp->softc;
 232
 233         mtx_lock(&sc->sc_mtx);
 234 }
 235
 236 void
 237 g_sched_unlock(struct g_geom *gp)
 238 {
 239         struct g_sched_softc *sc = gp->softc;
 240
 241         mtx_unlock(&sc->sc_mtx);
 242 }
 243
 244 /*
 245  * Support functions to handle references to the module,
 246  * which are coming from devices using this scheduler.
 247  */
 248 static inline void
 249 g_gsched_ref(struct g_gsched *gsp)
 250 {
 251
 252         atomic_add_int(&gsp->gs_refs, 1);
 253 }
 254
 255 static inline void
 256 g_gsched_unref(struct g_gsched *gsp)
 257 {
 258
 259         atomic_add_int(&gsp->gs_refs, -1);
 260 }
 261
 262 /*
 263  * Update the stats when this request is done.
 264  */
 265 static void
 266 g_sched_update_stats(struct bio *bio)
 267 {
 268
 269         me.gs_done++;
 270         me.gs_in_flight--;
 271         me.gs_bytes_in_flight -= bio->bio_length;
 272         if (bio->bio_cmd == BIO_WRITE) {
 273                 me.gs_writes_in_flight--;
 274                 me.gs_write_bytes_in_flight -= bio->bio_length;
 275         }
 276 }
 277
 278 /*
 279  * Dispatch any pending request.
 280  */
 281 static void
 282 g_sched_forced_dispatch(struct g_geom *gp)
 283 {
 284         struct g_sched_softc *sc = gp->softc;
 285         struct g_gsched *gsp = sc->sc_gsched;
 286         struct bio *bp;
 287
 288         KASSERT(mtx_owned(&sc->sc_mtx),
 289             ("sc_mtx not owned during forced dispatch"));
 290
 291         while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
 292                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 293 }
 294
 295 /*
 296  * The main dispatch loop, called either here after the start
 297  * routine, or by scheduling algorithms when they receive a timeout
 298  * or a 'done' notification.  Does not share code with the forced
 299  * dispatch path, since the gs_done() callback can call us.
 300  */
 301 void
 302 g_sched_dispatch(struct g_geom *gp)
 303 {
 304         struct g_sched_softc *sc = gp->softc;
 305         struct g_gsched *gsp = sc->sc_gsched;
 306         struct bio *bp;
 307
 308         KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
 309
 310         if ((sc->sc_flags & G_SCHED_FLUSHING))
 311                 return;
 312
 313         while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
 314                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 315 }
 316
 317 /*
 318  * Recent (8.0 and above) versions of FreeBSD have support to
 319  * register classifiers of disk requests. The classifier is
 320  * invoked by g_io_request(), and stores the information into
 321  * bp->bio_classifier1.
 322  *
 323  * Support for older versions, which is left here only for
 324  * documentation purposes, relies on two hacks:
 325  * 1. classification info is written into the bio_caller1
 326  *    field of the topmost node in the bio chain. This field
 327  *    is rarely used, but this module is incompatible with
 328  *    those that use bio_caller1 for other purposes,
 329  *    such as ZFS and gjournal;
 330  * 2. g_io_request() is patched in-memory when the module is
 331  *    loaded, so that the function calls a classifier as its
 332  *    first thing. g_io_request() is restored when the module
 333  *    is unloaded. This functionality is only supported for
 334  *    x86 and amd64, other architectures need source code changes.
 335  */
 336
 337 /*
 338  * Lookup the identity of the issuer of the original request.
 339  * In the current implementation we use the curthread of the
 340  * issuer, but different mechanisms may be implemented later
 341  * so we do not make assumptions on the return value which for
 342  * us is just an opaque identifier.
 343  */
 344
 345 static inline u_long
 346 g_sched_classify(struct bio *bp)
 347 {
 348
 349         /* we have classifier fields in the struct bio */
 350         return ((u_long)bp->bio_classifier1);
 351 }
 352
 353 /* Return the hash chain for the given key. */
 354 static inline struct g_hash *
 355 g_sched_hash(struct g_sched_softc *sc, u_long key)
 356 {
 357
 358         return (&sc->sc_hash[key & sc->sc_mask]);
 359 }
 360
 361 /*
 362  * Helper function for the children classes, which takes
 363  * a geom and a bio and returns the private descriptor
 364  * associated to the request.  This involves fetching
 365  * the classification field and [al]locating the
 366  * corresponding entry in the hash table.
 367  */
 368 void *
 369 g_sched_get_class(struct g_geom *gp, struct bio *bp)
 370 {
 371         struct g_sched_softc *sc;
 372         struct g_sched_class *gsc;
 373         struct g_gsched *gsp;
 374         struct g_hash *bucket;
 375         u_long key;
 376
 377         sc = gp->softc;
 378         key = g_sched_classify(bp);
 379         bucket = g_sched_hash(sc, key);
 380         LIST_FOREACH(gsc, bucket, gsc_clist) {
 381                 if (key == gsc->gsc_key) {
 382                         gsc->gsc_refs++;
 383                         return (gsc->gsc_priv);
 384                 }
 385         }
 386
 387         gsp = sc->sc_gsched;
 388         gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
 389             M_GEOM_SCHED, M_NOWAIT | M_ZERO);
 390         if (!gsc)
 391                 return (NULL);
 392
 393         if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 394                 free(gsc, M_GEOM_SCHED);
 395                 return (NULL);
 396         }
 397
 398         gsc->gsc_refs = 2;      /* 1 for the hash table, 1 for the caller. */
 399         gsc->gsc_key = key;
 400         LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
 401
 402         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 403
 404         return (gsc->gsc_priv);
 405 }
 406
 407 /*
 408  * Release a reference to the per-client descriptor,
 409  */
 410 void
 411 g_sched_put_class(struct g_geom *gp, void *priv)
 412 {
 413         struct g_sched_class *gsc;
 414         struct g_sched_softc *sc;
 415
 416         gsc = g_sched_priv2class(priv);
 417         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 418
 419         if (--gsc->gsc_refs > 0)
 420                 return;
 421
 422         sc = gp->softc;
 423         sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
 424
 425         LIST_REMOVE(gsc, gsc_clist);
 426         free(gsc, M_GEOM_SCHED);
 427 }
 428
 429 static void
 430 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
 431     struct g_gsched *gsp, void *data)
 432 {
 433         struct g_sched_class *cp, *cp2;
 434         int i;
 435
 436         if (!hp)
 437                 return;
 438
 439         if (data && gsp->gs_hash_unref)
 440                 gsp->gs_hash_unref(data);
 441
 442         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 443                 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
 444                         g_sched_put_class(gp, cp->gsc_priv);
 445         }
 446
 447         hashdestroy(hp, M_GEOM_SCHED, mask);
 448 }
 449
 450 static struct g_hash *
 451 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
 452 {
 453         struct g_hash *hash;
 454
 455         if (gsp->gs_priv_size == 0)
 456                 return (NULL);
 457
 458         hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
 459
 460         return (hash);
 461 }
 462
 463 static void
 464 g_sched_flush_classes(struct g_geom *gp)
 465 {
 466         struct g_sched_softc *sc;
 467         struct g_sched_class *cp, *cp2;
 468         int i;
 469
 470         sc = gp->softc;
 471
 472         if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
 473                 return;
 474
 475         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 476                 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
 477                         if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
 478                                 g_sched_put_class(gp, cp->gsc_priv);
 479                 }
 480         }
 481
 482         sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
 483 }
 484
 485 /*
 486  * Wait for the completion of any outstanding request.  To ensure
 487  * that this does not take forever the caller has to make sure that
 488  * no new request enter the scehduler before calling us.
 489  *
 490  * Must be called with the gp mutex held and topology locked.
 491  */
 492 static int
 493 g_sched_wait_pending(struct g_geom *gp)
 494 {
 495         struct g_sched_softc *sc = gp->softc;
 496         int endticks = ticks + hz;
 497
 498         g_topology_assert();
 499
 500         while (sc->sc_pending && endticks - ticks >= 0)
 501                 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
 502
 503         return (sc->sc_pending ? ETIMEDOUT : 0);
 504 }
 505
 506 static int
 507 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
 508 {
 509         struct g_sched_softc *sc = gp->softc;
 510         int error;
 511
 512         /* Set the flushing flag: new bios will not enter the scheduler. */
 513         sc->sc_flags |= G_SCHED_FLUSHING;
 514
 515         g_sched_forced_dispatch(gp);
 516         error = g_sched_wait_pending(gp);
 517         if (error)
 518                 goto failed;
 519
 520         /* No more requests pending or in flight from the old gsp. */
 521
 522         g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
 523         sc->sc_hash = NULL;
 524
 525         /*
 526          * Avoid deadlock here by releasing the gp mutex and reacquiring
 527          * it once done.  It should be safe, since no reconfiguration or
 528          * destruction can take place due to the geom topology lock; no
 529          * new request can use the current sc_data since we flagged the
 530          * geom as being flushed.
 531          */
 532         g_sched_unlock(gp);
 533         gsp->gs_fini(sc->sc_data);
 534         g_sched_lock(gp);
 535
 536         sc->sc_gsched = NULL;
 537         sc->sc_data = NULL;
 538         g_gsched_unref(gsp);
 539
 540 failed:
 541         sc->sc_flags &= ~G_SCHED_FLUSHING;
 542
 543         return (error);
 544 }
 545
 546 static int
 547 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
 548 {
 549         int error;
 550
 551         g_sched_lock(gp);
 552         error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
 553         g_sched_unlock(gp);
 554
 555         return (error);
 556 }
 557
 558 /*
 559  * Support function for create/taste -- locate the desired
 560  * algorithm and grab a reference to it.
 561  */
 562 static struct g_gsched *
 563 g_gsched_find(const char *name)
 564 {
 565         struct g_gsched *gsp = NULL;
 566
 567         mtx_lock(&me.gs_mtx);
 568         LIST_FOREACH(gsp, &me.gs_scheds, glist) {
 569                 if (strcmp(name, gsp->gs_name) == 0) {
 570                         g_gsched_ref(gsp);
 571                         break;
 572                 }
 573         }
 574         mtx_unlock(&me.gs_mtx);
 575
 576         return (gsp);
 577 }
 578
 579 /*
 580  * Rebuild the list of scheduler names.
 581  * To be called with me.gs_mtx lock held.
 582  */
 583 static void
 584 g_gsched_build_names(struct g_gsched *gsp)
 585 {
 586         int pos, l;
 587         struct g_gsched *cur;
 588
 589         pos = 0;
 590         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 591                 l = strlen(cur->gs_name);
 592                 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
 593                         if (pos != 0)
 594                                 me.gs_names[pos++] = ' ';
 595                         strcpy(me.gs_names + pos, cur->gs_name);
 596                         pos += l;
 597                 }
 598         }
 599         me.gs_names[pos] = '\0';
 600 }
 601
 602 /*
 603  * Register or unregister individual scheduling algorithms.
 604  */
 605 static int
 606 g_gsched_register(struct g_gsched *gsp)
 607 {
 608         struct g_gsched *cur;
 609         int error = 0;
 610
 611         mtx_lock(&me.gs_mtx);
 612         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 613                 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
 614                         break;
 615         }
 616         if (cur != NULL) {
 617                 G_SCHED_DEBUG(0, "A scheduler named %s already"
 618                     "exists.", gsp->gs_name);
 619                 error = EEXIST;
 620         } else {
 621                 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
 622                 gsp->gs_refs = 1;
 623                 me.gs_sched_count++;
 624                 g_gsched_build_names(gsp);
 625         }
 626         mtx_unlock(&me.gs_mtx);
 627
 628         return (error);
 629 }
 630
 631 struct g_gsched_unregparm {
 632         struct g_gsched *gup_gsp;
 633         int             gup_error;
 634 };
 635
 636 static void
 637 g_gsched_unregister(void *arg, int flag)
 638 {
 639         struct g_gsched_unregparm *parm = arg;
 640         struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
 641         struct g_sched_softc *sc;
 642         struct g_geom *gp, *gp_tmp;
 643         int error;
 644
 645         parm->gup_error = 0;
 646
 647         g_topology_assert();
 648
 649         if (flag == EV_CANCEL)
 650                 return;
 651
 652         mtx_lock(&me.gs_mtx);
 653
 654         LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
 655                 if (gp->class != &g_sched_class)
 656                         continue;       /* Should not happen. */
 657
 658                 sc = gp->softc;
 659                 if (sc->sc_gsched == gsp) {
 660                         error = g_sched_remove(gp, gsp);
 661                         if (error)
 662                                 goto failed;
 663                 }
 664         }
 665
 666         LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
 667                 if (cur != gsp)
 668                         continue;
 669
 670                 if (gsp->gs_refs != 1) {
 671                         G_SCHED_DEBUG(0, "%s still in use.",
 672                             gsp->gs_name);
 673                         parm->gup_error = EBUSY;
 674                 } else {
 675                         LIST_REMOVE(gsp, glist);
 676                         me.gs_sched_count--;
 677                         g_gsched_build_names(gsp);
 678                 }
 679                 break;
 680         }
 681
 682         if (cur == NULL) {
 683                 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
 684                 parm->gup_error = ENOENT;
 685         }
 686
 687 failed:
 688         mtx_unlock(&me.gs_mtx);
 689 }
 690
 691 static inline void
 692 g_gsched_global_init(void)
 693 {
 694
 695         if (!me.gs_initialized) {
 696                 G_SCHED_DEBUG(0, "Initializing global data.");
 697                 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
 698                 LIST_INIT(&me.gs_scheds);
 699                 bioq_init(&me.gs_pending);
 700                 me.gs_initialized = 1;
 701         }
 702 }
 703
 704 /*
 705  * Module event called when a scheduling algorithm module is loaded or
 706  * unloaded.
 707  */
 708 int
 709 g_gsched_modevent(module_t mod, int cmd, void *arg)
 710 {
 711         struct g_gsched *gsp = arg;
 712         struct g_gsched_unregparm parm;
 713         int error;
 714
 715         G_SCHED_DEBUG(0, "Modevent %d.", cmd);
 716
 717         /*
 718          * If the module is loaded at boot, the geom thread that calls
 719          * g_sched_init() might actually run after g_gsched_modevent(),
 720          * so make sure that the module is properly initialized.
 721          */
 722         g_gsched_global_init();
 723
 724         error = EOPNOTSUPP;
 725         switch (cmd) {
 726         case MOD_LOAD:
 727                 error = g_gsched_register(gsp);
 728                 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
 729                     gsp->gs_name, error);
 730                 if (error == 0)
 731                         g_retaste(&g_sched_class);
 732                 break;
 733
 734         case MOD_UNLOAD:
 735                 parm.gup_gsp = gsp;
 736                 parm.gup_error = 0;
 737
 738                 error = g_waitfor_event(g_gsched_unregister,
 739                     &parm, M_WAITOK, NULL);
 740                 if (error == 0)
 741                         error = parm.gup_error;
 742                 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
 743                     gsp->gs_name, error);
 744                 break;
 745         }
 746
 747         return (error);
 748 }
 749
 750 #ifdef KTR
 751 #define TRC_BIO_EVENT(e, bp)    g_sched_trace_bio_ ## e (bp)
 752
 753 static inline char
 754 g_sched_type(struct bio *bp)
 755 {
 756
 757         if (bp->bio_cmd == BIO_READ)
 758                 return ('R');
 759         else if (bp->bio_cmd == BIO_WRITE)
 760                 return ('W');
 761         return ('U');
 762 }
 763
 764 static inline void
 765 g_sched_trace_bio_START(struct bio *bp)
 766 {
 767
 768         CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
 769             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 770             bp->bio_offset, bp->bio_length);
 771 }
 772
 773 static inline void
 774 g_sched_trace_bio_DONE(struct bio *bp)
 775 {
 776
 777         CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
 778             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 779             bp->bio_offset, bp->bio_length);
 780 }
 781 #else /* !KTR */
 782 #define TRC_BIO_EVENT(e, bp)
 783 #endif /* !KTR */
 784
 785 /*
 786  * g_sched_done() and g_sched_start() dispatch the geom requests to
 787  * the scheduling algorithm in use.
 788  */
 789 static void
 790 g_sched_done(struct bio *bio)
 791 {
 792         struct g_geom *gp = bio->bio_caller2;
 793         struct g_sched_softc *sc = gp->softc;
 794
 795         TRC_BIO_EVENT(DONE, bio);
 796
 797         KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
 798
 799         g_sched_lock(gp);
 800
 801         g_sched_update_stats(bio);
 802         sc->sc_gsched->gs_done(sc->sc_data, bio);
 803         if (!--sc->sc_pending)
 804                 wakeup(gp);
 805
 806         g_sched_flush_classes(gp);
 807         g_sched_unlock(gp);
 808
 809         g_std_done(bio);
 810 }
 811
 812 static void
 813 g_sched_start(struct bio *bp)
 814 {
 815         struct g_geom *gp = bp->bio_to->geom;
 816         struct g_sched_softc *sc = gp->softc;
 817         struct bio *cbp;
 818
 819         TRC_BIO_EVENT(START, bp);
 820         G_SCHED_LOGREQ(bp, "Request received.");
 821
 822         cbp = g_clone_bio(bp);
 823         if (cbp == NULL) {
 824                 g_io_deliver(bp, ENOMEM);
 825                 return;
 826         }
 827         cbp->bio_done = g_sched_done;
 828         cbp->bio_to = LIST_FIRST(&gp->provider);
 829         KASSERT(cbp->bio_to != NULL, ("NULL provider"));
 830
 831         /* We only schedule reads and writes. */
 832         if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
 833                 goto bypass;
 834
 835         G_SCHED_LOGREQ(cbp, "Sending request.");
 836
 837         g_sched_lock(gp);
 838         /*
 839          * Call the algorithm's gs_start to queue the request in the
 840          * scheduler. If gs_start fails then pass the request down,
 841          * otherwise call g_sched_dispatch() which tries to push
 842          * one or more requests down.
 843          */
 844         if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
 845             sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
 846                 g_sched_unlock(gp);
 847                 goto bypass;
 848         }
 849         /*
 850          * We use bio_caller1 to mark requests that are scheduled
 851          * so make sure it is not NULL.
 852          */
 853         if (cbp->bio_caller1 == NULL)
 854                 cbp->bio_caller1 = &me; /* anything not NULL */
 855
 856         cbp->bio_caller2 = gp;
 857         sc->sc_pending++;
 858
 859         /* Update general stats. */
 860         me.gs_in_flight++;
 861         me.gs_requests++;
 862         me.gs_bytes_in_flight += bp->bio_length;
 863         if (bp->bio_cmd == BIO_WRITE) {
 864                 me.gs_writes_in_flight++;
 865                 me.gs_write_bytes_in_flight += bp->bio_length;
 866         }
 867         g_sched_dispatch(gp);
 868         g_sched_unlock(gp);
 869         return;
 870
 871 bypass:
 872         cbp->bio_done = g_std_done;
 873         cbp->bio_caller1 = NULL; /* not scheduled */
 874         g_io_request(cbp, LIST_FIRST(&gp->consumer));
 875 }
 876
 877 /*
 878  * The next few functions are the geom glue.
 879  */
 880 static void
 881 g_sched_orphan(struct g_consumer *cp)
 882 {
 883
 884         g_topology_assert();
 885         g_sched_destroy(cp->geom, 1);
 886 }
 887
 888 static int
 889 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
 890 {
 891         struct g_geom *gp;
 892         struct g_consumer *cp;
 893         int error;
 894
 895         gp = pp->geom;
 896         cp = LIST_FIRST(&gp->consumer);
 897         error = g_access(cp, dr, dw, de);
 898
 899         return (error);
 900 }
 901
 902 static void
 903 g_sched_temporary_start(struct bio *bio)
 904 {
 905
 906         mtx_lock(&me.gs_mtx);
 907         me.gs_npending++;
 908         bioq_disksort(&me.gs_pending, bio);
 909         mtx_unlock(&me.gs_mtx);
 910 }
 911
 912 static void
 913 g_sched_flush_pending(g_start_t *start)
 914 {
 915         struct bio *bp;
 916
 917         while ((bp = bioq_takefirst(&me.gs_pending)))
 918                 start(bp);
 919 }
 920
 921 static int
 922 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
 923     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
 924 {
 925         struct g_sched_softc *sc = gp->softc;
 926         g_start_t *saved_start, *flush = g_sched_start;
 927         int error = 0, endticks = ticks + hz;
 928
 929         g_cancel_event(newpp);  /* prevent taste() */
 930         /* copy private fields */
 931         newpp->private = pp->private;
 932         newpp->index = pp->index;
 933
 934         /* Queue all the early requests coming for us. */
 935         me.gs_npending = 0;
 936         saved_start = pp->geom->start;
 937         dstgp->start = g_sched_temporary_start;
 938
 939         while (pp->nstart - pp->nend != me.gs_npending &&
 940             endticks - ticks >= 0)
 941                 tsleep(pp, PRIBIO, "-", hz/10);
 942
 943         if (pp->nstart - pp->nend != me.gs_npending) {
 944                 flush = saved_start;
 945                 error = ETIMEDOUT;
 946                 goto fail;
 947         }
 948
 949         /* link pp to this geom */
 950         LIST_REMOVE(pp, provider);
 951         pp->geom = gp;
 952         LIST_INSERT_HEAD(&gp->provider, pp, provider);
 953
 954         /*
 955          * replicate the counts from the parent in the
 956          * new provider and consumer nodes
 957          */
 958         cp->acr = newpp->acr = pp->acr;
 959         cp->acw = newpp->acw = pp->acw;
 960         cp->ace = newpp->ace = pp->ace;
 961         sc->sc_flags |= G_SCHED_PROXYING;
 962
 963 fail:
 964         dstgp->start = saved_start;
 965
 966         g_sched_flush_pending(flush);
 967
 968         return (error);
 969 }
 970
 971 /*
 972  * Create a geom node for the device passed as *pp.
 973  * If successful, add a reference to this gsp.
 974  */
 975 static int
 976 g_sched_create(struct gctl_req *req, struct g_class *mp,
 977     struct g_provider *pp, struct g_gsched *gsp, int proxy)
 978 {
 979         struct g_sched_softc *sc = NULL;
 980         struct g_geom *gp, *dstgp;
 981         struct g_provider *newpp = NULL;
 982         struct g_consumer *cp = NULL;
 983         char name[64];
 984         int error;
 985
 986         g_topology_assert();
 987
 988         snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
 989         LIST_FOREACH(gp, &mp->geom, geom) {
 990                 if (strcmp(gp->name, name) == 0) {
 991                         gctl_error(req, "Geom %s already exists.",
 992                             name);
 993                         return (EEXIST);
 994                 }
 995         }
 996
 997         gp = g_new_geomf(mp, "%s", name);
 998         dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
 999
1000         sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1001         sc->sc_gsched = gsp;
1002         sc->sc_data = gsp->gs_init(gp);
1003         if (sc->sc_data == NULL) {
1004                 error = ENOMEM;
1005                 goto fail;
1006         }
1007
1008         sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1009
1010         /*
1011          * Do not initialize the flush mechanism, will be initialized
1012          * on the first insertion on the hash table.
1013          */
1014
1015         mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1016
1017         gp->softc = sc;
1018         gp->start = g_sched_start;
1019         gp->orphan = g_sched_orphan;
1020         gp->access = g_sched_access;
1021         gp->dumpconf = g_sched_dumpconf;
1022
1023         newpp = g_new_providerf(dstgp, "%s", gp->name);
1024         newpp->mediasize = pp->mediasize;
1025         newpp->sectorsize = pp->sectorsize;
1026
1027         cp = g_new_consumer(gp);
1028         error = g_attach(cp, proxy ? newpp : pp);
1029         if (error != 0) {
1030                 gctl_error(req, "Cannot attach to provider %s.",
1031                     pp->name);
1032                 goto fail;
1033         }
1034
1035         g_error_provider(newpp, 0);
1036         if (proxy) {
1037                 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1038                 if (error)
1039                         goto fail;
1040         }
1041         G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1042
1043         g_gsched_ref(gsp);
1044
1045         return (0);
1046
1047 fail:
1048         if (cp != NULL) {
1049                 if (cp->provider != NULL)
1050                         g_detach(cp);
1051                 g_destroy_consumer(cp);
1052         }
1053         if (newpp != NULL)
1054                 g_destroy_provider(newpp);
1055         if (sc->sc_hash)
1056                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1057                     gsp, sc->sc_data);
1058         if (sc->sc_data)
1059                 gsp->gs_fini(sc->sc_data);
1060         g_free(gp->softc);
1061         g_destroy_geom(gp);
1062
1063         return (error);
1064 }
1065
1066 /*
1067  * Support for dynamic switching of scheduling algorithms.
1068  * First initialize the data structures for the new algorithm,
1069  * then call g_sched_remove_locked() to flush all references
1070  * to the old one, finally link the new algorithm.
1071  */
1072 static int
1073 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1074     struct g_provider *pp, struct g_gsched *gsp)
1075 {
1076         struct g_sched_softc *sc;
1077         struct g_geom *gp;
1078         struct g_hash *newh;
1079         void *data;
1080         u_long mask;
1081         int error = 0;
1082
1083         gp = pp->geom;
1084         sc = gp->softc;
1085
1086         data = gsp->gs_init(gp);
1087         if (data == NULL)
1088                 return (ENOMEM);
1089
1090         newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1091         if (gsp->gs_priv_size && !newh) {
1092                 error = ENOMEM;
1093                 goto fail;
1094         }
1095
1096         g_sched_lock(gp);
1097         if (sc->sc_gsched) {    /* can be NULL in some cases */
1098                 error = g_sched_remove_locked(gp, sc->sc_gsched);
1099                 if (error)
1100                         goto fail;
1101         }
1102
1103         g_gsched_ref(gsp);
1104         sc->sc_gsched = gsp;
1105         sc->sc_data = data;
1106         sc->sc_hash = newh;
1107         sc->sc_mask = mask;
1108
1109         g_sched_unlock(gp);
1110
1111         return (0);
1112
1113 fail:
1114         if (newh)
1115                 g_sched_hash_fini(gp, newh, mask, gsp, data);
1116
1117         if (data)
1118                 gsp->gs_fini(data);
1119
1120         g_sched_unlock(gp);
1121
1122         return (error);
1123 }
1124
1125 /*
1126  * Stop the request flow directed to the proxy, redirecting the new
1127  * requests to the me.gs_pending queue.
1128  */
1129 static struct g_provider *
1130 g_detach_proxy(struct g_geom *gp)
1131 {
1132         struct g_consumer *cp;
1133         struct g_provider *pp, *newpp;
1134
1135         do {
1136                 pp = LIST_FIRST(&gp->provider);
1137                 if (pp == NULL)
1138                         break;
1139                 cp = LIST_FIRST(&gp->consumer);
1140                 if (cp == NULL)
1141                         break;
1142                 newpp = cp->provider;
1143                 if (newpp == NULL)
1144                         break;
1145
1146                 me.gs_npending = 0;
1147                 pp->geom->start = g_sched_temporary_start;
1148
1149                 return (pp);
1150         } while (0);
1151         printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1152
1153         return (NULL);
1154 }
1155
1156 static void
1157 g_sched_blackhole(struct bio *bp)
1158 {
1159
1160         g_io_deliver(bp, ENXIO);
1161 }
1162
1163 static inline void
1164 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1165     struct g_provider *newpp)
1166 {
1167
1168         LIST_REMOVE(pp, provider);
1169         if (newpp) {
1170                 pp->private = newpp->private;
1171                 pp->index = newpp->index;
1172         }
1173         pp->geom = gp;
1174         LIST_INSERT_HEAD(&gp->provider, pp, provider);
1175 }
1176
1177 static inline void
1178 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1179 {
1180         struct g_geom *gp = oldpp->geom;
1181
1182         g_reparent_provider(oldpp, newpp->geom, newpp);
1183
1184         /*
1185          * Hackish: let the system destroy the old provider for us, just
1186          * in case someone attached a consumer to it, in which case a
1187          * direct call to g_destroy_provider() would not work.
1188          */
1189         g_reparent_provider(newpp, gp, NULL);
1190 }
1191
1192 /*
1193  * Complete the proxy destruction, linking the old provider to its
1194  * original geom, and destroying the proxy provider.  Also take care
1195  * of issuing the pending requests collected in me.gs_pending (if any).
1196  */
1197 static int
1198 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1199 {
1200         struct g_consumer *cp;
1201         struct g_provider *newpp;
1202
1203         do {
1204                 cp = LIST_FIRST(&gp->consumer);
1205                 if (cp == NULL)
1206                         break;
1207                 newpp = cp->provider;
1208                 if (newpp == NULL)
1209                         break;
1210
1211                 /* Relink the provider to its original geom. */
1212                 g_unproxy_provider(oldpp, newpp);
1213
1214                 /* Detach consumer from provider, and destroy provider. */
1215                 cp->acr = newpp->acr = 0;
1216                 cp->acw = newpp->acw = 0;
1217                 cp->ace = newpp->ace = 0;
1218                 g_detach(cp);
1219
1220                 /* Send the pending bios through the right start function. */
1221                 g_sched_flush_pending(oldpp->geom->start);
1222
1223                 return (0);
1224         } while (0);
1225         printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1226
1227         /* We cannot send the pending bios anywhere... */
1228         g_sched_flush_pending(g_sched_blackhole);
1229
1230         return (EINVAL);
1231 }
1232
1233 static int
1234 g_sched_destroy(struct g_geom *gp, boolean_t force)
1235 {
1236         struct g_provider *pp, *oldpp = NULL;
1237         struct g_sched_softc *sc;
1238         struct g_gsched *gsp;
1239         int error;
1240
1241         g_topology_assert();
1242         sc = gp->softc;
1243         if (sc == NULL)
1244                 return (ENXIO);
1245         if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1246                 pp = LIST_FIRST(&gp->provider);
1247                 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1248                         const char *msg = force ?
1249                                 "but we force removal" : "cannot remove";
1250
1251                         G_SCHED_DEBUG(!force,
1252                             "Device %s is still open (r%dw%de%d), %s.",
1253                             pp->name, pp->acr, pp->acw, pp->ace, msg);
1254                         if (!force)
1255                                 return (EBUSY);
1256                 } else {
1257                         G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1258                 }
1259         } else
1260                 oldpp = g_detach_proxy(gp);
1261
1262         gsp = sc->sc_gsched;
1263         if (gsp) {
1264                 /*
1265                  * XXX bad hack here: force a dispatch to release
1266                  * any reference to the hash table still held by
1267                  * the scheduler.
1268                  */
1269                 g_sched_lock(gp);
1270                 /*
1271                  * We are dying here, no new requests should enter
1272                  * the scheduler.  This is granted by the topolgy,
1273                  * either in case we were proxying (new bios are
1274                  * being redirected) or not (see the access check
1275                  * above).
1276                  */
1277                 g_sched_forced_dispatch(gp);
1278                 error = g_sched_wait_pending(gp);
1279
1280                 if (error) {
1281                         /*
1282                          * Not all the requests came home: this might happen
1283                          * under heavy load, or if we were waiting for any
1284                          * bio which is served in the event path (see
1285                          * geom_slice.c for an example of how this can
1286                          * happen).  Try to restore a working configuration
1287                          * if we can fail.
1288                          */
1289                         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1290                                 g_sched_flush_pending(force ?
1291                                     g_sched_blackhole : g_sched_start);
1292                         }
1293
1294                         /*
1295                          * In the forced destroy case there is not so much
1296                          * we can do, we have pending bios that will call
1297                          * g_sched_done() somehow, and we don't want them
1298                          * to crash the system using freed memory.  We tell
1299                          * the user that something went wrong, and leak some
1300                          * memory here.
1301                          * Note: the callers using force = 1 ignore the
1302                          * return value.
1303                          */
1304                         if (force) {
1305                                 G_SCHED_DEBUG(0, "Pending requests while "
1306                                     " destroying geom, some memory leaked.");
1307                         }
1308
1309                         return (error);
1310                 }
1311
1312                 g_sched_unlock(gp);
1313                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1314                     gsp, sc->sc_data);
1315                 sc->sc_hash = NULL;
1316                 gsp->gs_fini(sc->sc_data);
1317                 g_gsched_unref(gsp);
1318                 sc->sc_gsched = NULL;
1319         } else
1320                 error = 0;
1321
1322         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1323                 error = g_destroy_proxy(gp, oldpp);
1324
1325                 if (error) {
1326                         if (force) {
1327                                 G_SCHED_DEBUG(0, "Unrecoverable error while "
1328                                     "destroying a proxy geom, leaking some "
1329                                     " memory.");
1330                         }
1331
1332                         return (error);
1333                 }
1334         }
1335
1336         mtx_destroy(&sc->sc_mtx);
1337
1338         g_free(gp->softc);
1339         gp->softc = NULL;
1340         g_wither_geom(gp, ENXIO);
1341
1342         return (error);
1343 }
1344
1345 static int
1346 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1347     struct g_geom *gp)
1348 {
1349
1350         return (g_sched_destroy(gp, 0));
1351 }
1352
1353 /*
1354  * Functions related to the classification of requests.
1355  *
1356  * On recent FreeBSD versions (8.0 and above), we store a reference
1357  * to the issuer of a request in bp->bio_classifier1 as soon
1358  * as the bio is posted to the geom queue (and not later, because
1359  * requests are managed by the g_down thread afterwards).
1360  */
1361
1362 /*
1363  * Classifier support for recent FreeBSD versions: we use
1364  * a very simple classifier, only use curthread to tag a request.
1365  * The classifier is registered at module load, and unregistered
1366  * at module unload.
1367  */
1368 static int
1369 g_sched_tag(void *arg, struct bio *bp)
1370 {
1371
1372         bp->bio_classifier1 = curthread;
1373         return (1);
1374 }
1375
1376 static struct g_classifier_hook g_sched_classifier = {
1377         .func = g_sched_tag,
1378 };
1379
1380 static inline void
1381 g_classifier_ini(void)
1382 {
1383
1384         g_register_classifier(&g_sched_classifier);
1385 }
1386
1387 static inline void
1388 g_classifier_fini(void)
1389 {
1390
1391         g_unregister_classifier(&g_sched_classifier);
1392 }
1393
1394 static void
1395 g_sched_init(struct g_class *mp)
1396 {
1397
1398         g_gsched_global_init();
1399
1400         G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1401             mp, &g_sched_class);
1402
1403         /* Patch g_io_request to store classification info in the bio. */
1404         g_classifier_ini();
1405 }
1406
1407 static void
1408 g_sched_fini(struct g_class *mp)
1409 {
1410
1411         g_classifier_fini();
1412
1413         G_SCHED_DEBUG(0, "Unloading...");
1414
1415         KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1416         mtx_destroy(&me.gs_mtx);
1417 }
1418
1419 static int
1420 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1421     struct thread *td)
1422 {
1423         struct g_consumer *cp;
1424         struct g_geom *gp;
1425
1426         cp = LIST_FIRST(&pp->geom->consumer);
1427         if (cp == NULL)
1428                 return (ENOIOCTL);
1429         gp = cp->provider->geom;
1430         if (gp->ioctl == NULL)
1431                 return (ENOIOCTL);
1432         return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1433 }
1434
1435 /*
1436  * Read the i-th argument for a request, skipping the /dev/
1437  * prefix if present.
1438  */
1439 static const char *
1440 g_sched_argi(struct gctl_req *req, int i)
1441 {
1442         static const char *dev_prefix = "/dev/";
1443         const char *name;
1444         char param[16];
1445         int l = strlen(dev_prefix);
1446
1447         snprintf(param, sizeof(param), "arg%d", i);
1448         name = gctl_get_asciiparam(req, param);
1449         if (name == NULL)
1450                 gctl_error(req, "No 'arg%d' argument", i);
1451         else if (strncmp(name, dev_prefix, l) == 0)
1452                 name += l;
1453         return (name);
1454 }
1455
1456 /*
1457  * Fetch nargs and do appropriate checks.
1458  */
1459 static int
1460 g_sched_get_nargs(struct gctl_req *req)
1461 {
1462         int *nargs;
1463
1464         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1465         if (nargs == NULL) {
1466                 gctl_error(req, "No 'nargs' argument");
1467                 return (0);
1468         }
1469         if (*nargs <= 0)
1470                 gctl_error(req, "Missing device(s).");
1471         return (*nargs);
1472 }
1473
1474 /*
1475  * Check whether we should add the class on certain volumes when
1476  * this geom is created. Right now this is under control of a kenv
1477  * variable containing the names of all devices that we care about.
1478  * Probably we should only support transparent insertion as the
1479  * preferred mode of operation.
1480  */
1481 static struct g_geom *
1482 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1483                 int flags __unused)
1484 {
1485         struct g_gsched *gsp = NULL;    /* the . algorithm we want */
1486         const char *s;                  /* generic string pointer */
1487         const char *taste_names;        /* devices we like */
1488         int l;
1489
1490         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1491             mp->name, pp->name);
1492         g_topology_assert();
1493
1494         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1495
1496         do {
1497                 /* do not taste on ourselves */
1498                 if (pp->geom->class == mp)
1499                         break;
1500
1501                 taste_names = kern_getenv("geom.sched.taste");
1502                 if (taste_names == NULL)
1503                         break;
1504
1505                 l = strlen(pp->name);
1506                 for (s = taste_names; *s &&
1507                     (s = strstr(s, pp->name)); s++) {
1508                         /* further checks for an exact match */
1509                         if ( (s == taste_names || s[-1] == ' ') &&
1510                              (s[l] == '\0' || s[l] == ' ') )
1511                                 break;
1512                 }
1513                 if (s == NULL)
1514                         break;
1515                 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1516                     pp->name, s);
1517
1518                 /* look up the provider name in the list */
1519                 s = kern_getenv("geom.sched.algo");
1520                 if (s == NULL)
1521                         s = "rr";
1522
1523                 gsp = g_gsched_find(s); /* also get a reference */
1524                 if (gsp == NULL) {
1525                         G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1526                         break;
1527                 }
1528
1529                 /* XXX create with 1 as last argument ? */
1530                 g_sched_create(NULL, mp, pp, gsp, 0);
1531                 g_gsched_unref(gsp);
1532         } while (0);
1533         return NULL;
1534 }
1535
1536 static void
1537 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1538 {
1539         struct g_provider *pp;
1540         struct g_gsched *gsp;
1541         const char *name;
1542         int i, nargs;
1543
1544         g_topology_assert();
1545
1546         name = gctl_get_asciiparam(req, "algo");
1547         if (name == NULL) {
1548                 gctl_error(req, "No '%s' argument", "algo");
1549                 return;
1550         }
1551
1552         gsp = g_gsched_find(name);      /* also get a reference */
1553         if (gsp == NULL) {
1554                 gctl_error(req, "Bad algorithm '%s'", name);
1555                 return;
1556         }
1557
1558         nargs = g_sched_get_nargs(req);
1559
1560         /*
1561          * Run on the arguments, and break on any error.
1562          * We look for a device name, but skip the /dev/ prefix if any.
1563          */
1564         for (i = 0; i < nargs; i++) {
1565                 name = g_sched_argi(req, i);
1566                 if (name == NULL)
1567                         break;
1568                 pp = g_provider_by_name(name);
1569                 if (pp == NULL) {
1570                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1571                         gctl_error(req, "Provider %s is invalid.", name);
1572                         break;
1573                 }
1574                 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1575                         break;
1576         }
1577
1578         g_gsched_unref(gsp);
1579 }
1580
1581 static void
1582 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1583 {
1584         struct g_provider *pp;
1585         struct g_gsched *gsp;
1586         const char *name;
1587         int i, nargs;
1588
1589         g_topology_assert();
1590
1591         name = gctl_get_asciiparam(req, "algo");
1592         if (name == NULL) {
1593                 gctl_error(req, "No '%s' argument", "algo");
1594                 return;
1595         }
1596
1597         gsp = g_gsched_find(name);      /* also get a reference */
1598         if (gsp == NULL) {
1599                 gctl_error(req, "Bad algorithm '%s'", name);
1600                 return;
1601         }
1602
1603         nargs = g_sched_get_nargs(req);
1604
1605         /*
1606          * Run on the arguments, and break on any error.
1607          * We look for a device name, but skip the /dev/ prefix if any.
1608          */
1609         for (i = 0; i < nargs; i++) {
1610                 name = g_sched_argi(req, i);
1611                 if (name == NULL)
1612                         break;
1613                 pp = g_provider_by_name(name);
1614                 if (pp == NULL || pp->geom->class != mp) {
1615                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1616                         gctl_error(req, "Provider %s is invalid.", name);
1617                         break;
1618                 }
1619                 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1620                         break;
1621         }
1622
1623         g_gsched_unref(gsp);
1624 }
1625
1626 static struct g_geom *
1627 g_sched_find_geom(struct g_class *mp, const char *name)
1628 {
1629         struct g_geom *gp;
1630
1631         LIST_FOREACH(gp, &mp->geom, geom) {
1632                 if (strcmp(gp->name, name) == 0)
1633                         return (gp);
1634         }
1635         return (NULL);
1636 }
1637
1638 static void
1639 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1640 {
1641         int nargs, *force, error, i;
1642         struct g_geom *gp;
1643         const char *name;
1644
1645         g_topology_assert();
1646
1647         nargs = g_sched_get_nargs(req);
1648
1649         force = gctl_get_paraml(req, "force", sizeof(*force));
1650         if (force == NULL) {
1651                 gctl_error(req, "No 'force' argument");
1652                 return;
1653         }
1654
1655         for (i = 0; i < nargs; i++) {
1656                 name = g_sched_argi(req, i);
1657                 if (name == NULL)
1658                         break;
1659
1660                 gp = g_sched_find_geom(mp, name);
1661                 if (gp == NULL) {
1662                         G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1663                         gctl_error(req, "Device %s is invalid.", name);
1664                         break;
1665                 }
1666
1667                 error = g_sched_destroy(gp, *force);
1668                 if (error != 0) {
1669                         gctl_error(req, "Cannot destroy device %s (error=%d).",
1670                             gp->name, error);
1671                         break;
1672                 }
1673         }
1674 }
1675
1676 static void
1677 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1678 {
1679         uint32_t *version;
1680
1681         g_topology_assert();
1682
1683         version = gctl_get_paraml(req, "version", sizeof(*version));
1684         if (version == NULL) {
1685                 gctl_error(req, "No '%s' argument.", "version");
1686                 return;
1687         }
1688
1689         if (*version != G_SCHED_VERSION) {
1690                 gctl_error(req, "Userland and kernel parts are "
1691                     "out of sync.");
1692                 return;
1693         }
1694
1695         if (strcmp(verb, "create") == 0) {
1696                 g_sched_ctl_create(req, mp, 0);
1697                 return;
1698         } else if (strcmp(verb, "insert") == 0) {
1699                 g_sched_ctl_create(req, mp, 1);
1700                 return;
1701         } else if (strcmp(verb, "configure") == 0) {
1702                 g_sched_ctl_configure(req, mp);
1703                 return;
1704         } else if (strcmp(verb, "destroy") == 0) {
1705                 g_sched_ctl_destroy(req, mp);
1706                 return;
1707         }
1708
1709         gctl_error(req, "Unknown verb.");
1710 }
1711
1712 static void
1713 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1714     struct g_consumer *cp, struct g_provider *pp)
1715 {
1716         struct g_sched_softc *sc = gp->softc;
1717         struct g_gsched *gsp = sc->sc_gsched;
1718         if (indent == NULL) {   /* plaintext */
1719                 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1720         }
1721         if (gsp != NULL && gsp->gs_dumpconf)
1722                 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1723 }
1724
1725 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1726 MODULE_VERSION(geom_sched, 0);