sys/geom/sched/g_sched.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2009-2010 Fabio Checconi
   5  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 /*
  31  * $Id$
  32  * $FreeBSD$
  33  *
  34  * Main control module for geom-based disk schedulers ('sched').
  35  *
  36  * USER VIEW
  37  * A 'sched' node is typically inserted transparently between
  38  * an existing provider pp and its original geom gp
  39  *
  40  *      [pp --> gp  ..]
  41  *
  42  * using the command "geom sched insert <provider>" and
  43  * resulting in the following topology
  44  *
  45  *      [pp --> sched_gp --> cp]   [new_pp --> gp ... ]
  46  *
  47  * Deletion "geom sched destroy <provider>.sched." restores the
  48  * original chain. The normal "geom sched create <provide>"
  49  * is also supported.
  50  *
  51  * INTERNALS
  52  * Internally, the 'sched' uses the following data structures
  53  *
  54  *   geom{}         g_sched_softc{}      g_gsched{}
  55  * +----------+    +---------------+   +-------------+
  56  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
  57  * |  ...     |    |               |   |  gs_fini    |
  58  * |          |    | [ hash table] |   |  gs_start   |
  59  * +----------+    |               |   |  ...        |
  60  *                 |               |   +-------------+
  61  *                 |               |
  62  *                 |               |     g_*_softc{}
  63  *                 |               |   +-------------+
  64  *                 | sc_data     *-|-->|             |
  65  *                 +---------------+   |  algorithm- |
  66  *                                     |  specific   |
  67  *                                     +-------------+
  68  *
  69  * A g_sched_softc{} is created with a "geom sched insert" call.
  70  * In turn this instantiates a specific scheduling algorithm,
  71  * which sets sc_gsched to point to the algorithm callbacks,
  72  * and calls gs_init() to create the g_*_softc{} .
  73  * The other callbacks (gs_start, gs_next, ...) are invoked
  74  * as needed
  75  *
  76  * g_sched_softc{} is defined in g_sched.h and mostly used here;
  77  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
  78  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
  79  *
  80  * DATA MOVING
  81  * When a bio is received on the provider, it goes to the
  82  * g_sched_start() which calls gs_start() to initially queue it;
  83  * then we call g_sched_dispatch() that loops around gs_next()
  84  * to select zero or more bio's to be sent downstream.
  85  *
  86  * g_sched_dispatch() can also be called as a result of a timeout,
  87  * e.g. when doing anticipation or pacing requests.
  88  *
  89  * When a bio comes back, it goes to g_sched_done() which in turn
  90  * calls gs_done(). The latter does any necessary housekeeping in
  91  * the scheduling algorithm, and may decide to call g_sched_dispatch()
  92  * to send more bio's downstream.
  93  *
  94  * If an algorithm needs per-flow queues, these are created
  95  * calling gs_init_class() and destroyed with gs_fini_class(),
  96  * and they are also inserted in the hash table implemented in
  97  * the g_sched_softc{}
  98  *
  99  * If an algorithm is replaced, or a transparently-inserted node is
 100  * removed with "geom sched destroy", we need to remove all references
 101  * to the g_*_softc{} and g_sched_softc from the bio's still in
 102  * the scheduler. g_sched_forced_dispatch() helps doing this.
 103  * XXX need to explain better.
 104  */
 105
 106 #include <sys/cdefs.h>
 107 #include <sys/param.h>
 108 #include <sys/systm.h>
 109 #include <sys/kernel.h>
 110 #include <sys/module.h>
 111 #include <sys/lock.h>
 112 #include <sys/mutex.h>
 113 #include <sys/bio.h>
 114 #include <sys/limits.h>
 115 #include <sys/hash.h>
 116 #include <sys/sbuf.h>
 117 #include <sys/sysctl.h>
 118 #include <sys/malloc.h>
 119 #include <sys/proc.h>           /* we access curthread */
 120 #include <geom/geom.h>
 121 #include <geom/geom_dbg.h>
 122 #include "gs_scheduler.h"
 123 #include "g_sched.h"            /* geom hooks */
 124
 125 /*
 126  * Size of the per-geom hash table storing traffic classes.
 127  * We may decide to change it at a later time, it has no ABI
 128  * implications as it is only used for run-time allocations.
 129  */
 130 #define G_SCHED_HASH_SIZE       32
 131
 132 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
 133 static int g_sched_destroy_geom(struct gctl_req *req,
 134     struct g_class *mp, struct g_geom *gp);
 135 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
 136     const char *verb);
 137 static struct g_geom *g_sched_taste(struct g_class *mp,
 138     struct g_provider *pp, int flags __unused);
 139 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
 140     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 141 static void g_sched_init(struct g_class *mp);
 142 static void g_sched_fini(struct g_class *mp);
 143 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
 144     int fflag, struct thread *td);
 145
 146 struct g_class g_sched_class = {
 147         .name = G_SCHED_CLASS_NAME,
 148         .version = G_VERSION,
 149         .ctlreq = g_sched_config,
 150         .taste = g_sched_taste,
 151         .destroy_geom = g_sched_destroy_geom,
 152         .init = g_sched_init,
 153         .ioctl = g_sched_ioctl,
 154         .fini = g_sched_fini
 155 };
 156
 157 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
 158
 159 /*
 160  * Global variables describing the state of the geom_sched module.
 161  * There is only one static instance of this structure.
 162  */
 163 LIST_HEAD(gs_list, g_gsched);   /* type, link field */
 164 struct geom_sched_vars {
 165         struct mtx      gs_mtx;
 166         struct gs_list  gs_scheds;      /* list of algorithms */
 167         u_int           gs_debug;
 168         u_int           gs_sched_count; /* how many algorithms ? */
 169         u_int           gs_patched;     /* g_io_request was patched */
 170
 171         u_int           gs_initialized;
 172         u_int           gs_expire_secs; /* expiration of hash entries */
 173
 174         struct bio_queue_head gs_pending;
 175         u_int           gs_npending;
 176
 177         /* The following are for stats, usually protected by gs_mtx. */
 178         u_long          gs_requests;    /* total requests */
 179         u_long          gs_done;        /* total done */
 180         u_int           gs_in_flight;   /* requests in flight */
 181         u_int           gs_writes_in_flight;
 182         u_int           gs_bytes_in_flight;
 183         u_int           gs_write_bytes_in_flight;
 184
 185         char            gs_names[256];  /* names of schedulers */
 186 };
 187
 188 static struct geom_sched_vars me = {
 189         .gs_expire_secs = 10,
 190 };
 191
 192 SYSCTL_DECL(_kern_geom);
 193 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
 194     "GEOM_SCHED stuff");
 195
 196 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
 197     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
 198
 199 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
 200     &me.gs_bytes_in_flight, 0, "Bytes in flight");
 201
 202 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
 203     &me.gs_writes_in_flight, 0, "Write Requests in flight");
 204
 205 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
 206     &me.gs_in_flight, 0, "Requests in flight");
 207
 208 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
 209     &me.gs_done, 0, "Total done");
 210
 211 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
 212     &me.gs_requests, 0, "Total requests");
 213
 214 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
 215     &me.gs_names, 0, "Algorithm names");
 216
 217 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
 218     &me.gs_sched_count, 0, "Number of algorithms");
 219
 220 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
 221     &me.gs_debug, 0, "Debug level");
 222
 223 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
 224     &me.gs_expire_secs, 0, "Expire time in seconds");
 225
 226 /*
 227  * g_sched calls the scheduler algorithms with this lock held.
 228  * The locking functions are exposed so the scheduler algorithms can also
 229  * protect themselves e.g. when running a callout handler.
 230  */
 231 void
 232 g_sched_lock(struct g_geom *gp)
 233 {
 234         struct g_sched_softc *sc = gp->softc;
 235
 236         mtx_lock(&sc->sc_mtx);
 237 }
 238
 239 void
 240 g_sched_unlock(struct g_geom *gp)
 241 {
 242         struct g_sched_softc *sc = gp->softc;
 243
 244         mtx_unlock(&sc->sc_mtx);
 245 }
 246
 247 /*
 248  * Support functions to handle references to the module,
 249  * which are coming from devices using this scheduler.
 250  */
 251 static inline void
 252 g_gsched_ref(struct g_gsched *gsp)
 253 {
 254
 255         atomic_add_int(&gsp->gs_refs, 1);
 256 }
 257
 258 static inline void
 259 g_gsched_unref(struct g_gsched *gsp)
 260 {
 261
 262         atomic_add_int(&gsp->gs_refs, -1);
 263 }
 264
 265 /*
 266  * Update the stats when this request is done.
 267  */
 268 static void
 269 g_sched_update_stats(struct bio *bio)
 270 {
 271
 272         me.gs_done++;
 273         me.gs_in_flight--;
 274         me.gs_bytes_in_flight -= bio->bio_length;
 275         if (bio->bio_cmd == BIO_WRITE) {
 276                 me.gs_writes_in_flight--;
 277                 me.gs_write_bytes_in_flight -= bio->bio_length;
 278         }
 279 }
 280
 281 /*
 282  * Dispatch any pending request.
 283  */
 284 static void
 285 g_sched_forced_dispatch(struct g_geom *gp)
 286 {
 287         struct g_sched_softc *sc = gp->softc;
 288         struct g_gsched *gsp = sc->sc_gsched;
 289         struct bio *bp;
 290
 291         KASSERT(mtx_owned(&sc->sc_mtx),
 292             ("sc_mtx not owned during forced dispatch"));
 293
 294         while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
 295                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 296 }
 297
 298 /*
 299  * The main dispatch loop, called either here after the start
 300  * routine, or by scheduling algorithms when they receive a timeout
 301  * or a 'done' notification.  Does not share code with the forced
 302  * dispatch path, since the gs_done() callback can call us.
 303  */
 304 void
 305 g_sched_dispatch(struct g_geom *gp)
 306 {
 307         struct g_sched_softc *sc = gp->softc;
 308         struct g_gsched *gsp = sc->sc_gsched;
 309         struct bio *bp;
 310
 311         KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
 312
 313         if ((sc->sc_flags & G_SCHED_FLUSHING))
 314                 return;
 315
 316         while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
 317                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 318 }
 319
 320 /*
 321  * Recent (8.0 and above) versions of FreeBSD have support to
 322  * register classifiers of disk requests. The classifier is
 323  * invoked by g_io_request(), and stores the information into
 324  * bp->bio_classifier1.
 325  *
 326  * Support for older versions, which is left here only for
 327  * documentation purposes, relies on two hacks:
 328  * 1. classification info is written into the bio_caller1
 329  *    field of the topmost node in the bio chain. This field
 330  *    is rarely used, but this module is incompatible with
 331  *    those that use bio_caller1 for other purposes,
 332  *    such as ZFS and gjournal;
 333  * 2. g_io_request() is patched in-memory when the module is
 334  *    loaded, so that the function calls a classifier as its
 335  *    first thing. g_io_request() is restored when the module
 336  *    is unloaded. This functionality is only supported for
 337  *    x86 and amd64, other architectures need source code changes.
 338  */
 339
 340 /*
 341  * Lookup the identity of the issuer of the original request.
 342  * In the current implementation we use the curthread of the
 343  * issuer, but different mechanisms may be implemented later
 344  * so we do not make assumptions on the return value which for
 345  * us is just an opaque identifier.
 346  */
 347
 348 static inline u_long
 349 g_sched_classify(struct bio *bp)
 350 {
 351
 352         /* we have classifier fields in the struct bio */
 353         return ((u_long)bp->bio_classifier1);
 354 }
 355
 356 /* Return the hash chain for the given key. */
 357 static inline struct g_hash *
 358 g_sched_hash(struct g_sched_softc *sc, u_long key)
 359 {
 360
 361         return (&sc->sc_hash[key & sc->sc_mask]);
 362 }
 363
 364 /*
 365  * Helper function for the children classes, which takes
 366  * a geom and a bio and returns the private descriptor
 367  * associated to the request.  This involves fetching
 368  * the classification field and [al]locating the
 369  * corresponding entry in the hash table.
 370  */
 371 void *
 372 g_sched_get_class(struct g_geom *gp, struct bio *bp)
 373 {
 374         struct g_sched_softc *sc;
 375         struct g_sched_class *gsc;
 376         struct g_gsched *gsp;
 377         struct g_hash *bucket;
 378         u_long key;
 379
 380         sc = gp->softc;
 381         key = g_sched_classify(bp);
 382         bucket = g_sched_hash(sc, key);
 383         LIST_FOREACH(gsc, bucket, gsc_clist) {
 384                 if (key == gsc->gsc_key) {
 385                         gsc->gsc_refs++;
 386                         return (gsc->gsc_priv);
 387                 }
 388         }
 389
 390         gsp = sc->sc_gsched;
 391         gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
 392             M_GEOM_SCHED, M_NOWAIT | M_ZERO);
 393         if (!gsc)
 394                 return (NULL);
 395
 396         if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 397                 free(gsc, M_GEOM_SCHED);
 398                 return (NULL);
 399         }
 400
 401         gsc->gsc_refs = 2;      /* 1 for the hash table, 1 for the caller. */
 402         gsc->gsc_key = key;
 403         LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
 404
 405         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 406
 407         return (gsc->gsc_priv);
 408 }
 409
 410 /*
 411  * Release a reference to the per-client descriptor,
 412  */
 413 void
 414 g_sched_put_class(struct g_geom *gp, void *priv)
 415 {
 416         struct g_sched_class *gsc;
 417         struct g_sched_softc *sc;
 418
 419         gsc = g_sched_priv2class(priv);
 420         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 421
 422         if (--gsc->gsc_refs > 0)
 423                 return;
 424
 425         sc = gp->softc;
 426         sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
 427
 428         LIST_REMOVE(gsc, gsc_clist);
 429         free(gsc, M_GEOM_SCHED);
 430 }
 431
 432 static void
 433 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
 434     struct g_gsched *gsp, void *data)
 435 {
 436         struct g_sched_class *cp, *cp2;
 437         int i;
 438
 439         if (!hp)
 440                 return;
 441
 442         if (data && gsp->gs_hash_unref)
 443                 gsp->gs_hash_unref(data);
 444
 445         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 446                 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
 447                         g_sched_put_class(gp, cp->gsc_priv);
 448         }
 449
 450         hashdestroy(hp, M_GEOM_SCHED, mask);
 451 }
 452
 453 static struct g_hash *
 454 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
 455 {
 456         struct g_hash *hash;
 457
 458         if (gsp->gs_priv_size == 0)
 459                 return (NULL);
 460
 461         hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
 462
 463         return (hash);
 464 }
 465
 466 static void
 467 g_sched_flush_classes(struct g_geom *gp)
 468 {
 469         struct g_sched_softc *sc;
 470         struct g_sched_class *cp, *cp2;
 471         int i;
 472
 473         sc = gp->softc;
 474
 475         if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
 476                 return;
 477
 478         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 479                 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
 480                         if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
 481                                 g_sched_put_class(gp, cp->gsc_priv);
 482                 }
 483         }
 484
 485         sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
 486 }
 487
 488 /*
 489  * Wait for the completion of any outstanding request.  To ensure
 490  * that this does not take forever the caller has to make sure that
 491  * no new request enter the scehduler before calling us.
 492  *
 493  * Must be called with the gp mutex held and topology locked.
 494  */
 495 static int
 496 g_sched_wait_pending(struct g_geom *gp)
 497 {
 498         struct g_sched_softc *sc = gp->softc;
 499         int endticks = ticks + hz;
 500
 501         g_topology_assert();
 502
 503         while (sc->sc_pending && endticks - ticks >= 0)
 504                 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
 505
 506         return (sc->sc_pending ? ETIMEDOUT : 0);
 507 }
 508
 509 static int
 510 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
 511 {
 512         struct g_sched_softc *sc = gp->softc;
 513         int error;
 514
 515         /* Set the flushing flag: new bios will not enter the scheduler. */
 516         sc->sc_flags |= G_SCHED_FLUSHING;
 517
 518         g_sched_forced_dispatch(gp);
 519         error = g_sched_wait_pending(gp);
 520         if (error)
 521                 goto failed;
 522
 523         /* No more requests pending or in flight from the old gsp. */
 524
 525         g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
 526         sc->sc_hash = NULL;
 527
 528         /*
 529          * Avoid deadlock here by releasing the gp mutex and reacquiring
 530          * it once done.  It should be safe, since no reconfiguration or
 531          * destruction can take place due to the geom topology lock; no
 532          * new request can use the current sc_data since we flagged the
 533          * geom as being flushed.
 534          */
 535         g_sched_unlock(gp);
 536         gsp->gs_fini(sc->sc_data);
 537         g_sched_lock(gp);
 538
 539         sc->sc_gsched = NULL;
 540         sc->sc_data = NULL;
 541         g_gsched_unref(gsp);
 542
 543 failed:
 544         sc->sc_flags &= ~G_SCHED_FLUSHING;
 545
 546         return (error);
 547 }
 548
 549 static int
 550 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
 551 {
 552         int error;
 553
 554         g_sched_lock(gp);
 555         error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
 556         g_sched_unlock(gp);
 557
 558         return (error);
 559 }
 560
 561 /*
 562  * Support function for create/taste -- locate the desired
 563  * algorithm and grab a reference to it.
 564  */
 565 static struct g_gsched *
 566 g_gsched_find(const char *name)
 567 {
 568         struct g_gsched *gsp = NULL;
 569
 570         mtx_lock(&me.gs_mtx);
 571         LIST_FOREACH(gsp, &me.gs_scheds, glist) {
 572                 if (strcmp(name, gsp->gs_name) == 0) {
 573                         g_gsched_ref(gsp);
 574                         break;
 575                 }
 576         }
 577         mtx_unlock(&me.gs_mtx);
 578
 579         return (gsp);
 580 }
 581
 582 /*
 583  * Rebuild the list of scheduler names.
 584  * To be called with me.gs_mtx lock held.
 585  */
 586 static void
 587 g_gsched_build_names(struct g_gsched *gsp)
 588 {
 589         int pos, l;
 590         struct g_gsched *cur;
 591
 592         pos = 0;
 593         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 594                 l = strlen(cur->gs_name);
 595                 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
 596                         if (pos != 0)
 597                                 me.gs_names[pos++] = ' ';
 598                         strcpy(me.gs_names + pos, cur->gs_name);
 599                         pos += l;
 600                 }
 601         }
 602         me.gs_names[pos] = '\0';
 603 }
 604
 605 /*
 606  * Register or unregister individual scheduling algorithms.
 607  */
 608 static int
 609 g_gsched_register(struct g_gsched *gsp)
 610 {
 611         struct g_gsched *cur;
 612         int error = 0;
 613
 614         mtx_lock(&me.gs_mtx);
 615         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 616                 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
 617                         break;
 618         }
 619         if (cur != NULL) {
 620                 G_SCHED_DEBUG(0, "A scheduler named %s already"
 621                     "exists.", gsp->gs_name);
 622                 error = EEXIST;
 623         } else {
 624                 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
 625                 gsp->gs_refs = 1;
 626                 me.gs_sched_count++;
 627                 g_gsched_build_names(gsp);
 628         }
 629         mtx_unlock(&me.gs_mtx);
 630
 631         return (error);
 632 }
 633
 634 struct g_gsched_unregparm {
 635         struct g_gsched *gup_gsp;
 636         int             gup_error;
 637 };
 638
 639 static void
 640 g_gsched_unregister(void *arg, int flag)
 641 {
 642         struct g_gsched_unregparm *parm = arg;
 643         struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
 644         struct g_sched_softc *sc;
 645         struct g_geom *gp, *gp_tmp;
 646         int error;
 647
 648         parm->gup_error = 0;
 649
 650         g_topology_assert();
 651
 652         if (flag == EV_CANCEL)
 653                 return;
 654
 655         mtx_lock(&me.gs_mtx);
 656
 657         LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
 658                 if (gp->class != &g_sched_class)
 659                         continue;       /* Should not happen. */
 660
 661                 sc = gp->softc;
 662                 if (sc->sc_gsched == gsp) {
 663                         error = g_sched_remove(gp, gsp);
 664                         if (error)
 665                                 goto failed;
 666                 }
 667         }
 668
 669         LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
 670                 if (cur != gsp)
 671                         continue;
 672
 673                 if (gsp->gs_refs != 1) {
 674                         G_SCHED_DEBUG(0, "%s still in use.",
 675                             gsp->gs_name);
 676                         parm->gup_error = EBUSY;
 677                 } else {
 678                         LIST_REMOVE(gsp, glist);
 679                         me.gs_sched_count--;
 680                         g_gsched_build_names(gsp);
 681                 }
 682                 break;
 683         }
 684
 685         if (cur == NULL) {
 686                 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
 687                 parm->gup_error = ENOENT;
 688         }
 689
 690 failed:
 691         mtx_unlock(&me.gs_mtx);
 692 }
 693
 694 static inline void
 695 g_gsched_global_init(void)
 696 {
 697
 698         if (!me.gs_initialized) {
 699                 G_SCHED_DEBUG(0, "Initializing global data.");
 700                 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
 701                 LIST_INIT(&me.gs_scheds);
 702                 bioq_init(&me.gs_pending);
 703                 me.gs_initialized = 1;
 704         }
 705 }
 706
 707 /*
 708  * Module event called when a scheduling algorithm module is loaded or
 709  * unloaded.
 710  */
 711 int
 712 g_gsched_modevent(module_t mod, int cmd, void *arg)
 713 {
 714         struct g_gsched *gsp = arg;
 715         struct g_gsched_unregparm parm;
 716         int error;
 717
 718         G_SCHED_DEBUG(0, "Modevent %d.", cmd);
 719
 720         /*
 721          * If the module is loaded at boot, the geom thread that calls
 722          * g_sched_init() might actually run after g_gsched_modevent(),
 723          * so make sure that the module is properly initialized.
 724          */
 725         g_gsched_global_init();
 726
 727         error = EOPNOTSUPP;
 728         switch (cmd) {
 729         case MOD_LOAD:
 730                 error = g_gsched_register(gsp);
 731                 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
 732                     gsp->gs_name, error);
 733                 if (error == 0)
 734                         g_retaste(&g_sched_class);
 735                 break;
 736
 737         case MOD_UNLOAD:
 738                 parm.gup_gsp = gsp;
 739                 parm.gup_error = 0;
 740
 741                 error = g_waitfor_event(g_gsched_unregister,
 742                     &parm, M_WAITOK, NULL);
 743                 if (error == 0)
 744                         error = parm.gup_error;
 745                 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
 746                     gsp->gs_name, error);
 747                 break;
 748         }
 749
 750         return (error);
 751 }
 752
 753 #ifdef KTR
 754 #define TRC_BIO_EVENT(e, bp)    g_sched_trace_bio_ ## e (bp)
 755
 756 static inline char
 757 g_sched_type(struct bio *bp)
 758 {
 759
 760         if (bp->bio_cmd == BIO_READ)
 761                 return ('R');
 762         else if (bp->bio_cmd == BIO_WRITE)
 763                 return ('W');
 764         return ('U');
 765 }
 766
 767 static inline void
 768 g_sched_trace_bio_START(struct bio *bp)
 769 {
 770
 771         CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
 772             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 773             bp->bio_offset, bp->bio_length);
 774 }
 775
 776 static inline void
 777 g_sched_trace_bio_DONE(struct bio *bp)
 778 {
 779
 780         CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
 781             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 782             bp->bio_offset, bp->bio_length);
 783 }
 784 #else /* !KTR */
 785 #define TRC_BIO_EVENT(e, bp)
 786 #endif /* !KTR */
 787
 788 /*
 789  * g_sched_done() and g_sched_start() dispatch the geom requests to
 790  * the scheduling algorithm in use.
 791  */
 792 static void
 793 g_sched_done(struct bio *bio)
 794 {
 795         struct g_geom *gp = bio->bio_caller2;
 796         struct g_sched_softc *sc = gp->softc;
 797
 798         TRC_BIO_EVENT(DONE, bio);
 799
 800         KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
 801
 802         g_sched_lock(gp);
 803
 804         g_sched_update_stats(bio);
 805         sc->sc_gsched->gs_done(sc->sc_data, bio);
 806         if (!--sc->sc_pending)
 807                 wakeup(gp);
 808
 809         g_sched_flush_classes(gp);
 810         g_sched_unlock(gp);
 811
 812         g_std_done(bio);
 813 }
 814
 815 static void
 816 g_sched_start(struct bio *bp)
 817 {
 818         struct g_geom *gp = bp->bio_to->geom;
 819         struct g_sched_softc *sc = gp->softc;
 820         struct bio *cbp;
 821
 822         TRC_BIO_EVENT(START, bp);
 823         G_SCHED_LOGREQ(bp, "Request received.");
 824
 825         cbp = g_clone_bio(bp);
 826         if (cbp == NULL) {
 827                 g_io_deliver(bp, ENOMEM);
 828                 return;
 829         }
 830         cbp->bio_done = g_sched_done;
 831         cbp->bio_to = LIST_FIRST(&gp->provider);
 832         KASSERT(cbp->bio_to != NULL, ("NULL provider"));
 833
 834         /* We only schedule reads and writes. */
 835         if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
 836                 goto bypass;
 837
 838         G_SCHED_LOGREQ(cbp, "Sending request.");
 839
 840         g_sched_lock(gp);
 841         /*
 842          * Call the algorithm's gs_start to queue the request in the
 843          * scheduler. If gs_start fails then pass the request down,
 844          * otherwise call g_sched_dispatch() which tries to push
 845          * one or more requests down.
 846          */
 847         if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
 848             sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
 849                 g_sched_unlock(gp);
 850                 goto bypass;
 851         }
 852         /*
 853          * We use bio_caller1 to mark requests that are scheduled
 854          * so make sure it is not NULL.
 855          */
 856         if (cbp->bio_caller1 == NULL)
 857                 cbp->bio_caller1 = &me; /* anything not NULL */
 858
 859         cbp->bio_caller2 = gp;
 860         sc->sc_pending++;
 861
 862         /* Update general stats. */
 863         me.gs_in_flight++;
 864         me.gs_requests++;
 865         me.gs_bytes_in_flight += bp->bio_length;
 866         if (bp->bio_cmd == BIO_WRITE) {
 867                 me.gs_writes_in_flight++;
 868                 me.gs_write_bytes_in_flight += bp->bio_length;
 869         }
 870         g_sched_dispatch(gp);
 871         g_sched_unlock(gp);
 872         return;
 873
 874 bypass:
 875         cbp->bio_done = g_std_done;
 876         cbp->bio_caller1 = NULL; /* not scheduled */
 877         g_io_request(cbp, LIST_FIRST(&gp->consumer));
 878 }
 879
 880 /*
 881  * The next few functions are the geom glue.
 882  */
 883 static void
 884 g_sched_orphan(struct g_consumer *cp)
 885 {
 886
 887         g_topology_assert();
 888         g_sched_destroy(cp->geom, 1);
 889 }
 890
 891 static int
 892 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
 893 {
 894         struct g_geom *gp;
 895         struct g_consumer *cp;
 896         int error;
 897
 898         gp = pp->geom;
 899         cp = LIST_FIRST(&gp->consumer);
 900         error = g_access(cp, dr, dw, de);
 901
 902         return (error);
 903 }
 904
 905 static void
 906 g_sched_temporary_start(struct bio *bio)
 907 {
 908
 909         mtx_lock(&me.gs_mtx);
 910         me.gs_npending++;
 911         bioq_disksort(&me.gs_pending, bio);
 912         mtx_unlock(&me.gs_mtx);
 913 }
 914
 915 static void
 916 g_sched_flush_pending(g_start_t *start)
 917 {
 918         struct bio *bp;
 919
 920         while ((bp = bioq_takefirst(&me.gs_pending)))
 921                 start(bp);
 922 }
 923
 924 static int
 925 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
 926     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
 927 {
 928         struct g_sched_softc *sc = gp->softc;
 929         g_start_t *saved_start, *flush = g_sched_start;
 930         int error = 0, endticks = ticks + hz;
 931
 932         g_cancel_event(newpp);  /* prevent taste() */
 933         /* copy private fields */
 934         newpp->private = pp->private;
 935         newpp->index = pp->index;
 936
 937         /* Queue all the early requests coming for us. */
 938         me.gs_npending = 0;
 939         saved_start = pp->geom->start;
 940         dstgp->start = g_sched_temporary_start;
 941
 942         while (pp->nstart - pp->nend != me.gs_npending &&
 943             endticks - ticks >= 0)
 944                 tsleep(pp, PRIBIO, "-", hz/10);
 945
 946         if (pp->nstart - pp->nend != me.gs_npending) {
 947                 flush = saved_start;
 948                 error = ETIMEDOUT;
 949                 goto fail;
 950         }
 951
 952         /* link pp to this geom */
 953         LIST_REMOVE(pp, provider);
 954         pp->geom = gp;
 955         LIST_INSERT_HEAD(&gp->provider, pp, provider);
 956
 957         /*
 958          * replicate the counts from the parent in the
 959          * new provider and consumer nodes
 960          */
 961         cp->acr = newpp->acr = pp->acr;
 962         cp->acw = newpp->acw = pp->acw;
 963         cp->ace = newpp->ace = pp->ace;
 964         sc->sc_flags |= G_SCHED_PROXYING;
 965
 966 fail:
 967         dstgp->start = saved_start;
 968
 969         g_sched_flush_pending(flush);
 970
 971         return (error);
 972 }
 973
 974 /*
 975  * Create a geom node for the device passed as *pp.
 976  * If successful, add a reference to this gsp.
 977  */
 978 static int
 979 g_sched_create(struct gctl_req *req, struct g_class *mp,
 980     struct g_provider *pp, struct g_gsched *gsp, int proxy)
 981 {
 982         struct g_sched_softc *sc = NULL;
 983         struct g_geom *gp, *dstgp;
 984         struct g_provider *newpp = NULL;
 985         struct g_consumer *cp = NULL;
 986         char name[64];
 987         int error;
 988
 989         g_topology_assert();
 990
 991         snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
 992         LIST_FOREACH(gp, &mp->geom, geom) {
 993                 if (strcmp(gp->name, name) == 0) {
 994                         gctl_error(req, "Geom %s already exists.",
 995                             name);
 996                         return (EEXIST);
 997                 }
 998         }
 999
1000         gp = g_new_geomf(mp, "%s", name);
1001         dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1002
1003         sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1004         sc->sc_gsched = gsp;
1005         sc->sc_data = gsp->gs_init(gp);
1006         if (sc->sc_data == NULL) {
1007                 error = ENOMEM;
1008                 goto fail;
1009         }
1010
1011         sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1012
1013         /*
1014          * Do not initialize the flush mechanism, will be initialized
1015          * on the first insertion on the hash table.
1016          */
1017
1018         mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1019
1020         gp->softc = sc;
1021         gp->start = g_sched_start;
1022         gp->orphan = g_sched_orphan;
1023         gp->access = g_sched_access;
1024         gp->dumpconf = g_sched_dumpconf;
1025
1026         newpp = g_new_providerf(dstgp, "%s", gp->name);
1027         newpp->mediasize = pp->mediasize;
1028         newpp->sectorsize = pp->sectorsize;
1029
1030         cp = g_new_consumer(gp);
1031         error = g_attach(cp, proxy ? newpp : pp);
1032         if (error != 0) {
1033                 gctl_error(req, "Cannot attach to provider %s.",
1034                     pp->name);
1035                 goto fail;
1036         }
1037
1038         g_error_provider(newpp, 0);
1039         if (proxy) {
1040                 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1041                 if (error)
1042                         goto fail;
1043         }
1044         G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1045
1046         g_gsched_ref(gsp);
1047
1048         return (0);
1049
1050 fail:
1051         if (cp != NULL) {
1052                 if (cp->provider != NULL)
1053                         g_detach(cp);
1054                 g_destroy_consumer(cp);
1055         }
1056         if (newpp != NULL)
1057                 g_destroy_provider(newpp);
1058         if (sc->sc_hash)
1059                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1060                     gsp, sc->sc_data);
1061         if (sc->sc_data)
1062                 gsp->gs_fini(sc->sc_data);
1063         g_free(gp->softc);
1064         g_destroy_geom(gp);
1065
1066         return (error);
1067 }
1068
1069 /*
1070  * Support for dynamic switching of scheduling algorithms.
1071  * First initialize the data structures for the new algorithm,
1072  * then call g_sched_remove_locked() to flush all references
1073  * to the old one, finally link the new algorithm.
1074  */
1075 static int
1076 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1077     struct g_provider *pp, struct g_gsched *gsp)
1078 {
1079         struct g_sched_softc *sc;
1080         struct g_geom *gp;
1081         struct g_hash *newh;
1082         void *data;
1083         u_long mask;
1084         int error = 0;
1085
1086         gp = pp->geom;
1087         sc = gp->softc;
1088
1089         data = gsp->gs_init(gp);
1090         if (data == NULL)
1091                 return (ENOMEM);
1092
1093         newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1094         if (gsp->gs_priv_size && !newh) {
1095                 error = ENOMEM;
1096                 goto fail;
1097         }
1098
1099         g_sched_lock(gp);
1100         if (sc->sc_gsched) {    /* can be NULL in some cases */
1101                 error = g_sched_remove_locked(gp, sc->sc_gsched);
1102                 if (error)
1103                         goto fail;
1104         }
1105
1106         g_gsched_ref(gsp);
1107         sc->sc_gsched = gsp;
1108         sc->sc_data = data;
1109         sc->sc_hash = newh;
1110         sc->sc_mask = mask;
1111
1112         g_sched_unlock(gp);
1113
1114         return (0);
1115
1116 fail:
1117         if (newh)
1118                 g_sched_hash_fini(gp, newh, mask, gsp, data);
1119
1120         if (data)
1121                 gsp->gs_fini(data);
1122
1123         g_sched_unlock(gp);
1124
1125         return (error);
1126 }
1127
1128 /*
1129  * Stop the request flow directed to the proxy, redirecting the new
1130  * requests to the me.gs_pending queue.
1131  */
1132 static struct g_provider *
1133 g_detach_proxy(struct g_geom *gp)
1134 {
1135         struct g_consumer *cp;
1136         struct g_provider *pp, *newpp;
1137
1138         do {
1139                 pp = LIST_FIRST(&gp->provider);
1140                 if (pp == NULL)
1141                         break;
1142                 cp = LIST_FIRST(&gp->consumer);
1143                 if (cp == NULL)
1144                         break;
1145                 newpp = cp->provider;
1146                 if (newpp == NULL)
1147                         break;
1148
1149                 me.gs_npending = 0;
1150                 pp->geom->start = g_sched_temporary_start;
1151
1152                 return (pp);
1153         } while (0);
1154         printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1155
1156         return (NULL);
1157 }
1158
1159 static void
1160 g_sched_blackhole(struct bio *bp)
1161 {
1162
1163         g_io_deliver(bp, ENXIO);
1164 }
1165
1166 static inline void
1167 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1168     struct g_provider *newpp)
1169 {
1170
1171         LIST_REMOVE(pp, provider);
1172         if (newpp) {
1173                 pp->private = newpp->private;
1174                 pp->index = newpp->index;
1175         }
1176         pp->geom = gp;
1177         LIST_INSERT_HEAD(&gp->provider, pp, provider);
1178 }
1179
1180 static inline void
1181 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1182 {
1183         struct g_geom *gp = oldpp->geom;
1184
1185         g_reparent_provider(oldpp, newpp->geom, newpp);
1186
1187         /*
1188          * Hackish: let the system destroy the old provider for us, just
1189          * in case someone attached a consumer to it, in which case a
1190          * direct call to g_destroy_provider() would not work.
1191          */
1192         g_reparent_provider(newpp, gp, NULL);
1193 }
1194
1195 /*
1196  * Complete the proxy destruction, linking the old provider to its
1197  * original geom, and destroying the proxy provider.  Also take care
1198  * of issuing the pending requests collected in me.gs_pending (if any).
1199  */
1200 static int
1201 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1202 {
1203         struct g_consumer *cp;
1204         struct g_provider *newpp;
1205
1206         do {
1207                 cp = LIST_FIRST(&gp->consumer);
1208                 if (cp == NULL)
1209                         break;
1210                 newpp = cp->provider;
1211                 if (newpp == NULL)
1212                         break;
1213
1214                 /* Relink the provider to its original geom. */
1215                 g_unproxy_provider(oldpp, newpp);
1216
1217                 /* Detach consumer from provider, and destroy provider. */
1218                 cp->acr = newpp->acr = 0;
1219                 cp->acw = newpp->acw = 0;
1220                 cp->ace = newpp->ace = 0;
1221                 g_detach(cp);
1222
1223                 /* Send the pending bios through the right start function. */
1224                 g_sched_flush_pending(oldpp->geom->start);
1225
1226                 return (0);
1227         } while (0);
1228         printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1229
1230         /* We cannot send the pending bios anywhere... */
1231         g_sched_flush_pending(g_sched_blackhole);
1232
1233         return (EINVAL);
1234 }
1235
1236 static int
1237 g_sched_destroy(struct g_geom *gp, boolean_t force)
1238 {
1239         struct g_provider *pp, *oldpp = NULL;
1240         struct g_sched_softc *sc;
1241         struct g_gsched *gsp;
1242         int error;
1243
1244         g_topology_assert();
1245         sc = gp->softc;
1246         if (sc == NULL)
1247                 return (ENXIO);
1248         if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1249                 pp = LIST_FIRST(&gp->provider);
1250                 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1251                         const char *msg = force ?
1252                                 "but we force removal" : "cannot remove";
1253
1254                         G_SCHED_DEBUG(!force,
1255                             "Device %s is still open (r%dw%de%d), %s.",
1256                             pp->name, pp->acr, pp->acw, pp->ace, msg);
1257                         if (!force)
1258                                 return (EBUSY);
1259                 } else {
1260                         G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1261                 }
1262         } else
1263                 oldpp = g_detach_proxy(gp);
1264
1265         gsp = sc->sc_gsched;
1266         if (gsp) {
1267                 /*
1268                  * XXX bad hack here: force a dispatch to release
1269                  * any reference to the hash table still held by
1270                  * the scheduler.
1271                  */
1272                 g_sched_lock(gp);
1273                 /*
1274                  * We are dying here, no new requests should enter
1275                  * the scheduler.  This is granted by the topolgy,
1276                  * either in case we were proxying (new bios are
1277                  * being redirected) or not (see the access check
1278                  * above).
1279                  */
1280                 g_sched_forced_dispatch(gp);
1281                 error = g_sched_wait_pending(gp);
1282
1283                 if (error) {
1284                         /*
1285                          * Not all the requests came home: this might happen
1286                          * under heavy load, or if we were waiting for any
1287                          * bio which is served in the event path (see
1288                          * geom_slice.c for an example of how this can
1289                          * happen).  Try to restore a working configuration
1290                          * if we can fail.
1291                          */
1292                         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1293                                 g_sched_flush_pending(force ?
1294                                     g_sched_blackhole : g_sched_start);
1295                         }
1296
1297                         /*
1298                          * In the forced destroy case there is not so much
1299                          * we can do, we have pending bios that will call
1300                          * g_sched_done() somehow, and we don't want them
1301                          * to crash the system using freed memory.  We tell
1302                          * the user that something went wrong, and leak some
1303                          * memory here.
1304                          * Note: the callers using force = 1 ignore the
1305                          * return value.
1306                          */
1307                         if (force) {
1308                                 G_SCHED_DEBUG(0, "Pending requests while "
1309                                     " destroying geom, some memory leaked.");
1310                         }
1311
1312                         return (error);
1313                 }
1314
1315                 g_sched_unlock(gp);
1316                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1317                     gsp, sc->sc_data);
1318                 sc->sc_hash = NULL;
1319                 gsp->gs_fini(sc->sc_data);
1320                 g_gsched_unref(gsp);
1321                 sc->sc_gsched = NULL;
1322         } else
1323                 error = 0;
1324
1325         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1326                 error = g_destroy_proxy(gp, oldpp);
1327
1328                 if (error) {
1329                         if (force) {
1330                                 G_SCHED_DEBUG(0, "Unrecoverable error while "
1331                                     "destroying a proxy geom, leaking some "
1332                                     " memory.");
1333                         }
1334
1335                         return (error);
1336                 }
1337         }
1338
1339         mtx_destroy(&sc->sc_mtx);
1340
1341         g_free(gp->softc);
1342         gp->softc = NULL;
1343         g_wither_geom(gp, ENXIO);
1344
1345         return (error);
1346 }
1347
1348 static int
1349 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1350     struct g_geom *gp)
1351 {
1352
1353         return (g_sched_destroy(gp, 0));
1354 }
1355
1356 /*
1357  * Functions related to the classification of requests.
1358  *
1359  * On recent FreeBSD versions (8.0 and above), we store a reference
1360  * to the issuer of a request in bp->bio_classifier1 as soon
1361  * as the bio is posted to the geom queue (and not later, because
1362  * requests are managed by the g_down thread afterwards).
1363  */
1364
1365 /*
1366  * Classifier support for recent FreeBSD versions: we use
1367  * a very simple classifier, only use curthread to tag a request.
1368  * The classifier is registered at module load, and unregistered
1369  * at module unload.
1370  */
1371 static int
1372 g_sched_tag(void *arg, struct bio *bp)
1373 {
1374
1375         bp->bio_classifier1 = curthread;
1376         return (1);
1377 }
1378
1379 static struct g_classifier_hook g_sched_classifier = {
1380         .func = g_sched_tag,
1381 };
1382
1383 static inline void
1384 g_classifier_ini(void)
1385 {
1386
1387         g_register_classifier(&g_sched_classifier);
1388 }
1389
1390 static inline void
1391 g_classifier_fini(void)
1392 {
1393
1394         g_unregister_classifier(&g_sched_classifier);
1395 }
1396
1397 static void
1398 g_sched_init(struct g_class *mp)
1399 {
1400
1401         g_gsched_global_init();
1402
1403         G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1404             mp, &g_sched_class);
1405
1406         /* Patch g_io_request to store classification info in the bio. */
1407         g_classifier_ini();
1408 }
1409
1410 static void
1411 g_sched_fini(struct g_class *mp)
1412 {
1413
1414         g_classifier_fini();
1415
1416         G_SCHED_DEBUG(0, "Unloading...");
1417
1418         KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1419         mtx_destroy(&me.gs_mtx);
1420 }
1421
1422 static int
1423 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1424     struct thread *td)
1425 {
1426         struct g_consumer *cp;
1427         struct g_geom *gp;
1428
1429         cp = LIST_FIRST(&pp->geom->consumer);
1430         if (cp == NULL)
1431                 return (ENOIOCTL);
1432         gp = cp->provider->geom;
1433         if (gp->ioctl == NULL)
1434                 return (ENOIOCTL);
1435         return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1436 }
1437
1438 /*
1439  * Read the i-th argument for a request, skipping the /dev/
1440  * prefix if present.
1441  */
1442 static const char *
1443 g_sched_argi(struct gctl_req *req, int i)
1444 {
1445         static const char *dev_prefix = "/dev/";
1446         const char *name;
1447         char param[16];
1448         int l = strlen(dev_prefix);
1449
1450         snprintf(param, sizeof(param), "arg%d", i);
1451         name = gctl_get_asciiparam(req, param);
1452         if (name == NULL)
1453                 gctl_error(req, "No 'arg%d' argument", i);
1454         else if (strncmp(name, dev_prefix, l) == 0)
1455                 name += l;
1456         return (name);
1457 }
1458
1459 /*
1460  * Fetch nargs and do appropriate checks.
1461  */
1462 static int
1463 g_sched_get_nargs(struct gctl_req *req)
1464 {
1465         int *nargs;
1466
1467         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1468         if (nargs == NULL) {
1469                 gctl_error(req, "No 'nargs' argument");
1470                 return (0);
1471         }
1472         if (*nargs <= 0)
1473                 gctl_error(req, "Missing device(s).");
1474         return (*nargs);
1475 }
1476
1477 /*
1478  * Check whether we should add the class on certain volumes when
1479  * this geom is created. Right now this is under control of a kenv
1480  * variable containing the names of all devices that we care about.
1481  * Probably we should only support transparent insertion as the
1482  * preferred mode of operation.
1483  */
1484 static struct g_geom *
1485 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1486                 int flags __unused)
1487 {
1488         struct g_gsched *gsp = NULL;    /* the . algorithm we want */
1489         const char *s;                  /* generic string pointer */
1490         const char *taste_names;        /* devices we like */
1491         int l;
1492
1493         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1494             mp->name, pp->name);
1495         g_topology_assert();
1496
1497         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1498
1499         do {
1500                 /* do not taste on ourselves */
1501                 if (pp->geom->class == mp)
1502                         break;
1503
1504                 taste_names = kern_getenv("geom.sched.taste");
1505                 if (taste_names == NULL)
1506                         break;
1507
1508                 l = strlen(pp->name);
1509                 for (s = taste_names; *s &&
1510                     (s = strstr(s, pp->name)); s++) {
1511                         /* further checks for an exact match */
1512                         if ( (s == taste_names || s[-1] == ' ') &&
1513                              (s[l] == '\0' || s[l] == ' ') )
1514                                 break;
1515                 }
1516                 if (s == NULL)
1517                         break;
1518                 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1519                     pp->name, s);
1520
1521                 /* look up the provider name in the list */
1522                 s = kern_getenv("geom.sched.algo");
1523                 if (s == NULL)
1524                         s = "rr";
1525
1526                 gsp = g_gsched_find(s); /* also get a reference */
1527                 if (gsp == NULL) {
1528                         G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1529                         break;
1530                 }
1531
1532                 /* XXX create with 1 as last argument ? */
1533                 g_sched_create(NULL, mp, pp, gsp, 0);
1534                 g_gsched_unref(gsp);
1535         } while (0);
1536         return NULL;
1537 }
1538
1539 static void
1540 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1541 {
1542         struct g_provider *pp;
1543         struct g_gsched *gsp;
1544         const char *name;
1545         int i, nargs;
1546
1547         g_topology_assert();
1548
1549         name = gctl_get_asciiparam(req, "algo");
1550         if (name == NULL) {
1551                 gctl_error(req, "No '%s' argument", "algo");
1552                 return;
1553         }
1554
1555         gsp = g_gsched_find(name);      /* also get a reference */
1556         if (gsp == NULL) {
1557                 gctl_error(req, "Bad algorithm '%s'", name);
1558                 return;
1559         }
1560
1561         nargs = g_sched_get_nargs(req);
1562
1563         /*
1564          * Run on the arguments, and break on any error.
1565          * We look for a device name, but skip the /dev/ prefix if any.
1566          */
1567         for (i = 0; i < nargs; i++) {
1568                 name = g_sched_argi(req, i);
1569                 if (name == NULL)
1570                         break;
1571                 pp = g_provider_by_name(name);
1572                 if (pp == NULL) {
1573                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1574                         gctl_error(req, "Provider %s is invalid.", name);
1575                         break;
1576                 }
1577                 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1578                         break;
1579         }
1580
1581         g_gsched_unref(gsp);
1582 }
1583
1584 static void
1585 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1586 {
1587         struct g_provider *pp;
1588         struct g_gsched *gsp;
1589         const char *name;
1590         int i, nargs;
1591
1592         g_topology_assert();
1593
1594         name = gctl_get_asciiparam(req, "algo");
1595         if (name == NULL) {
1596                 gctl_error(req, "No '%s' argument", "algo");
1597                 return;
1598         }
1599
1600         gsp = g_gsched_find(name);      /* also get a reference */
1601         if (gsp == NULL) {
1602                 gctl_error(req, "Bad algorithm '%s'", name);
1603                 return;
1604         }
1605
1606         nargs = g_sched_get_nargs(req);
1607
1608         /*
1609          * Run on the arguments, and break on any error.
1610          * We look for a device name, but skip the /dev/ prefix if any.
1611          */
1612         for (i = 0; i < nargs; i++) {
1613                 name = g_sched_argi(req, i);
1614                 if (name == NULL)
1615                         break;
1616                 pp = g_provider_by_name(name);
1617                 if (pp == NULL || pp->geom->class != mp) {
1618                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1619                         gctl_error(req, "Provider %s is invalid.", name);
1620                         break;
1621                 }
1622                 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1623                         break;
1624         }
1625
1626         g_gsched_unref(gsp);
1627 }
1628
1629 static struct g_geom *
1630 g_sched_find_geom(struct g_class *mp, const char *name)
1631 {
1632         struct g_geom *gp;
1633
1634         LIST_FOREACH(gp, &mp->geom, geom) {
1635                 if (strcmp(gp->name, name) == 0)
1636                         return (gp);
1637         }
1638         return (NULL);
1639 }
1640
1641 static void
1642 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1643 {
1644         int nargs, *force, error, i;
1645         struct g_geom *gp;
1646         const char *name;
1647
1648         g_topology_assert();
1649
1650         nargs = g_sched_get_nargs(req);
1651
1652         force = gctl_get_paraml(req, "force", sizeof(*force));
1653         if (force == NULL) {
1654                 gctl_error(req, "No 'force' argument");
1655                 return;
1656         }
1657
1658         for (i = 0; i < nargs; i++) {
1659                 name = g_sched_argi(req, i);
1660                 if (name == NULL)
1661                         break;
1662
1663                 gp = g_sched_find_geom(mp, name);
1664                 if (gp == NULL) {
1665                         G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1666                         gctl_error(req, "Device %s is invalid.", name);
1667                         break;
1668                 }
1669
1670                 error = g_sched_destroy(gp, *force);
1671                 if (error != 0) {
1672                         gctl_error(req, "Cannot destroy device %s (error=%d).",
1673                             gp->name, error);
1674                         break;
1675                 }
1676         }
1677 }
1678
1679 static void
1680 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1681 {
1682         uint32_t *version;
1683
1684         g_topology_assert();
1685
1686         version = gctl_get_paraml(req, "version", sizeof(*version));
1687         if (version == NULL) {
1688                 gctl_error(req, "No '%s' argument.", "version");
1689                 return;
1690         }
1691
1692         if (*version != G_SCHED_VERSION) {
1693                 gctl_error(req, "Userland and kernel parts are "
1694                     "out of sync.");
1695                 return;
1696         }
1697
1698         if (strcmp(verb, "create") == 0) {
1699                 g_sched_ctl_create(req, mp, 0);
1700                 return;
1701         } else if (strcmp(verb, "insert") == 0) {
1702                 g_sched_ctl_create(req, mp, 1);
1703                 return;
1704         } else if (strcmp(verb, "configure") == 0) {
1705                 g_sched_ctl_configure(req, mp);
1706                 return;
1707         } else if (strcmp(verb, "destroy") == 0) {
1708                 g_sched_ctl_destroy(req, mp);
1709                 return;
1710         }
1711
1712         gctl_error(req, "Unknown verb.");
1713 }
1714
1715 static void
1716 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1717     struct g_consumer *cp, struct g_provider *pp)
1718 {
1719         struct g_sched_softc *sc = gp->softc;
1720         struct g_gsched *gsp = sc->sc_gsched;
1721         if (indent == NULL) {   /* plaintext */
1722                 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1723         }
1724         if (gsp != NULL && gsp->gs_dumpconf)
1725                 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1726 }
1727
1728 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1729 MODULE_VERSION(geom_sched, 0);