sys/geom/sched/g_sched.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2009-2010 Fabio Checconi
   5  * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 /*
  31  * $Id$
  32  * $FreeBSD$
  33  *
  34  * Main control module for geom-based disk schedulers ('sched').
  35  *
  36  * USER VIEW
  37  * A 'sched' node is typically inserted transparently between
  38  * an existing provider pp and its original geom gp
  39  *
  40  *      [pp --> gp  ..]
  41  *
  42  * using the command "geom sched insert <provider>" and
  43  * resulting in the following topology
  44  *
  45  *      [pp --> sched_gp --> cp]   [new_pp --> gp ... ]
  46  *
  47  * Deletion "geom sched destroy <provider>.sched." restores the
  48  * original chain. The normal "geom sched create <provide>"
  49  * is also supported.
  50  *
  51  * INTERNALS
  52  * Internally, the 'sched' uses the following data structures
  53  *
  54  *   geom{}         g_sched_softc{}      g_gsched{}
  55  * +----------+    +---------------+   +-------------+
  56  * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
  57  * |  ...     |    |               |   |  gs_fini    |
  58  * |          |    | [ hash table] |   |  gs_start   |
  59  * +----------+    |               |   |  ...        |
  60  *                 |               |   +-------------+
  61  *                 |               |
  62  *                 |               |     g_*_softc{}
  63  *                 |               |   +-------------+
  64  *                 | sc_data     *-|-->|             |
  65  *                 +---------------+   |  algorithm- |
  66  *                                     |  specific   |
  67  *                                     +-------------+
  68  *
  69  * A g_sched_softc{} is created with a "geom sched insert" call.
  70  * In turn this instantiates a specific scheduling algorithm,
  71  * which sets sc_gsched to point to the algorithm callbacks,
  72  * and calls gs_init() to create the g_*_softc{} .
  73  * The other callbacks (gs_start, gs_next, ...) are invoked
  74  * as needed
  75  *
  76  * g_sched_softc{} is defined in g_sched.h and mostly used here;
  77  * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
  78  * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
  79  *
  80  * DATA MOVING
  81  * When a bio is received on the provider, it goes to the
  82  * g_sched_start() which calls gs_start() to initially queue it;
  83  * then we call g_sched_dispatch() that loops around gs_next()
  84  * to select zero or more bio's to be sent downstream.
  85  *
  86  * g_sched_dispatch() can also be called as a result of a timeout,
  87  * e.g. when doing anticipation or pacing requests.
  88  *
  89  * When a bio comes back, it goes to g_sched_done() which in turn
  90  * calls gs_done(). The latter does any necessary housekeeping in
  91  * the scheduling algorithm, and may decide to call g_sched_dispatch()
  92  * to send more bio's downstream.
  93  *
  94  * If an algorithm needs per-flow queues, these are created
  95  * calling gs_init_class() and destroyed with gs_fini_class(),
  96  * and they are also inserted in the hash table implemented in
  97  * the g_sched_softc{}
  98  *
  99  * If an algorithm is replaced, or a transparently-inserted node is
 100  * removed with "geom sched destroy", we need to remove all references
 101  * to the g_*_softc{} and g_sched_softc from the bio's still in
 102  * the scheduler. g_sched_forced_dispatch() helps doing this.
 103  * XXX need to explain better.
 104  */
 105
 106 #include <sys/cdefs.h>
 107 #include <sys/param.h>
 108 #include <sys/systm.h>
 109 #include <sys/kernel.h>
 110 #include <sys/module.h>
 111 #include <sys/lock.h>
 112 #include <sys/mutex.h>
 113 #include <sys/bio.h>
 114 #include <sys/limits.h>
 115 #include <sys/hash.h>
 116 #include <sys/sbuf.h>
 117 #include <sys/sysctl.h>
 118 #include <sys/malloc.h>
 119 #include <sys/proc.h>           /* we access curthread */
 120 #include <geom/geom.h>
 121 #include "gs_scheduler.h"
 122 #include "g_sched.h"            /* geom hooks */
 123
 124 /*
 125  * Size of the per-geom hash table storing traffic classes.
 126  * We may decide to change it at a later time, it has no ABI
 127  * implications as it is only used for run-time allocations.
 128  */
 129 #define G_SCHED_HASH_SIZE       32
 130
 131 static int g_sched_destroy(struct g_geom *gp, boolean_t force);
 132 static int g_sched_destroy_geom(struct gctl_req *req,
 133     struct g_class *mp, struct g_geom *gp);
 134 static void g_sched_config(struct gctl_req *req, struct g_class *mp,
 135     const char *verb);
 136 static struct g_geom *g_sched_taste(struct g_class *mp,
 137     struct g_provider *pp, int flags __unused);
 138 static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
 139     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 140 static void g_sched_init(struct g_class *mp);
 141 static void g_sched_fini(struct g_class *mp);
 142 static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
 143     int fflag, struct thread *td);
 144
 145 struct g_class g_sched_class = {
 146         .name = G_SCHED_CLASS_NAME,
 147         .version = G_VERSION,
 148         .ctlreq = g_sched_config,
 149         .taste = g_sched_taste,
 150         .destroy_geom = g_sched_destroy_geom,
 151         .init = g_sched_init,
 152         .ioctl = g_sched_ioctl,
 153         .fini = g_sched_fini
 154 };
 155
 156 MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
 157
 158 /*
 159  * Global variables describing the state of the geom_sched module.
 160  * There is only one static instance of this structure.
 161  */
 162 LIST_HEAD(gs_list, g_gsched);   /* type, link field */
 163 struct geom_sched_vars {
 164         struct mtx      gs_mtx;
 165         struct gs_list  gs_scheds;      /* list of algorithms */
 166         u_int           gs_debug;
 167         u_int           gs_sched_count; /* how many algorithms ? */
 168         u_int           gs_patched;     /* g_io_request was patched */
 169
 170         u_int           gs_initialized;
 171         u_int           gs_expire_secs; /* expiration of hash entries */
 172
 173         struct bio_queue_head gs_pending;
 174         u_int           gs_npending;
 175
 176         /* The following are for stats, usually protected by gs_mtx. */
 177         u_long          gs_requests;    /* total requests */
 178         u_long          gs_done;        /* total done */
 179         u_int           gs_in_flight;   /* requests in flight */
 180         u_int           gs_writes_in_flight;
 181         u_int           gs_bytes_in_flight;
 182         u_int           gs_write_bytes_in_flight;
 183
 184         char            gs_names[256];  /* names of schedulers */
 185 };
 186
 187 static struct geom_sched_vars me = {
 188         .gs_expire_secs = 10,
 189 };
 190
 191 SYSCTL_DECL(_kern_geom);
 192 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
 193     "GEOM_SCHED stuff");
 194
 195 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
 196     &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
 197
 198 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
 199     &me.gs_bytes_in_flight, 0, "Bytes in flight");
 200
 201 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
 202     &me.gs_writes_in_flight, 0, "Write Requests in flight");
 203
 204 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
 205     &me.gs_in_flight, 0, "Requests in flight");
 206
 207 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
 208     &me.gs_done, 0, "Total done");
 209
 210 SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
 211     &me.gs_requests, 0, "Total requests");
 212
 213 SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
 214     &me.gs_names, 0, "Algorithm names");
 215
 216 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
 217     &me.gs_sched_count, 0, "Number of algorithms");
 218
 219 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
 220     &me.gs_debug, 0, "Debug level");
 221
 222 SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
 223     &me.gs_expire_secs, 0, "Expire time in seconds");
 224
 225 /*
 226  * g_sched calls the scheduler algorithms with this lock held.
 227  * The locking functions are exposed so the scheduler algorithms can also
 228  * protect themselves e.g. when running a callout handler.
 229  */
 230 void
 231 g_sched_lock(struct g_geom *gp)
 232 {
 233         struct g_sched_softc *sc = gp->softc;
 234
 235         mtx_lock(&sc->sc_mtx);
 236 }
 237
 238 void
 239 g_sched_unlock(struct g_geom *gp)
 240 {
 241         struct g_sched_softc *sc = gp->softc;
 242
 243         mtx_unlock(&sc->sc_mtx);
 244 }
 245
 246 /*
 247  * Support functions to handle references to the module,
 248  * which are coming from devices using this scheduler.
 249  */
 250 static inline void
 251 g_gsched_ref(struct g_gsched *gsp)
 252 {
 253
 254         atomic_add_int(&gsp->gs_refs, 1);
 255 }
 256
 257 static inline void
 258 g_gsched_unref(struct g_gsched *gsp)
 259 {
 260
 261         atomic_add_int(&gsp->gs_refs, -1);
 262 }
 263
 264 /*
 265  * Update the stats when this request is done.
 266  */
 267 static void
 268 g_sched_update_stats(struct bio *bio)
 269 {
 270
 271         me.gs_done++;
 272         me.gs_in_flight--;
 273         me.gs_bytes_in_flight -= bio->bio_length;
 274         if (bio->bio_cmd == BIO_WRITE) {
 275                 me.gs_writes_in_flight--;
 276                 me.gs_write_bytes_in_flight -= bio->bio_length;
 277         }
 278 }
 279
 280 /*
 281  * Dispatch any pending request.
 282  */
 283 static void
 284 g_sched_forced_dispatch(struct g_geom *gp)
 285 {
 286         struct g_sched_softc *sc = gp->softc;
 287         struct g_gsched *gsp = sc->sc_gsched;
 288         struct bio *bp;
 289
 290         KASSERT(mtx_owned(&sc->sc_mtx),
 291             ("sc_mtx not owned during forced dispatch"));
 292
 293         while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
 294                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 295 }
 296
 297 /*
 298  * The main dispatch loop, called either here after the start
 299  * routine, or by scheduling algorithms when they receive a timeout
 300  * or a 'done' notification.  Does not share code with the forced
 301  * dispatch path, since the gs_done() callback can call us.
 302  */
 303 void
 304 g_sched_dispatch(struct g_geom *gp)
 305 {
 306         struct g_sched_softc *sc = gp->softc;
 307         struct g_gsched *gsp = sc->sc_gsched;
 308         struct bio *bp;
 309
 310         KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
 311
 312         if ((sc->sc_flags & G_SCHED_FLUSHING))
 313                 return;
 314
 315         while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
 316                 g_io_request(bp, LIST_FIRST(&gp->consumer));
 317 }
 318
 319 /*
 320  * Recent (8.0 and above) versions of FreeBSD have support to
 321  * register classifiers of disk requests. The classifier is
 322  * invoked by g_io_request(), and stores the information into
 323  * bp->bio_classifier1.
 324  *
 325  * Support for older versions, which is left here only for
 326  * documentation purposes, relies on two hacks:
 327  * 1. classification info is written into the bio_caller1
 328  *    field of the topmost node in the bio chain. This field
 329  *    is rarely used, but this module is incompatible with
 330  *    those that use bio_caller1 for other purposes,
 331  *    such as ZFS and gjournal;
 332  * 2. g_io_request() is patched in-memory when the module is
 333  *    loaded, so that the function calls a classifier as its
 334  *    first thing. g_io_request() is restored when the module
 335  *    is unloaded. This functionality is only supported for
 336  *    x86 and amd64, other architectures need source code changes.
 337  */
 338
 339 /*
 340  * Lookup the identity of the issuer of the original request.
 341  * In the current implementation we use the curthread of the
 342  * issuer, but different mechanisms may be implemented later
 343  * so we do not make assumptions on the return value which for
 344  * us is just an opaque identifier.
 345  */
 346
 347 static inline u_long
 348 g_sched_classify(struct bio *bp)
 349 {
 350
 351         /* we have classifier fields in the struct bio */
 352         return ((u_long)bp->bio_classifier1);
 353 }
 354
 355 /* Return the hash chain for the given key. */
 356 static inline struct g_hash *
 357 g_sched_hash(struct g_sched_softc *sc, u_long key)
 358 {
 359
 360         return (&sc->sc_hash[key & sc->sc_mask]);
 361 }
 362
 363 /*
 364  * Helper function for the children classes, which takes
 365  * a geom and a bio and returns the private descriptor
 366  * associated to the request.  This involves fetching
 367  * the classification field and [al]locating the
 368  * corresponding entry in the hash table.
 369  */
 370 void *
 371 g_sched_get_class(struct g_geom *gp, struct bio *bp)
 372 {
 373         struct g_sched_softc *sc;
 374         struct g_sched_class *gsc;
 375         struct g_gsched *gsp;
 376         struct g_hash *bucket;
 377         u_long key;
 378
 379         sc = gp->softc;
 380         key = g_sched_classify(bp);
 381         bucket = g_sched_hash(sc, key);
 382         LIST_FOREACH(gsc, bucket, gsc_clist) {
 383                 if (key == gsc->gsc_key) {
 384                         gsc->gsc_refs++;
 385                         return (gsc->gsc_priv);
 386                 }
 387         }
 388
 389         gsp = sc->sc_gsched;
 390         gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
 391             M_GEOM_SCHED, M_NOWAIT | M_ZERO);
 392         if (!gsc)
 393                 return (NULL);
 394
 395         if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 396                 free(gsc, M_GEOM_SCHED);
 397                 return (NULL);
 398         }
 399
 400         gsc->gsc_refs = 2;      /* 1 for the hash table, 1 for the caller. */
 401         gsc->gsc_key = key;
 402         LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
 403
 404         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 405
 406         return (gsc->gsc_priv);
 407 }
 408
 409 /*
 410  * Release a reference to the per-client descriptor,
 411  */
 412 void
 413 g_sched_put_class(struct g_geom *gp, void *priv)
 414 {
 415         struct g_sched_class *gsc;
 416         struct g_sched_softc *sc;
 417
 418         gsc = g_sched_priv2class(priv);
 419         gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
 420
 421         if (--gsc->gsc_refs > 0)
 422                 return;
 423
 424         sc = gp->softc;
 425         sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
 426
 427         LIST_REMOVE(gsc, gsc_clist);
 428         free(gsc, M_GEOM_SCHED);
 429 }
 430
 431 static void
 432 g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
 433     struct g_gsched *gsp, void *data)
 434 {
 435         struct g_sched_class *cp, *cp2;
 436         int i;
 437
 438         if (!hp)
 439                 return;
 440
 441         if (data && gsp->gs_hash_unref)
 442                 gsp->gs_hash_unref(data);
 443
 444         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 445                 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
 446                         g_sched_put_class(gp, cp->gsc_priv);
 447         }
 448
 449         hashdestroy(hp, M_GEOM_SCHED, mask);
 450 }
 451
 452 static struct g_hash *
 453 g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
 454 {
 455         struct g_hash *hash;
 456
 457         if (gsp->gs_priv_size == 0)
 458                 return (NULL);
 459
 460         hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
 461
 462         return (hash);
 463 }
 464
 465 static void
 466 g_sched_flush_classes(struct g_geom *gp)
 467 {
 468         struct g_sched_softc *sc;
 469         struct g_sched_class *cp, *cp2;
 470         int i;
 471
 472         sc = gp->softc;
 473
 474         if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
 475                 return;
 476
 477         for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
 478                 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
 479                         if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
 480                                 g_sched_put_class(gp, cp->gsc_priv);
 481                 }
 482         }
 483
 484         sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
 485 }
 486
 487 /*
 488  * Wait for the completion of any outstanding request.  To ensure
 489  * that this does not take forever the caller has to make sure that
 490  * no new request enter the scehduler before calling us.
 491  *
 492  * Must be called with the gp mutex held and topology locked.
 493  */
 494 static int
 495 g_sched_wait_pending(struct g_geom *gp)
 496 {
 497         struct g_sched_softc *sc = gp->softc;
 498         int endticks = ticks + hz;
 499
 500         g_topology_assert();
 501
 502         while (sc->sc_pending && endticks - ticks >= 0)
 503                 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
 504
 505         return (sc->sc_pending ? ETIMEDOUT : 0);
 506 }
 507
 508 static int
 509 g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
 510 {
 511         struct g_sched_softc *sc = gp->softc;
 512         int error;
 513
 514         /* Set the flushing flag: new bios will not enter the scheduler. */
 515         sc->sc_flags |= G_SCHED_FLUSHING;
 516
 517         g_sched_forced_dispatch(gp);
 518         error = g_sched_wait_pending(gp);
 519         if (error)
 520                 goto failed;
 521
 522         /* No more requests pending or in flight from the old gsp. */
 523
 524         g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
 525         sc->sc_hash = NULL;
 526
 527         /*
 528          * Avoid deadlock here by releasing the gp mutex and reacquiring
 529          * it once done.  It should be safe, since no reconfiguration or
 530          * destruction can take place due to the geom topology lock; no
 531          * new request can use the current sc_data since we flagged the
 532          * geom as being flushed.
 533          */
 534         g_sched_unlock(gp);
 535         gsp->gs_fini(sc->sc_data);
 536         g_sched_lock(gp);
 537
 538         sc->sc_gsched = NULL;
 539         sc->sc_data = NULL;
 540         g_gsched_unref(gsp);
 541
 542 failed:
 543         sc->sc_flags &= ~G_SCHED_FLUSHING;
 544
 545         return (error);
 546 }
 547
 548 static int
 549 g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
 550 {
 551         int error;
 552
 553         g_sched_lock(gp);
 554         error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
 555         g_sched_unlock(gp);
 556
 557         return (error);
 558 }
 559
 560 /*
 561  * Support function for create/taste -- locate the desired
 562  * algorithm and grab a reference to it.
 563  */
 564 static struct g_gsched *
 565 g_gsched_find(const char *name)
 566 {
 567         struct g_gsched *gsp = NULL;
 568
 569         mtx_lock(&me.gs_mtx);
 570         LIST_FOREACH(gsp, &me.gs_scheds, glist) {
 571                 if (strcmp(name, gsp->gs_name) == 0) {
 572                         g_gsched_ref(gsp);
 573                         break;
 574                 }
 575         }
 576         mtx_unlock(&me.gs_mtx);
 577
 578         return (gsp);
 579 }
 580
 581 /*
 582  * Rebuild the list of scheduler names.
 583  * To be called with me.gs_mtx lock held.
 584  */
 585 static void
 586 g_gsched_build_names(struct g_gsched *gsp)
 587 {
 588         int pos, l;
 589         struct g_gsched *cur;
 590
 591         pos = 0;
 592         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 593                 l = strlen(cur->gs_name);
 594                 if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
 595                         if (pos != 0)
 596                                 me.gs_names[pos++] = ' ';
 597                         strcpy(me.gs_names + pos, cur->gs_name);
 598                         pos += l;
 599                 }
 600         }
 601         me.gs_names[pos] = '\0';
 602 }
 603
 604 /*
 605  * Register or unregister individual scheduling algorithms.
 606  */
 607 static int
 608 g_gsched_register(struct g_gsched *gsp)
 609 {
 610         struct g_gsched *cur;
 611         int error = 0;
 612
 613         mtx_lock(&me.gs_mtx);
 614         LIST_FOREACH(cur, &me.gs_scheds, glist) {
 615                 if (strcmp(gsp->gs_name, cur->gs_name) == 0)
 616                         break;
 617         }
 618         if (cur != NULL) {
 619                 G_SCHED_DEBUG(0, "A scheduler named %s already"
 620                     "exists.", gsp->gs_name);
 621                 error = EEXIST;
 622         } else {
 623                 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
 624                 gsp->gs_refs = 1;
 625                 me.gs_sched_count++;
 626                 g_gsched_build_names(gsp);
 627         }
 628         mtx_unlock(&me.gs_mtx);
 629
 630         return (error);
 631 }
 632
 633 struct g_gsched_unregparm {
 634         struct g_gsched *gup_gsp;
 635         int             gup_error;
 636 };
 637
 638 static void
 639 g_gsched_unregister(void *arg, int flag)
 640 {
 641         struct g_gsched_unregparm *parm = arg;
 642         struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
 643         struct g_sched_softc *sc;
 644         struct g_geom *gp, *gp_tmp;
 645         int error;
 646
 647         parm->gup_error = 0;
 648
 649         g_topology_assert();
 650
 651         if (flag == EV_CANCEL)
 652                 return;
 653
 654         mtx_lock(&me.gs_mtx);
 655
 656         LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
 657                 if (gp->class != &g_sched_class)
 658                         continue;       /* Should not happen. */
 659
 660                 sc = gp->softc;
 661                 if (sc->sc_gsched == gsp) {
 662                         error = g_sched_remove(gp, gsp);
 663                         if (error)
 664                                 goto failed;
 665                 }
 666         }
 667
 668         LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
 669                 if (cur != gsp)
 670                         continue;
 671
 672                 if (gsp->gs_refs != 1) {
 673                         G_SCHED_DEBUG(0, "%s still in use.",
 674                             gsp->gs_name);
 675                         parm->gup_error = EBUSY;
 676                 } else {
 677                         LIST_REMOVE(gsp, glist);
 678                         me.gs_sched_count--;
 679                         g_gsched_build_names(gsp);
 680                 }
 681                 break;
 682         }
 683
 684         if (cur == NULL) {
 685                 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
 686                 parm->gup_error = ENOENT;
 687         }
 688
 689 failed:
 690         mtx_unlock(&me.gs_mtx);
 691 }
 692
 693 static inline void
 694 g_gsched_global_init(void)
 695 {
 696
 697         if (!me.gs_initialized) {
 698                 G_SCHED_DEBUG(0, "Initializing global data.");
 699                 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
 700                 LIST_INIT(&me.gs_scheds);
 701                 bioq_init(&me.gs_pending);
 702                 me.gs_initialized = 1;
 703         }
 704 }
 705
 706 /*
 707  * Module event called when a scheduling algorithm module is loaded or
 708  * unloaded.
 709  */
 710 int
 711 g_gsched_modevent(module_t mod, int cmd, void *arg)
 712 {
 713         struct g_gsched *gsp = arg;
 714         struct g_gsched_unregparm parm;
 715         int error;
 716
 717         G_SCHED_DEBUG(0, "Modevent %d.", cmd);
 718
 719         /*
 720          * If the module is loaded at boot, the geom thread that calls
 721          * g_sched_init() might actually run after g_gsched_modevent(),
 722          * so make sure that the module is properly initialized.
 723          */
 724         g_gsched_global_init();
 725
 726         error = EOPNOTSUPP;
 727         switch (cmd) {
 728         case MOD_LOAD:
 729                 error = g_gsched_register(gsp);
 730                 G_SCHED_DEBUG(0, "Loaded module %s error %d.",
 731                     gsp->gs_name, error);
 732                 if (error == 0)
 733                         g_retaste(&g_sched_class);
 734                 break;
 735
 736         case MOD_UNLOAD:
 737                 parm.gup_gsp = gsp;
 738                 parm.gup_error = 0;
 739
 740                 error = g_waitfor_event(g_gsched_unregister,
 741                     &parm, M_WAITOK, NULL);
 742                 if (error == 0)
 743                         error = parm.gup_error;
 744                 G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
 745                     gsp->gs_name, error);
 746                 break;
 747         }
 748
 749         return (error);
 750 }
 751
 752 #ifdef KTR
 753 #define TRC_BIO_EVENT(e, bp)    g_sched_trace_bio_ ## e (bp)
 754
 755 static inline char
 756 g_sched_type(struct bio *bp)
 757 {
 758
 759         if (bp->bio_cmd == BIO_READ)
 760                 return ('R');
 761         else if (bp->bio_cmd == BIO_WRITE)
 762                 return ('W');
 763         return ('U');
 764 }
 765
 766 static inline void
 767 g_sched_trace_bio_START(struct bio *bp)
 768 {
 769
 770         CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
 771             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 772             bp->bio_offset, bp->bio_length);
 773 }
 774
 775 static inline void
 776 g_sched_trace_bio_DONE(struct bio *bp)
 777 {
 778
 779         CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
 780             g_sched_type(bp), bp->bio_offset / ULONG_MAX,
 781             bp->bio_offset, bp->bio_length);
 782 }
 783 #else /* !KTR */
 784 #define TRC_BIO_EVENT(e, bp)
 785 #endif /* !KTR */
 786
 787 /*
 788  * g_sched_done() and g_sched_start() dispatch the geom requests to
 789  * the scheduling algorithm in use.
 790  */
 791 static void
 792 g_sched_done(struct bio *bio)
 793 {
 794         struct g_geom *gp = bio->bio_caller2;
 795         struct g_sched_softc *sc = gp->softc;
 796
 797         TRC_BIO_EVENT(DONE, bio);
 798
 799         KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
 800
 801         g_sched_lock(gp);
 802
 803         g_sched_update_stats(bio);
 804         sc->sc_gsched->gs_done(sc->sc_data, bio);
 805         if (!--sc->sc_pending)
 806                 wakeup(gp);
 807
 808         g_sched_flush_classes(gp);
 809         g_sched_unlock(gp);
 810
 811         g_std_done(bio);
 812 }
 813
 814 static void
 815 g_sched_start(struct bio *bp)
 816 {
 817         struct g_geom *gp = bp->bio_to->geom;
 818         struct g_sched_softc *sc = gp->softc;
 819         struct bio *cbp;
 820
 821         TRC_BIO_EVENT(START, bp);
 822         G_SCHED_LOGREQ(bp, "Request received.");
 823
 824         cbp = g_clone_bio(bp);
 825         if (cbp == NULL) {
 826                 g_io_deliver(bp, ENOMEM);
 827                 return;
 828         }
 829         cbp->bio_done = g_sched_done;
 830         cbp->bio_to = LIST_FIRST(&gp->provider);
 831         KASSERT(cbp->bio_to != NULL, ("NULL provider"));
 832
 833         /* We only schedule reads and writes. */
 834         if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE)
 835                 goto bypass;
 836
 837         G_SCHED_LOGREQ(cbp, "Sending request.");
 838
 839         g_sched_lock(gp);
 840         /*
 841          * Call the algorithm's gs_start to queue the request in the
 842          * scheduler. If gs_start fails then pass the request down,
 843          * otherwise call g_sched_dispatch() which tries to push
 844          * one or more requests down.
 845          */
 846         if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
 847             sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
 848                 g_sched_unlock(gp);
 849                 goto bypass;
 850         }
 851         /*
 852          * We use bio_caller1 to mark requests that are scheduled
 853          * so make sure it is not NULL.
 854          */
 855         if (cbp->bio_caller1 == NULL)
 856                 cbp->bio_caller1 = &me; /* anything not NULL */
 857
 858         cbp->bio_caller2 = gp;
 859         sc->sc_pending++;
 860
 861         /* Update general stats. */
 862         me.gs_in_flight++;
 863         me.gs_requests++;
 864         me.gs_bytes_in_flight += bp->bio_length;
 865         if (bp->bio_cmd == BIO_WRITE) {
 866                 me.gs_writes_in_flight++;
 867                 me.gs_write_bytes_in_flight += bp->bio_length;
 868         }
 869         g_sched_dispatch(gp);
 870         g_sched_unlock(gp);
 871         return;
 872
 873 bypass:
 874         cbp->bio_done = g_std_done;
 875         cbp->bio_caller1 = NULL; /* not scheduled */
 876         g_io_request(cbp, LIST_FIRST(&gp->consumer));
 877 }
 878
 879 /*
 880  * The next few functions are the geom glue.
 881  */
 882 static void
 883 g_sched_orphan(struct g_consumer *cp)
 884 {
 885
 886         g_topology_assert();
 887         g_sched_destroy(cp->geom, 1);
 888 }
 889
 890 static int
 891 g_sched_access(struct g_provider *pp, int dr, int dw, int de)
 892 {
 893         struct g_geom *gp;
 894         struct g_consumer *cp;
 895         int error;
 896
 897         gp = pp->geom;
 898         cp = LIST_FIRST(&gp->consumer);
 899         error = g_access(cp, dr, dw, de);
 900
 901         return (error);
 902 }
 903
 904 static void
 905 g_sched_temporary_start(struct bio *bio)
 906 {
 907
 908         mtx_lock(&me.gs_mtx);
 909         me.gs_npending++;
 910         bioq_disksort(&me.gs_pending, bio);
 911         mtx_unlock(&me.gs_mtx);
 912 }
 913
 914 static void
 915 g_sched_flush_pending(g_start_t *start)
 916 {
 917         struct bio *bp;
 918
 919         while ((bp = bioq_takefirst(&me.gs_pending)))
 920                 start(bp);
 921 }
 922
 923 static int
 924 g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
 925     struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
 926 {
 927         struct g_sched_softc *sc = gp->softc;
 928         g_start_t *saved_start, *flush = g_sched_start;
 929         int error = 0, endticks = ticks + hz;
 930
 931         g_cancel_event(newpp);  /* prevent taste() */
 932         /* copy private fields */
 933         newpp->private = pp->private;
 934         newpp->index = pp->index;
 935
 936         /* Queue all the early requests coming for us. */
 937         me.gs_npending = 0;
 938         saved_start = pp->geom->start;
 939         dstgp->start = g_sched_temporary_start;
 940
 941         while (pp->nstart - pp->nend != me.gs_npending &&
 942             endticks - ticks >= 0)
 943                 tsleep(pp, PRIBIO, "-", hz/10);
 944
 945         if (pp->nstart - pp->nend != me.gs_npending) {
 946                 flush = saved_start;
 947                 error = ETIMEDOUT;
 948                 goto fail;
 949         }
 950
 951         /* link pp to this geom */
 952         LIST_REMOVE(pp, provider);
 953         pp->geom = gp;
 954         LIST_INSERT_HEAD(&gp->provider, pp, provider);
 955
 956         /*
 957          * replicate the counts from the parent in the
 958          * new provider and consumer nodes
 959          */
 960         cp->acr = newpp->acr = pp->acr;
 961         cp->acw = newpp->acw = pp->acw;
 962         cp->ace = newpp->ace = pp->ace;
 963         sc->sc_flags |= G_SCHED_PROXYING;
 964
 965 fail:
 966         dstgp->start = saved_start;
 967
 968         g_sched_flush_pending(flush);
 969
 970         return (error);
 971 }
 972
 973 /*
 974  * Create a geom node for the device passed as *pp.
 975  * If successful, add a reference to this gsp.
 976  */
 977 static int
 978 g_sched_create(struct gctl_req *req, struct g_class *mp,
 979     struct g_provider *pp, struct g_gsched *gsp, int proxy)
 980 {
 981         struct g_sched_softc *sc = NULL;
 982         struct g_geom *gp, *dstgp;
 983         struct g_provider *newpp = NULL;
 984         struct g_consumer *cp = NULL;
 985         char name[64];
 986         int error;
 987
 988         g_topology_assert();
 989
 990         snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
 991         LIST_FOREACH(gp, &mp->geom, geom) {
 992                 if (strcmp(gp->name, name) == 0) {
 993                         gctl_error(req, "Geom %s already exists.",
 994                             name);
 995                         return (EEXIST);
 996                 }
 997         }
 998
 999         gp = g_new_geomf(mp, "%s", name);
1000         dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1001
1002         sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1003         sc->sc_gsched = gsp;
1004         sc->sc_data = gsp->gs_init(gp);
1005         if (sc->sc_data == NULL) {
1006                 error = ENOMEM;
1007                 goto fail;
1008         }
1009
1010         sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1011
1012         /*
1013          * Do not initialize the flush mechanism, will be initialized
1014          * on the first insertion on the hash table.
1015          */
1016
1017         mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1018
1019         gp->softc = sc;
1020         gp->start = g_sched_start;
1021         gp->orphan = g_sched_orphan;
1022         gp->access = g_sched_access;
1023         gp->dumpconf = g_sched_dumpconf;
1024
1025         newpp = g_new_providerf(dstgp, "%s", gp->name);
1026         newpp->mediasize = pp->mediasize;
1027         newpp->sectorsize = pp->sectorsize;
1028
1029         cp = g_new_consumer(gp);
1030         error = g_attach(cp, proxy ? newpp : pp);
1031         if (error != 0) {
1032                 gctl_error(req, "Cannot attach to provider %s.",
1033                     pp->name);
1034                 goto fail;
1035         }
1036
1037         g_error_provider(newpp, 0);
1038         if (proxy) {
1039                 error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1040                 if (error)
1041                         goto fail;
1042         }
1043         G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1044
1045         g_gsched_ref(gsp);
1046
1047         return (0);
1048
1049 fail:
1050         if (cp != NULL) {
1051                 if (cp->provider != NULL)
1052                         g_detach(cp);
1053                 g_destroy_consumer(cp);
1054         }
1055         if (newpp != NULL)
1056                 g_destroy_provider(newpp);
1057         if (sc->sc_hash)
1058                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1059                     gsp, sc->sc_data);
1060         if (sc->sc_data)
1061                 gsp->gs_fini(sc->sc_data);
1062         g_free(gp->softc);
1063         g_destroy_geom(gp);
1064
1065         return (error);
1066 }
1067
1068 /*
1069  * Support for dynamic switching of scheduling algorithms.
1070  * First initialize the data structures for the new algorithm,
1071  * then call g_sched_remove_locked() to flush all references
1072  * to the old one, finally link the new algorithm.
1073  */
1074 static int
1075 g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1076     struct g_provider *pp, struct g_gsched *gsp)
1077 {
1078         struct g_sched_softc *sc;
1079         struct g_geom *gp;
1080         struct g_hash *newh;
1081         void *data;
1082         u_long mask;
1083         int error = 0;
1084
1085         gp = pp->geom;
1086         sc = gp->softc;
1087
1088         data = gsp->gs_init(gp);
1089         if (data == NULL)
1090                 return (ENOMEM);
1091
1092         newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1093         if (gsp->gs_priv_size && !newh) {
1094                 error = ENOMEM;
1095                 goto fail;
1096         }
1097
1098         g_sched_lock(gp);
1099         if (sc->sc_gsched) {    /* can be NULL in some cases */
1100                 error = g_sched_remove_locked(gp, sc->sc_gsched);
1101                 if (error)
1102                         goto fail;
1103         }
1104
1105         g_gsched_ref(gsp);
1106         sc->sc_gsched = gsp;
1107         sc->sc_data = data;
1108         sc->sc_hash = newh;
1109         sc->sc_mask = mask;
1110
1111         g_sched_unlock(gp);
1112
1113         return (0);
1114
1115 fail:
1116         if (newh)
1117                 g_sched_hash_fini(gp, newh, mask, gsp, data);
1118
1119         if (data)
1120                 gsp->gs_fini(data);
1121
1122         g_sched_unlock(gp);
1123
1124         return (error);
1125 }
1126
1127 /*
1128  * Stop the request flow directed to the proxy, redirecting the new
1129  * requests to the me.gs_pending queue.
1130  */
1131 static struct g_provider *
1132 g_detach_proxy(struct g_geom *gp)
1133 {
1134         struct g_consumer *cp;
1135         struct g_provider *pp, *newpp;
1136
1137         do {
1138                 pp = LIST_FIRST(&gp->provider);
1139                 if (pp == NULL)
1140                         break;
1141                 cp = LIST_FIRST(&gp->consumer);
1142                 if (cp == NULL)
1143                         break;
1144                 newpp = cp->provider;
1145                 if (newpp == NULL)
1146                         break;
1147
1148                 me.gs_npending = 0;
1149                 pp->geom->start = g_sched_temporary_start;
1150
1151                 return (pp);
1152         } while (0);
1153         printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1154
1155         return (NULL);
1156 }
1157
1158 static void
1159 g_sched_blackhole(struct bio *bp)
1160 {
1161
1162         g_io_deliver(bp, ENXIO);
1163 }
1164
1165 static inline void
1166 g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1167     struct g_provider *newpp)
1168 {
1169
1170         LIST_REMOVE(pp, provider);
1171         if (newpp) {
1172                 pp->private = newpp->private;
1173                 pp->index = newpp->index;
1174         }
1175         pp->geom = gp;
1176         LIST_INSERT_HEAD(&gp->provider, pp, provider);
1177 }
1178
1179 static inline void
1180 g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1181 {
1182         struct g_geom *gp = oldpp->geom;
1183
1184         g_reparent_provider(oldpp, newpp->geom, newpp);
1185
1186         /*
1187          * Hackish: let the system destroy the old provider for us, just
1188          * in case someone attached a consumer to it, in which case a
1189          * direct call to g_destroy_provider() would not work.
1190          */
1191         g_reparent_provider(newpp, gp, NULL);
1192 }
1193
1194 /*
1195  * Complete the proxy destruction, linking the old provider to its
1196  * original geom, and destroying the proxy provider.  Also take care
1197  * of issuing the pending requests collected in me.gs_pending (if any).
1198  */
1199 static int
1200 g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1201 {
1202         struct g_consumer *cp;
1203         struct g_provider *newpp;
1204
1205         do {
1206                 cp = LIST_FIRST(&gp->consumer);
1207                 if (cp == NULL)
1208                         break;
1209                 newpp = cp->provider;
1210                 if (newpp == NULL)
1211                         break;
1212
1213                 /* Relink the provider to its original geom. */
1214                 g_unproxy_provider(oldpp, newpp);
1215
1216                 /* Detach consumer from provider, and destroy provider. */
1217                 cp->acr = newpp->acr = 0;
1218                 cp->acw = newpp->acw = 0;
1219                 cp->ace = newpp->ace = 0;
1220                 g_detach(cp);
1221
1222                 /* Send the pending bios through the right start function. */
1223                 g_sched_flush_pending(oldpp->geom->start);
1224
1225                 return (0);
1226         } while (0);
1227         printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1228
1229         /* We cannot send the pending bios anywhere... */
1230         g_sched_flush_pending(g_sched_blackhole);
1231
1232         return (EINVAL);
1233 }
1234
1235 static int
1236 g_sched_destroy(struct g_geom *gp, boolean_t force)
1237 {
1238         struct g_provider *pp, *oldpp = NULL;
1239         struct g_sched_softc *sc;
1240         struct g_gsched *gsp;
1241         int error;
1242
1243         g_topology_assert();
1244         sc = gp->softc;
1245         if (sc == NULL)
1246                 return (ENXIO);
1247         if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1248                 pp = LIST_FIRST(&gp->provider);
1249                 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1250                         const char *msg = force ?
1251                                 "but we force removal" : "cannot remove";
1252
1253                         G_SCHED_DEBUG(!force,
1254                             "Device %s is still open (r%dw%de%d), %s.",
1255                             pp->name, pp->acr, pp->acw, pp->ace, msg);
1256                         if (!force)
1257                                 return (EBUSY);
1258                 } else {
1259                         G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1260                 }
1261         } else
1262                 oldpp = g_detach_proxy(gp);
1263
1264         gsp = sc->sc_gsched;
1265         if (gsp) {
1266                 /*
1267                  * XXX bad hack here: force a dispatch to release
1268                  * any reference to the hash table still held by
1269                  * the scheduler.
1270                  */
1271                 g_sched_lock(gp);
1272                 /*
1273                  * We are dying here, no new requests should enter
1274                  * the scheduler.  This is granted by the topolgy,
1275                  * either in case we were proxying (new bios are
1276                  * being redirected) or not (see the access check
1277                  * above).
1278                  */
1279                 g_sched_forced_dispatch(gp);
1280                 error = g_sched_wait_pending(gp);
1281
1282                 if (error) {
1283                         /*
1284                          * Not all the requests came home: this might happen
1285                          * under heavy load, or if we were waiting for any
1286                          * bio which is served in the event path (see
1287                          * geom_slice.c for an example of how this can
1288                          * happen).  Try to restore a working configuration
1289                          * if we can fail.
1290                          */
1291                         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1292                                 g_sched_flush_pending(force ?
1293                                     g_sched_blackhole : g_sched_start);
1294                         }
1295
1296                         /*
1297                          * In the forced destroy case there is not so much
1298                          * we can do, we have pending bios that will call
1299                          * g_sched_done() somehow, and we don't want them
1300                          * to crash the system using freed memory.  We tell
1301                          * the user that something went wrong, and leak some
1302                          * memory here.
1303                          * Note: the callers using force = 1 ignore the
1304                          * return value.
1305                          */
1306                         if (force) {
1307                                 G_SCHED_DEBUG(0, "Pending requests while "
1308                                     " destroying geom, some memory leaked.");
1309                         }
1310
1311                         return (error);
1312                 }
1313
1314                 g_sched_unlock(gp);
1315                 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1316                     gsp, sc->sc_data);
1317                 sc->sc_hash = NULL;
1318                 gsp->gs_fini(sc->sc_data);
1319                 g_gsched_unref(gsp);
1320                 sc->sc_gsched = NULL;
1321         } else
1322                 error = 0;
1323
1324         if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1325                 error = g_destroy_proxy(gp, oldpp);
1326
1327                 if (error) {
1328                         if (force) {
1329                                 G_SCHED_DEBUG(0, "Unrecoverable error while "
1330                                     "destroying a proxy geom, leaking some "
1331                                     " memory.");
1332                         }
1333
1334                         return (error);
1335                 }
1336         }
1337
1338         mtx_destroy(&sc->sc_mtx);
1339
1340         g_free(gp->softc);
1341         gp->softc = NULL;
1342         g_wither_geom(gp, ENXIO);
1343
1344         return (error);
1345 }
1346
1347 static int
1348 g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1349     struct g_geom *gp)
1350 {
1351
1352         return (g_sched_destroy(gp, 0));
1353 }
1354
1355 /*
1356  * Functions related to the classification of requests.
1357  *
1358  * On recent FreeBSD versions (8.0 and above), we store a reference
1359  * to the issuer of a request in bp->bio_classifier1 as soon
1360  * as the bio is posted to the geom queue (and not later, because
1361  * requests are managed by the g_down thread afterwards).
1362  */
1363
1364 /*
1365  * Classifier support for recent FreeBSD versions: we use
1366  * a very simple classifier, only use curthread to tag a request.
1367  * The classifier is registered at module load, and unregistered
1368  * at module unload.
1369  */
1370 static int
1371 g_sched_tag(void *arg, struct bio *bp)
1372 {
1373
1374         bp->bio_classifier1 = curthread;
1375         return (1);
1376 }
1377
1378 static struct g_classifier_hook g_sched_classifier = {
1379         .func = g_sched_tag,
1380 };
1381
1382 static inline void
1383 g_classifier_ini(void)
1384 {
1385
1386         g_register_classifier(&g_sched_classifier);
1387 }
1388
1389 static inline void
1390 g_classifier_fini(void)
1391 {
1392
1393         g_unregister_classifier(&g_sched_classifier);
1394 }
1395
1396 static void
1397 g_sched_init(struct g_class *mp)
1398 {
1399
1400         g_gsched_global_init();
1401
1402         G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1403             mp, &g_sched_class);
1404
1405         /* Patch g_io_request to store classification info in the bio. */
1406         g_classifier_ini();
1407 }
1408
1409 static void
1410 g_sched_fini(struct g_class *mp)
1411 {
1412
1413         g_classifier_fini();
1414
1415         G_SCHED_DEBUG(0, "Unloading...");
1416
1417         KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1418         mtx_destroy(&me.gs_mtx);
1419 }
1420
1421 static int
1422 g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1423     struct thread *td)
1424 {
1425         struct g_consumer *cp;
1426         struct g_geom *gp;
1427
1428         cp = LIST_FIRST(&pp->geom->consumer);
1429         if (cp == NULL)
1430                 return (ENOIOCTL);
1431         gp = cp->provider->geom;
1432         if (gp->ioctl == NULL)
1433                 return (ENOIOCTL);
1434         return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1435 }
1436
1437 /*
1438  * Read the i-th argument for a request, skipping the /dev/
1439  * prefix if present.
1440  */
1441 static const char *
1442 g_sched_argi(struct gctl_req *req, int i)
1443 {
1444         static const char *dev_prefix = "/dev/";
1445         const char *name;
1446         char param[16];
1447         int l = strlen(dev_prefix);
1448
1449         snprintf(param, sizeof(param), "arg%d", i);
1450         name = gctl_get_asciiparam(req, param);
1451         if (name == NULL)
1452                 gctl_error(req, "No 'arg%d' argument", i);
1453         else if (strncmp(name, dev_prefix, l) == 0)
1454                 name += l;
1455         return (name);
1456 }
1457
1458 /*
1459  * Fetch nargs and do appropriate checks.
1460  */
1461 static int
1462 g_sched_get_nargs(struct gctl_req *req)
1463 {
1464         int *nargs;
1465
1466         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1467         if (nargs == NULL) {
1468                 gctl_error(req, "No 'nargs' argument");
1469                 return (0);
1470         }
1471         if (*nargs <= 0)
1472                 gctl_error(req, "Missing device(s).");
1473         return (*nargs);
1474 }
1475
1476 /*
1477  * Check whether we should add the class on certain volumes when
1478  * this geom is created. Right now this is under control of a kenv
1479  * variable containing the names of all devices that we care about.
1480  * Probably we should only support transparent insertion as the
1481  * preferred mode of operation.
1482  */
1483 static struct g_geom *
1484 g_sched_taste(struct g_class *mp, struct g_provider *pp,
1485                 int flags __unused)
1486 {
1487         struct g_gsched *gsp = NULL;    /* the . algorithm we want */
1488         const char *s;                  /* generic string pointer */
1489         const char *taste_names;        /* devices we like */
1490         int l;
1491
1492         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1493             mp->name, pp->name);
1494         g_topology_assert();
1495
1496         G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1497
1498         do {
1499                 /* do not taste on ourselves */
1500                 if (pp->geom->class == mp)
1501                         break;
1502
1503                 taste_names = kern_getenv("geom.sched.taste");
1504                 if (taste_names == NULL)
1505                         break;
1506
1507                 l = strlen(pp->name);
1508                 for (s = taste_names; *s &&
1509                     (s = strstr(s, pp->name)); s++) {
1510                         /* further checks for an exact match */
1511                         if ( (s == taste_names || s[-1] == ' ') &&
1512                              (s[l] == '\0' || s[l] == ' ') )
1513                                 break;
1514                 }
1515                 if (s == NULL)
1516                         break;
1517                 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1518                     pp->name, s);
1519
1520                 /* look up the provider name in the list */
1521                 s = kern_getenv("geom.sched.algo");
1522                 if (s == NULL)
1523                         s = "rr";
1524
1525                 gsp = g_gsched_find(s); /* also get a reference */
1526                 if (gsp == NULL) {
1527                         G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1528                         break;
1529                 }
1530
1531                 /* XXX create with 1 as last argument ? */
1532                 g_sched_create(NULL, mp, pp, gsp, 0);
1533                 g_gsched_unref(gsp);
1534         } while (0);
1535         return NULL;
1536 }
1537
1538 static void
1539 g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1540 {
1541         struct g_provider *pp;
1542         struct g_gsched *gsp;
1543         const char *name;
1544         int i, nargs;
1545
1546         g_topology_assert();
1547
1548         name = gctl_get_asciiparam(req, "algo");
1549         if (name == NULL) {
1550                 gctl_error(req, "No '%s' argument", "algo");
1551                 return;
1552         }
1553
1554         gsp = g_gsched_find(name);      /* also get a reference */
1555         if (gsp == NULL) {
1556                 gctl_error(req, "Bad algorithm '%s'", name);
1557                 return;
1558         }
1559
1560         nargs = g_sched_get_nargs(req);
1561
1562         /*
1563          * Run on the arguments, and break on any error.
1564          * We look for a device name, but skip the /dev/ prefix if any.
1565          */
1566         for (i = 0; i < nargs; i++) {
1567                 name = g_sched_argi(req, i);
1568                 if (name == NULL)
1569                         break;
1570                 pp = g_provider_by_name(name);
1571                 if (pp == NULL) {
1572                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1573                         gctl_error(req, "Provider %s is invalid.", name);
1574                         break;
1575                 }
1576                 if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1577                         break;
1578         }
1579
1580         g_gsched_unref(gsp);
1581 }
1582
1583 static void
1584 g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1585 {
1586         struct g_provider *pp;
1587         struct g_gsched *gsp;
1588         const char *name;
1589         int i, nargs;
1590
1591         g_topology_assert();
1592
1593         name = gctl_get_asciiparam(req, "algo");
1594         if (name == NULL) {
1595                 gctl_error(req, "No '%s' argument", "algo");
1596                 return;
1597         }
1598
1599         gsp = g_gsched_find(name);      /* also get a reference */
1600         if (gsp == NULL) {
1601                 gctl_error(req, "Bad algorithm '%s'", name);
1602                 return;
1603         }
1604
1605         nargs = g_sched_get_nargs(req);
1606
1607         /*
1608          * Run on the arguments, and break on any error.
1609          * We look for a device name, but skip the /dev/ prefix if any.
1610          */
1611         for (i = 0; i < nargs; i++) {
1612                 name = g_sched_argi(req, i);
1613                 if (name == NULL)
1614                         break;
1615                 pp = g_provider_by_name(name);
1616                 if (pp == NULL || pp->geom->class != mp) {
1617                         G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1618                         gctl_error(req, "Provider %s is invalid.", name);
1619                         break;
1620                 }
1621                 if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1622                         break;
1623         }
1624
1625         g_gsched_unref(gsp);
1626 }
1627
1628 static struct g_geom *
1629 g_sched_find_geom(struct g_class *mp, const char *name)
1630 {
1631         struct g_geom *gp;
1632
1633         LIST_FOREACH(gp, &mp->geom, geom) {
1634                 if (strcmp(gp->name, name) == 0)
1635                         return (gp);
1636         }
1637         return (NULL);
1638 }
1639
1640 static void
1641 g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1642 {
1643         int nargs, *force, error, i;
1644         struct g_geom *gp;
1645         const char *name;
1646
1647         g_topology_assert();
1648
1649         nargs = g_sched_get_nargs(req);
1650
1651         force = gctl_get_paraml(req, "force", sizeof(*force));
1652         if (force == NULL) {
1653                 gctl_error(req, "No 'force' argument");
1654                 return;
1655         }
1656
1657         for (i = 0; i < nargs; i++) {
1658                 name = g_sched_argi(req, i);
1659                 if (name == NULL)
1660                         break;
1661
1662                 gp = g_sched_find_geom(mp, name);
1663                 if (gp == NULL) {
1664                         G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1665                         gctl_error(req, "Device %s is invalid.", name);
1666                         break;
1667                 }
1668
1669                 error = g_sched_destroy(gp, *force);
1670                 if (error != 0) {
1671                         gctl_error(req, "Cannot destroy device %s (error=%d).",
1672                             gp->name, error);
1673                         break;
1674                 }
1675         }
1676 }
1677
1678 static void
1679 g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1680 {
1681         uint32_t *version;
1682
1683         g_topology_assert();
1684
1685         version = gctl_get_paraml(req, "version", sizeof(*version));
1686         if (version == NULL) {
1687                 gctl_error(req, "No '%s' argument.", "version");
1688                 return;
1689         }
1690
1691         if (*version != G_SCHED_VERSION) {
1692                 gctl_error(req, "Userland and kernel parts are "
1693                     "out of sync.");
1694                 return;
1695         }
1696
1697         if (strcmp(verb, "create") == 0) {
1698                 g_sched_ctl_create(req, mp, 0);
1699                 return;
1700         } else if (strcmp(verb, "insert") == 0) {
1701                 g_sched_ctl_create(req, mp, 1);
1702                 return;
1703         } else if (strcmp(verb, "configure") == 0) {
1704                 g_sched_ctl_configure(req, mp);
1705                 return;
1706         } else if (strcmp(verb, "destroy") == 0) {
1707                 g_sched_ctl_destroy(req, mp);
1708                 return;
1709         }
1710
1711         gctl_error(req, "Unknown verb.");
1712 }
1713
1714 static void
1715 g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1716     struct g_consumer *cp, struct g_provider *pp)
1717 {
1718         struct g_sched_softc *sc = gp->softc;
1719         struct g_gsched *gsp = sc->sc_gsched;
1720         if (indent == NULL) {   /* plaintext */
1721                 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1722         }
1723         if (gsp != NULL && gsp->gs_dumpconf)
1724                 gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1725 }
1726
1727 DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1728 MODULE_VERSION(geom_sched, 0);