sys/geom/bde/g_bde_work.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause
   3  *
   4  * Copyright (c) 2002 Poul-Henning Kamp
   5  * Copyright (c) 2002 Networks Associates Technology, Inc.
   6  * All rights reserved.
   7  *
   8  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
   9  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  10  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  11  * DARPA CHATS research program.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * This source file contains the state-engine which makes things happen in the
  36  * right order.
  37  *
  38  * Outline:
  39  *   1) g_bde_start1()
  40  *      Break the struct bio into multiple work packets one per zone.
  41  *   2) g_bde_start2()
  42  *      Setup the necessary sector buffers and start those read operations
  43  *      which we can start at this time and put the item on the work-list.
  44  *   3) g_bde_worker()
  45  *      Scan the work-list for items which are ready for crypto processing
  46  *      and call the matching crypto function in g_bde_crypt.c and schedule
  47  *      any writes needed.  Read operations finish here by releasing the
  48  *      sector buffers and delivering the original bio request.
  49  *   4) g_bde_write_done()
  50  *      Release sector buffers and deliver the original bio request.
  51  *
  52  * Because of the C-scope rules, the functions are almost perfectly in the
  53  * opposite order in this source file.
  54  *
  55  * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
  56  * XXX: additional states to this state-engine.  Since no hardware available
  57  * XXX: at this time has AES support, implementing this has been postponed
  58  * XXX: until such time as it would result in a benefit.
  59  */
  60
  61 #include <sys/param.h>
  62 #include <sys/bio.h>
  63 #include <sys/lock.h>
  64 #include <sys/mutex.h>
  65 #include <sys/queue.h>
  66 #include <sys/malloc.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/proc.h>
  71 #include <sys/kthread.h>
  72
  73 #include <crypto/rijndael/rijndael-api-fst.h>
  74 #include <crypto/sha2/sha512.h>
  75 #include <geom/geom.h>
  76 #include <geom/bde/g_bde.h>
  77
  78 /*
  79  * FIXME: This used to call malloc_last_fail which in practice was almost
  80  * guaranteed to return time_uptime even in face of severe memory shortage.
  81  * As GBDE is the only consumer the kludge below was added to facilitate the
  82  * removal with minimial changes. The code should be fixed to respond to memory
  83  * pressure (e.g., by using lowmem eventhandler) instead.
  84  */
  85 static int
  86 g_bde_malloc_last_fail(void)
  87 {
  88
  89         return (time_uptime);
  90 }
  91
  92 static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
  93 static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
  94 static void g_bde_release_keysector(struct g_bde_work *wp);
  95 static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
  96 static int g_bde_start_read(struct g_bde_sector *sp);
  97 static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
  98
  99 /*
 100  * Work item allocation.
 101  *
 102  * C++ would call these constructors and destructors.
 103  */
 104 static u_int g_bde_nwork;
 105 SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
 106
 107 static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
 108
 109 static struct g_bde_work *
 110 g_bde_new_work(struct g_bde_softc *sc)
 111 {
 112         struct g_bde_work *wp;
 113
 114         wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
 115         if (wp == NULL)
 116                 return (wp);
 117         wp->state = SETUP;
 118         wp->softc = sc;
 119         g_bde_nwork++;
 120         sc->nwork++;
 121         TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
 122         return (wp);
 123 }
 124
 125 static void
 126 g_bde_delete_work(struct g_bde_work *wp)
 127 {
 128         struct g_bde_softc *sc;
 129
 130         sc = wp->softc;
 131         g_bde_nwork--;
 132         sc->nwork--;
 133         TAILQ_REMOVE(&sc->worklist, wp, list);
 134         free(wp, M_GBDE);
 135 }
 136
 137 /*
 138  * Sector buffer allocation
 139  *
 140  * These two functions allocate and free back variable sized sector buffers
 141  */
 142
 143 static u_int g_bde_nsect;
 144 SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
 145
 146 static void
 147 g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
 148 {
 149
 150         g_bde_nsect--;
 151         sc->nsect--;
 152         if (sp->malloc)
 153                 free(sp->data, M_GBDE);
 154         free(sp, M_GBDE);
 155 }
 156
 157 static struct g_bde_sector *
 158 g_bde_new_sector(struct g_bde_work *wp, u_int len)
 159 {
 160         struct g_bde_sector *sp;
 161
 162         sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
 163         if (sp == NULL)
 164                 return (sp);
 165         if (len > 0) {
 166                 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
 167                 if (sp->data == NULL) {
 168                         free(sp, M_GBDE);
 169                         return (NULL);
 170                 }
 171                 sp->malloc = 1;
 172         }
 173         g_bde_nsect++;
 174         wp->softc->nsect++;
 175         sp->size = len;
 176         sp->softc = wp->softc;
 177         sp->ref = 1;
 178         sp->owner = wp;
 179         sp->offset = wp->so;
 180         sp->state = JUNK;
 181         return (sp);
 182 }
 183
 184 /*
 185  * Skey sector cache.
 186  *
 187  * Nothing prevents two separate I/O requests from addressing the same zone
 188  * and thereby needing the same skey sector.  We therefore need to sequence
 189  * I/O operations to the skey sectors.  A certain amount of caching is also
 190  * desirable, although the extent of benefit from this is not at this point
 191  * determined.
 192  *
 193  * XXX: GEOM may be able to grow a generic caching facility at some point
 194  * XXX: to support such needs.
 195  */
 196
 197 static u_int g_bde_ncache;
 198 SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
 199
 200 static void
 201 g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
 202 {
 203
 204         g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
 205         if (sp->ref != 0)
 206                 return;
 207         TAILQ_REMOVE(&sc->freelist, sp, list);
 208         g_bde_ncache--;
 209         sc->ncache--;
 210         bzero(sp->data, sp->size);
 211         g_bde_delete_sector(sc, sp);
 212 }
 213
 214 static struct g_bde_sector *
 215 g_bde_get_keysector(struct g_bde_work *wp)
 216 {
 217         struct g_bde_sector *sp;
 218         struct g_bde_softc *sc;
 219         off_t offset;
 220
 221         offset = wp->kso;
 222         g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
 223         sc = wp->softc;
 224
 225         if (g_bde_malloc_last_fail() < g_bde_ncache)
 226                 g_bde_purge_sector(sc, -1);
 227
 228         sp = TAILQ_FIRST(&sc->freelist);
 229         if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
 230                 g_bde_purge_one_sector(sc, sp);
 231
 232         TAILQ_FOREACH(sp, &sc->freelist, list) {
 233                 if (sp->offset == offset)
 234                         break;
 235         }
 236         if (sp != NULL) {
 237                 sp->ref++;
 238                 KASSERT(sp->offset == offset, ("wrong offset"));
 239                 KASSERT(sp->softc == wp->softc, ("wrong softc"));
 240                 if (sp->ref == 1)
 241                         sp->owner = wp;
 242         } else {
 243                 if (g_bde_malloc_last_fail() < g_bde_ncache) {
 244                         TAILQ_FOREACH(sp, &sc->freelist, list)
 245                                 if (sp->ref == 0)
 246                                         break;
 247                 }
 248                 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
 249                         sp = TAILQ_FIRST(&sc->freelist);
 250                 if (sp != NULL && sp->ref > 0)
 251                         sp = NULL;
 252                 if (sp == NULL) {
 253                         sp = g_bde_new_sector(wp, sc->sectorsize);
 254                         if (sp != NULL) {
 255                                 g_bde_ncache++;
 256                                 sc->ncache++;
 257                                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 258                                 sp->malloc = 2;
 259                         }
 260                 }
 261                 if (sp != NULL) {
 262                         sp->offset = offset;
 263                         sp->softc = wp->softc;
 264                         sp->ref = 1;
 265                         sp->owner = wp;
 266                         sp->state = JUNK;
 267                         sp->error = 0;
 268                 }
 269         }
 270         if (sp != NULL) {
 271                 TAILQ_REMOVE(&sc->freelist, sp, list);
 272                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 273                 sp->used = time_uptime;
 274         }
 275         wp->ksp = sp;
 276         return(sp);
 277 }
 278
 279 static void
 280 g_bde_release_keysector(struct g_bde_work *wp)
 281 {
 282         struct g_bde_softc *sc;
 283         struct g_bde_work *wp2;
 284         struct g_bde_sector *sp;
 285
 286         sp = wp->ksp;
 287         g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
 288         KASSERT(sp->malloc == 2, ("Wrong sector released"));
 289         sc = sp->softc;
 290         KASSERT(sc != NULL, ("NULL sp->softc"));
 291         KASSERT(wp == sp->owner, ("Releasing, not owner"));
 292         sp->owner = NULL;
 293         wp->ksp = NULL;
 294         sp->ref--;
 295         if (sp->ref > 0) {
 296                 TAILQ_REMOVE(&sc->freelist, sp, list);
 297                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 298                 TAILQ_FOREACH(wp2, &sc->worklist, list) {
 299                         if (wp2->ksp == sp) {
 300                                 KASSERT(wp2 != wp, ("Self-reowning"));
 301                                 sp->owner = wp2;
 302                                 wakeup(sp->softc);
 303                                 break;
 304                         }
 305                 }
 306                 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
 307         } else if (sp->error != 0) {
 308                 sp->offset = ~0;
 309                 sp->error = 0;
 310                 sp->state = JUNK;
 311         }
 312         TAILQ_REMOVE(&sc->freelist, sp, list);
 313         TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
 314 }
 315
 316 static void
 317 g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
 318 {
 319         struct g_bde_sector *sp;
 320         int n;
 321
 322         g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
 323         if (fraction > 0)
 324                 n = sc->ncache / fraction + 1;
 325         else
 326                 n = g_bde_ncache - g_bde_malloc_last_fail();
 327         if (n < 0)
 328                 return;
 329         if (n > sc->ncache)
 330                 n = sc->ncache;
 331         while(n--) {
 332                 TAILQ_FOREACH(sp, &sc->freelist, list) {
 333                         if (sp->ref != 0)
 334                                 continue;
 335                         TAILQ_REMOVE(&sc->freelist, sp, list);
 336                         g_bde_ncache--;
 337                         sc->ncache--;
 338                         bzero(sp->data, sp->size);
 339                         g_bde_delete_sector(sc, sp);
 340                         break;
 341                 }
 342         }
 343 }
 344
 345 static struct g_bde_sector *
 346 g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
 347 {
 348         struct g_bde_sector *sp;
 349
 350         g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
 351         sp = g_bde_get_keysector(wp);
 352         if (sp == NULL) {
 353                 g_bde_purge_sector(sc, -1);
 354                 sp = g_bde_get_keysector(wp);
 355         }
 356         if (sp == NULL)
 357                 return (sp);
 358         if (sp->owner != wp)
 359                 return (sp);
 360         if (sp->state == VALID)
 361                 return (sp);
 362         if (g_bde_start_read(sp) == 0)
 363                 return (sp);
 364         g_bde_release_keysector(wp);
 365         return (NULL);
 366 }
 367
 368 /*
 369  * Contribute to the completion of the original bio request.
 370  *
 371  * We have no simple way to tell how many bits the original bio request has
 372  * been segmented into, so the easiest way to determine when we can deliver
 373  * it is to keep track of the number of bytes we have completed.  We keep
 374  * track of any errors underway and latch onto the first one.
 375  *
 376  * We always report "nothing done" in case of error, because random bits here
 377  * and there may be completed and returning a number of completed bytes does
 378  * not convey any useful information about which bytes they were.  If some
 379  * piece of broken code somewhere interprets this to mean that nothing has
 380  * changed on the underlying media they deserve the lossage headed for them.
 381  *
 382  * A single mutex per g_bde instance is used to prevent contention.
 383  */
 384
 385 static void
 386 g_bde_contribute(struct bio *bp, off_t bytes, int error)
 387 {
 388
 389         g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
 390              bp, (intmax_t)bytes, error);
 391         if (bp->bio_error == 0)
 392                 bp->bio_error = error;
 393         bp->bio_completed += bytes;
 394         KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
 395         if (bp->bio_completed == bp->bio_length) {
 396                 if (bp->bio_error != 0)
 397                         bp->bio_completed = 0;
 398                 g_io_deliver(bp, bp->bio_error);
 399         }
 400 }
 401
 402 /*
 403  * This is the common case "we're done with this work package" function
 404  */
 405
 406 static void
 407 g_bde_work_done(struct g_bde_work *wp, int error)
 408 {
 409
 410         g_bde_contribute(wp->bp, wp->length, error);
 411         if (wp->sp != NULL)
 412                 g_bde_delete_sector(wp->softc, wp->sp);
 413         if (wp->ksp != NULL)
 414                 g_bde_release_keysector(wp);
 415         g_bde_delete_work(wp);
 416 }
 417
 418 /*
 419  * A write operation has finished.  When we have all expected cows in the
 420  * barn close the door and call it a day.
 421  */
 422
 423 static void
 424 g_bde_write_done(struct bio *bp)
 425 {
 426         struct g_bde_sector *sp;
 427         struct g_bde_work *wp;
 428         struct g_bde_softc *sc;
 429
 430         sp = bp->bio_caller1;
 431         sc = bp->bio_caller2;
 432         mtx_lock(&sc->worklist_mutex);
 433         KASSERT(sp != NULL, ("NULL sp"));
 434         KASSERT(sc != NULL, ("NULL sc"));
 435         KASSERT(sp->owner != NULL, ("NULL sp->owner"));
 436         g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
 437         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
 438                 bp->bio_error = EIO;
 439         sp->error = bp->bio_error;
 440         g_destroy_bio(bp);
 441         wp = sp->owner;
 442         if (wp->error == 0)
 443                 wp->error = sp->error;
 444
 445         if (wp->bp->bio_cmd == BIO_DELETE) {
 446                 KASSERT(sp == wp->sp, ("trashed delete op"));
 447                 g_bde_work_done(wp, wp->error);
 448                 mtx_unlock(&sc->worklist_mutex);
 449                 return;
 450         }
 451
 452         KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
 453         KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
 454         if (wp->sp == sp) {
 455                 g_bde_delete_sector(sc, wp->sp);
 456                 wp->sp = NULL;
 457         } else {
 458                 sp->state = VALID;
 459         }
 460         if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
 461                 g_bde_work_done(wp, wp->error);
 462         mtx_unlock(&sc->worklist_mutex);
 463         return;
 464 }
 465
 466 /*
 467  * Send a write request for the given sector down the pipeline.
 468  */
 469
 470 static int
 471 g_bde_start_write(struct g_bde_sector *sp)
 472 {
 473         struct bio *bp;
 474         struct g_bde_softc *sc;
 475
 476         g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
 477         sc = sp->softc;
 478         KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
 479         KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
 480         bp = g_new_bio();
 481         if (bp == NULL)
 482                 return (ENOMEM);
 483         bp->bio_cmd = BIO_WRITE;
 484         bp->bio_offset = sp->offset;
 485         bp->bio_data = sp->data;
 486         bp->bio_length = sp->size;
 487         bp->bio_done = g_bde_write_done;
 488         bp->bio_caller1 = sp;
 489         bp->bio_caller2 = sc;
 490         sp->state = IO;
 491         g_io_request(bp, sc->consumer);
 492         return(0);
 493 }
 494
 495 /*
 496  * A read operation has finished.  Mark the sector no longer iobusy and
 497  * wake up the worker thread and let it do its thing.
 498  */
 499
 500 static void
 501 g_bde_read_done(struct bio *bp)
 502 {
 503         struct g_bde_sector *sp;
 504         struct g_bde_softc *sc;
 505
 506         sp = bp->bio_caller1;
 507         g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
 508         sc = bp->bio_caller2;
 509         mtx_lock(&sc->worklist_mutex);
 510         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
 511                 bp->bio_error = EIO;
 512         sp->error = bp->bio_error;
 513         if (sp->error == 0)
 514                 sp->state = VALID;
 515         else
 516                 sp->state = JUNK;
 517         wakeup(sc);
 518         g_destroy_bio(bp);
 519         mtx_unlock(&sc->worklist_mutex);
 520 }
 521
 522 /*
 523  * Send a read request for the given sector down the pipeline.
 524  */
 525
 526 static int
 527 g_bde_start_read(struct g_bde_sector *sp)
 528 {
 529         struct bio *bp;
 530         struct g_bde_softc *sc;
 531
 532         g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
 533         sc = sp->softc;
 534         KASSERT(sc != NULL, ("Null softc in sp %p", sp));
 535         bp = g_new_bio();
 536         if (bp == NULL)
 537                 return (ENOMEM);
 538         bp->bio_cmd = BIO_READ;
 539         bp->bio_offset = sp->offset;
 540         bp->bio_data = sp->data;
 541         bp->bio_length = sp->size;
 542         bp->bio_done = g_bde_read_done;
 543         bp->bio_caller1 = sp;
 544         bp->bio_caller2 = sc;
 545         sp->state = IO;
 546         g_io_request(bp, sc->consumer);
 547         return(0);
 548 }
 549
 550 /*
 551  * The worker thread.
 552  *
 553  * The up/down path of GEOM is not allowed to sleep or do any major work
 554  * so we use this thread to do the actual crypto operations and to push
 555  * the state engine onwards.
 556  *
 557  * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
 558  * XXX: using a thread here is probably not needed.
 559  */
 560
 561 void
 562 g_bde_worker(void *arg)
 563 {
 564         struct g_bde_softc *sc;
 565         struct g_bde_work *wp, *twp;
 566         struct g_geom *gp;
 567         int restart, error;
 568
 569         gp = arg;
 570         sc = gp->softc;
 571
 572         mtx_lock(&sc->worklist_mutex);
 573         for (;;) {
 574                 restart = 0;
 575                 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
 576                 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
 577                         KASSERT(wp != NULL, ("NULL wp"));
 578                         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
 579                         if (wp->state != WAIT)
 580                                 continue;       /* Not interesting here */
 581
 582                         KASSERT(wp->bp != NULL, ("NULL wp->bp"));
 583                         KASSERT(wp->sp != NULL, ("NULL wp->sp"));
 584
 585                         if (wp->ksp != NULL) {
 586                                 if (wp->ksp->owner != wp)
 587                                         continue;
 588                                 if (wp->ksp->state == IO)
 589                                         continue;
 590                                 KASSERT(wp->ksp->state == VALID,
 591                                     ("Illegal sector state (%d)",
 592                                     wp->ksp->state));
 593                         }
 594
 595                         if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
 596                                 continue;
 597
 598                         if (wp->ksp != NULL && wp->ksp->error != 0) {
 599                                 g_bde_work_done(wp, wp->ksp->error);
 600                                 continue;
 601                         }
 602                         switch(wp->bp->bio_cmd) {
 603                         case BIO_READ:
 604                                 if (wp->ksp == NULL) {
 605                                         KASSERT(wp->error != 0,
 606                                             ("BIO_READ, no ksp and no error"));
 607                                         g_bde_work_done(wp, wp->error);
 608                                         break;
 609                                 }
 610                                 if (wp->sp->error != 0) {
 611                                         g_bde_work_done(wp, wp->sp->error);
 612                                         break;
 613                                 }
 614                                 mtx_unlock(&sc->worklist_mutex);
 615                                 g_bde_crypt_read(wp);
 616                                 mtx_lock(&sc->worklist_mutex);
 617                                 restart++;
 618                                 g_bde_work_done(wp, wp->sp->error);
 619                                 break;
 620                         case BIO_WRITE:
 621                                 wp->state = FINISH;
 622                                 KASSERT(wp->sp->owner == wp,
 623                                     ("Write not owner sp"));
 624                                 KASSERT(wp->ksp->owner == wp,
 625                                     ("Write not owner ksp"));
 626                                 mtx_unlock(&sc->worklist_mutex);
 627                                 g_bde_crypt_write(wp);
 628                                 mtx_lock(&sc->worklist_mutex);
 629                                 restart++;
 630                                 error = g_bde_start_write(wp->sp);
 631                                 if (error) {
 632                                         g_bde_work_done(wp, error);
 633                                         break;
 634                                 }
 635                                 error = g_bde_start_write(wp->ksp);
 636                                 if (wp->error != 0)
 637                                         wp->error = error;
 638                                 break;
 639                         case BIO_DELETE:
 640                                 wp->state = FINISH;
 641                                 mtx_unlock(&sc->worklist_mutex);
 642                                 g_bde_crypt_delete(wp);
 643                                 mtx_lock(&sc->worklist_mutex);
 644                                 restart++;
 645                                 g_bde_start_write(wp->sp);
 646                                 break;
 647                         }
 648                         if (restart)
 649                                 break;
 650                 }
 651                 if (!restart) {
 652                         /*
 653                          * We don't look for our death-warrant until we are
 654                          * idle.  Shouldn't make a difference in practice.
 655                          */
 656                         if (sc->dead)
 657                                 break;
 658                         g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
 659                         error = msleep(sc, &sc->worklist_mutex,
 660                             PRIBIO, "-", hz);
 661                         if (error == EWOULDBLOCK) {
 662                                 /*
 663                                  * Lose our skey cache in an orderly fashion.
 664                                  * The exact rate can be tuned to be less
 665                                  * aggressive if this is desirable.  10% per
 666                                  * second means that the cache is gone in a
 667                                  * few minutes.
 668                                  */
 669                                 g_bde_purge_sector(sc, 10);
 670                         }
 671                 }
 672         }
 673         g_trace(G_T_TOPOLOGY, "g_bde_worker die");
 674         g_bde_purge_sector(sc, 1);
 675         KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
 676         KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
 677         KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
 678         mtx_unlock(&sc->worklist_mutex);
 679         sc->dead = 2;
 680         wakeup(sc);
 681         kproc_exit(0);
 682 }
 683
 684 /*
 685  * g_bde_start1 has chopped the incoming request up so all the requests
 686  * we see here are inside a single zone.  Map the data and key locations
 687  * grab the buffers we need and fire off the first volley of read requests.
 688  */
 689
 690 static void
 691 g_bde_start2(struct g_bde_work *wp)
 692 {
 693         struct g_bde_softc *sc;
 694
 695         KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
 696         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
 697         g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
 698         sc = wp->softc;
 699         switch (wp->bp->bio_cmd) {
 700         case BIO_READ:
 701                 wp->sp = g_bde_new_sector(wp, 0);
 702                 if (wp->sp == NULL) {
 703                         g_bde_work_done(wp, ENOMEM);
 704                         return;
 705                 }
 706                 wp->sp->size = wp->length;
 707                 wp->sp->data = wp->data;
 708                 if (g_bde_start_read(wp->sp) != 0) {
 709                         g_bde_work_done(wp, ENOMEM);
 710                         return;
 711                 }
 712                 g_bde_read_keysector(sc, wp);
 713                 if (wp->ksp == NULL)
 714                         wp->error = ENOMEM;
 715                 break;
 716         case BIO_DELETE:
 717                 wp->sp = g_bde_new_sector(wp, wp->length);
 718                 if (wp->sp == NULL) {
 719                         g_bde_work_done(wp, ENOMEM);
 720                         return;
 721                 }
 722                 break;
 723         case BIO_WRITE:
 724                 wp->sp = g_bde_new_sector(wp, wp->length);
 725                 if (wp->sp == NULL) {
 726                         g_bde_work_done(wp, ENOMEM);
 727                         return;
 728                 }
 729                 g_bde_read_keysector(sc, wp);
 730                 if (wp->ksp == NULL) {
 731                         g_bde_work_done(wp, ENOMEM);
 732                         return;
 733                 }
 734                 break;
 735         default:
 736                 KASSERT(0 == 1,
 737                     ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
 738         }
 739
 740         wp->state = WAIT;
 741         wakeup(sc);
 742 }
 743
 744 /*
 745  * Create a sequence of work structures, and have g_bde_map_sector() determine
 746  * how long they each can be.  Feed them to g_bde_start2().
 747  */
 748
 749 void
 750 g_bde_start1(struct bio *bp)
 751 {
 752         struct g_bde_softc *sc;
 753         struct g_bde_work *wp;
 754         off_t done;
 755
 756         sc = bp->bio_to->geom->softc;
 757         bp->bio_driver1 = sc;
 758
 759         mtx_lock(&sc->worklist_mutex);
 760         for(done = 0; done < bp->bio_length; ) {
 761                 wp = g_bde_new_work(sc);
 762                 if (wp != NULL) {
 763                         wp->bp = bp;
 764                         wp->offset = bp->bio_offset + done;
 765                         wp->data = bp->bio_data + done;
 766                         wp->length = bp->bio_length - done;
 767                         g_bde_map_sector(wp);
 768                         done += wp->length;
 769                         g_bde_start2(wp);
 770                 }
 771                 if (wp == NULL || bp->bio_error != 0) {
 772                         g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
 773                         break;
 774                 }
 775         }
 776         mtx_unlock(&sc->worklist_mutex);
 777         return;
 778 }