sys/geom/bde/g_bde_work.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2002 Poul-Henning Kamp
   5  * Copyright (c) 2002 Networks Associates Technology, Inc.
   6  * All rights reserved.
   7  *
   8  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
   9  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  10  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  11  * DARPA CHATS research program.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $FreeBSD$
  35  */
  36 /*
  37  * This source file contains the state-engine which makes things happen in the
  38  * right order.
  39  *
  40  * Outline:
  41  *   1) g_bde_start1()
  42  *      Break the struct bio into multiple work packets one per zone.
  43  *   2) g_bde_start2()
  44  *      Setup the necessary sector buffers and start those read operations
  45  *      which we can start at this time and put the item on the work-list.
  46  *   3) g_bde_worker()
  47  *      Scan the work-list for items which are ready for crypto processing
  48  *      and call the matching crypto function in g_bde_crypt.c and schedule
  49  *      any writes needed.  Read operations finish here by releasing the
  50  *      sector buffers and delivering the original bio request.
  51  *   4) g_bde_write_done()
  52  *      Release sector buffers and deliver the original bio request.
  53  *
  54  * Because of the C-scope rules, the functions are almost perfectly in the
  55  * opposite order in this source file.
  56  *
  57  * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
  58  * XXX: additional states to this state-engine.  Since no hardware available
  59  * XXX: at this time has AES support, implementing this has been postponed
  60  * XXX: until such time as it would result in a benefit.
  61  */
  62
  63 #include <sys/param.h>
  64 #include <sys/bio.h>
  65 #include <sys/lock.h>
  66 #include <sys/mutex.h>
  67 #include <sys/queue.h>
  68 #include <sys/malloc.h>
  69 #include <sys/systm.h>
  70 #include <sys/kernel.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/proc.h>
  73 #include <sys/kthread.h>
  74
  75 #include <crypto/rijndael/rijndael-api-fst.h>
  76 #include <crypto/sha2/sha512.h>
  77 #include <geom/geom.h>
  78 #include <geom/bde/g_bde.h>
  79
  80 /*
  81  * FIXME: This used to call malloc_last_fail which in practice was almost
  82  * guaranteed to return time_uptime even in face of severe memory shortage.
  83  * As GBDE is the only consumer the kludge below was added to facilitate the
  84  * removal with minimial changes. The code should be fixed to respond to memory
  85  * pressure (e.g., by using lowmem eventhandler) instead.
  86  */
  87 static int
  88 g_bde_malloc_last_fail(void)
  89 {
  90
  91         return (time_uptime);
  92 }
  93
  94 static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
  95 static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
  96 static void g_bde_release_keysector(struct g_bde_work *wp);
  97 static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
  98 static int g_bde_start_read(struct g_bde_sector *sp);
  99 static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
 100
 101 /*
 102  * Work item allocation.
 103  *
 104  * C++ would call these constructors and destructors.
 105  */
 106 static u_int g_bde_nwork;
 107 SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
 108
 109 static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
 110
 111 static struct g_bde_work *
 112 g_bde_new_work(struct g_bde_softc *sc)
 113 {
 114         struct g_bde_work *wp;
 115
 116         wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
 117         if (wp == NULL)
 118                 return (wp);
 119         wp->state = SETUP;
 120         wp->softc = sc;
 121         g_bde_nwork++;
 122         sc->nwork++;
 123         TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
 124         return (wp);
 125 }
 126
 127 static void
 128 g_bde_delete_work(struct g_bde_work *wp)
 129 {
 130         struct g_bde_softc *sc;
 131
 132         sc = wp->softc;
 133         g_bde_nwork--;
 134         sc->nwork--;
 135         TAILQ_REMOVE(&sc->worklist, wp, list);
 136         free(wp, M_GBDE);
 137 }
 138
 139 /*
 140  * Sector buffer allocation
 141  *
 142  * These two functions allocate and free back variable sized sector buffers
 143  */
 144
 145 static u_int g_bde_nsect;
 146 SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
 147
 148 static void
 149 g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
 150 {
 151
 152         g_bde_nsect--;
 153         sc->nsect--;
 154         if (sp->malloc)
 155                 free(sp->data, M_GBDE);
 156         free(sp, M_GBDE);
 157 }
 158
 159 static struct g_bde_sector *
 160 g_bde_new_sector(struct g_bde_work *wp, u_int len)
 161 {
 162         struct g_bde_sector *sp;
 163
 164         sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
 165         if (sp == NULL)
 166                 return (sp);
 167         if (len > 0) {
 168                 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
 169                 if (sp->data == NULL) {
 170                         free(sp, M_GBDE);
 171                         return (NULL);
 172                 }
 173                 sp->malloc = 1;
 174         }
 175         g_bde_nsect++;
 176         wp->softc->nsect++;
 177         sp->size = len;
 178         sp->softc = wp->softc;
 179         sp->ref = 1;
 180         sp->owner = wp;
 181         sp->offset = wp->so;
 182         sp->state = JUNK;
 183         return (sp);
 184 }
 185
 186 /*
 187  * Skey sector cache.
 188  *
 189  * Nothing prevents two separate I/O requests from addressing the same zone
 190  * and thereby needing the same skey sector.  We therefore need to sequence
 191  * I/O operations to the skey sectors.  A certain amount of caching is also
 192  * desirable, although the extent of benefit from this is not at this point
 193  * determined.
 194  *
 195  * XXX: GEOM may be able to grow a generic caching facility at some point
 196  * XXX: to support such needs.
 197  */
 198
 199 static u_int g_bde_ncache;
 200 SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
 201
 202 static void
 203 g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
 204 {
 205
 206         g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
 207         if (sp->ref != 0)
 208                 return;
 209         TAILQ_REMOVE(&sc->freelist, sp, list);
 210         g_bde_ncache--;
 211         sc->ncache--;
 212         bzero(sp->data, sp->size);
 213         g_bde_delete_sector(sc, sp);
 214 }
 215
 216 static struct g_bde_sector *
 217 g_bde_get_keysector(struct g_bde_work *wp)
 218 {
 219         struct g_bde_sector *sp;
 220         struct g_bde_softc *sc;
 221         off_t offset;
 222
 223         offset = wp->kso;
 224         g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
 225         sc = wp->softc;
 226
 227         if (g_bde_malloc_last_fail() < g_bde_ncache)
 228                 g_bde_purge_sector(sc, -1);
 229
 230         sp = TAILQ_FIRST(&sc->freelist);
 231         if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
 232                 g_bde_purge_one_sector(sc, sp);
 233
 234         TAILQ_FOREACH(sp, &sc->freelist, list) {
 235                 if (sp->offset == offset)
 236                         break;
 237         }
 238         if (sp != NULL) {
 239                 sp->ref++;
 240                 KASSERT(sp->offset == offset, ("wrong offset"));
 241                 KASSERT(sp->softc == wp->softc, ("wrong softc"));
 242                 if (sp->ref == 1)
 243                         sp->owner = wp;
 244         } else {
 245                 if (g_bde_malloc_last_fail() < g_bde_ncache) {
 246                         TAILQ_FOREACH(sp, &sc->freelist, list)
 247                                 if (sp->ref == 0)
 248                                         break;
 249                 }
 250                 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
 251                         sp = TAILQ_FIRST(&sc->freelist);
 252                 if (sp != NULL && sp->ref > 0)
 253                         sp = NULL;
 254                 if (sp == NULL) {
 255                         sp = g_bde_new_sector(wp, sc->sectorsize);
 256                         if (sp != NULL) {
 257                                 g_bde_ncache++;
 258                                 sc->ncache++;
 259                                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 260                                 sp->malloc = 2;
 261                         }
 262                 }
 263                 if (sp != NULL) {
 264                         sp->offset = offset;
 265                         sp->softc = wp->softc;
 266                         sp->ref = 1;
 267                         sp->owner = wp;
 268                         sp->state = JUNK;
 269                         sp->error = 0;
 270                 }
 271         }
 272         if (sp != NULL) {
 273                 TAILQ_REMOVE(&sc->freelist, sp, list);
 274                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 275                 sp->used = time_uptime;
 276         }
 277         wp->ksp = sp;
 278         return(sp);
 279 }
 280
 281 static void
 282 g_bde_release_keysector(struct g_bde_work *wp)
 283 {
 284         struct g_bde_softc *sc;
 285         struct g_bde_work *wp2;
 286         struct g_bde_sector *sp;
 287
 288         sp = wp->ksp;
 289         g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
 290         KASSERT(sp->malloc == 2, ("Wrong sector released"));
 291         sc = sp->softc;
 292         KASSERT(sc != NULL, ("NULL sp->softc"));
 293         KASSERT(wp == sp->owner, ("Releasing, not owner"));
 294         sp->owner = NULL;
 295         wp->ksp = NULL;
 296         sp->ref--;
 297         if (sp->ref > 0) {
 298                 TAILQ_REMOVE(&sc->freelist, sp, list);
 299                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
 300                 TAILQ_FOREACH(wp2, &sc->worklist, list) {
 301                         if (wp2->ksp == sp) {
 302                                 KASSERT(wp2 != wp, ("Self-reowning"));
 303                                 sp->owner = wp2;
 304                                 wakeup(sp->softc);
 305                                 break;
 306                         }
 307                 }
 308                 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
 309         } else if (sp->error != 0) {
 310                 sp->offset = ~0;
 311                 sp->error = 0;
 312                 sp->state = JUNK;
 313         }
 314         TAILQ_REMOVE(&sc->freelist, sp, list);
 315         TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
 316 }
 317
 318 static void
 319 g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
 320 {
 321         struct g_bde_sector *sp;
 322         int n;
 323
 324         g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
 325         if (fraction > 0)
 326                 n = sc->ncache / fraction + 1;
 327         else
 328                 n = g_bde_ncache - g_bde_malloc_last_fail();
 329         if (n < 0)
 330                 return;
 331         if (n > sc->ncache)
 332                 n = sc->ncache;
 333         while(n--) {
 334                 TAILQ_FOREACH(sp, &sc->freelist, list) {
 335                         if (sp->ref != 0)
 336                                 continue;
 337                         TAILQ_REMOVE(&sc->freelist, sp, list);
 338                         g_bde_ncache--;
 339                         sc->ncache--;
 340                         bzero(sp->data, sp->size);
 341                         g_bde_delete_sector(sc, sp);
 342                         break;
 343                 }
 344         }
 345 }
 346
 347 static struct g_bde_sector *
 348 g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
 349 {
 350         struct g_bde_sector *sp;
 351
 352         g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
 353         sp = g_bde_get_keysector(wp);
 354         if (sp == NULL) {
 355                 g_bde_purge_sector(sc, -1);
 356                 sp = g_bde_get_keysector(wp);
 357         }
 358         if (sp == NULL)
 359                 return (sp);
 360         if (sp->owner != wp)
 361                 return (sp);
 362         if (sp->state == VALID)
 363                 return (sp);
 364         if (g_bde_start_read(sp) == 0)
 365                 return (sp);
 366         g_bde_release_keysector(wp);
 367         return (NULL);
 368 }
 369
 370 /*
 371  * Contribute to the completion of the original bio request.
 372  *
 373  * We have no simple way to tell how many bits the original bio request has
 374  * been segmented into, so the easiest way to determine when we can deliver
 375  * it is to keep track of the number of bytes we have completed.  We keep
 376  * track of any errors underway and latch onto the first one.
 377  *
 378  * We always report "nothing done" in case of error, because random bits here
 379  * and there may be completed and returning a number of completed bytes does
 380  * not convey any useful information about which bytes they were.  If some
 381  * piece of broken code somewhere interprets this to mean that nothing has
 382  * changed on the underlying media they deserve the lossage headed for them.
 383  *
 384  * A single mutex per g_bde instance is used to prevent contention.
 385  */
 386
 387 static void
 388 g_bde_contribute(struct bio *bp, off_t bytes, int error)
 389 {
 390
 391         g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
 392              bp, (intmax_t)bytes, error);
 393         if (bp->bio_error == 0)
 394                 bp->bio_error = error;
 395         bp->bio_completed += bytes;
 396         KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
 397         if (bp->bio_completed == bp->bio_length) {
 398                 if (bp->bio_error != 0)
 399                         bp->bio_completed = 0;
 400                 g_io_deliver(bp, bp->bio_error);
 401         }
 402 }
 403
 404 /*
 405  * This is the common case "we're done with this work package" function
 406  */
 407
 408 static void
 409 g_bde_work_done(struct g_bde_work *wp, int error)
 410 {
 411
 412         g_bde_contribute(wp->bp, wp->length, error);
 413         if (wp->sp != NULL)
 414                 g_bde_delete_sector(wp->softc, wp->sp);
 415         if (wp->ksp != NULL)
 416                 g_bde_release_keysector(wp);
 417         g_bde_delete_work(wp);
 418 }
 419
 420 /*
 421  * A write operation has finished.  When we have all expected cows in the
 422  * barn close the door and call it a day.
 423  */
 424
 425 static void
 426 g_bde_write_done(struct bio *bp)
 427 {
 428         struct g_bde_sector *sp;
 429         struct g_bde_work *wp;
 430         struct g_bde_softc *sc;
 431
 432         sp = bp->bio_caller1;
 433         sc = bp->bio_caller2;
 434         mtx_lock(&sc->worklist_mutex);
 435         KASSERT(sp != NULL, ("NULL sp"));
 436         KASSERT(sc != NULL, ("NULL sc"));
 437         KASSERT(sp->owner != NULL, ("NULL sp->owner"));
 438         g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
 439         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
 440                 bp->bio_error = EIO;
 441         sp->error = bp->bio_error;
 442         g_destroy_bio(bp);
 443         wp = sp->owner;
 444         if (wp->error == 0)
 445                 wp->error = sp->error;
 446
 447         if (wp->bp->bio_cmd == BIO_DELETE) {
 448                 KASSERT(sp == wp->sp, ("trashed delete op"));
 449                 g_bde_work_done(wp, wp->error);
 450                 mtx_unlock(&sc->worklist_mutex);
 451                 return;
 452         }
 453
 454         KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
 455         KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
 456         if (wp->sp == sp) {
 457                 g_bde_delete_sector(sc, wp->sp);
 458                 wp->sp = NULL;
 459         } else {
 460                 sp->state = VALID;
 461         }
 462         if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
 463                 g_bde_work_done(wp, wp->error);
 464         mtx_unlock(&sc->worklist_mutex);
 465         return;
 466 }
 467
 468 /*
 469  * Send a write request for the given sector down the pipeline.
 470  */
 471
 472 static int
 473 g_bde_start_write(struct g_bde_sector *sp)
 474 {
 475         struct bio *bp;
 476         struct g_bde_softc *sc;
 477
 478         g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
 479         sc = sp->softc;
 480         KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
 481         KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
 482         bp = g_new_bio();
 483         if (bp == NULL)
 484                 return (ENOMEM);
 485         bp->bio_cmd = BIO_WRITE;
 486         bp->bio_offset = sp->offset;
 487         bp->bio_data = sp->data;
 488         bp->bio_length = sp->size;
 489         bp->bio_done = g_bde_write_done;
 490         bp->bio_caller1 = sp;
 491         bp->bio_caller2 = sc;
 492         sp->state = IO;
 493         g_io_request(bp, sc->consumer);
 494         return(0);
 495 }
 496
 497 /*
 498  * A read operation has finished.  Mark the sector no longer iobusy and
 499  * wake up the worker thread and let it do its thing.
 500  */
 501
 502 static void
 503 g_bde_read_done(struct bio *bp)
 504 {
 505         struct g_bde_sector *sp;
 506         struct g_bde_softc *sc;
 507
 508         sp = bp->bio_caller1;
 509         g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
 510         sc = bp->bio_caller2;
 511         mtx_lock(&sc->worklist_mutex);
 512         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
 513                 bp->bio_error = EIO;
 514         sp->error = bp->bio_error;
 515         if (sp->error == 0)
 516                 sp->state = VALID;
 517         else
 518                 sp->state = JUNK;
 519         wakeup(sc);
 520         g_destroy_bio(bp);
 521         mtx_unlock(&sc->worklist_mutex);
 522 }
 523
 524 /*
 525  * Send a read request for the given sector down the pipeline.
 526  */
 527
 528 static int
 529 g_bde_start_read(struct g_bde_sector *sp)
 530 {
 531         struct bio *bp;
 532         struct g_bde_softc *sc;
 533
 534         g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
 535         sc = sp->softc;
 536         KASSERT(sc != NULL, ("Null softc in sp %p", sp));
 537         bp = g_new_bio();
 538         if (bp == NULL)
 539                 return (ENOMEM);
 540         bp->bio_cmd = BIO_READ;
 541         bp->bio_offset = sp->offset;
 542         bp->bio_data = sp->data;
 543         bp->bio_length = sp->size;
 544         bp->bio_done = g_bde_read_done;
 545         bp->bio_caller1 = sp;
 546         bp->bio_caller2 = sc;
 547         sp->state = IO;
 548         g_io_request(bp, sc->consumer);
 549         return(0);
 550 }
 551
 552 /*
 553  * The worker thread.
 554  *
 555  * The up/down path of GEOM is not allowed to sleep or do any major work
 556  * so we use this thread to do the actual crypto operations and to push
 557  * the state engine onwards.
 558  *
 559  * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
 560  * XXX: using a thread here is probably not needed.
 561  */
 562
 563 void
 564 g_bde_worker(void *arg)
 565 {
 566         struct g_bde_softc *sc;
 567         struct g_bde_work *wp, *twp;
 568         struct g_geom *gp;
 569         int restart, error;
 570
 571         gp = arg;
 572         sc = gp->softc;
 573
 574         mtx_lock(&sc->worklist_mutex);
 575         for (;;) {
 576                 restart = 0;
 577                 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
 578                 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
 579                         KASSERT(wp != NULL, ("NULL wp"));
 580                         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
 581                         if (wp->state != WAIT)
 582                                 continue;       /* Not interesting here */
 583
 584                         KASSERT(wp->bp != NULL, ("NULL wp->bp"));
 585                         KASSERT(wp->sp != NULL, ("NULL wp->sp"));
 586
 587                         if (wp->ksp != NULL) {
 588                                 if (wp->ksp->owner != wp)
 589                                         continue;
 590                                 if (wp->ksp->state == IO)
 591                                         continue;
 592                                 KASSERT(wp->ksp->state == VALID,
 593                                     ("Illegal sector state (%d)",
 594                                     wp->ksp->state));
 595                         }
 596
 597                         if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
 598                                 continue;
 599
 600                         if (wp->ksp != NULL && wp->ksp->error != 0) {
 601                                 g_bde_work_done(wp, wp->ksp->error);
 602                                 continue;
 603                         }
 604                         switch(wp->bp->bio_cmd) {
 605                         case BIO_READ:
 606                                 if (wp->ksp == NULL) {
 607                                         KASSERT(wp->error != 0,
 608                                             ("BIO_READ, no ksp and no error"));
 609                                         g_bde_work_done(wp, wp->error);
 610                                         break;
 611                                 }
 612                                 if (wp->sp->error != 0) {
 613                                         g_bde_work_done(wp, wp->sp->error);
 614                                         break;
 615                                 }
 616                                 mtx_unlock(&sc->worklist_mutex);
 617                                 g_bde_crypt_read(wp);
 618                                 mtx_lock(&sc->worklist_mutex);
 619                                 restart++;
 620                                 g_bde_work_done(wp, wp->sp->error);
 621                                 break;
 622                         case BIO_WRITE:
 623                                 wp->state = FINISH;
 624                                 KASSERT(wp->sp->owner == wp,
 625                                     ("Write not owner sp"));
 626                                 KASSERT(wp->ksp->owner == wp,
 627                                     ("Write not owner ksp"));
 628                                 mtx_unlock(&sc->worklist_mutex);
 629                                 g_bde_crypt_write(wp);
 630                                 mtx_lock(&sc->worklist_mutex);
 631                                 restart++;
 632                                 error = g_bde_start_write(wp->sp);
 633                                 if (error) {
 634                                         g_bde_work_done(wp, error);
 635                                         break;
 636                                 }
 637                                 error = g_bde_start_write(wp->ksp);
 638                                 if (wp->error != 0)
 639                                         wp->error = error;
 640                                 break;
 641                         case BIO_DELETE:
 642                                 wp->state = FINISH;
 643                                 mtx_unlock(&sc->worklist_mutex);
 644                                 g_bde_crypt_delete(wp);
 645                                 mtx_lock(&sc->worklist_mutex);
 646                                 restart++;
 647                                 g_bde_start_write(wp->sp);
 648                                 break;
 649                         }
 650                         if (restart)
 651                                 break;
 652                 }
 653                 if (!restart) {
 654                         /*
 655                          * We don't look for our death-warrant until we are
 656                          * idle.  Shouldn't make a difference in practice.
 657                          */
 658                         if (sc->dead)
 659                                 break;
 660                         g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
 661                         error = msleep(sc, &sc->worklist_mutex,
 662                             PRIBIO, "-", hz);
 663                         if (error == EWOULDBLOCK) {
 664                                 /*
 665                                  * Lose our skey cache in an orderly fashion.
 666                                  * The exact rate can be tuned to be less
 667                                  * aggressive if this is desirable.  10% per
 668                                  * second means that the cache is gone in a
 669                                  * few minutes.
 670                                  */
 671                                 g_bde_purge_sector(sc, 10);
 672                         }
 673                 }
 674         }
 675         g_trace(G_T_TOPOLOGY, "g_bde_worker die");
 676         g_bde_purge_sector(sc, 1);
 677         KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
 678         KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
 679         KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
 680         mtx_unlock(&sc->worklist_mutex);
 681         sc->dead = 2;
 682         wakeup(sc);
 683         kproc_exit(0);
 684 }
 685
 686 /*
 687  * g_bde_start1 has chopped the incoming request up so all the requests
 688  * we see here are inside a single zone.  Map the data and key locations
 689  * grab the buffers we need and fire off the first volley of read requests.
 690  */
 691
 692 static void
 693 g_bde_start2(struct g_bde_work *wp)
 694 {
 695         struct g_bde_softc *sc;
 696
 697         KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
 698         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
 699         g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
 700         sc = wp->softc;
 701         switch (wp->bp->bio_cmd) {
 702         case BIO_READ:
 703                 wp->sp = g_bde_new_sector(wp, 0);
 704                 if (wp->sp == NULL) {
 705                         g_bde_work_done(wp, ENOMEM);
 706                         return;
 707                 }
 708                 wp->sp->size = wp->length;
 709                 wp->sp->data = wp->data;
 710                 if (g_bde_start_read(wp->sp) != 0) {
 711                         g_bde_work_done(wp, ENOMEM);
 712                         return;
 713                 }
 714                 g_bde_read_keysector(sc, wp);
 715                 if (wp->ksp == NULL)
 716                         wp->error = ENOMEM;
 717                 break;
 718         case BIO_DELETE:
 719                 wp->sp = g_bde_new_sector(wp, wp->length);
 720                 if (wp->sp == NULL) {
 721                         g_bde_work_done(wp, ENOMEM);
 722                         return;
 723                 }
 724                 break;
 725         case BIO_WRITE:
 726                 wp->sp = g_bde_new_sector(wp, wp->length);
 727                 if (wp->sp == NULL) {
 728                         g_bde_work_done(wp, ENOMEM);
 729                         return;
 730                 }
 731                 g_bde_read_keysector(sc, wp);
 732                 if (wp->ksp == NULL) {
 733                         g_bde_work_done(wp, ENOMEM);
 734                         return;
 735                 }
 736                 break;
 737         default:
 738                 KASSERT(0 == 1,
 739                     ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
 740         }
 741
 742         wp->state = WAIT;
 743         wakeup(sc);
 744 }
 745
 746 /*
 747  * Create a sequence of work structures, and have g_bde_map_sector() determine
 748  * how long they each can be.  Feed them to g_bde_start2().
 749  */
 750
 751 void
 752 g_bde_start1(struct bio *bp)
 753 {
 754         struct g_bde_softc *sc;
 755         struct g_bde_work *wp;
 756         off_t done;
 757
 758         sc = bp->bio_to->geom->softc;
 759         bp->bio_driver1 = sc;
 760
 761         mtx_lock(&sc->worklist_mutex);
 762         for(done = 0; done < bp->bio_length; ) {
 763                 wp = g_bde_new_work(sc);
 764                 if (wp != NULL) {
 765                         wp->bp = bp;
 766                         wp->offset = bp->bio_offset + done;
 767                         wp->data = bp->bio_data + done;
 768                         wp->length = bp->bio_length - done;
 769                         g_bde_map_sector(wp);
 770                         done += wp->length;
 771                         g_bde_start2(wp);
 772                 }
 773                 if (wp == NULL || bp->bio_error != 0) {
 774                         g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
 775                         break;
 776                 }
 777         }
 778         mtx_unlock(&sc->worklist_mutex);
 779         return;
 780 }