]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/geom/bde/g_bde_work.c
zfs: merge openzfs/zfs@887a3c533
[FreeBSD/FreeBSD.git] / sys / geom / bde / g_bde_work.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2002 Poul-Henning Kamp
5  * Copyright (c) 2002 Networks Associates Technology, Inc.
6  * All rights reserved.
7  *
8  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
9  * and NAI Labs, the Security Research Division of Network Associates, Inc.
10  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
11  * DARPA CHATS research program.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This source file contains the state-engine which makes things happen in the
36  * right order.
37  *
38  * Outline:
39  *   1) g_bde_start1()
40  *      Break the struct bio into multiple work packets one per zone.
41  *   2) g_bde_start2()
42  *      Setup the necessary sector buffers and start those read operations
43  *      which we can start at this time and put the item on the work-list.
44  *   3) g_bde_worker()
45  *      Scan the work-list for items which are ready for crypto processing
46  *      and call the matching crypto function in g_bde_crypt.c and schedule
47  *      any writes needed.  Read operations finish here by releasing the
48  *      sector buffers and delivering the original bio request.
49  *   4) g_bde_write_done()
50  *      Release sector buffers and deliver the original bio request.
51  *
52  * Because of the C-scope rules, the functions are almost perfectly in the
53  * opposite order in this source file.
54  *
55  * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
56  * XXX: additional states to this state-engine.  Since no hardware available
57  * XXX: at this time has AES support, implementing this has been postponed
58  * XXX: until such time as it would result in a benefit.
59  */
60
61 #include <sys/param.h>
62 #include <sys/bio.h>
63 #include <sys/lock.h>
64 #include <sys/mutex.h>
65 #include <sys/queue.h>
66 #include <sys/malloc.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/proc.h>
71 #include <sys/kthread.h>
72
73 #include <crypto/rijndael/rijndael-api-fst.h>
74 #include <crypto/sha2/sha512.h>
75 #include <geom/geom.h>
76 #include <geom/bde/g_bde.h>
77
78 /*
79  * FIXME: This used to call malloc_last_fail which in practice was almost
80  * guaranteed to return time_uptime even in face of severe memory shortage.
81  * As GBDE is the only consumer the kludge below was added to facilitate the
82  * removal with minimial changes. The code should be fixed to respond to memory
83  * pressure (e.g., by using lowmem eventhandler) instead.
84  */
85 static int
86 g_bde_malloc_last_fail(void)
87 {
88
89         return (time_uptime);
90 }
91
92 static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
93 static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
94 static void g_bde_release_keysector(struct g_bde_work *wp);
95 static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
96 static int g_bde_start_read(struct g_bde_sector *sp);
97 static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
98
99 /*
100  * Work item allocation.
101  *
102  * C++ would call these constructors and destructors.
103  */
104 static u_int g_bde_nwork;
105 SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
106
107 static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
108
109 static struct g_bde_work *
110 g_bde_new_work(struct g_bde_softc *sc)
111 {
112         struct g_bde_work *wp;
113
114         wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
115         if (wp == NULL)
116                 return (wp);
117         wp->state = SETUP;
118         wp->softc = sc;
119         g_bde_nwork++;
120         sc->nwork++;
121         TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
122         return (wp);
123 }
124
125 static void
126 g_bde_delete_work(struct g_bde_work *wp)
127 {
128         struct g_bde_softc *sc;
129
130         sc = wp->softc;
131         g_bde_nwork--;
132         sc->nwork--;
133         TAILQ_REMOVE(&sc->worklist, wp, list);
134         free(wp, M_GBDE);
135 }
136
137 /*
138  * Sector buffer allocation
139  *
140  * These two functions allocate and free back variable sized sector buffers
141  */
142
143 static u_int g_bde_nsect;
144 SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
145
146 static void
147 g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
148 {
149
150         g_bde_nsect--;
151         sc->nsect--;
152         if (sp->malloc)
153                 free(sp->data, M_GBDE);
154         free(sp, M_GBDE);
155 }
156
157 static struct g_bde_sector *
158 g_bde_new_sector(struct g_bde_work *wp, u_int len)
159 {
160         struct g_bde_sector *sp;
161
162         sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
163         if (sp == NULL)
164                 return (sp);
165         if (len > 0) {
166                 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
167                 if (sp->data == NULL) {
168                         free(sp, M_GBDE);
169                         return (NULL);
170                 }
171                 sp->malloc = 1;
172         }
173         g_bde_nsect++;
174         wp->softc->nsect++;
175         sp->size = len;
176         sp->softc = wp->softc;
177         sp->ref = 1;
178         sp->owner = wp;
179         sp->offset = wp->so;
180         sp->state = JUNK;
181         return (sp);
182 }
183
184 /*
185  * Skey sector cache.
186  *
187  * Nothing prevents two separate I/O requests from addressing the same zone
188  * and thereby needing the same skey sector.  We therefore need to sequence
189  * I/O operations to the skey sectors.  A certain amount of caching is also
190  * desirable, although the extent of benefit from this is not at this point
191  * determined.
192  *
193  * XXX: GEOM may be able to grow a generic caching facility at some point
194  * XXX: to support such needs.
195  */
196
197 static u_int g_bde_ncache;
198 SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
199
200 static void
201 g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
202 {
203
204         g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
205         if (sp->ref != 0)
206                 return;
207         TAILQ_REMOVE(&sc->freelist, sp, list);
208         g_bde_ncache--;
209         sc->ncache--;
210         bzero(sp->data, sp->size);
211         g_bde_delete_sector(sc, sp);
212 }
213
214 static struct g_bde_sector *
215 g_bde_get_keysector(struct g_bde_work *wp)
216 {
217         struct g_bde_sector *sp;
218         struct g_bde_softc *sc;
219         off_t offset;
220
221         offset = wp->kso;
222         g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
223         sc = wp->softc;
224
225         if (g_bde_malloc_last_fail() < g_bde_ncache)
226                 g_bde_purge_sector(sc, -1);
227
228         sp = TAILQ_FIRST(&sc->freelist);
229         if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
230                 g_bde_purge_one_sector(sc, sp);
231
232         TAILQ_FOREACH(sp, &sc->freelist, list) {
233                 if (sp->offset == offset)
234                         break;
235         }
236         if (sp != NULL) {
237                 sp->ref++;
238                 KASSERT(sp->offset == offset, ("wrong offset"));
239                 KASSERT(sp->softc == wp->softc, ("wrong softc"));
240                 if (sp->ref == 1)
241                         sp->owner = wp;
242         } else {
243                 if (g_bde_malloc_last_fail() < g_bde_ncache) {
244                         TAILQ_FOREACH(sp, &sc->freelist, list)
245                                 if (sp->ref == 0)
246                                         break;
247                 }
248                 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
249                         sp = TAILQ_FIRST(&sc->freelist);
250                 if (sp != NULL && sp->ref > 0)
251                         sp = NULL;
252                 if (sp == NULL) {
253                         sp = g_bde_new_sector(wp, sc->sectorsize);
254                         if (sp != NULL) {
255                                 g_bde_ncache++;
256                                 sc->ncache++;
257                                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
258                                 sp->malloc = 2;
259                         }
260                 }
261                 if (sp != NULL) {
262                         sp->offset = offset;
263                         sp->softc = wp->softc;
264                         sp->ref = 1;
265                         sp->owner = wp;
266                         sp->state = JUNK;
267                         sp->error = 0;
268                 }
269         }
270         if (sp != NULL) {
271                 TAILQ_REMOVE(&sc->freelist, sp, list);
272                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
273                 sp->used = time_uptime;
274         }
275         wp->ksp = sp;
276         return(sp);
277 }
278
279 static void
280 g_bde_release_keysector(struct g_bde_work *wp)
281 {
282         struct g_bde_softc *sc;
283         struct g_bde_work *wp2;
284         struct g_bde_sector *sp;
285
286         sp = wp->ksp;
287         g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
288         KASSERT(sp->malloc == 2, ("Wrong sector released"));
289         sc = sp->softc;
290         KASSERT(sc != NULL, ("NULL sp->softc"));
291         KASSERT(wp == sp->owner, ("Releasing, not owner"));
292         sp->owner = NULL;
293         wp->ksp = NULL;
294         sp->ref--;
295         if (sp->ref > 0) {
296                 TAILQ_REMOVE(&sc->freelist, sp, list);
297                 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
298                 TAILQ_FOREACH(wp2, &sc->worklist, list) {
299                         if (wp2->ksp == sp) {
300                                 KASSERT(wp2 != wp, ("Self-reowning"));
301                                 sp->owner = wp2;
302                                 wakeup(sp->softc);
303                                 break;
304                         }
305                 }
306                 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
307         } else if (sp->error != 0) {
308                 sp->offset = ~0;
309                 sp->error = 0;
310                 sp->state = JUNK;
311         }
312         TAILQ_REMOVE(&sc->freelist, sp, list);
313         TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
314 }
315
316 static void
317 g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
318 {
319         struct g_bde_sector *sp;
320         int n;
321
322         g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
323         if (fraction > 0)
324                 n = sc->ncache / fraction + 1;
325         else 
326                 n = g_bde_ncache - g_bde_malloc_last_fail();
327         if (n < 0)
328                 return;
329         if (n > sc->ncache)
330                 n = sc->ncache;
331         while(n--) {
332                 TAILQ_FOREACH(sp, &sc->freelist, list) {
333                         if (sp->ref != 0)
334                                 continue;
335                         TAILQ_REMOVE(&sc->freelist, sp, list);
336                         g_bde_ncache--;
337                         sc->ncache--;
338                         bzero(sp->data, sp->size);
339                         g_bde_delete_sector(sc, sp);
340                         break;
341                 }
342         }
343 }
344
345 static struct g_bde_sector *
346 g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
347 {
348         struct g_bde_sector *sp;
349
350         g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
351         sp = g_bde_get_keysector(wp);
352         if (sp == NULL) {
353                 g_bde_purge_sector(sc, -1);
354                 sp = g_bde_get_keysector(wp);
355         }
356         if (sp == NULL)
357                 return (sp);
358         if (sp->owner != wp)
359                 return (sp);
360         if (sp->state == VALID)
361                 return (sp);
362         if (g_bde_start_read(sp) == 0)
363                 return (sp);
364         g_bde_release_keysector(wp);
365         return (NULL);
366 }
367
368 /*
369  * Contribute to the completion of the original bio request.
370  *
371  * We have no simple way to tell how many bits the original bio request has
372  * been segmented into, so the easiest way to determine when we can deliver
373  * it is to keep track of the number of bytes we have completed.  We keep
374  * track of any errors underway and latch onto the first one.
375  *
376  * We always report "nothing done" in case of error, because random bits here
377  * and there may be completed and returning a number of completed bytes does
378  * not convey any useful information about which bytes they were.  If some
379  * piece of broken code somewhere interprets this to mean that nothing has
380  * changed on the underlying media they deserve the lossage headed for them.
381  *
382  * A single mutex per g_bde instance is used to prevent contention.
383  */
384
385 static void
386 g_bde_contribute(struct bio *bp, off_t bytes, int error)
387 {
388
389         g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
390              bp, (intmax_t)bytes, error);
391         if (bp->bio_error == 0)
392                 bp->bio_error = error;
393         bp->bio_completed += bytes;
394         KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
395         if (bp->bio_completed == bp->bio_length) {
396                 if (bp->bio_error != 0)
397                         bp->bio_completed = 0;
398                 g_io_deliver(bp, bp->bio_error);
399         }
400 }
401
402 /*
403  * This is the common case "we're done with this work package" function
404  */
405
406 static void
407 g_bde_work_done(struct g_bde_work *wp, int error)
408 {
409
410         g_bde_contribute(wp->bp, wp->length, error);
411         if (wp->sp != NULL)
412                 g_bde_delete_sector(wp->softc, wp->sp);
413         if (wp->ksp != NULL)
414                 g_bde_release_keysector(wp);
415         g_bde_delete_work(wp);
416 }
417
418 /*
419  * A write operation has finished.  When we have all expected cows in the
420  * barn close the door and call it a day.
421  */
422
423 static void
424 g_bde_write_done(struct bio *bp)
425 {
426         struct g_bde_sector *sp;
427         struct g_bde_work *wp;
428         struct g_bde_softc *sc;
429
430         sp = bp->bio_caller1;
431         sc = bp->bio_caller2;
432         mtx_lock(&sc->worklist_mutex);
433         KASSERT(sp != NULL, ("NULL sp"));
434         KASSERT(sc != NULL, ("NULL sc"));
435         KASSERT(sp->owner != NULL, ("NULL sp->owner"));
436         g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
437         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
438                 bp->bio_error = EIO;
439         sp->error = bp->bio_error;
440         g_destroy_bio(bp);
441         wp = sp->owner;
442         if (wp->error == 0)
443                 wp->error = sp->error;
444
445         if (wp->bp->bio_cmd == BIO_DELETE) {
446                 KASSERT(sp == wp->sp, ("trashed delete op"));
447                 g_bde_work_done(wp, wp->error);
448                 mtx_unlock(&sc->worklist_mutex);
449                 return;
450         }
451
452         KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
453         KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
454         if (wp->sp == sp) {
455                 g_bde_delete_sector(sc, wp->sp);
456                 wp->sp = NULL;
457         } else {
458                 sp->state = VALID;
459         }
460         if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
461                 g_bde_work_done(wp, wp->error);
462         mtx_unlock(&sc->worklist_mutex);
463         return;
464 }
465
466 /*
467  * Send a write request for the given sector down the pipeline.
468  */
469
470 static int
471 g_bde_start_write(struct g_bde_sector *sp)
472 {
473         struct bio *bp;
474         struct g_bde_softc *sc;
475
476         g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
477         sc = sp->softc;
478         KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
479         KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
480         bp = g_new_bio();
481         if (bp == NULL)
482                 return (ENOMEM);
483         bp->bio_cmd = BIO_WRITE;
484         bp->bio_offset = sp->offset;
485         bp->bio_data = sp->data;
486         bp->bio_length = sp->size;
487         bp->bio_done = g_bde_write_done;
488         bp->bio_caller1 = sp;
489         bp->bio_caller2 = sc;
490         sp->state = IO;
491         g_io_request(bp, sc->consumer);
492         return(0);
493 }
494
495 /*
496  * A read operation has finished.  Mark the sector no longer iobusy and
497  * wake up the worker thread and let it do its thing.
498  */
499
500 static void
501 g_bde_read_done(struct bio *bp)
502 {
503         struct g_bde_sector *sp;
504         struct g_bde_softc *sc;
505
506         sp = bp->bio_caller1;
507         g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
508         sc = bp->bio_caller2;
509         mtx_lock(&sc->worklist_mutex);
510         if (bp->bio_error == 0 && bp->bio_completed != sp->size)
511                 bp->bio_error = EIO;
512         sp->error = bp->bio_error;
513         if (sp->error == 0)
514                 sp->state = VALID;
515         else
516                 sp->state = JUNK;
517         wakeup(sc);
518         g_destroy_bio(bp);
519         mtx_unlock(&sc->worklist_mutex);
520 }
521
522 /*
523  * Send a read request for the given sector down the pipeline.
524  */
525
526 static int
527 g_bde_start_read(struct g_bde_sector *sp)
528 {
529         struct bio *bp;
530         struct g_bde_softc *sc;
531
532         g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
533         sc = sp->softc;
534         KASSERT(sc != NULL, ("Null softc in sp %p", sp));
535         bp = g_new_bio();
536         if (bp == NULL)
537                 return (ENOMEM);
538         bp->bio_cmd = BIO_READ;
539         bp->bio_offset = sp->offset;
540         bp->bio_data = sp->data;
541         bp->bio_length = sp->size;
542         bp->bio_done = g_bde_read_done;
543         bp->bio_caller1 = sp;
544         bp->bio_caller2 = sc;
545         sp->state = IO;
546         g_io_request(bp, sc->consumer);
547         return(0);
548 }
549
550 /*
551  * The worker thread.
552  *
553  * The up/down path of GEOM is not allowed to sleep or do any major work
554  * so we use this thread to do the actual crypto operations and to push
555  * the state engine onwards.
556  *
557  * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
558  * XXX: using a thread here is probably not needed.
559  */
560
561 void
562 g_bde_worker(void *arg)
563 {
564         struct g_bde_softc *sc;
565         struct g_bde_work *wp, *twp;
566         struct g_geom *gp;
567         int restart, error;
568
569         gp = arg;
570         sc = gp->softc;
571
572         mtx_lock(&sc->worklist_mutex);
573         for (;;) {
574                 restart = 0;
575                 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
576                 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
577                         KASSERT(wp != NULL, ("NULL wp"));
578                         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
579                         if (wp->state != WAIT)
580                                 continue;       /* Not interesting here */
581
582                         KASSERT(wp->bp != NULL, ("NULL wp->bp"));
583                         KASSERT(wp->sp != NULL, ("NULL wp->sp"));
584
585                         if (wp->ksp != NULL) {
586                                 if (wp->ksp->owner != wp)
587                                         continue;
588                                 if (wp->ksp->state == IO)
589                                         continue;
590                                 KASSERT(wp->ksp->state == VALID,
591                                     ("Illegal sector state (%d)",
592                                     wp->ksp->state));
593                         }
594
595                         if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
596                                 continue;
597
598                         if (wp->ksp != NULL && wp->ksp->error != 0) {
599                                 g_bde_work_done(wp, wp->ksp->error);
600                                 continue;
601                         } 
602                         switch(wp->bp->bio_cmd) {
603                         case BIO_READ:
604                                 if (wp->ksp == NULL) {
605                                         KASSERT(wp->error != 0,
606                                             ("BIO_READ, no ksp and no error"));
607                                         g_bde_work_done(wp, wp->error);
608                                         break;
609                                 }
610                                 if (wp->sp->error != 0) {
611                                         g_bde_work_done(wp, wp->sp->error);
612                                         break;
613                                 }
614                                 mtx_unlock(&sc->worklist_mutex);
615                                 g_bde_crypt_read(wp);
616                                 mtx_lock(&sc->worklist_mutex);
617                                 restart++;
618                                 g_bde_work_done(wp, wp->sp->error);
619                                 break;
620                         case BIO_WRITE:
621                                 wp->state = FINISH;
622                                 KASSERT(wp->sp->owner == wp,
623                                     ("Write not owner sp"));
624                                 KASSERT(wp->ksp->owner == wp,
625                                     ("Write not owner ksp"));
626                                 mtx_unlock(&sc->worklist_mutex);
627                                 g_bde_crypt_write(wp);
628                                 mtx_lock(&sc->worklist_mutex);
629                                 restart++;
630                                 error = g_bde_start_write(wp->sp);
631                                 if (error) {
632                                         g_bde_work_done(wp, error);
633                                         break;
634                                 }
635                                 error = g_bde_start_write(wp->ksp);
636                                 if (wp->error != 0)
637                                         wp->error = error;
638                                 break;
639                         case BIO_DELETE:
640                                 wp->state = FINISH;
641                                 mtx_unlock(&sc->worklist_mutex);
642                                 g_bde_crypt_delete(wp);
643                                 mtx_lock(&sc->worklist_mutex);
644                                 restart++;
645                                 g_bde_start_write(wp->sp);
646                                 break;
647                         }
648                         if (restart)
649                                 break;
650                 }
651                 if (!restart) {
652                         /*
653                          * We don't look for our death-warrant until we are
654                          * idle.  Shouldn't make a difference in practice.
655                          */
656                         if (sc->dead)
657                                 break;
658                         g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
659                         error = msleep(sc, &sc->worklist_mutex,
660                             PRIBIO, "-", hz);
661                         if (error == EWOULDBLOCK) {
662                                 /*
663                                  * Lose our skey cache in an orderly fashion.
664                                  * The exact rate can be tuned to be less
665                                  * aggressive if this is desirable.  10% per
666                                  * second means that the cache is gone in a
667                                  * few minutes.
668                                  */
669                                 g_bde_purge_sector(sc, 10);
670                         }
671                 }
672         }
673         g_trace(G_T_TOPOLOGY, "g_bde_worker die");
674         g_bde_purge_sector(sc, 1);
675         KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
676         KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
677         KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
678         mtx_unlock(&sc->worklist_mutex);
679         sc->dead = 2;
680         wakeup(sc);
681         kproc_exit(0);
682 }
683
684 /*
685  * g_bde_start1 has chopped the incoming request up so all the requests
686  * we see here are inside a single zone.  Map the data and key locations
687  * grab the buffers we need and fire off the first volley of read requests.
688  */
689
690 static void
691 g_bde_start2(struct g_bde_work *wp)
692 {
693         struct g_bde_softc *sc;
694
695         KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
696         KASSERT(wp->softc != NULL, ("NULL wp->softc"));
697         g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
698         sc = wp->softc;
699         switch (wp->bp->bio_cmd) {
700         case BIO_READ:
701                 wp->sp = g_bde_new_sector(wp, 0);
702                 if (wp->sp == NULL) {
703                         g_bde_work_done(wp, ENOMEM);
704                         return;
705                 }
706                 wp->sp->size = wp->length;
707                 wp->sp->data = wp->data;
708                 if (g_bde_start_read(wp->sp) != 0) {
709                         g_bde_work_done(wp, ENOMEM);
710                         return;
711                 }
712                 g_bde_read_keysector(sc, wp);
713                 if (wp->ksp == NULL)
714                         wp->error = ENOMEM;
715                 break;
716         case BIO_DELETE:
717                 wp->sp = g_bde_new_sector(wp, wp->length);
718                 if (wp->sp == NULL) {
719                         g_bde_work_done(wp, ENOMEM);
720                         return;
721                 }
722                 break;
723         case BIO_WRITE:
724                 wp->sp = g_bde_new_sector(wp, wp->length);
725                 if (wp->sp == NULL) {
726                         g_bde_work_done(wp, ENOMEM);
727                         return;
728                 }
729                 g_bde_read_keysector(sc, wp);
730                 if (wp->ksp == NULL) {
731                         g_bde_work_done(wp, ENOMEM);
732                         return;
733                 }
734                 break;
735         default:
736                 KASSERT(0 == 1, 
737                     ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
738         }
739
740         wp->state = WAIT;
741         wakeup(sc);
742 }
743
744 /*
745  * Create a sequence of work structures, and have g_bde_map_sector() determine
746  * how long they each can be.  Feed them to g_bde_start2().
747  */
748
749 void
750 g_bde_start1(struct bio *bp)
751 {
752         struct g_bde_softc *sc;
753         struct g_bde_work *wp;
754         off_t done;
755
756         sc = bp->bio_to->geom->softc;
757         bp->bio_driver1 = sc;
758
759         mtx_lock(&sc->worklist_mutex);
760         for(done = 0; done < bp->bio_length; ) {
761                 wp = g_bde_new_work(sc);
762                 if (wp != NULL) {
763                         wp->bp = bp;
764                         wp->offset = bp->bio_offset + done;
765                         wp->data = bp->bio_data + done;
766                         wp->length = bp->bio_length - done;
767                         g_bde_map_sector(wp);
768                         done += wp->length;
769                         g_bde_start2(wp);
770                 }
771                 if (wp == NULL || bp->bio_error != 0) {
772                         g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
773                         break;
774                 }
775         }
776         mtx_unlock(&sc->worklist_mutex);
777         return;
778 }