2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2002 Poul-Henning Kamp
5 * Copyright (c) 2002 Networks Associates Technology, Inc.
8 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
9 * and NAI Labs, the Security Research Division of Network Associates, Inc.
10 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
11 * DARPA CHATS research program.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * This source file contains the state-engine which makes things happen in the
42 * Break the struct bio into multiple work packets one per zone.
44 * Setup the necessary sector buffers and start those read operations
45 * which we can start at this time and put the item on the work-list.
47 * Scan the work-list for items which are ready for crypto processing
48 * and call the matching crypto function in g_bde_crypt.c and schedule
49 * any writes needed. Read operations finish here by releasing the
50 * sector buffers and delivering the original bio request.
51 * 4) g_bde_write_done()
52 * Release sector buffers and deliver the original bio request.
54 * Because of the C-scope rules, the functions are almost perfectly in the
55 * opposite order in this source file.
57 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
58 * XXX: additional states to this state-engine. Since no hardware available
59 * XXX: at this time has AES support, implementing this has been postponed
60 * XXX: until such time as it would result in a benefit.
63 #include <sys/param.h>
66 #include <sys/mutex.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/systm.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
73 #include <sys/kthread.h>
75 #include <crypto/rijndael/rijndael-api-fst.h>
76 #include <crypto/sha2/sha512.h>
77 #include <geom/geom.h>
78 #include <geom/bde/g_bde.h>
81 * FIXME: This used to call malloc_last_fail which in practice was almost
82 * guaranteed to return time_uptime even in face of severe memory shortage.
83 * As GBDE is the only consumer the kludge below was added to facilitate the
84 * removal with minimial changes. The code should be fixed to respond to memory
85 * pressure (e.g., by using lowmem eventhandler) instead.
88 g_bde_malloc_last_fail(void)
94 static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
95 static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
96 static void g_bde_release_keysector(struct g_bde_work *wp);
97 static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
98 static int g_bde_start_read(struct g_bde_sector *sp);
99 static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
102 * Work item allocation.
104 * C++ would call these constructors and destructors.
106 static u_int g_bde_nwork;
107 SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
109 static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
111 static struct g_bde_work *
112 g_bde_new_work(struct g_bde_softc *sc)
114 struct g_bde_work *wp;
116 wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
123 TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
128 g_bde_delete_work(struct g_bde_work *wp)
130 struct g_bde_softc *sc;
135 TAILQ_REMOVE(&sc->worklist, wp, list);
140 * Sector buffer allocation
142 * These two functions allocate and free back variable sized sector buffers
145 static u_int g_bde_nsect;
146 SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
149 g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
155 free(sp->data, M_GBDE);
159 static struct g_bde_sector *
160 g_bde_new_sector(struct g_bde_work *wp, u_int len)
162 struct g_bde_sector *sp;
164 sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
168 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
169 if (sp->data == NULL) {
178 sp->softc = wp->softc;
189 * Nothing prevents two separate I/O requests from addressing the same zone
190 * and thereby needing the same skey sector. We therefore need to sequence
191 * I/O operations to the skey sectors. A certain amount of caching is also
192 * desirable, although the extent of benefit from this is not at this point
195 * XXX: GEOM may be able to grow a generic caching facility at some point
196 * XXX: to support such needs.
199 static u_int g_bde_ncache;
200 SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
203 g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
206 g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
209 TAILQ_REMOVE(&sc->freelist, sp, list);
212 bzero(sp->data, sp->size);
213 g_bde_delete_sector(sc, sp);
216 static struct g_bde_sector *
217 g_bde_get_keysector(struct g_bde_work *wp)
219 struct g_bde_sector *sp;
220 struct g_bde_softc *sc;
224 g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
227 if (g_bde_malloc_last_fail() < g_bde_ncache)
228 g_bde_purge_sector(sc, -1);
230 sp = TAILQ_FIRST(&sc->freelist);
231 if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
232 g_bde_purge_one_sector(sc, sp);
234 TAILQ_FOREACH(sp, &sc->freelist, list) {
235 if (sp->offset == offset)
240 KASSERT(sp->offset == offset, ("wrong offset"));
241 KASSERT(sp->softc == wp->softc, ("wrong softc"));
245 if (g_bde_malloc_last_fail() < g_bde_ncache) {
246 TAILQ_FOREACH(sp, &sc->freelist, list)
250 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
251 sp = TAILQ_FIRST(&sc->freelist);
252 if (sp != NULL && sp->ref > 0)
255 sp = g_bde_new_sector(wp, sc->sectorsize);
259 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
265 sp->softc = wp->softc;
273 TAILQ_REMOVE(&sc->freelist, sp, list);
274 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
275 sp->used = time_uptime;
282 g_bde_release_keysector(struct g_bde_work *wp)
284 struct g_bde_softc *sc;
285 struct g_bde_work *wp2;
286 struct g_bde_sector *sp;
289 g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
290 KASSERT(sp->malloc == 2, ("Wrong sector released"));
292 KASSERT(sc != NULL, ("NULL sp->softc"));
293 KASSERT(wp == sp->owner, ("Releasing, not owner"));
298 TAILQ_REMOVE(&sc->freelist, sp, list);
299 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
300 TAILQ_FOREACH(wp2, &sc->worklist, list) {
301 if (wp2->ksp == sp) {
302 KASSERT(wp2 != wp, ("Self-reowning"));
308 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
309 } else if (sp->error != 0) {
314 TAILQ_REMOVE(&sc->freelist, sp, list);
315 TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
319 g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
321 struct g_bde_sector *sp;
324 g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
326 n = sc->ncache / fraction + 1;
328 n = g_bde_ncache - g_bde_malloc_last_fail();
334 TAILQ_FOREACH(sp, &sc->freelist, list) {
337 TAILQ_REMOVE(&sc->freelist, sp, list);
340 bzero(sp->data, sp->size);
341 g_bde_delete_sector(sc, sp);
347 static struct g_bde_sector *
348 g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
350 struct g_bde_sector *sp;
352 g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
353 sp = g_bde_get_keysector(wp);
355 g_bde_purge_sector(sc, -1);
356 sp = g_bde_get_keysector(wp);
362 if (sp->state == VALID)
364 if (g_bde_start_read(sp) == 0)
366 g_bde_release_keysector(wp);
371 * Contribute to the completion of the original bio request.
373 * We have no simple way to tell how many bits the original bio request has
374 * been segmented into, so the easiest way to determine when we can deliver
375 * it is to keep track of the number of bytes we have completed. We keep
376 * track of any errors underway and latch onto the first one.
378 * We always report "nothing done" in case of error, because random bits here
379 * and there may be completed and returning a number of completed bytes does
380 * not convey any useful information about which bytes they were. If some
381 * piece of broken code somewhere interprets this to mean that nothing has
382 * changed on the underlying media they deserve the lossage headed for them.
384 * A single mutex per g_bde instance is used to prevent contention.
388 g_bde_contribute(struct bio *bp, off_t bytes, int error)
391 g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
392 bp, (intmax_t)bytes, error);
393 if (bp->bio_error == 0)
394 bp->bio_error = error;
395 bp->bio_completed += bytes;
396 KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
397 if (bp->bio_completed == bp->bio_length) {
398 if (bp->bio_error != 0)
399 bp->bio_completed = 0;
400 g_io_deliver(bp, bp->bio_error);
405 * This is the common case "we're done with this work package" function
409 g_bde_work_done(struct g_bde_work *wp, int error)
412 g_bde_contribute(wp->bp, wp->length, error);
414 g_bde_delete_sector(wp->softc, wp->sp);
416 g_bde_release_keysector(wp);
417 g_bde_delete_work(wp);
421 * A write operation has finished. When we have all expected cows in the
422 * barn close the door and call it a day.
426 g_bde_write_done(struct bio *bp)
428 struct g_bde_sector *sp;
429 struct g_bde_work *wp;
430 struct g_bde_softc *sc;
432 sp = bp->bio_caller1;
433 sc = bp->bio_caller2;
434 mtx_lock(&sc->worklist_mutex);
435 KASSERT(sp != NULL, ("NULL sp"));
436 KASSERT(sc != NULL, ("NULL sc"));
437 KASSERT(sp->owner != NULL, ("NULL sp->owner"));
438 g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
439 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
441 sp->error = bp->bio_error;
445 wp->error = sp->error;
447 if (wp->bp->bio_cmd == BIO_DELETE) {
448 KASSERT(sp == wp->sp, ("trashed delete op"));
449 g_bde_work_done(wp, wp->error);
450 mtx_unlock(&sc->worklist_mutex);
454 KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
455 KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
457 g_bde_delete_sector(sc, wp->sp);
462 if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
463 g_bde_work_done(wp, wp->error);
464 mtx_unlock(&sc->worklist_mutex);
469 * Send a write request for the given sector down the pipeline.
473 g_bde_start_write(struct g_bde_sector *sp)
476 struct g_bde_softc *sc;
478 g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
480 KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
481 KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
485 bp->bio_cmd = BIO_WRITE;
486 bp->bio_offset = sp->offset;
487 bp->bio_data = sp->data;
488 bp->bio_length = sp->size;
489 bp->bio_done = g_bde_write_done;
490 bp->bio_caller1 = sp;
491 bp->bio_caller2 = sc;
493 g_io_request(bp, sc->consumer);
498 * A read operation has finished. Mark the sector no longer iobusy and
499 * wake up the worker thread and let it do its thing.
503 g_bde_read_done(struct bio *bp)
505 struct g_bde_sector *sp;
506 struct g_bde_softc *sc;
508 sp = bp->bio_caller1;
509 g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
510 sc = bp->bio_caller2;
511 mtx_lock(&sc->worklist_mutex);
512 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
514 sp->error = bp->bio_error;
521 mtx_unlock(&sc->worklist_mutex);
525 * Send a read request for the given sector down the pipeline.
529 g_bde_start_read(struct g_bde_sector *sp)
532 struct g_bde_softc *sc;
534 g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
536 KASSERT(sc != NULL, ("Null softc in sp %p", sp));
540 bp->bio_cmd = BIO_READ;
541 bp->bio_offset = sp->offset;
542 bp->bio_data = sp->data;
543 bp->bio_length = sp->size;
544 bp->bio_done = g_bde_read_done;
545 bp->bio_caller1 = sp;
546 bp->bio_caller2 = sc;
548 g_io_request(bp, sc->consumer);
555 * The up/down path of GEOM is not allowed to sleep or do any major work
556 * so we use this thread to do the actual crypto operations and to push
557 * the state engine onwards.
559 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
560 * XXX: using a thread here is probably not needed.
564 g_bde_worker(void *arg)
566 struct g_bde_softc *sc;
567 struct g_bde_work *wp, *twp;
574 mtx_lock(&sc->worklist_mutex);
577 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
578 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
579 KASSERT(wp != NULL, ("NULL wp"));
580 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
581 if (wp->state != WAIT)
582 continue; /* Not interesting here */
584 KASSERT(wp->bp != NULL, ("NULL wp->bp"));
585 KASSERT(wp->sp != NULL, ("NULL wp->sp"));
587 if (wp->ksp != NULL) {
588 if (wp->ksp->owner != wp)
590 if (wp->ksp->state == IO)
592 KASSERT(wp->ksp->state == VALID,
593 ("Illegal sector state (%d)",
597 if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
600 if (wp->ksp != NULL && wp->ksp->error != 0) {
601 g_bde_work_done(wp, wp->ksp->error);
604 switch(wp->bp->bio_cmd) {
606 if (wp->ksp == NULL) {
607 KASSERT(wp->error != 0,
608 ("BIO_READ, no ksp and no error"));
609 g_bde_work_done(wp, wp->error);
612 if (wp->sp->error != 0) {
613 g_bde_work_done(wp, wp->sp->error);
616 mtx_unlock(&sc->worklist_mutex);
617 g_bde_crypt_read(wp);
618 mtx_lock(&sc->worklist_mutex);
620 g_bde_work_done(wp, wp->sp->error);
624 KASSERT(wp->sp->owner == wp,
625 ("Write not owner sp"));
626 KASSERT(wp->ksp->owner == wp,
627 ("Write not owner ksp"));
628 mtx_unlock(&sc->worklist_mutex);
629 g_bde_crypt_write(wp);
630 mtx_lock(&sc->worklist_mutex);
632 error = g_bde_start_write(wp->sp);
634 g_bde_work_done(wp, error);
637 error = g_bde_start_write(wp->ksp);
643 mtx_unlock(&sc->worklist_mutex);
644 g_bde_crypt_delete(wp);
645 mtx_lock(&sc->worklist_mutex);
647 g_bde_start_write(wp->sp);
655 * We don't look for our death-warrant until we are
656 * idle. Shouldn't make a difference in practice.
660 g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
661 error = msleep(sc, &sc->worklist_mutex,
663 if (error == EWOULDBLOCK) {
665 * Lose our skey cache in an orderly fashion.
666 * The exact rate can be tuned to be less
667 * aggressive if this is desirable. 10% per
668 * second means that the cache is gone in a
671 g_bde_purge_sector(sc, 10);
675 g_trace(G_T_TOPOLOGY, "g_bde_worker die");
676 g_bde_purge_sector(sc, 1);
677 KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
678 KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
679 KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
680 mtx_unlock(&sc->worklist_mutex);
687 * g_bde_start1 has chopped the incoming request up so all the requests
688 * we see here are inside a single zone. Map the data and key locations
689 * grab the buffers we need and fire off the first volley of read requests.
693 g_bde_start2(struct g_bde_work *wp)
695 struct g_bde_softc *sc;
697 KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
698 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
699 g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
701 switch (wp->bp->bio_cmd) {
703 wp->sp = g_bde_new_sector(wp, 0);
704 if (wp->sp == NULL) {
705 g_bde_work_done(wp, ENOMEM);
708 wp->sp->size = wp->length;
709 wp->sp->data = wp->data;
710 if (g_bde_start_read(wp->sp) != 0) {
711 g_bde_work_done(wp, ENOMEM);
714 g_bde_read_keysector(sc, wp);
719 wp->sp = g_bde_new_sector(wp, wp->length);
720 if (wp->sp == NULL) {
721 g_bde_work_done(wp, ENOMEM);
726 wp->sp = g_bde_new_sector(wp, wp->length);
727 if (wp->sp == NULL) {
728 g_bde_work_done(wp, ENOMEM);
731 g_bde_read_keysector(sc, wp);
732 if (wp->ksp == NULL) {
733 g_bde_work_done(wp, ENOMEM);
739 ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
747 * Create a sequence of work structures, and have g_bde_map_sector() determine
748 * how long they each can be. Feed them to g_bde_start2().
752 g_bde_start1(struct bio *bp)
754 struct g_bde_softc *sc;
755 struct g_bde_work *wp;
758 sc = bp->bio_to->geom->softc;
759 bp->bio_driver1 = sc;
761 mtx_lock(&sc->worklist_mutex);
762 for(done = 0; done < bp->bio_length; ) {
763 wp = g_bde_new_work(sc);
766 wp->offset = bp->bio_offset + done;
767 wp->data = bp->bio_data + done;
768 wp->length = bp->bio_length - done;
769 g_bde_map_sector(wp);
773 if (wp == NULL || bp->bio_error != 0) {
774 g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
778 mtx_unlock(&sc->worklist_mutex);