2 * Copyright (c) 2004 Lukas Ertl
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include <sys/param.h>
32 #include <sys/kernel.h>
33 #include <sys/kthread.h>
34 #include <sys/libkern.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/mutex.h>
39 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include <geom/vinum/geom_vinum_var.h>
43 #include <geom/vinum/geom_vinum_raid5.h>
44 #include <geom/vinum/geom_vinum.h>
46 static void gv_plex_completed_request(struct gv_plex *, struct bio *);
47 static void gv_plex_normal_request(struct gv_plex *, struct bio *);
48 static void gv_plex_worker(void *);
49 static int gv_check_parity(struct gv_plex *, struct bio *,
50 struct gv_raid5_packet *);
51 static int gv_normal_parity(struct gv_plex *, struct bio *,
52 struct gv_raid5_packet *);
54 /* XXX: is this the place to catch dying subdisks? */
56 gv_plex_orphan(struct g_consumer *cp)
64 g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
66 if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
67 g_access(cp, -cp->acr, -cp->acw, -cp->ace);
68 error = cp->provider->error;
72 g_destroy_consumer(cp);
73 if (!LIST_EMPTY(&gp->consumer))
78 gv_kill_plex_thread(p);
84 g_wither_geom(gp, error);
88 gv_plex_done(struct bio *bp)
92 p = bp->bio_from->geom->softc;
93 bp->bio_cflags |= GV_BIO_DONE;
94 mtx_lock(&p->bqueue_mtx);
95 bioq_insert_tail(p->bqueue, bp);
97 mtx_unlock(&p->bqueue_mtx);
100 /* Find the correct subdisk to send the bio to and build a bio to send. */
102 gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
106 struct bio *cbp, *pbp;
108 off_t len_left, real_len, real_off;
109 off_t stripeend, stripeno, stripestart;
111 if (p == NULL || LIST_EMPTY(&p->subdisks))
115 gp = bp->bio_to->geom;
118 * We only handle concatenated and striped plexes here. RAID5 plexes
119 * are handled in build_raid5_request().
124 * Find the subdisk where this request starts. The subdisks in
125 * this list must be ordered by plex_offset.
127 LIST_FOREACH(s, &p->subdisks, in_plex) {
128 if (s->plex_offset <= boff &&
129 s->plex_offset + s->size > boff)
132 /* Subdisk not found. */
136 /* Calculate corresponding offsets on disk. */
137 real_off = boff - s->plex_offset;
138 len_left = s->size - real_off;
139 real_len = (bcount > len_left) ? len_left : bcount;
142 case GV_PLEX_STRIPED:
143 /* The number of the stripe where the request starts. */
144 stripeno = boff / p->stripesize;
146 /* The number of the subdisk where the stripe resides. */
147 sdno = stripeno % p->sdcount;
149 /* Find the right subdisk. */
151 LIST_FOREACH(s, &p->subdisks, in_plex) {
157 /* Subdisk not found. */
161 /* The offset of the stripe from the start of the subdisk. */
162 stripestart = (stripeno / p->sdcount) *
165 /* The offset at the end of the stripe. */
166 stripeend = stripestart + p->stripesize;
168 /* The offset of the request on this subdisk. */
169 real_off = boff - (stripeno * p->stripesize) +
172 /* The length left in this stripe. */
173 len_left = stripeend - real_off;
175 real_len = (bcount <= len_left) ? bcount : len_left;
182 /* Now check if we can handle the request on this subdisk. */
185 /* If the subdisk is up, just continue. */
189 if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
192 printf("GEOM_VINUM: sd %s is initializing\n", s->name);
193 gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
196 case GV_SD_INITIALIZING:
197 if (bp->bio_cmd == BIO_READ)
202 /* All other subdisk states mean it's not accessible. */
206 /* Clone the bio and adjust the offsets and sizes. */
207 cbp = g_clone_bio(bp);
210 cbp->bio_offset = real_off;
211 cbp->bio_length = real_len;
212 cbp->bio_data = addr;
213 cbp->bio_done = g_std_done;
214 cbp->bio_caller2 = s->consumer;
215 if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
216 cbp->bio_cflags |= GV_BIO_SYNCREQ;
217 cbp->bio_done = gv_plex_done;
220 if (bp->bio_driver1 == NULL) {
221 bp->bio_driver1 = cbp;
223 pbp = bp->bio_driver1;
224 while (pbp->bio_caller1 != NULL)
225 pbp = pbp->bio_caller1;
226 pbp->bio_caller1 = cbp;
233 gv_plex_start(struct bio *bp)
237 switch(bp->bio_cmd) {
244 g_io_deliver(bp, EOPNOTSUPP);
249 * We cannot handle this request if too many of our subdisks are
252 p = bp->bio_to->geom->softc;
253 if ((p->state < GV_PLEX_DEGRADED) &&
254 !(bp->bio_cflags & GV_BIO_SYNCREQ)) {
255 g_io_deliver(bp, ENXIO);
259 mtx_lock(&p->bqueue_mtx);
260 bioq_disksort(p->bqueue, bp);
262 mtx_unlock(&p->bqueue_mtx);
266 gv_plex_worker(void *arg)
273 KASSERT(p != NULL, ("NULL p"));
275 mtx_lock(&p->bqueue_mtx);
277 /* We were signaled to exit. */
278 if (p->flags & GV_PLEX_THREAD_DIE)
281 /* Take the first BIO from our queue. */
282 bp = bioq_takefirst(p->bqueue);
284 msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
287 mtx_unlock(&p->bqueue_mtx);
289 /* A completed request. */
290 if (bp->bio_cflags & GV_BIO_DONE) {
291 if (bp->bio_cflags & GV_BIO_SYNCREQ ||
292 bp->bio_cflags & GV_BIO_REBUILD) {
293 s = bp->bio_to->private;
294 if (bp->bio_error == 0)
295 s->initialized += bp->bio_length;
296 if (s->initialized >= s->size) {
298 gv_set_sd_state(s, GV_SD_UP,
305 if (bp->bio_cflags & GV_BIO_SYNCREQ)
308 gv_plex_completed_request(p, bp);
310 * A sub-request that was hold back because it interfered with
311 * another sub-request.
313 } else if (bp->bio_cflags & GV_BIO_ONHOLD) {
314 /* Is it still locked out? */
315 if (gv_stripe_active(p, bp)) {
316 /* Park the bio on the waiting queue. */
317 mtx_lock(&p->bqueue_mtx);
318 bioq_disksort(p->wqueue, bp);
319 mtx_unlock(&p->bqueue_mtx);
321 bp->bio_cflags &= ~GV_BIO_ONHOLD;
322 g_io_request(bp, bp->bio_caller2);
325 /* A normal request to this plex. */
327 gv_plex_normal_request(p, bp);
329 mtx_lock(&p->bqueue_mtx);
331 mtx_unlock(&p->bqueue_mtx);
332 p->flags |= GV_PLEX_THREAD_DEAD;
339 gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
341 struct bio *cbp, *pbp;
346 if (wp->waiting != NULL) {
350 for (i = 0; i < wp->length; i++)
351 cbp->bio_data[i] ^= pbp->bio_data[i];
352 g_io_request(pbp, pbp->bio_caller2);
355 } else if (wp->parity != NULL) {
358 g_io_request(cbp, cbp->bio_caller2);
366 gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
369 int err, finished, i;
374 if (wp->waiting != NULL) {
377 g_io_request(pbp, pbp->bio_caller2);
380 } else if (wp->parity != NULL) {
384 /* Check if the parity is correct. */
385 for (i = 0; i < wp->length; i++) {
386 if (bp->bio_data[i] != pbp->bio_data[i]) {
392 /* The parity is not correct... */
394 bp->bio_parent->bio_error = EAGAIN;
396 /* ... but we rebuild it. */
397 if (bp->bio_parent->bio_cflags & GV_BIO_PARITY) {
398 g_io_request(pbp, pbp->bio_caller2);
404 * Clean up the BIO we would have used for rebuilding the
408 bp->bio_parent->bio_inbed++;
418 gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
420 struct bio *cbp, *pbp;
421 struct gv_bioq *bq, *bq2;
422 struct gv_raid5_packet *wp;
425 wp = bp->bio_driver1;
427 switch (bp->bio_parent->bio_cmd) {
432 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
434 TAILQ_REMOVE(&wp->bits, bq, queue);
436 for (i = 0; i < wp->length; i++)
437 wp->data[i] ^= bp->bio_data[i];
441 if (TAILQ_EMPTY(&wp->bits)) {
442 bp->bio_parent->bio_completed += wp->length;
443 if (wp->lockbase != -1) {
444 TAILQ_REMOVE(&p->packets, wp, list);
445 /* Bring the waiting bios back into the game. */
446 mtx_lock(&p->bqueue_mtx);
447 pbp = bioq_takefirst(p->wqueue);
448 while (pbp != NULL) {
449 bioq_disksort(p->bqueue, pbp);
450 pbp = bioq_takefirst(p->wqueue);
452 mtx_unlock(&p->bqueue_mtx);
463 /* Check if we need to handle parity data. */
464 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
466 TAILQ_REMOVE(&wp->bits, bq, queue);
470 for (i = 0; i < wp->length; i++)
478 /* Handle parity data. */
479 if (TAILQ_EMPTY(&wp->bits)) {
480 if (bp->bio_parent->bio_cflags & GV_BIO_CHECK)
481 i = gv_check_parity(p, bp, wp);
483 i = gv_normal_parity(p, bp, wp);
485 /* All of our sub-requests have finished. */
487 bp->bio_parent->bio_completed += wp->length;
488 TAILQ_REMOVE(&p->packets, wp, list);
489 /* Bring the waiting bios back into the game. */
490 mtx_lock(&p->bqueue_mtx);
491 pbp = bioq_takefirst(p->wqueue);
492 while (pbp != NULL) {
493 bioq_disksort(p->bqueue, pbp);
494 pbp = bioq_takefirst(p->wqueue);
496 mtx_unlock(&p->bqueue_mtx);
504 pbp = bp->bio_parent;
505 if (pbp->bio_error == 0)
506 pbp->bio_error = bp->bio_error;
508 /* When the original request is finished, we deliver it. */
510 if (pbp->bio_inbed == pbp->bio_children)
511 g_io_deliver(pbp, pbp->bio_error);
513 /* Clean up what we allocated. */
514 if (bp->bio_cflags & GV_BIO_MALLOC)
515 g_free(bp->bio_data);
520 gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
522 struct bio *cbp, *pbp;
523 struct gv_bioq *bq, *bq2;
524 struct gv_raid5_packet *wp, *wp2;
529 bcount = bp->bio_length;
531 boff = bp->bio_offset;
533 /* Walk over the whole length of the request, we might split it up. */
538 * RAID5 plexes need special treatment, as a single write
539 * request involves several read/write sub-requests.
541 if (p->org == GV_PLEX_RAID5) {
542 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
544 TAILQ_INIT(&wp->bits);
546 if (bp->bio_cflags & GV_BIO_REBUILD)
547 err = gv_rebuild_raid5(p, wp, bp, addr,
549 else if (bp->bio_cflags & GV_BIO_CHECK)
550 err = gv_check_raid5(p, wp, bp, addr,
553 err = gv_build_raid5_req(p, wp, bp, addr,
557 * Building the sub-request failed, we probably need to
561 printf("GEOM_VINUM: plex request failed for ");
564 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
565 TAILQ_REMOVE(&wp->bits, bq, queue);
568 if (wp->waiting != NULL) {
569 if (wp->waiting->bio_cflags &
571 g_free(wp->waiting->bio_data);
572 g_destroy_bio(wp->waiting);
574 if (wp->parity != NULL) {
575 if (wp->parity->bio_cflags &
577 g_free(wp->parity->bio_data);
578 g_destroy_bio(wp->parity);
582 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
584 TAILQ_REMOVE(&p->packets, wp,
586 TAILQ_FOREACH_SAFE(bq,
587 &wp->bits, queue, bq2) {
588 TAILQ_REMOVE(&wp->bits,
596 cbp = bp->bio_driver1;
597 while (cbp != NULL) {
598 pbp = cbp->bio_caller1;
599 if (cbp->bio_cflags & GV_BIO_MALLOC)
600 g_free(cbp->bio_data);
605 g_io_deliver(bp, err);
609 if (TAILQ_EMPTY(&wp->bits))
611 else if (wp->lockbase != -1)
612 TAILQ_INSERT_TAIL(&p->packets, wp, list);
615 * Requests to concatenated and striped plexes go straight
619 err = gv_plexbuffer(p, bp, addr, boff, bcount);
621 /* Building the sub-request failed. */
623 printf("GEOM_VINUM: plex request failed for ");
626 cbp = bp->bio_driver1;
627 while (cbp != NULL) {
628 pbp = cbp->bio_caller1;
632 g_io_deliver(bp, err);
637 /* Abuse bio_caller1 as linked list. */
638 pbp = bp->bio_driver1;
639 while (pbp->bio_caller1 != NULL)
640 pbp = pbp->bio_caller1;
641 bcount -= pbp->bio_length;
642 addr += pbp->bio_length;
643 boff += pbp->bio_length;
646 /* Fire off all sub-requests. */
647 pbp = bp->bio_driver1;
648 while (pbp != NULL) {
650 * RAID5 sub-requests need to come in correct order, otherwise
651 * we trip over the parity, as it might be overwritten by
652 * another sub-request.
654 if (pbp->bio_driver1 != NULL &&
655 gv_stripe_active(p, pbp)) {
656 /* Park the bio on the waiting queue. */
657 pbp->bio_cflags |= GV_BIO_ONHOLD;
658 mtx_lock(&p->bqueue_mtx);
659 bioq_disksort(p->wqueue, pbp);
660 mtx_unlock(&p->bqueue_mtx);
662 g_io_request(pbp, pbp->bio_caller2);
663 pbp = pbp->bio_caller1;
668 gv_plex_access(struct g_provider *pp, int dr, int dw, int de)
672 struct g_consumer *cp, *cp2;
677 KASSERT(p != NULL, ("NULL p"));
679 if (p->org == GV_PLEX_RAID5) {
680 if (dw > 0 && dr == 0)
682 else if (dw < 0 && dr == 0)
686 LIST_FOREACH(cp, &gp->consumer, consumer) {
687 error = g_access(cp, dr, dw, de);
689 LIST_FOREACH(cp2, &gp->consumer, consumer) {
692 g_access(cp2, -dr, -dw, -de);
700 static struct g_geom *
701 gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
704 struct g_consumer *cp, *cp2;
705 struct g_provider *pp2;
711 g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
714 /* We only want to attach to subdisks. */
715 if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
718 /* Find the VINUM class and its associated geom. */
719 gp = find_vinum_geom();
723 KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
725 /* Find out which subdisk the offered provider corresponds to. */
727 KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
729 /* Now find the correct plex where this subdisk belongs to. */
730 p = gv_find_plex(sc, s->plex);
732 printf("gv_plex_taste: NULL p for '%s'\n", s->name);
737 * Add this subdisk to this plex. Since we trust the on-disk
738 * configuration, we don't check the given value (should we?).
739 * XXX: shouldn't be done here
741 gv_sd_to_plex(p, s, 0);
743 /* Now check if there's already a geom for this plex. */
746 /* Yes, there is already a geom, so we just add the consumer. */
748 cp2 = LIST_FIRST(&gp->consumer);
749 /* Need to attach a new consumer to this subdisk. */
750 cp = g_new_consumer(gp);
751 error = g_attach(cp, pp);
753 printf("geom_vinum: couldn't attach consumer to %s\n",
755 g_destroy_consumer(cp);
758 /* Adjust the access counts of the new consumer. */
759 if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) {
760 error = g_access(cp, cp2->acr, cp2->acw, cp2->ace);
762 printf("geom_vinum: couldn't set access counts"
763 " for consumer on %s\n", pp->name);
765 g_destroy_consumer(cp);
771 /* Adjust the size of the providers this plex has. */
772 LIST_FOREACH(pp2, &gp->provider, provider)
773 pp2->mediasize = p->size;
775 /* Update the size of the volume this plex is attached to. */
776 if (p->vol_sc != NULL)
777 gv_update_vol_size(p->vol_sc, p->size);
780 * If necessary, create bio queues, queue mutex and a worker
783 if (p->bqueue == NULL) {
784 p->bqueue = g_malloc(sizeof(struct bio_queue_head),
786 bioq_init(p->bqueue);
788 if (p->wqueue == NULL) {
789 p->wqueue = g_malloc(sizeof(struct bio_queue_head),
791 bioq_init(p->wqueue);
793 if (mtx_initialized(&p->bqueue_mtx) == 0)
794 mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
795 if (!(p->flags & GV_PLEX_THREAD_ACTIVE)) {
796 kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
798 p->flags |= GV_PLEX_THREAD_ACTIVE;
803 /* We need to create a new geom. */
805 gp = g_new_geomf(mp, "%s", p->name);
806 gp->start = gv_plex_start;
807 gp->orphan = gv_plex_orphan;
808 gp->access = gv_plex_access;
812 TAILQ_INIT(&p->packets);
813 p->bqueue = g_malloc(sizeof(struct bio_queue_head),
815 bioq_init(p->bqueue);
816 p->wqueue = g_malloc(sizeof(struct bio_queue_head),
818 bioq_init(p->wqueue);
819 mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
820 kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
822 p->flags |= GV_PLEX_THREAD_ACTIVE;
824 /* Attach a consumer to this provider. */
825 cp = g_new_consumer(gp);
829 /* Create a provider for the outside world. */
830 pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
831 pp2->mediasize = p->size;
832 pp2->sectorsize = pp->sectorsize;
834 g_error_provider(pp2, 0);
840 gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
845 g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
850 KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
853 * If this is a RAID5 plex, check if its worker thread is still active
854 * and signal it to self destruct.
856 gv_kill_plex_thread(p);
858 g_wither_geom(gp, ENXIO);
862 #define VINUMPLEX_CLASS_NAME "VINUMPLEX"
864 static struct g_class g_vinum_plex_class = {
865 .name = VINUMPLEX_CLASS_NAME,
866 .version = G_VERSION,
867 .taste = gv_plex_taste,
868 .destroy_geom = gv_plex_destroy_geom,
871 DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);