2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2004, 2007 Lukas Ertl
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
35 #include <sys/malloc.h>
36 #include <sys/systm.h>
38 #include <geom/geom.h>
39 #include <geom/vinum/geom_vinum_var.h>
40 #include <geom/vinum/geom_vinum_raid5.h>
41 #include <geom/vinum/geom_vinum.h>
43 static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
44 off_t *, off_t *, int *, int *, int);
45 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
46 struct gv_raid5_packet *, caddr_t, int);
47 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
48 struct bio *, caddr_t, off_t, off_t, int *);
49 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
50 struct bio *, caddr_t, off_t, off_t);
51 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
52 struct bio *, caddr_t, off_t, off_t);
54 struct gv_raid5_packet *
55 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
59 struct gv_raid5_packet *wp, *wp2;
60 struct gv_bioq *bq, *bq2;
64 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
68 TAILQ_INIT(&wp->bits);
70 if (bp->bio_pflags & GV_BIO_REBUILD)
71 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
72 else if (bp->bio_pflags & GV_BIO_CHECK)
73 err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
75 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
77 /* Means we have a delayed request. */
84 * Building the sub-request failed, we probably need to clean up a lot.
87 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
88 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
89 TAILQ_REMOVE(&wp->bits, bq, queue);
92 if (wp->waiting != NULL) {
93 if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
94 g_free(wp->waiting->bio_data);
95 g_destroy_bio(wp->waiting);
97 if (wp->parity != NULL) {
98 if (wp->parity->bio_cflags & GV_BIO_MALLOC)
99 g_free(wp->parity->bio_data);
100 g_destroy_bio(wp->parity);
104 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
108 TAILQ_REMOVE(&p->packets, wp, list);
109 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
110 TAILQ_REMOVE(&wp->bits, bq, queue);
116 cbp = bioq_takefirst(p->bqueue);
117 while (cbp != NULL) {
118 if (cbp->bio_cflags & GV_BIO_MALLOC)
119 g_free(cbp->bio_data);
121 cbp = bioq_takefirst(p->bqueue);
124 /* If internal, stop and reset state. */
125 if (bp->bio_pflags & GV_BIO_INTERNAL) {
126 if (bp->bio_pflags & GV_BIO_MALLOC)
127 g_free(bp->bio_data);
130 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
134 g_io_deliver(bp, err);
142 * Check if the stripe that the work packet wants is already being used by
143 * some other work packet.
146 gv_stripe_active(struct gv_plex *p, struct bio *bp)
148 struct gv_raid5_packet *wp, *owp;
151 wp = bp->bio_caller2;
152 if (wp->lockbase == -1)
156 TAILQ_FOREACH(owp, &p->packets, list) {
159 if ((wp->lockbase >= owp->lockbase) &&
160 (wp->lockbase <= owp->lockbase + owp->length)) {
164 if ((wp->lockbase <= owp->lockbase) &&
165 (wp->lockbase + wp->length >= owp->lockbase)) {
175 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
176 caddr_t addr, off_t boff, off_t bcount)
178 struct gv_sd *parity, *s;
182 off_t real_len, real_off;
184 if (p == NULL || LIST_EMPTY(&p->subdisks))
187 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
189 /* Find the right subdisk. */
192 LIST_FOREACH(s, &p->subdisks, in_plex) {
200 /* Parity stripe not found. */
204 if (parity->state != GV_SD_UP)
207 wp->length = real_len;
209 wp->lockbase = real_off;
211 /* Read all subdisks. */
212 LIST_FOREACH(s, &p->subdisks, in_plex) {
213 /* Skip the parity subdisk. */
216 /* Skip growing subdisks. */
217 if (s->flags & GV_SD_GROW)
220 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
223 cbp->bio_cmd = BIO_READ;
225 bioq_insert_tail(p->bqueue, cbp);
227 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
229 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
232 /* Read the parity data. */
233 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
236 cbp->bio_cmd = BIO_READ;
240 * In case we want to rebuild the parity, create an extra BIO to write
241 * it out. It also acts as buffer for the XOR operations.
243 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
251 /* Rebuild a degraded RAID5 plex. */
253 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
254 caddr_t addr, off_t boff, off_t bcount)
256 struct gv_sd *broken, *s;
259 off_t real_len, real_off;
261 if (p == NULL || LIST_EMPTY(&p->subdisks))
264 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
266 /* Find the right subdisk. */
268 LIST_FOREACH(s, &p->subdisks, in_plex) {
269 if (s->state != GV_SD_UP)
273 /* Broken stripe not found. */
277 switch (broken->state) {
282 if (!(bp->bio_pflags & GV_BIO_REBUILD))
285 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
286 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
287 /* Set this bit now, but should be set at end. */
288 broken->flags |= GV_SD_CANGOUP;
295 /* All other subdisk states mean it's not accessible. */
299 wp->length = real_len;
301 wp->lockbase = real_off;
303 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
305 /* Read all subdisks. */
306 LIST_FOREACH(s, &p->subdisks, in_plex) {
307 /* Skip the broken subdisk. */
311 /* Skip growing subdisks. */
312 if (s->flags & GV_SD_GROW)
315 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
318 cbp->bio_cmd = BIO_READ;
320 bioq_insert_tail(p->bqueue, cbp);
322 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
324 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
327 /* Write the parity data. */
328 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
335 /* Post notification that we're finished. */
339 /* Build a request group to perform (part of) a RAID5 request. */
341 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
342 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
345 struct gv_sd *broken, *original, *parity, *s;
348 int i, psdno, sdno, type, grow;
349 off_t real_len, real_off;
351 gp = bp->bio_to->geom;
353 if (p == NULL || LIST_EMPTY(&p->subdisks))
356 /* We are optimistic and assume that this request will be OK. */
357 #define REQ_TYPE_NORMAL 0
358 #define REQ_TYPE_DEGRADED 1
359 #define REQ_TYPE_NOPARITY 2
361 type = REQ_TYPE_NORMAL;
362 original = parity = broken = NULL;
364 /* XXX: The resize won't crash with rebuild or sync, but we should still
365 * be aware of it. Also this should perhaps be done on rebuild/check as
368 /* If we're over, we must use the old. */
369 if (boff >= p->synced) {
371 /* Or if over the resized offset, we use all drives. */
372 } else if (boff + bcount <= p->synced) {
374 /* Else, we're in the middle, and must wait a bit. */
376 bioq_disksort(p->rqueue, bp);
380 gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
381 &sdno, &psdno, grow);
383 /* Find the right subdisks. */
385 LIST_FOREACH(s, &p->subdisks, in_plex) {
390 if (s->state != GV_SD_UP)
395 if ((original == NULL) || (parity == NULL))
398 /* Our data stripe is missing. */
399 if (original->state != GV_SD_UP)
400 type = REQ_TYPE_DEGRADED;
402 /* If synchronizing request, just write it if disks are stale. */
403 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
404 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
405 type = REQ_TYPE_NORMAL;
406 /* Our parity stripe is missing. */
407 } else if (parity->state != GV_SD_UP) {
408 /* We cannot take another failure if we're already degraded. */
409 if (type != REQ_TYPE_NORMAL)
412 type = REQ_TYPE_NOPARITY;
415 wp->length = real_len;
417 wp->lockbase = real_off;
419 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
421 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
422 type = REQ_TYPE_NORMAL;
424 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
425 bioq_disksort(p->rqueue, bp);
430 switch (bp->bio_cmd) {
433 * For a degraded read we need to read in all stripes except
434 * the broken one plus the parity stripe and then recalculate
437 if (type == REQ_TYPE_DEGRADED) {
438 bzero(wp->data, wp->length);
439 LIST_FOREACH(s, &p->subdisks, in_plex) {
440 /* Skip the broken subdisk. */
443 /* Skip growing if within offset. */
444 if (grow && s->flags & GV_SD_GROW)
446 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
450 bioq_insert_tail(p->bqueue, cbp);
452 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
454 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
457 /* A normal read can be fulfilled with the original subdisk. */
459 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
463 bioq_insert_tail(p->bqueue, cbp);
471 * A degraded write means we cannot write to the original data
472 * subdisk. Thus we need to read in all valid stripes,
473 * recalculate the parity from the original data, and then
474 * write the parity stripe back out.
476 if (type == REQ_TYPE_DEGRADED) {
477 /* Read all subdisks. */
478 LIST_FOREACH(s, &p->subdisks, in_plex) {
479 /* Skip the broken and the parity subdisk. */
480 if ((s == broken) || (s == parity))
482 /* Skip growing if within offset. */
483 if (grow && s->flags & GV_SD_GROW)
486 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
489 cbp->bio_cmd = BIO_READ;
491 bioq_insert_tail(p->bqueue, cbp);
493 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
495 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
498 /* Write the parity data. */
499 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
502 bcopy(addr, cbp->bio_data, wp->length);
506 * When the parity stripe is missing we just write out the data.
508 } else if (type == REQ_TYPE_NOPARITY) {
509 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
513 bioq_insert_tail(p->bqueue, cbp);
515 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
517 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
520 * A normal write request goes to the original subdisk, then we
521 * read in all other stripes, recalculate the parity and write
522 * out the parity again.
525 /* Read old parity. */
526 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
529 cbp->bio_cmd = BIO_READ;
531 bioq_insert_tail(p->bqueue, cbp);
533 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
535 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
538 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
541 cbp->bio_cmd = BIO_READ;
543 bioq_insert_tail(p->bqueue, cbp);
545 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
547 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
549 /* Write new data. */
550 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
555 * We must not write the new data until the old data
556 * was read, so hold this BIO back until we're ready
561 /* The final bio for the parity. */
562 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
566 /* Remember that this is the BIO for the parity data. */
579 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
580 * care of new subdisks in an expanded RAID5 array.
581 * XXX: This assumes that the new subdisks are inserted after the others (which
582 * is okay as long as plex_offset is larger). If subdisks are inserted into the
583 * plexlist before, we get problems.
586 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
587 off_t *real_len, int *sdno, int *psdno, int growing)
590 int sd, psd, sdcount;
591 off_t len_left, stripeend, stripeoff, stripestart;
593 sdcount = p->sdcount;
595 LIST_FOREACH(s, &p->subdisks, in_plex) {
596 if (s->flags & GV_SD_GROW)
601 /* The number of the subdisk containing the parity stripe. */
602 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
604 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
606 /* Offset of the start address from the start of the stripe. */
607 stripeoff = boff % (p->stripesize * (sdcount - 1));
608 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
610 /* The number of the subdisk where the stripe resides. */
611 sd = stripeoff / p->stripesize;
612 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
614 /* At or past parity subdisk. */
618 /* The offset of the stripe on this subdisk. */
619 stripestart = (boff - stripeoff) / (sdcount - 1);
620 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
622 stripeoff %= p->stripesize;
624 /* The offset of the request on this subdisk. */
625 *real_off = stripestart + stripeoff;
627 stripeend = stripestart + p->stripesize;
628 len_left = stripeend - *real_off;
629 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
631 *real_len = (bcount <= len_left) ? bcount : len_left;
642 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
643 caddr_t addr, int use_wp)
647 cbp = g_clone_bio(bp);
651 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
652 cbp->bio_cflags |= GV_BIO_MALLOC;
654 cbp->bio_data = addr;
655 cbp->bio_offset = wp->lockbase + s->drive_offset;
656 cbp->bio_length = wp->length;
657 cbp->bio_done = gv_done;
658 cbp->bio_caller1 = s;
660 cbp->bio_caller2 = wp;