2 * Copyright (c) 2004 Lukas Ertl
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include <sys/param.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
47 int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
51 * Check if the stripe that the work packet wants is already being used by
52 * some other work packet.
55 gv_stripe_active(struct gv_plex *p, struct bio *bp)
57 struct gv_raid5_packet *wp, *owp;
61 if (wp->lockbase == -1)
65 TAILQ_FOREACH(owp, &p->packets, list) {
68 if ((wp->lockbase >= owp->lockbase) &&
69 (wp->lockbase <= owp->lockbase + owp->length)) {
73 if ((wp->lockbase <= owp->lockbase) &&
74 (wp->lockbase + wp->length >= owp->lockbase)) {
84 gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85 caddr_t addr, off_t boff, off_t bcount)
87 struct gv_sd *parity, *s;
89 struct bio *cbp, *pbp;
91 off_t real_len, real_off;
93 if (p == NULL || LIST_EMPTY(&p->subdisks))
96 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
98 /* Find the right subdisk. */
101 LIST_FOREACH(s, &p->subdisks, in_plex) {
109 /* Parity stripe not found. */
113 if (parity->state != GV_SD_UP)
116 wp->length = real_len;
118 wp->lockbase = real_off;
120 /* Read all subdisks. */
121 LIST_FOREACH(s, &p->subdisks, in_plex) {
122 /* Skip the parity subdisk. */
126 cbp = g_clone_bio(bp);
129 cbp->bio_cmd = BIO_READ;
130 cbp->bio_data = g_malloc(real_len, M_WAITOK);
131 cbp->bio_cflags |= GV_BIO_MALLOC;
132 cbp->bio_offset = real_off;
133 cbp->bio_length = real_len;
134 cbp->bio_done = gv_plex_done;
135 cbp->bio_caller2 = s->consumer;
136 cbp->bio_driver1 = wp;
138 GV_ENQUEUE(bp, cbp, pbp);
140 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
142 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
145 /* Read the parity data. */
146 cbp = g_clone_bio(bp);
149 cbp->bio_cmd = BIO_READ;
150 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151 cbp->bio_cflags |= GV_BIO_MALLOC;
152 cbp->bio_offset = real_off;
153 cbp->bio_length = real_len;
154 cbp->bio_done = gv_plex_done;
155 cbp->bio_caller2 = parity->consumer;
156 cbp->bio_driver1 = wp;
160 * In case we want to rebuild the parity, create an extra BIO to write
161 * it out. It also acts as buffer for the XOR operations.
163 cbp = g_clone_bio(bp);
166 cbp->bio_data = addr;
167 cbp->bio_offset = real_off;
168 cbp->bio_length = real_len;
169 cbp->bio_done = gv_plex_done;
170 cbp->bio_caller2 = parity->consumer;
171 cbp->bio_driver1 = wp;
177 /* Rebuild a degraded RAID5 plex. */
179 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180 caddr_t addr, off_t boff, off_t bcount)
182 struct gv_sd *broken, *s;
184 struct bio *cbp, *pbp;
185 off_t real_len, real_off;
187 if (p == NULL || LIST_EMPTY(&p->subdisks))
190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
192 /* Find the right subdisk. */
194 LIST_FOREACH(s, &p->subdisks, in_plex) {
195 if (s->state != GV_SD_UP)
199 /* Broken stripe not found. */
203 switch (broken->state) {
208 if (!(bp->bio_cflags & GV_BIO_REBUILD))
211 printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
212 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
219 /* All other subdisk states mean it's not accessible. */
223 wp->length = real_len;
225 wp->lockbase = real_off;
227 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
229 /* Read all subdisks. */
230 LIST_FOREACH(s, &p->subdisks, in_plex) {
231 /* Skip the broken subdisk. */
235 cbp = g_clone_bio(bp);
238 cbp->bio_cmd = BIO_READ;
239 cbp->bio_data = g_malloc(real_len, M_WAITOK);
240 cbp->bio_cflags |= GV_BIO_MALLOC;
241 cbp->bio_offset = real_off;
242 cbp->bio_length = real_len;
243 cbp->bio_done = gv_plex_done;
244 cbp->bio_caller2 = s->consumer;
245 cbp->bio_driver1 = wp;
247 GV_ENQUEUE(bp, cbp, pbp);
249 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
251 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
254 /* Write the parity data. */
255 cbp = g_clone_bio(bp);
258 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259 cbp->bio_cflags |= GV_BIO_MALLOC;
260 cbp->bio_offset = real_off;
261 cbp->bio_length = real_len;
262 cbp->bio_done = gv_plex_done;
263 cbp->bio_caller2 = broken->consumer;
264 cbp->bio_driver1 = wp;
265 cbp->bio_cflags |= GV_BIO_REBUILD;
273 /* Build a request group to perform (part of) a RAID5 request. */
275 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
276 struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
279 struct gv_sd *broken, *original, *parity, *s;
281 struct bio *cbp, *pbp;
282 int i, psdno, sdno, type;
283 off_t real_len, real_off;
285 gp = bp->bio_to->geom;
287 if (p == NULL || LIST_EMPTY(&p->subdisks))
290 /* We are optimistic and assume that this request will be OK. */
291 #define REQ_TYPE_NORMAL 0
292 #define REQ_TYPE_DEGRADED 1
293 #define REQ_TYPE_NOPARITY 2
295 type = REQ_TYPE_NORMAL;
296 original = parity = broken = NULL;
298 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
300 /* Find the right subdisks. */
302 LIST_FOREACH(s, &p->subdisks, in_plex) {
307 if (s->state != GV_SD_UP)
312 if ((original == NULL) || (parity == NULL))
315 /* Our data stripe is missing. */
316 if (original->state != GV_SD_UP)
317 type = REQ_TYPE_DEGRADED;
318 /* Our parity stripe is missing. */
319 if (parity->state != GV_SD_UP) {
320 /* We cannot take another failure if we're already degraded. */
321 if (type != REQ_TYPE_NORMAL)
324 type = REQ_TYPE_NOPARITY;
327 wp->length = real_len;
329 wp->lockbase = real_off;
331 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
333 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
334 type = REQ_TYPE_NORMAL;
336 switch (bp->bio_cmd) {
339 * For a degraded read we need to read in all stripes except
340 * the broken one plus the parity stripe and then recalculate
343 if (type == REQ_TYPE_DEGRADED) {
344 bzero(wp->data, wp->length);
345 LIST_FOREACH(s, &p->subdisks, in_plex) {
346 /* Skip the broken subdisk. */
349 cbp = g_clone_bio(bp);
352 cbp->bio_data = g_malloc(real_len, M_WAITOK);
353 cbp->bio_cflags |= GV_BIO_MALLOC;
354 cbp->bio_offset = real_off;
355 cbp->bio_length = real_len;
356 cbp->bio_done = gv_plex_done;
357 cbp->bio_caller2 = s->consumer;
358 cbp->bio_driver1 = wp;
360 GV_ENQUEUE(bp, cbp, pbp);
362 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
364 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
367 /* A normal read can be fulfilled with the original subdisk. */
369 cbp = g_clone_bio(bp);
372 cbp->bio_offset = real_off;
373 cbp->bio_length = real_len;
374 cbp->bio_data = addr;
375 cbp->bio_done = g_std_done;
376 cbp->bio_caller2 = original->consumer;
378 GV_ENQUEUE(bp, cbp, pbp);
386 * A degraded write means we cannot write to the original data
387 * subdisk. Thus we need to read in all valid stripes,
388 * recalculate the parity from the original data, and then
389 * write the parity stripe back out.
391 if (type == REQ_TYPE_DEGRADED) {
392 /* Read all subdisks. */
393 LIST_FOREACH(s, &p->subdisks, in_plex) {
394 /* Skip the broken and the parity subdisk. */
395 if ((s == broken) || (s == parity))
398 cbp = g_clone_bio(bp);
401 cbp->bio_cmd = BIO_READ;
402 cbp->bio_data = g_malloc(real_len, M_WAITOK);
403 cbp->bio_cflags |= GV_BIO_MALLOC;
404 cbp->bio_offset = real_off;
405 cbp->bio_length = real_len;
406 cbp->bio_done = gv_plex_done;
407 cbp->bio_caller2 = s->consumer;
408 cbp->bio_driver1 = wp;
410 GV_ENQUEUE(bp, cbp, pbp);
412 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
414 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
417 /* Write the parity data. */
418 cbp = g_clone_bio(bp);
421 cbp->bio_data = g_malloc(real_len, M_WAITOK);
422 cbp->bio_cflags |= GV_BIO_MALLOC;
423 bcopy(addr, cbp->bio_data, real_len);
424 cbp->bio_offset = real_off;
425 cbp->bio_length = real_len;
426 cbp->bio_done = gv_plex_done;
427 cbp->bio_caller2 = parity->consumer;
428 cbp->bio_driver1 = wp;
432 * When the parity stripe is missing we just write out the data.
434 } else if (type == REQ_TYPE_NOPARITY) {
435 cbp = g_clone_bio(bp);
438 cbp->bio_offset = real_off;
439 cbp->bio_length = real_len;
440 cbp->bio_data = addr;
441 cbp->bio_done = gv_plex_done;
442 cbp->bio_caller2 = original->consumer;
443 cbp->bio_driver1 = wp;
445 GV_ENQUEUE(bp, cbp, pbp);
447 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
449 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
452 * A normal write request goes to the original subdisk, then we
453 * read in all other stripes, recalculate the parity and write
454 * out the parity again.
457 /* Read old parity. */
458 cbp = g_clone_bio(bp);
461 cbp->bio_cmd = BIO_READ;
462 cbp->bio_data = g_malloc(real_len, M_WAITOK);
463 cbp->bio_cflags |= GV_BIO_MALLOC;
464 cbp->bio_offset = real_off;
465 cbp->bio_length = real_len;
466 cbp->bio_done = gv_plex_done;
467 cbp->bio_caller2 = parity->consumer;
468 cbp->bio_driver1 = wp;
470 GV_ENQUEUE(bp, cbp, pbp);
472 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
474 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
477 cbp = g_clone_bio(bp);
480 cbp->bio_cmd = BIO_READ;
481 cbp->bio_data = g_malloc(real_len, M_WAITOK);
482 cbp->bio_cflags |= GV_BIO_MALLOC;
483 cbp->bio_offset = real_off;
484 cbp->bio_length = real_len;
485 cbp->bio_done = gv_plex_done;
486 cbp->bio_caller2 = original->consumer;
487 cbp->bio_driver1 = wp;
489 GV_ENQUEUE(bp, cbp, pbp);
491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
495 /* Write new data. */
496 cbp = g_clone_bio(bp);
499 cbp->bio_data = addr;
500 cbp->bio_offset = real_off;
501 cbp->bio_length = real_len;
502 cbp->bio_done = gv_plex_done;
503 cbp->bio_caller2 = original->consumer;
505 cbp->bio_driver1 = wp;
508 * We must not write the new data until the old data
509 * was read, so hold this BIO back until we're ready
514 /* The final bio for the parity. */
515 cbp = g_clone_bio(bp);
518 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
519 cbp->bio_cflags |= GV_BIO_MALLOC;
520 cbp->bio_offset = real_off;
521 cbp->bio_length = real_len;
522 cbp->bio_done = gv_plex_done;
523 cbp->bio_caller2 = parity->consumer;
524 cbp->bio_driver1 = wp;
526 /* Remember that this is the BIO for the parity data. */
538 /* Calculate the offsets in the various subdisks for a RAID5 request. */
540 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541 off_t *real_len, int *sdno, int *psdno)
544 off_t len_left, stripeend, stripeoff, stripestart;
546 /* The number of the subdisk containing the parity stripe. */
547 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
549 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
551 /* Offset of the start address from the start of the stripe. */
552 stripeoff = boff % (p->stripesize * (p->sdcount - 1));
553 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
555 /* The number of the subdisk where the stripe resides. */
556 sd = stripeoff / p->stripesize;
557 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
559 /* At or past parity subdisk. */
563 /* The offset of the stripe on this subdisk. */
564 stripestart = (boff - stripeoff) / (p->sdcount - 1);
565 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
567 stripeoff %= p->stripesize;
569 /* The offset of the request on this subdisk. */
570 *real_off = stripestart + stripeoff;
572 stripeend = stripestart + p->stripesize;
573 len_left = stripeend - *real_off;
574 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
576 *real_len = (bcount <= len_left) ? bcount : len_left;