]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/geom/vinum/geom_vinum_raid5.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / geom / vinum / geom_vinum_raid5.c
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46
47 int     gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48             int *, int *);
49
50 /*
51  * Check if the stripe that the work packet wants is already being used by
52  * some other work packet.
53  */
54 int
55 gv_stripe_active(struct gv_plex *p, struct bio *bp)
56 {
57         struct gv_raid5_packet *wp, *owp;
58         int overlap;
59
60         wp = bp->bio_driver1;
61         if (wp->lockbase == -1)
62                 return (0);
63
64         overlap = 0;
65         TAILQ_FOREACH(owp, &p->packets, list) {
66                 if (owp == wp)
67                         break;
68                 if ((wp->lockbase >= owp->lockbase) &&
69                     (wp->lockbase <= owp->lockbase + owp->length)) {
70                         overlap++;
71                         break;
72                 }
73                 if ((wp->lockbase <= owp->lockbase) &&
74                     (wp->lockbase + wp->length >= owp->lockbase)) {
75                         overlap++;
76                         break;
77                 }
78         }
79
80         return (overlap);
81 }
82
83 int
84 gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85     caddr_t addr, off_t boff, off_t bcount)
86 {
87         struct gv_sd *parity, *s;
88         struct gv_bioq *bq;
89         struct bio *cbp, *pbp;
90         int i, psdno;
91         off_t real_len, real_off;
92
93         if (p == NULL || LIST_EMPTY(&p->subdisks))
94                 return (ENXIO);
95
96         gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
97
98         /* Find the right subdisk. */
99         parity = NULL;
100         i = 0;
101         LIST_FOREACH(s, &p->subdisks, in_plex) {
102                 if (i == psdno) {
103                         parity = s;
104                         break;
105                 }
106                 i++;
107         }
108
109         /* Parity stripe not found. */
110         if (parity == NULL)
111                 return (ENXIO);
112
113         if (parity->state != GV_SD_UP)
114                 return (ENXIO);
115
116         wp->length = real_len;
117         wp->data = addr;
118         wp->lockbase = real_off;
119
120         /* Read all subdisks. */
121         LIST_FOREACH(s, &p->subdisks, in_plex) {
122                 /* Skip the parity subdisk. */
123                 if (s == parity)
124                         continue;
125
126                 cbp = g_clone_bio(bp);
127                 if (cbp == NULL)
128                         return (ENOMEM);
129                 cbp->bio_cmd = BIO_READ;
130                 cbp->bio_data = g_malloc(real_len, M_WAITOK);
131                 cbp->bio_cflags |= GV_BIO_MALLOC;
132                 cbp->bio_offset = real_off;
133                 cbp->bio_length = real_len;
134                 cbp->bio_done = gv_plex_done;
135                 cbp->bio_caller2 = s->consumer;
136                 cbp->bio_driver1 = wp;
137
138                 GV_ENQUEUE(bp, cbp, pbp);
139
140                 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
141                 bq->bp = cbp;
142                 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
143         }
144
145         /* Read the parity data. */
146         cbp = g_clone_bio(bp);
147         if (cbp == NULL)
148                 return (ENOMEM);
149         cbp->bio_cmd = BIO_READ;
150         cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151         cbp->bio_cflags |= GV_BIO_MALLOC;
152         cbp->bio_offset = real_off;
153         cbp->bio_length = real_len;
154         cbp->bio_done = gv_plex_done;
155         cbp->bio_caller2 = parity->consumer;
156         cbp->bio_driver1 = wp;
157         wp->waiting = cbp;
158
159         /*
160          * In case we want to rebuild the parity, create an extra BIO to write
161          * it out.  It also acts as buffer for the XOR operations.
162          */
163         cbp = g_clone_bio(bp);
164         if (cbp == NULL)
165                 return (ENOMEM);
166         cbp->bio_data = addr;
167         cbp->bio_offset = real_off;
168         cbp->bio_length = real_len;
169         cbp->bio_done = gv_plex_done;
170         cbp->bio_caller2 = parity->consumer;
171         cbp->bio_driver1 = wp;
172         wp->parity = cbp;
173
174         return (0);
175 }
176
177 /* Rebuild a degraded RAID5 plex. */
178 int
179 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180     caddr_t addr, off_t boff, off_t bcount)
181 {
182         struct gv_sd *broken, *s;
183         struct gv_bioq *bq;
184         struct bio *cbp, *pbp;
185         off_t real_len, real_off;
186
187         if (p == NULL || LIST_EMPTY(&p->subdisks))
188                 return (ENXIO);
189
190         gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
191
192         /* Find the right subdisk. */
193         broken = NULL;
194         LIST_FOREACH(s, &p->subdisks, in_plex) {
195                 if (s->state != GV_SD_UP)
196                         broken = s;
197         }
198
199         /* Broken stripe not found. */
200         if (broken == NULL)
201                 return (ENXIO);
202
203         switch (broken->state) {
204         case GV_SD_UP:
205                 return (EINVAL);
206
207         case GV_SD_STALE:
208                 if (!(bp->bio_cflags & GV_BIO_REBUILD))
209                         return (ENXIO);
210
211                 printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
212                 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
213                 break;
214
215         case GV_SD_REVIVING:
216                 break;
217
218         default:
219                 /* All other subdisk states mean it's not accessible. */
220                 return (ENXIO);
221         }
222
223         wp->length = real_len;
224         wp->data = addr;
225         wp->lockbase = real_off;
226
227         KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
228
229         /* Read all subdisks. */
230         LIST_FOREACH(s, &p->subdisks, in_plex) {
231                 /* Skip the broken subdisk. */
232                 if (s == broken)
233                         continue;
234
235                 cbp = g_clone_bio(bp);
236                 if (cbp == NULL)
237                         return (ENOMEM);
238                 cbp->bio_cmd = BIO_READ;
239                 cbp->bio_data = g_malloc(real_len, M_WAITOK);
240                 cbp->bio_cflags |= GV_BIO_MALLOC;
241                 cbp->bio_offset = real_off;
242                 cbp->bio_length = real_len;
243                 cbp->bio_done = gv_plex_done;
244                 cbp->bio_caller2 = s->consumer;
245                 cbp->bio_driver1 = wp;
246
247                 GV_ENQUEUE(bp, cbp, pbp);
248
249                 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
250                 bq->bp = cbp;
251                 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
252         }
253
254         /* Write the parity data. */
255         cbp = g_clone_bio(bp);
256         if (cbp == NULL)
257                 return (ENOMEM);
258         cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259         cbp->bio_cflags |= GV_BIO_MALLOC;
260         cbp->bio_offset = real_off;
261         cbp->bio_length = real_len;
262         cbp->bio_done = gv_plex_done;
263         cbp->bio_caller2 = broken->consumer;
264         cbp->bio_driver1 = wp;
265         cbp->bio_cflags |= GV_BIO_REBUILD;
266         wp->parity = cbp;
267
268         p->synced = boff;
269
270         return (0);
271 }
272
273 /* Build a request group to perform (part of) a RAID5 request. */
274 int
275 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
276     struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
277 {
278         struct g_geom *gp;
279         struct gv_sd *broken, *original, *parity, *s;
280         struct gv_bioq *bq;
281         struct bio *cbp, *pbp;
282         int i, psdno, sdno, type;
283         off_t real_len, real_off;
284
285         gp = bp->bio_to->geom;
286
287         if (p == NULL || LIST_EMPTY(&p->subdisks))
288                 return (ENXIO);
289
290         /* We are optimistic and assume that this request will be OK. */
291 #define REQ_TYPE_NORMAL         0
292 #define REQ_TYPE_DEGRADED       1
293 #define REQ_TYPE_NOPARITY       2
294
295         type = REQ_TYPE_NORMAL;
296         original = parity = broken = NULL;
297
298         gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
299
300         /* Find the right subdisks. */
301         i = 0;
302         LIST_FOREACH(s, &p->subdisks, in_plex) {
303                 if (i == sdno)
304                         original = s;
305                 if (i == psdno)
306                         parity = s;
307                 if (s->state != GV_SD_UP)
308                         broken = s;
309                 i++;
310         }
311
312         if ((original == NULL) || (parity == NULL))
313                 return (ENXIO);
314
315         /* Our data stripe is missing. */
316         if (original->state != GV_SD_UP)
317                 type = REQ_TYPE_DEGRADED;
318         /* Our parity stripe is missing. */
319         if (parity->state != GV_SD_UP) {
320                 /* We cannot take another failure if we're already degraded. */
321                 if (type != REQ_TYPE_NORMAL)
322                         return (ENXIO);
323                 else
324                         type = REQ_TYPE_NOPARITY;
325         }
326
327         wp->length = real_len;
328         wp->data = addr;
329         wp->lockbase = real_off;
330
331         KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
332
333         if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
334                 type = REQ_TYPE_NORMAL;
335
336         switch (bp->bio_cmd) {
337         case BIO_READ:
338                 /*
339                  * For a degraded read we need to read in all stripes except
340                  * the broken one plus the parity stripe and then recalculate
341                  * the desired data.
342                  */
343                 if (type == REQ_TYPE_DEGRADED) {
344                         bzero(wp->data, wp->length);
345                         LIST_FOREACH(s, &p->subdisks, in_plex) {
346                                 /* Skip the broken subdisk. */
347                                 if (s == broken)
348                                         continue;
349                                 cbp = g_clone_bio(bp);
350                                 if (cbp == NULL)
351                                         return (ENOMEM);
352                                 cbp->bio_data = g_malloc(real_len, M_WAITOK);
353                                 cbp->bio_cflags |= GV_BIO_MALLOC;
354                                 cbp->bio_offset = real_off;
355                                 cbp->bio_length = real_len;
356                                 cbp->bio_done = gv_plex_done;
357                                 cbp->bio_caller2 = s->consumer;
358                                 cbp->bio_driver1 = wp;
359
360                                 GV_ENQUEUE(bp, cbp, pbp);
361
362                                 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
363                                 bq->bp = cbp;
364                                 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
365                         }
366
367                 /* A normal read can be fulfilled with the original subdisk. */
368                 } else {
369                         cbp = g_clone_bio(bp);
370                         if (cbp == NULL)
371                                 return (ENOMEM);
372                         cbp->bio_offset = real_off;
373                         cbp->bio_length = real_len;
374                         cbp->bio_data = addr;
375                         cbp->bio_done = g_std_done;
376                         cbp->bio_caller2 = original->consumer;
377
378                         GV_ENQUEUE(bp, cbp, pbp);
379                 }
380                 wp->lockbase = -1;
381
382                 break;
383
384         case BIO_WRITE:
385                 /*
386                  * A degraded write means we cannot write to the original data
387                  * subdisk.  Thus we need to read in all valid stripes,
388                  * recalculate the parity from the original data, and then
389                  * write the parity stripe back out.
390                  */
391                 if (type == REQ_TYPE_DEGRADED) {
392                         /* Read all subdisks. */
393                         LIST_FOREACH(s, &p->subdisks, in_plex) {
394                                 /* Skip the broken and the parity subdisk. */
395                                 if ((s == broken) || (s == parity))
396                                         continue;
397
398                                 cbp = g_clone_bio(bp);
399                                 if (cbp == NULL)
400                                         return (ENOMEM);
401                                 cbp->bio_cmd = BIO_READ;
402                                 cbp->bio_data = g_malloc(real_len, M_WAITOK);
403                                 cbp->bio_cflags |= GV_BIO_MALLOC;
404                                 cbp->bio_offset = real_off;
405                                 cbp->bio_length = real_len;
406                                 cbp->bio_done = gv_plex_done;
407                                 cbp->bio_caller2 = s->consumer;
408                                 cbp->bio_driver1 = wp;
409
410                                 GV_ENQUEUE(bp, cbp, pbp);
411
412                                 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
413                                 bq->bp = cbp;
414                                 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
415                         }
416
417                         /* Write the parity data. */
418                         cbp = g_clone_bio(bp);
419                         if (cbp == NULL)
420                                 return (ENOMEM);
421                         cbp->bio_data = g_malloc(real_len, M_WAITOK);
422                         cbp->bio_cflags |= GV_BIO_MALLOC;
423                         bcopy(addr, cbp->bio_data, real_len);
424                         cbp->bio_offset = real_off;
425                         cbp->bio_length = real_len;
426                         cbp->bio_done = gv_plex_done;
427                         cbp->bio_caller2 = parity->consumer;
428                         cbp->bio_driver1 = wp;
429                         wp->parity = cbp;
430
431                 /*
432                  * When the parity stripe is missing we just write out the data.
433                  */
434                 } else if (type == REQ_TYPE_NOPARITY) {
435                         cbp = g_clone_bio(bp);
436                         if (cbp == NULL)
437                                 return (ENOMEM);
438                         cbp->bio_offset = real_off;
439                         cbp->bio_length = real_len;
440                         cbp->bio_data = addr;
441                         cbp->bio_done = gv_plex_done;
442                         cbp->bio_caller2 = original->consumer;
443                         cbp->bio_driver1 = wp;
444
445                         GV_ENQUEUE(bp, cbp, pbp);
446
447                         bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
448                         bq->bp = cbp;
449                         TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
450
451                 /*
452                  * A normal write request goes to the original subdisk, then we
453                  * read in all other stripes, recalculate the parity and write
454                  * out the parity again.
455                  */
456                 } else {
457                         /* Read old parity. */
458                         cbp = g_clone_bio(bp);
459                         if (cbp == NULL)
460                                 return (ENOMEM);
461                         cbp->bio_cmd = BIO_READ;
462                         cbp->bio_data = g_malloc(real_len, M_WAITOK);
463                         cbp->bio_cflags |= GV_BIO_MALLOC;
464                         cbp->bio_offset = real_off;
465                         cbp->bio_length = real_len;
466                         cbp->bio_done = gv_plex_done;
467                         cbp->bio_caller2 = parity->consumer;
468                         cbp->bio_driver1 = wp;
469
470                         GV_ENQUEUE(bp, cbp, pbp);
471
472                         bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
473                         bq->bp = cbp;
474                         TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
475
476                         /* Read old data. */
477                         cbp = g_clone_bio(bp);
478                         if (cbp == NULL)
479                                 return (ENOMEM);
480                         cbp->bio_cmd = BIO_READ;
481                         cbp->bio_data = g_malloc(real_len, M_WAITOK);
482                         cbp->bio_cflags |= GV_BIO_MALLOC;
483                         cbp->bio_offset = real_off;
484                         cbp->bio_length = real_len;
485                         cbp->bio_done = gv_plex_done;
486                         cbp->bio_caller2 = original->consumer;
487                         cbp->bio_driver1 = wp;
488
489                         GV_ENQUEUE(bp, cbp, pbp);
490
491                         bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492                         bq->bp = cbp;
493                         TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494
495                         /* Write new data. */
496                         cbp = g_clone_bio(bp);
497                         if (cbp == NULL)
498                                 return (ENOMEM);
499                         cbp->bio_data = addr;
500                         cbp->bio_offset = real_off;
501                         cbp->bio_length = real_len;
502                         cbp->bio_done = gv_plex_done;
503                         cbp->bio_caller2 = original->consumer;
504
505                         cbp->bio_driver1 = wp;
506
507                         /*
508                          * We must not write the new data until the old data
509                          * was read, so hold this BIO back until we're ready
510                          * for it.
511                          */
512                         wp->waiting = cbp;
513
514                         /* The final bio for the parity. */
515                         cbp = g_clone_bio(bp);
516                         if (cbp == NULL)
517                                 return (ENOMEM);
518                         cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
519                         cbp->bio_cflags |= GV_BIO_MALLOC;
520                         cbp->bio_offset = real_off;
521                         cbp->bio_length = real_len;
522                         cbp->bio_done = gv_plex_done;
523                         cbp->bio_caller2 = parity->consumer;
524                         cbp->bio_driver1 = wp;
525
526                         /* Remember that this is the BIO for the parity data. */
527                         wp->parity = cbp;
528                 }
529                 break;
530
531         default:
532                 return (EINVAL);
533         }
534
535         return (0);
536 }
537
538 /* Calculate the offsets in the various subdisks for a RAID5 request. */
539 int
540 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541     off_t *real_len, int *sdno, int *psdno)
542 {
543         int sd, psd;
544         off_t len_left, stripeend, stripeoff, stripestart;
545
546         /* The number of the subdisk containing the parity stripe. */
547         psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
548             p->sdcount;
549         KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
550
551         /* Offset of the start address from the start of the stripe. */
552         stripeoff = boff % (p->stripesize * (p->sdcount - 1));
553         KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
554
555         /* The number of the subdisk where the stripe resides. */
556         sd = stripeoff / p->stripesize;
557         KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
558
559         /* At or past parity subdisk. */
560         if (sd >= psd)
561                 sd++;
562
563         /* The offset of the stripe on this subdisk. */
564         stripestart = (boff - stripeoff) / (p->sdcount - 1);
565         KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
566
567         stripeoff %= p->stripesize;
568
569         /* The offset of the request on this subdisk. */
570         *real_off = stripestart + stripeoff;
571
572         stripeend = stripestart + p->stripesize;
573         len_left = stripeend - *real_off;
574         KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
575
576         *real_len = (bcount <= len_left) ? bcount : len_left;
577
578         if (sdno != NULL)
579                 *sdno = sd;
580         if (psdno != NULL)
581                 *psdno = psd;
582
583         return (0);
584 }