2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
34 #include <sys/endian.h>
35 #include <sys/kernel.h>
37 #include <sys/limits.h>
39 #include <sys/malloc.h>
40 #include <sys/mutex.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include <geom/geom_dbg.h>
45 #include "geom/raid/g_raid.h"
46 #include "g_raid_tr_if.h"
50 SYSCTL_DECL(_kern_geom_raid_raid1e);
52 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
55 &g_raid1e_rebuild_slab, 0,
56 "Amount of the disk to rebuild each read/write cycle of the rebuild.");
58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
61 &g_raid1e_rebuild_fair_io, 0,
62 "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64 #define RAID1E_REBUILD_CLUSTER_IDLE 100
65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
67 &g_raid1e_rebuild_cluster_idle, 0,
68 "Number of slabs to do each time we trigger a rebuild cycle");
70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
73 &g_raid1e_rebuild_meta_update, 0,
74 "When to update the meta data.");
76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
78 #define TR_RAID1E_NONE 0
79 #define TR_RAID1E_REBUILD 1
80 #define TR_RAID1E_RESYNC 2
82 #define TR_RAID1E_F_DOING_SOME 0x1
83 #define TR_RAID1E_F_LOCKED 0x2
84 #define TR_RAID1E_F_ABORT 0x4
86 struct g_raid_tr_raid1e_object {
87 struct g_raid_tr_object trso_base;
91 int trso_recover_slabs; /* slabs before rest */
95 struct g_raid_subdisk *trso_failed_sd; /* like per volume */
96 void *trso_buffer; /* Buffer space */
97 off_t trso_lock_pos; /* Locked range start. */
98 off_t trso_lock_len; /* Locked range length. */
102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
103 static g_raid_tr_event_t g_raid_tr_event_raid1e;
104 static g_raid_tr_start_t g_raid_tr_start_raid1e;
105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
111 static g_raid_tr_free_t g_raid_tr_free_raid1e;
113 static kobj_method_t g_raid_tr_raid1e_methods[] = {
114 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
115 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
116 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
117 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
118 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
119 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
120 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
121 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
122 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
123 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
127 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
129 g_raid_tr_raid1e_methods,
130 sizeof(struct g_raid_tr_raid1e_object),
133 .trc_accept_unmapped = 1
136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
138 struct g_raid_subdisk *sd);
139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
140 int no, off_t off, off_t len, u_int mask);
143 V2P(struct g_raid_volume *vol, off_t virt,
144 int *disk, off_t *offset, off_t *start)
149 strip_size = vol->v_strip_size;
151 nstrip = virt / strip_size;
152 /* Start position in strip. */
153 *start = virt % strip_size;
155 *disk = (nstrip * N) % vol->v_disks_count;
156 /* Strip start position in disk. */
157 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
161 P2V(struct g_raid_volume *vol, int disk, off_t offset,
162 off_t *virt, int *copy)
167 strip_size = vol->v_strip_size;
168 /* Start position in strip. */
169 start = offset % strip_size;
170 /* Physical strip number. */
171 nstrip = (offset / strip_size) * vol->v_disks_count + disk;
172 /* Number of physical strip (copy) inside virtual strip. */
174 /* Offset in virtual space. */
175 *virt = (nstrip / N) * strip_size + start;
179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
181 struct g_raid_tr_raid1e_object *trs;
183 trs = (struct g_raid_tr_raid1e_object *)tr;
184 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
185 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
186 return (G_RAID_TR_TASTE_FAIL);
187 trs->trso_starting = 1;
188 return (G_RAID_TR_TASTE_SUCCEED);
192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
194 struct g_raid_softc *sc;
195 struct g_raid_subdisk *sd, *bestsd, *worstsd;
196 int i, j, state, sstate;
199 state = G_RAID_VOLUME_S_OPTIMAL;
200 for (i = 0; i < vol->v_disks_count / N; i++) {
201 bestsd = &vol->v_subdisks[i * N];
202 for (j = 1; j < N; j++) {
203 sd = &vol->v_subdisks[i * N + j];
204 if (sd->sd_state > bestsd->sd_state)
206 else if (sd->sd_state == bestsd->sd_state &&
207 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
208 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
209 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
212 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
213 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
214 /* We found reasonable candidate. */
216 "Promote subdisk %s:%d from %s to ACTIVE.",
217 vol->v_name, bestsd->sd_pos,
218 g_raid_subdisk_state2str(bestsd->sd_state));
219 g_raid_change_subdisk_state(bestsd,
220 G_RAID_SUBDISK_S_ACTIVE);
221 g_raid_write_metadata(sc,
222 vol, bestsd, bestsd->sd_disk);
224 worstsd = &vol->v_subdisks[i * N];
225 for (j = 1; j < N; j++) {
226 sd = &vol->v_subdisks[i * N + j];
227 if (sd->sd_state < worstsd->sd_state)
230 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
231 sstate = G_RAID_VOLUME_S_OPTIMAL;
232 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
233 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
234 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235 sstate = G_RAID_VOLUME_S_DEGRADED;
237 sstate = G_RAID_VOLUME_S_BROKEN;
245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
247 struct g_raid_softc *sc;
248 struct g_raid_subdisk *sd, *bestsd, *worstsd;
249 int i, j, state, sstate;
252 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
254 return (G_RAID_VOLUME_S_OPTIMAL);
255 for (i = 0; i < vol->v_disks_count; i++) {
256 sd = &vol->v_subdisks[i];
257 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
258 /* We found reasonable candidate. */
260 "Promote subdisk %s:%d from %s to STALE.",
261 vol->v_name, sd->sd_pos,
262 g_raid_subdisk_state2str(sd->sd_state));
263 g_raid_change_subdisk_state(sd,
264 G_RAID_SUBDISK_S_STALE);
265 g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
268 state = G_RAID_VOLUME_S_OPTIMAL;
269 for (i = 0; i < vol->v_disks_count; i++) {
270 bestsd = &vol->v_subdisks[i];
271 worstsd = &vol->v_subdisks[i];
272 for (j = 1; j < N; j++) {
273 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
274 if (sd->sd_state > bestsd->sd_state)
276 else if (sd->sd_state == bestsd->sd_state &&
277 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
278 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
279 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
281 if (sd->sd_state < worstsd->sd_state)
284 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
285 sstate = G_RAID_VOLUME_S_OPTIMAL;
286 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
288 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
289 sstate = G_RAID_VOLUME_S_DEGRADED;
291 sstate = G_RAID_VOLUME_S_BROKEN;
299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
300 struct g_raid_subdisk *sd)
302 struct g_raid_tr_raid1e_object *trs;
303 struct g_raid_softc *sc;
307 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
308 if (trs->trso_stopping &&
309 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
310 s = G_RAID_VOLUME_S_STOPPED;
311 else if (trs->trso_starting)
312 s = G_RAID_VOLUME_S_STARTING;
314 if ((vol->v_disks_count % N) == 0)
315 s = g_raid_tr_update_state_raid1e_even(vol);
317 s = g_raid_tr_update_state_raid1e_odd(vol);
319 if (s != vol->v_state) {
320 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
321 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
322 G_RAID_EVENT_VOLUME);
323 g_raid_change_volume_state(vol, s);
324 if (!trs->trso_starting && !trs->trso_stopping)
325 g_raid_write_metadata(sc, vol, NULL, NULL);
327 if (!trs->trso_starting && !trs->trso_stopping)
328 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
334 struct g_raid_disk *disk)
336 struct g_raid_volume *vol;
340 * We don't fail the last disk in the pack, since it still has decent
341 * data on it and that's better than failing the disk if it is the root
344 * XXX should this be controlled via a tunable? It makes sense for
345 * the volume that has / on it. I can't think of a case where we'd
346 * want the volume to go away on this kind of event.
348 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
349 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
350 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
351 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
352 vol->v_disks_count) &&
353 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
355 g_raid_fail_disk(sc, sd, disk);
359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
361 struct g_raid_volume *vol;
362 struct g_raid_subdisk *sd;
364 vol = trs->trso_base.tro_volume;
365 sd = trs->trso_failed_sd;
366 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
367 free(trs->trso_buffer, M_TR_RAID1E);
368 trs->trso_buffer = NULL;
369 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
370 trs->trso_type = TR_RAID1E_NONE;
371 trs->trso_recover_slabs = 0;
372 trs->trso_failed_sd = NULL;
373 g_raid_tr_update_state_raid1e(vol, NULL);
377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
379 struct g_raid_tr_raid1e_object *trs;
380 struct g_raid_subdisk *sd;
382 trs = (struct g_raid_tr_raid1e_object *)tr;
383 sd = trs->trso_failed_sd;
384 G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
385 "Subdisk %s:%d-%s rebuild completed.",
386 sd->sd_volume->v_name, sd->sd_pos,
387 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
388 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
389 sd->sd_rebuild_pos = 0;
390 g_raid_tr_raid1e_rebuild_done(trs);
394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
396 struct g_raid_tr_raid1e_object *trs;
397 struct g_raid_subdisk *sd;
398 struct g_raid_volume *vol;
400 vol = tr->tro_volume;
401 trs = (struct g_raid_tr_raid1e_object *)tr;
402 sd = trs->trso_failed_sd;
403 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
404 G_RAID_DEBUG1(1, vol->v_softc,
405 "Subdisk %s:%d-%s rebuild is aborting.",
406 sd->sd_volume->v_name, sd->sd_pos,
407 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
408 trs->trso_flags |= TR_RAID1E_F_ABORT;
410 G_RAID_DEBUG1(0, vol->v_softc,
411 "Subdisk %s:%d-%s rebuild aborted.",
412 sd->sd_volume->v_name, sd->sd_pos,
413 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
414 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
415 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
416 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
417 g_raid_unlock_range(tr->tro_volume,
418 trs->trso_lock_pos, trs->trso_lock_len);
420 g_raid_tr_raid1e_rebuild_done(trs);
425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
427 struct g_raid_tr_raid1e_object *trs;
428 struct g_raid_softc *sc;
429 struct g_raid_volume *vol;
430 struct g_raid_subdisk *sd;
432 off_t len, virtual, vend, offset, start;
433 int disk, copy, best;
435 trs = (struct g_raid_tr_raid1e_object *)tr;
436 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
438 vol = tr->tro_volume;
440 sd = trs->trso_failed_sd;
443 if (sd->sd_rebuild_pos >= sd->sd_size) {
444 g_raid_tr_raid1e_rebuild_finish(tr);
447 /* Get virtual offset from physical rebuild position. */
448 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
449 /* Get physical offset back to get first stripe position. */
450 V2P(vol, virtual, &disk, &offset, &start);
451 /* Calculate contignous data length. */
452 len = MIN(g_raid1e_rebuild_slab,
453 sd->sd_size - sd->sd_rebuild_pos);
454 if ((vol->v_disks_count % N) != 0)
455 len = MIN(len, vol->v_strip_size - start);
456 /* Find disk with most accurate data. */
457 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
458 offset + start, len, 0);
460 /* There is no any valid disk. */
461 g_raid_tr_raid1e_rebuild_abort(tr);
463 } else if (best != copy) {
464 /* Some other disk has better data. */
467 /* We have the most accurate data. Skip the range. */
468 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
469 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
470 sd->sd_rebuild_pos += len;
474 memset(bp, 0, sizeof(*bp));
475 bp->bio_offset = offset + start +
476 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
477 bp->bio_length = len;
478 bp->bio_data = trs->trso_buffer;
479 bp->bio_cmd = BIO_READ;
480 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
481 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
482 G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
484 * If we are crossing stripe boundary, correct affected virtual
485 * range we should lock.
487 if (start + len > vol->v_strip_size) {
488 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
489 len = vend - virtual;
491 trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
492 trs->trso_flags |= TR_RAID1E_F_LOCKED;
493 trs->trso_lock_pos = virtual;
494 trs->trso_lock_len = len;
495 /* Lock callback starts I/O */
496 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
502 struct g_raid_volume *vol;
503 struct g_raid_tr_raid1e_object *trs;
504 struct g_raid_subdisk *sd;
506 vol = tr->tro_volume;
507 trs = (struct g_raid_tr_raid1e_object *)tr;
508 if (trs->trso_failed_sd) {
509 G_RAID_DEBUG1(1, vol->v_softc,
510 "Already rebuild in start rebuild. pos %jd\n",
511 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
514 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
516 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
518 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
520 sd->sd_rebuild_pos = 0;
521 g_raid_change_subdisk_state(sd,
522 G_RAID_SUBDISK_S_RESYNC);
523 g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
525 sd = g_raid_get_subdisk(vol,
526 G_RAID_SUBDISK_S_UNINITIALIZED);
528 sd = g_raid_get_subdisk(vol,
529 G_RAID_SUBDISK_S_NEW);
531 sd->sd_rebuild_pos = 0;
532 g_raid_change_subdisk_state(sd,
533 G_RAID_SUBDISK_S_REBUILD);
534 g_raid_write_metadata(vol->v_softc,
540 G_RAID_DEBUG1(1, vol->v_softc,
541 "No failed disk to rebuild. night night.");
544 trs->trso_failed_sd = sd;
545 G_RAID_DEBUG1(0, vol->v_softc,
546 "Subdisk %s:%d-%s rebuild start at %jd.",
547 sd->sd_volume->v_name, sd->sd_pos,
548 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
549 trs->trso_failed_sd->sd_rebuild_pos);
550 trs->trso_type = TR_RAID1E_REBUILD;
551 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
552 trs->trso_meta_update = g_raid1e_rebuild_meta_update;
553 g_raid_tr_raid1e_rebuild_some(tr);
557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
558 struct g_raid_subdisk *sd)
560 struct g_raid_volume *vol;
561 struct g_raid_tr_raid1e_object *trs;
564 vol = tr->tro_volume;
565 trs = (struct g_raid_tr_raid1e_object *)tr;
566 if (trs->trso_stopping)
568 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
569 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
570 switch(trs->trso_type) {
572 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
575 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
576 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
577 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
581 g_raid_tr_raid1e_rebuild_start(tr);
583 case TR_RAID1E_REBUILD:
584 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
585 trs->trso_failed_sd == sd)
586 g_raid_tr_raid1e_rebuild_abort(tr);
588 case TR_RAID1E_RESYNC:
594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
595 struct g_raid_subdisk *sd, u_int event)
598 g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
605 struct g_raid_tr_raid1e_object *trs;
606 struct g_raid_volume *vol;
608 trs = (struct g_raid_tr_raid1e_object *)tr;
609 vol = tr->tro_volume;
610 trs->trso_starting = 0;
611 g_raid_tr_update_state_raid1e(vol, NULL);
616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
618 struct g_raid_tr_raid1e_object *trs;
619 struct g_raid_volume *vol;
621 trs = (struct g_raid_tr_raid1e_object *)tr;
622 vol = tr->tro_volume;
623 trs->trso_starting = 0;
624 trs->trso_stopping = 1;
625 g_raid_tr_update_state_raid1e(vol, NULL);
630 * Select the disk to read from. Take into account: subdisk state, running
631 * error recovery, average disk load, head position and possible cache hits.
633 #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
636 int no, off_t off, off_t len, u_int mask)
638 struct g_raid_subdisk *sd;
640 int i, best, prio, bestprio;
644 for (i = 0; i < N; i++) {
645 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
647 if (no + i >= vol->v_disks_count)
648 offset += vol->v_strip_size;
650 prio = G_RAID_SUBDISK_LOAD(sd);
651 if ((mask & (1 << sd->sd_pos)) != 0)
653 switch (sd->sd_state) {
654 case G_RAID_SUBDISK_S_ACTIVE:
656 case G_RAID_SUBDISK_S_RESYNC:
657 if (offset + off < sd->sd_rebuild_pos)
660 case G_RAID_SUBDISK_S_STALE:
663 case G_RAID_SUBDISK_S_REBUILD:
664 if (offset + off < sd->sd_rebuild_pos)
670 prio += min(sd->sd_recovery, 255) << 16;
671 /* If disk head is precisely in position - highly prefer it. */
672 if (G_RAID_SUBDISK_POS(sd) == offset)
673 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
675 /* If disk head is close to position - prefer it. */
676 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
677 G_RAID_SUBDISK_TRACK_SIZE)
678 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
679 if (prio < bestprio) {
688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
690 struct g_raid_volume *vol;
691 struct g_raid_subdisk *sd;
692 struct bio_queue_head queue;
695 off_t offset, start, length, remain;
696 u_int no, strip_size;
699 vol = tr->tro_volume;
700 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
704 strip_size = vol->v_strip_size;
705 V2P(vol, bp->bio_offset, &no, &offset, &start);
706 remain = bp->bio_length;
709 length = MIN(strip_size - start, remain);
710 best = g_raid_tr_raid1e_select_read_disk(vol,
711 no, offset, length, 0);
712 KASSERT(best >= 0, ("No readable disk in volume %s!",
715 if (no >= vol->v_disks_count) {
716 no -= vol->v_disks_count;
717 offset += strip_size;
719 cbp = g_clone_bio(bp);
722 cbp->bio_offset = offset + start;
723 cbp->bio_length = length;
724 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
725 cbp->bio_ma_offset += (uintptr_t)addr;
726 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
727 cbp->bio_ma_offset %= PAGE_SIZE;
728 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
729 cbp->bio_length) / PAGE_SIZE;
731 cbp->bio_data = addr;
732 cbp->bio_caller1 = &vol->v_subdisks[no];
733 bioq_insert_tail(&queue, cbp);
735 if (no >= vol->v_disks_count) {
736 no -= vol->v_disks_count;
737 offset += strip_size;
743 while ((cbp = bioq_takefirst(&queue)) != NULL) {
744 sd = cbp->bio_caller1;
745 cbp->bio_caller1 = NULL;
746 g_raid_subdisk_iostart(sd, cbp);
750 while ((cbp = bioq_takefirst(&queue)) != NULL)
752 if (bp->bio_error == 0)
753 bp->bio_error = ENOMEM;
754 g_raid_iodone(bp, bp->bio_error);
758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
760 struct g_raid_volume *vol;
761 struct g_raid_subdisk *sd;
762 struct bio_queue_head queue;
765 off_t offset, start, length, remain;
766 u_int no, strip_size;
769 vol = tr->tro_volume;
770 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
774 strip_size = vol->v_strip_size;
775 V2P(vol, bp->bio_offset, &no, &offset, &start);
776 remain = bp->bio_length;
779 length = MIN(strip_size - start, remain);
780 for (i = 0; i < N; i++) {
781 sd = &vol->v_subdisks[no];
782 switch (sd->sd_state) {
783 case G_RAID_SUBDISK_S_ACTIVE:
784 case G_RAID_SUBDISK_S_STALE:
785 case G_RAID_SUBDISK_S_RESYNC:
787 case G_RAID_SUBDISK_S_REBUILD:
788 if (offset + start >= sd->sd_rebuild_pos)
794 cbp = g_clone_bio(bp);
797 cbp->bio_offset = offset + start;
798 cbp->bio_length = length;
799 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
800 bp->bio_cmd != BIO_DELETE) {
801 cbp->bio_ma_offset += (uintptr_t)addr;
802 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
803 cbp->bio_ma_offset %= PAGE_SIZE;
804 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
805 cbp->bio_length) / PAGE_SIZE;
807 cbp->bio_data = addr;
808 cbp->bio_caller1 = sd;
809 bioq_insert_tail(&queue, cbp);
811 if (++no >= vol->v_disks_count) {
813 offset += strip_size;
817 if (bp->bio_cmd != BIO_DELETE)
821 while ((cbp = bioq_takefirst(&queue)) != NULL) {
822 sd = cbp->bio_caller1;
823 cbp->bio_caller1 = NULL;
824 g_raid_subdisk_iostart(sd, cbp);
828 while ((cbp = bioq_takefirst(&queue)) != NULL)
830 if (bp->bio_error == 0)
831 bp->bio_error = ENOMEM;
832 g_raid_iodone(bp, bp->bio_error);
836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
838 struct g_raid_volume *vol;
839 struct g_raid_tr_raid1e_object *trs;
841 vol = tr->tro_volume;
842 trs = (struct g_raid_tr_raid1e_object *)tr;
843 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
844 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
845 vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
846 g_raid_iodone(bp, EIO);
850 * If we're rebuilding, squeeze in rebuild activity every so often,
851 * even when the disk is busy. Be sure to only count real I/O
852 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
855 if (trs->trso_failed_sd != NULL &&
856 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
857 /* Make this new or running now round short. */
858 trs->trso_recover_slabs = 0;
859 if (--trs->trso_fair_io <= 0) {
860 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
861 g_raid_tr_raid1e_rebuild_some(tr);
864 switch (bp->bio_cmd) {
866 g_raid_tr_iostart_raid1e_read(tr, bp);
870 g_raid_tr_iostart_raid1e_write(tr, bp);
874 g_raid_tr_flush_common(tr, bp);
877 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
878 bp->bio_cmd, vol->v_name));
884 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
885 struct g_raid_subdisk *sd, struct bio *bp)
888 struct g_raid_subdisk *nsd;
889 struct g_raid_volume *vol;
891 struct g_raid_tr_raid1e_object *trs;
892 off_t virtual, offset, start;
894 int error, do_write, copy, disk, best;
896 trs = (struct g_raid_tr_raid1e_object *)tr;
897 vol = tr->tro_volume;
898 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
899 if (trs->trso_type == TR_RAID1E_REBUILD) {
900 nsd = trs->trso_failed_sd;
901 if (bp->bio_cmd == BIO_READ) {
902 /* Immediately abort rebuild, if requested. */
903 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
904 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
905 g_raid_tr_raid1e_rebuild_abort(tr);
909 /* On read error, skip and cross fingers. */
910 if (bp->bio_error != 0) {
912 "Read error during rebuild (%d), "
913 "possible data loss!",
915 goto rebuild_round_done;
919 * The read operation finished, queue the
922 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
924 bp->bio_cmd = BIO_WRITE;
925 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
926 bp->bio_offset = nsd->sd_rebuild_pos;
927 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
928 g_raid_subdisk_iostart(nsd, bp);
931 * The write operation just finished. Do
932 * another. We keep cloning the master bio
933 * since it has the right buffers allocated to
936 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
938 if (bp->bio_error != 0 ||
939 trs->trso_flags & TR_RAID1E_F_ABORT) {
940 if ((trs->trso_flags &
941 TR_RAID1E_F_ABORT) == 0) {
942 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
945 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
946 g_raid_tr_raid1e_rebuild_abort(tr);
950 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
951 g_raid_unlock_range(tr->tro_volume,
952 trs->trso_lock_pos, trs->trso_lock_len);
953 nsd->sd_rebuild_pos += bp->bio_length;
954 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
955 g_raid_tr_raid1e_rebuild_finish(tr);
959 /* Abort rebuild if we are stopping */
960 if (trs->trso_stopping) {
961 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
962 g_raid_tr_raid1e_rebuild_abort(tr);
966 if (--trs->trso_meta_update <= 0) {
967 g_raid_write_metadata(vol->v_softc,
968 vol, nsd, nsd->sd_disk);
969 trs->trso_meta_update =
970 g_raid1e_rebuild_meta_update;
971 /* Compensate short rebuild I/Os. */
972 if ((vol->v_disks_count % N) != 0 &&
974 g_raid1e_rebuild_slab) {
975 trs->trso_meta_update *=
976 g_raid1e_rebuild_slab;
977 trs->trso_meta_update /=
981 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
982 if (--trs->trso_recover_slabs <= 0)
984 /* Run next rebuild iteration. */
985 g_raid_tr_raid1e_rebuild_some(tr);
987 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
989 * read good sd, read bad sd in parallel. when both
990 * done, compare the buffers. write good to the bad
991 * if different. do the next bit of work.
993 panic("Somehow, we think we're doing a resync");
997 pbp = bp->bio_parent;
999 mask = (intptr_t)bp->bio_caller2;
1000 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1002 * Read failed on first drive. Retry the read error on
1003 * another disk drive, if available, before erroring out the
1006 sd->sd_disk->d_read_errs++;
1007 G_RAID_LOGREQ(0, bp,
1008 "Read error (%d), %d read errors total",
1009 bp->bio_error, sd->sd_disk->d_read_errs);
1012 * If there are too many read errors, we move to degraded.
1013 * XXX Do we want to FAIL the drive (eg, make the user redo
1014 * everything to get it back in sync), or just degrade the
1015 * drive, which kicks off a resync?
1018 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1019 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1023 /* Restore what we were doing. */
1024 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1025 V2P(vol, virtual, &disk, &offset, &start);
1027 /* Find the other disk, and try to do the I/O to it. */
1029 best = g_raid_tr_raid1e_select_read_disk(vol,
1030 disk, offset, start, mask);
1031 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1033 if (disk >= vol->v_disks_count) {
1034 disk -= vol->v_disks_count;
1035 offset += vol->v_strip_size;
1037 cbp->bio_offset = offset + start;
1038 cbp->bio_length = bp->bio_length;
1039 cbp->bio_data = bp->bio_data;
1040 cbp->bio_ma = bp->bio_ma;
1041 cbp->bio_ma_offset = bp->bio_ma_offset;
1042 cbp->bio_ma_n = bp->bio_ma_n;
1044 nsd = &vol->v_subdisks[disk];
1045 G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1049 if ((mask & (1U << 31)) != 0)
1051 cbp->bio_caller2 = (void *)mask;
1053 cbp->bio_caller1 = nsd;
1054 /* Lock callback starts I/O */
1055 g_raid_lock_range(sd->sd_volume,
1056 virtual, cbp->bio_length, pbp, cbp);
1058 g_raid_subdisk_iostart(nsd, cbp);
1063 * We can't retry. Return the original error by falling
1064 * through. This will happen when there's only one good disk.
1065 * We don't need to fail the raid, since its actual state is
1066 * based on the state of the subdisks.
1068 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1070 if (bp->bio_cmd == BIO_READ &&
1071 bp->bio_error == 0 &&
1072 (mask & (1U << 31)) != 0) {
1073 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1075 /* Restore what we were doing. */
1076 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1077 V2P(vol, virtual, &disk, &offset, &start);
1079 /* Find best disk to write. */
1080 best = g_raid_tr_raid1e_select_read_disk(vol,
1081 disk, offset, start, ~mask);
1082 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1084 if (disk >= vol->v_disks_count) {
1085 disk -= vol->v_disks_count;
1086 offset += vol->v_strip_size;
1088 cbp->bio_offset = offset + start;
1089 cbp->bio_cmd = BIO_WRITE;
1090 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1091 cbp->bio_caller2 = (void *)mask;
1093 G_RAID_LOGREQ(2, cbp,
1094 "Attempting bad sector remap on failing drive.");
1095 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1099 if ((mask & (1U << 31)) != 0) {
1101 * We're done with a recovery, mark the range as unlocked.
1102 * For any write errors, we aggressively fail the disk since
1103 * there was both a READ and a WRITE error at this location.
1104 * Both types of errors generally indicates the drive is on
1105 * the verge of total failure anyway. Better to stop trusting
1106 * it now. However, we need to reset error to 0 in that case
1107 * because we're not failing the original I/O which succeeded.
1110 /* Restore what we were doing. */
1111 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1112 V2P(vol, virtual, &disk, &offset, &start);
1114 for (copy = 0; copy < N; copy++) {
1115 if ((mask & (1 << copy) ) != 0)
1116 vol->v_subdisks[(disk + copy) %
1117 vol->v_disks_count].sd_recovery--;
1120 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1121 G_RAID_LOGREQ(0, bp, "Remap write failed: "
1122 "failing subdisk.");
1123 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1126 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1127 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1129 if (pbp->bio_cmd != BIO_READ) {
1130 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1131 pbp->bio_error = bp->bio_error;
1132 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1133 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1134 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1136 error = pbp->bio_error;
1138 error = bp->bio_error;
1140 if (pbp->bio_children == pbp->bio_inbed) {
1141 pbp->bio_completed = pbp->bio_length;
1142 g_raid_iodone(pbp, error);
1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1148 void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1150 struct g_raid_volume *vol;
1151 struct g_raid_subdisk *sd;
1152 struct bio_queue_head queue;
1154 off_t offset, start, length, remain;
1155 u_int no, strip_size;
1158 vol = tr->tro_volume;
1160 strip_size = vol->v_strip_size;
1161 V2P(vol, boffset, &no, &offset, &start);
1164 while (remain > 0) {
1165 length = MIN(strip_size - start, remain);
1166 for (i = 0; i < N; i++) {
1167 sd = &vol->v_subdisks[no];
1168 switch (sd->sd_state) {
1169 case G_RAID_SUBDISK_S_ACTIVE:
1170 case G_RAID_SUBDISK_S_STALE:
1171 case G_RAID_SUBDISK_S_RESYNC:
1173 case G_RAID_SUBDISK_S_REBUILD:
1174 if (offset + start >= sd->sd_rebuild_pos)
1180 error = g_raid_subdisk_kerneldump(sd,
1181 addr, 0, offset + start, length);
1185 if (++no >= vol->v_disks_count) {
1187 offset += strip_size;
1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1201 struct g_raid_subdisk *sd;
1203 bp = (struct bio *)argp;
1204 sd = (struct g_raid_subdisk *)bp->bio_caller1;
1205 g_raid_subdisk_iostart(sd, bp);
1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1213 struct g_raid_tr_raid1e_object *trs;
1214 struct g_raid_volume *vol;
1216 vol = tr->tro_volume;
1217 trs = (struct g_raid_tr_raid1e_object *)tr;
1218 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1219 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1220 /* Compensate short rebuild I/Os. */
1221 if ((vol->v_disks_count % N) != 0 &&
1222 vol->v_strip_size < g_raid1e_rebuild_slab) {
1223 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1224 trs->trso_recover_slabs /= vol->v_strip_size;
1226 if (trs->trso_type == TR_RAID1E_REBUILD)
1227 g_raid_tr_raid1e_rebuild_some(tr);
1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1234 struct g_raid_tr_raid1e_object *trs;
1236 trs = (struct g_raid_tr_raid1e_object *)tr;
1238 if (trs->trso_buffer != NULL) {
1239 free(trs->trso_buffer, M_TR_RAID1E);
1240 trs->trso_buffer = NULL;
1245 G_RAID_TR_DECLARE(raid1e, "RAID1E");