2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
34 #include <sys/endian.h>
35 #include <sys/kernel.h>
37 #include <sys/limits.h>
39 #include <sys/malloc.h>
40 #include <sys/mutex.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include "geom/raid/g_raid.h"
45 #include "g_raid_tr_if.h"
49 SYSCTL_DECL(_kern_geom_raid_raid1e);
51 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
54 &g_raid1e_rebuild_slab, 0,
55 "Amount of the disk to rebuild each read/write cycle of the rebuild.");
57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
60 &g_raid1e_rebuild_fair_io, 0,
61 "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
63 #define RAID1E_REBUILD_CLUSTER_IDLE 100
64 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
65 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
66 &g_raid1e_rebuild_cluster_idle, 0,
67 "Number of slabs to do each time we trigger a rebuild cycle");
69 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
70 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
72 &g_raid1e_rebuild_meta_update, 0,
73 "When to update the meta data.");
75 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
77 #define TR_RAID1E_NONE 0
78 #define TR_RAID1E_REBUILD 1
79 #define TR_RAID1E_RESYNC 2
81 #define TR_RAID1E_F_DOING_SOME 0x1
82 #define TR_RAID1E_F_LOCKED 0x2
83 #define TR_RAID1E_F_ABORT 0x4
85 struct g_raid_tr_raid1e_object {
86 struct g_raid_tr_object trso_base;
90 int trso_recover_slabs; /* slabs before rest */
94 struct g_raid_subdisk *trso_failed_sd; /* like per volume */
95 void *trso_buffer; /* Buffer space */
96 off_t trso_lock_pos; /* Locked range start. */
97 off_t trso_lock_len; /* Locked range length. */
101 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
102 static g_raid_tr_event_t g_raid_tr_event_raid1e;
103 static g_raid_tr_start_t g_raid_tr_start_raid1e;
104 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
105 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
106 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
107 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
108 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
109 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
110 static g_raid_tr_free_t g_raid_tr_free_raid1e;
112 static kobj_method_t g_raid_tr_raid1e_methods[] = {
113 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
114 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
115 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
116 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
117 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
118 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
119 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
120 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
121 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
122 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
126 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
128 g_raid_tr_raid1e_methods,
129 sizeof(struct g_raid_tr_raid1e_object),
132 .trc_accept_unmapped = 1
135 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
136 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
137 struct g_raid_subdisk *sd);
138 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
139 int no, off_t off, off_t len, u_int mask);
142 V2P(struct g_raid_volume *vol, off_t virt,
143 int *disk, off_t *offset, off_t *start)
148 strip_size = vol->v_strip_size;
150 nstrip = virt / strip_size;
151 /* Start position in strip. */
152 *start = virt % strip_size;
154 *disk = (nstrip * N) % vol->v_disks_count;
155 /* Strip start position in disk. */
156 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
160 P2V(struct g_raid_volume *vol, int disk, off_t offset,
161 off_t *virt, int *copy)
166 strip_size = vol->v_strip_size;
167 /* Start position in strip. */
168 start = offset % strip_size;
169 /* Physical strip number. */
170 nstrip = (offset / strip_size) * vol->v_disks_count + disk;
171 /* Number of physical strip (copy) inside virtual strip. */
173 /* Offset in virtual space. */
174 *virt = (nstrip / N) * strip_size + start;
178 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
180 struct g_raid_tr_raid1e_object *trs;
182 trs = (struct g_raid_tr_raid1e_object *)tr;
183 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
184 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
185 return (G_RAID_TR_TASTE_FAIL);
186 trs->trso_starting = 1;
187 return (G_RAID_TR_TASTE_SUCCEED);
191 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
193 struct g_raid_softc *sc;
194 struct g_raid_subdisk *sd, *bestsd, *worstsd;
195 int i, j, state, sstate;
198 state = G_RAID_VOLUME_S_OPTIMAL;
199 for (i = 0; i < vol->v_disks_count / N; i++) {
200 bestsd = &vol->v_subdisks[i * N];
201 for (j = 1; j < N; j++) {
202 sd = &vol->v_subdisks[i * N + j];
203 if (sd->sd_state > bestsd->sd_state)
205 else if (sd->sd_state == bestsd->sd_state &&
206 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
207 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
208 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
211 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
212 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
213 /* We found reasonable candidate. */
215 "Promote subdisk %s:%d from %s to ACTIVE.",
216 vol->v_name, bestsd->sd_pos,
217 g_raid_subdisk_state2str(bestsd->sd_state));
218 g_raid_change_subdisk_state(bestsd,
219 G_RAID_SUBDISK_S_ACTIVE);
220 g_raid_write_metadata(sc,
221 vol, bestsd, bestsd->sd_disk);
223 worstsd = &vol->v_subdisks[i * N];
224 for (j = 1; j < N; j++) {
225 sd = &vol->v_subdisks[i * N + j];
226 if (sd->sd_state < worstsd->sd_state)
229 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
230 sstate = G_RAID_VOLUME_S_OPTIMAL;
231 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
232 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
233 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
234 sstate = G_RAID_VOLUME_S_DEGRADED;
236 sstate = G_RAID_VOLUME_S_BROKEN;
244 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
246 struct g_raid_softc *sc;
247 struct g_raid_subdisk *sd, *bestsd, *worstsd;
248 int i, j, state, sstate;
251 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
253 return (G_RAID_VOLUME_S_OPTIMAL);
254 for (i = 0; i < vol->v_disks_count; i++) {
255 sd = &vol->v_subdisks[i];
256 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
257 /* We found reasonable candidate. */
259 "Promote subdisk %s:%d from %s to STALE.",
260 vol->v_name, sd->sd_pos,
261 g_raid_subdisk_state2str(sd->sd_state));
262 g_raid_change_subdisk_state(sd,
263 G_RAID_SUBDISK_S_STALE);
264 g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
267 state = G_RAID_VOLUME_S_OPTIMAL;
268 for (i = 0; i < vol->v_disks_count; i++) {
269 bestsd = &vol->v_subdisks[i];
270 worstsd = &vol->v_subdisks[i];
271 for (j = 1; j < N; j++) {
272 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
273 if (sd->sd_state > bestsd->sd_state)
275 else if (sd->sd_state == bestsd->sd_state &&
276 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
277 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
278 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
280 if (sd->sd_state < worstsd->sd_state)
283 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
284 sstate = G_RAID_VOLUME_S_OPTIMAL;
285 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
287 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
288 sstate = G_RAID_VOLUME_S_DEGRADED;
290 sstate = G_RAID_VOLUME_S_BROKEN;
298 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
299 struct g_raid_subdisk *sd)
301 struct g_raid_tr_raid1e_object *trs;
302 struct g_raid_softc *sc;
306 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
307 if (trs->trso_stopping &&
308 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
309 s = G_RAID_VOLUME_S_STOPPED;
310 else if (trs->trso_starting)
311 s = G_RAID_VOLUME_S_STARTING;
313 if ((vol->v_disks_count % N) == 0)
314 s = g_raid_tr_update_state_raid1e_even(vol);
316 s = g_raid_tr_update_state_raid1e_odd(vol);
318 if (s != vol->v_state) {
319 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
320 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
321 G_RAID_EVENT_VOLUME);
322 g_raid_change_volume_state(vol, s);
323 if (!trs->trso_starting && !trs->trso_stopping)
324 g_raid_write_metadata(sc, vol, NULL, NULL);
326 if (!trs->trso_starting && !trs->trso_stopping)
327 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
332 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
333 struct g_raid_disk *disk)
335 struct g_raid_volume *vol;
339 * We don't fail the last disk in the pack, since it still has decent
340 * data on it and that's better than failing the disk if it is the root
343 * XXX should this be controlled via a tunable? It makes sense for
344 * the volume that has / on it. I can't think of a case where we'd
345 * want the volume to go away on this kind of event.
347 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
348 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
349 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
350 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
351 vol->v_disks_count) &&
352 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
354 g_raid_fail_disk(sc, sd, disk);
358 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
360 struct g_raid_volume *vol;
361 struct g_raid_subdisk *sd;
363 vol = trs->trso_base.tro_volume;
364 sd = trs->trso_failed_sd;
365 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
366 free(trs->trso_buffer, M_TR_RAID1E);
367 trs->trso_buffer = NULL;
368 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
369 trs->trso_type = TR_RAID1E_NONE;
370 trs->trso_recover_slabs = 0;
371 trs->trso_failed_sd = NULL;
372 g_raid_tr_update_state_raid1e(vol, NULL);
376 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
378 struct g_raid_tr_raid1e_object *trs;
379 struct g_raid_subdisk *sd;
381 trs = (struct g_raid_tr_raid1e_object *)tr;
382 sd = trs->trso_failed_sd;
383 G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
384 "Subdisk %s:%d-%s rebuild completed.",
385 sd->sd_volume->v_name, sd->sd_pos,
386 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
387 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
388 sd->sd_rebuild_pos = 0;
389 g_raid_tr_raid1e_rebuild_done(trs);
393 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
395 struct g_raid_tr_raid1e_object *trs;
396 struct g_raid_subdisk *sd;
397 struct g_raid_volume *vol;
399 vol = tr->tro_volume;
400 trs = (struct g_raid_tr_raid1e_object *)tr;
401 sd = trs->trso_failed_sd;
402 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
403 G_RAID_DEBUG1(1, vol->v_softc,
404 "Subdisk %s:%d-%s rebuild is aborting.",
405 sd->sd_volume->v_name, sd->sd_pos,
406 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
407 trs->trso_flags |= TR_RAID1E_F_ABORT;
409 G_RAID_DEBUG1(0, vol->v_softc,
410 "Subdisk %s:%d-%s rebuild aborted.",
411 sd->sd_volume->v_name, sd->sd_pos,
412 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
413 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
414 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
415 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
416 g_raid_unlock_range(tr->tro_volume,
417 trs->trso_lock_pos, trs->trso_lock_len);
419 g_raid_tr_raid1e_rebuild_done(trs);
424 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
426 struct g_raid_tr_raid1e_object *trs;
427 struct g_raid_softc *sc;
428 struct g_raid_volume *vol;
429 struct g_raid_subdisk *sd;
431 off_t len, virtual, vend, offset, start;
432 int disk, copy, best;
434 trs = (struct g_raid_tr_raid1e_object *)tr;
435 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
437 vol = tr->tro_volume;
439 sd = trs->trso_failed_sd;
442 if (sd->sd_rebuild_pos >= sd->sd_size) {
443 g_raid_tr_raid1e_rebuild_finish(tr);
446 /* Get virtual offset from physical rebuild position. */
447 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
448 /* Get physical offset back to get first stripe position. */
449 V2P(vol, virtual, &disk, &offset, &start);
450 /* Calculate contignous data length. */
451 len = MIN(g_raid1e_rebuild_slab,
452 sd->sd_size - sd->sd_rebuild_pos);
453 if ((vol->v_disks_count % N) != 0)
454 len = MIN(len, vol->v_strip_size - start);
455 /* Find disk with most accurate data. */
456 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
457 offset + start, len, 0);
459 /* There is no any valid disk. */
460 g_raid_tr_raid1e_rebuild_abort(tr);
462 } else if (best != copy) {
463 /* Some other disk has better data. */
466 /* We have the most accurate data. Skip the range. */
467 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
468 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
469 sd->sd_rebuild_pos += len;
473 memset(bp, 0, sizeof(*bp));
474 bp->bio_offset = offset + start +
475 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
476 bp->bio_length = len;
477 bp->bio_data = trs->trso_buffer;
478 bp->bio_cmd = BIO_READ;
479 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
480 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
481 G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
483 * If we are crossing stripe boundary, correct affected virtual
484 * range we should lock.
486 if (start + len > vol->v_strip_size) {
487 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
488 len = vend - virtual;
490 trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
491 trs->trso_flags |= TR_RAID1E_F_LOCKED;
492 trs->trso_lock_pos = virtual;
493 trs->trso_lock_len = len;
494 /* Lock callback starts I/O */
495 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
499 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
501 struct g_raid_volume *vol;
502 struct g_raid_tr_raid1e_object *trs;
503 struct g_raid_subdisk *sd;
505 vol = tr->tro_volume;
506 trs = (struct g_raid_tr_raid1e_object *)tr;
507 if (trs->trso_failed_sd) {
508 G_RAID_DEBUG1(1, vol->v_softc,
509 "Already rebuild in start rebuild. pos %jd\n",
510 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
513 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
515 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
517 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
519 sd->sd_rebuild_pos = 0;
520 g_raid_change_subdisk_state(sd,
521 G_RAID_SUBDISK_S_RESYNC);
522 g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
524 sd = g_raid_get_subdisk(vol,
525 G_RAID_SUBDISK_S_UNINITIALIZED);
527 sd = g_raid_get_subdisk(vol,
528 G_RAID_SUBDISK_S_NEW);
530 sd->sd_rebuild_pos = 0;
531 g_raid_change_subdisk_state(sd,
532 G_RAID_SUBDISK_S_REBUILD);
533 g_raid_write_metadata(vol->v_softc,
539 G_RAID_DEBUG1(1, vol->v_softc,
540 "No failed disk to rebuild. night night.");
543 trs->trso_failed_sd = sd;
544 G_RAID_DEBUG1(0, vol->v_softc,
545 "Subdisk %s:%d-%s rebuild start at %jd.",
546 sd->sd_volume->v_name, sd->sd_pos,
547 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
548 trs->trso_failed_sd->sd_rebuild_pos);
549 trs->trso_type = TR_RAID1E_REBUILD;
550 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
551 trs->trso_meta_update = g_raid1e_rebuild_meta_update;
552 g_raid_tr_raid1e_rebuild_some(tr);
556 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
557 struct g_raid_subdisk *sd)
559 struct g_raid_volume *vol;
560 struct g_raid_tr_raid1e_object *trs;
563 vol = tr->tro_volume;
564 trs = (struct g_raid_tr_raid1e_object *)tr;
565 if (trs->trso_stopping)
567 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
568 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
569 switch(trs->trso_type) {
571 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
574 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
575 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
576 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
580 g_raid_tr_raid1e_rebuild_start(tr);
582 case TR_RAID1E_REBUILD:
583 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
584 trs->trso_failed_sd == sd)
585 g_raid_tr_raid1e_rebuild_abort(tr);
587 case TR_RAID1E_RESYNC:
593 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
594 struct g_raid_subdisk *sd, u_int event)
597 g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
602 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
604 struct g_raid_tr_raid1e_object *trs;
605 struct g_raid_volume *vol;
607 trs = (struct g_raid_tr_raid1e_object *)tr;
608 vol = tr->tro_volume;
609 trs->trso_starting = 0;
610 g_raid_tr_update_state_raid1e(vol, NULL);
615 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
617 struct g_raid_tr_raid1e_object *trs;
618 struct g_raid_volume *vol;
620 trs = (struct g_raid_tr_raid1e_object *)tr;
621 vol = tr->tro_volume;
622 trs->trso_starting = 0;
623 trs->trso_stopping = 1;
624 g_raid_tr_update_state_raid1e(vol, NULL);
629 * Select the disk to read from. Take into account: subdisk state, running
630 * error recovery, average disk load, head position and possible cache hits.
632 #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
634 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
635 int no, off_t off, off_t len, u_int mask)
637 struct g_raid_subdisk *sd;
639 int i, best, prio, bestprio;
643 for (i = 0; i < N; i++) {
644 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
646 if (no + i >= vol->v_disks_count)
647 offset += vol->v_strip_size;
649 prio = G_RAID_SUBDISK_LOAD(sd);
650 if ((mask & (1 << sd->sd_pos)) != 0)
652 switch (sd->sd_state) {
653 case G_RAID_SUBDISK_S_ACTIVE:
655 case G_RAID_SUBDISK_S_RESYNC:
656 if (offset + off < sd->sd_rebuild_pos)
659 case G_RAID_SUBDISK_S_STALE:
662 case G_RAID_SUBDISK_S_REBUILD:
663 if (offset + off < sd->sd_rebuild_pos)
669 prio += min(sd->sd_recovery, 255) << 16;
670 /* If disk head is precisely in position - highly prefer it. */
671 if (G_RAID_SUBDISK_POS(sd) == offset)
672 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
674 /* If disk head is close to position - prefer it. */
675 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
676 G_RAID_SUBDISK_TRACK_SIZE)
677 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
678 if (prio < bestprio) {
687 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
689 struct g_raid_volume *vol;
690 struct g_raid_subdisk *sd;
691 struct bio_queue_head queue;
694 off_t offset, start, length, remain;
695 u_int no, strip_size;
698 vol = tr->tro_volume;
699 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
703 strip_size = vol->v_strip_size;
704 V2P(vol, bp->bio_offset, &no, &offset, &start);
705 remain = bp->bio_length;
708 length = MIN(strip_size - start, remain);
709 best = g_raid_tr_raid1e_select_read_disk(vol,
710 no, offset, length, 0);
711 KASSERT(best >= 0, ("No readable disk in volume %s!",
714 if (no >= vol->v_disks_count) {
715 no -= vol->v_disks_count;
716 offset += strip_size;
718 cbp = g_clone_bio(bp);
721 cbp->bio_offset = offset + start;
722 cbp->bio_length = length;
723 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
724 cbp->bio_ma_offset += (uintptr_t)addr;
725 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
726 cbp->bio_ma_offset %= PAGE_SIZE;
727 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
728 cbp->bio_length) / PAGE_SIZE;
730 cbp->bio_data = addr;
731 cbp->bio_caller1 = &vol->v_subdisks[no];
732 bioq_insert_tail(&queue, cbp);
734 if (no >= vol->v_disks_count) {
735 no -= vol->v_disks_count;
736 offset += strip_size;
742 while ((cbp = bioq_takefirst(&queue)) != NULL) {
743 sd = cbp->bio_caller1;
744 cbp->bio_caller1 = NULL;
745 g_raid_subdisk_iostart(sd, cbp);
749 while ((cbp = bioq_takefirst(&queue)) != NULL)
751 if (bp->bio_error == 0)
752 bp->bio_error = ENOMEM;
753 g_raid_iodone(bp, bp->bio_error);
757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
759 struct g_raid_volume *vol;
760 struct g_raid_subdisk *sd;
761 struct bio_queue_head queue;
764 off_t offset, start, length, remain;
765 u_int no, strip_size;
768 vol = tr->tro_volume;
769 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
773 strip_size = vol->v_strip_size;
774 V2P(vol, bp->bio_offset, &no, &offset, &start);
775 remain = bp->bio_length;
778 length = MIN(strip_size - start, remain);
779 for (i = 0; i < N; i++) {
780 sd = &vol->v_subdisks[no];
781 switch (sd->sd_state) {
782 case G_RAID_SUBDISK_S_ACTIVE:
783 case G_RAID_SUBDISK_S_STALE:
784 case G_RAID_SUBDISK_S_RESYNC:
786 case G_RAID_SUBDISK_S_REBUILD:
787 if (offset + start >= sd->sd_rebuild_pos)
793 cbp = g_clone_bio(bp);
796 cbp->bio_offset = offset + start;
797 cbp->bio_length = length;
798 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
799 bp->bio_cmd != BIO_DELETE) {
800 cbp->bio_ma_offset += (uintptr_t)addr;
801 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
802 cbp->bio_ma_offset %= PAGE_SIZE;
803 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
804 cbp->bio_length) / PAGE_SIZE;
806 cbp->bio_data = addr;
807 cbp->bio_caller1 = sd;
808 bioq_insert_tail(&queue, cbp);
810 if (++no >= vol->v_disks_count) {
812 offset += strip_size;
816 if (bp->bio_cmd != BIO_DELETE)
820 while ((cbp = bioq_takefirst(&queue)) != NULL) {
821 sd = cbp->bio_caller1;
822 cbp->bio_caller1 = NULL;
823 g_raid_subdisk_iostart(sd, cbp);
827 while ((cbp = bioq_takefirst(&queue)) != NULL)
829 if (bp->bio_error == 0)
830 bp->bio_error = ENOMEM;
831 g_raid_iodone(bp, bp->bio_error);
835 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
837 struct g_raid_volume *vol;
838 struct g_raid_tr_raid1e_object *trs;
840 vol = tr->tro_volume;
841 trs = (struct g_raid_tr_raid1e_object *)tr;
842 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
843 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
844 vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
845 g_raid_iodone(bp, EIO);
849 * If we're rebuilding, squeeze in rebuild activity every so often,
850 * even when the disk is busy. Be sure to only count real I/O
851 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
854 if (trs->trso_failed_sd != NULL &&
855 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
856 /* Make this new or running now round short. */
857 trs->trso_recover_slabs = 0;
858 if (--trs->trso_fair_io <= 0) {
859 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
860 g_raid_tr_raid1e_rebuild_some(tr);
863 switch (bp->bio_cmd) {
865 g_raid_tr_iostart_raid1e_read(tr, bp);
869 g_raid_tr_iostart_raid1e_write(tr, bp);
872 g_raid_tr_flush_common(tr, bp);
875 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
876 bp->bio_cmd, vol->v_name));
882 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
883 struct g_raid_subdisk *sd, struct bio *bp)
886 struct g_raid_subdisk *nsd;
887 struct g_raid_volume *vol;
889 struct g_raid_tr_raid1e_object *trs;
890 off_t virtual, offset, start;
892 int error, do_write, copy, disk, best;
894 trs = (struct g_raid_tr_raid1e_object *)tr;
895 vol = tr->tro_volume;
896 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
897 if (trs->trso_type == TR_RAID1E_REBUILD) {
898 nsd = trs->trso_failed_sd;
899 if (bp->bio_cmd == BIO_READ) {
901 /* Immediately abort rebuild, if requested. */
902 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
903 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
904 g_raid_tr_raid1e_rebuild_abort(tr);
908 /* On read error, skip and cross fingers. */
909 if (bp->bio_error != 0) {
911 "Read error during rebuild (%d), "
912 "possible data loss!",
914 goto rebuild_round_done;
918 * The read operation finished, queue the
921 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
923 bp->bio_cmd = BIO_WRITE;
924 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
925 bp->bio_offset = nsd->sd_rebuild_pos;
926 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
927 g_raid_subdisk_iostart(nsd, bp);
930 * The write operation just finished. Do
931 * another. We keep cloning the master bio
932 * since it has the right buffers allocated to
935 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
937 if (bp->bio_error != 0 ||
938 trs->trso_flags & TR_RAID1E_F_ABORT) {
939 if ((trs->trso_flags &
940 TR_RAID1E_F_ABORT) == 0) {
941 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
944 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
945 g_raid_tr_raid1e_rebuild_abort(tr);
949 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
950 g_raid_unlock_range(tr->tro_volume,
951 trs->trso_lock_pos, trs->trso_lock_len);
952 nsd->sd_rebuild_pos += bp->bio_length;
953 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
954 g_raid_tr_raid1e_rebuild_finish(tr);
958 /* Abort rebuild if we are stopping */
959 if (trs->trso_stopping) {
960 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
961 g_raid_tr_raid1e_rebuild_abort(tr);
965 if (--trs->trso_meta_update <= 0) {
966 g_raid_write_metadata(vol->v_softc,
967 vol, nsd, nsd->sd_disk);
968 trs->trso_meta_update =
969 g_raid1e_rebuild_meta_update;
970 /* Compensate short rebuild I/Os. */
971 if ((vol->v_disks_count % N) != 0 &&
973 g_raid1e_rebuild_slab) {
974 trs->trso_meta_update *=
975 g_raid1e_rebuild_slab;
976 trs->trso_meta_update /=
980 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
981 if (--trs->trso_recover_slabs <= 0)
983 /* Run next rebuild iteration. */
984 g_raid_tr_raid1e_rebuild_some(tr);
986 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
988 * read good sd, read bad sd in parallel. when both
989 * done, compare the buffers. write good to the bad
990 * if different. do the next bit of work.
992 panic("Somehow, we think we're doing a resync");
996 pbp = bp->bio_parent;
998 mask = (intptr_t)bp->bio_caller2;
999 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1001 * Read failed on first drive. Retry the read error on
1002 * another disk drive, if available, before erroring out the
1005 sd->sd_disk->d_read_errs++;
1006 G_RAID_LOGREQ(0, bp,
1007 "Read error (%d), %d read errors total",
1008 bp->bio_error, sd->sd_disk->d_read_errs);
1011 * If there are too many read errors, we move to degraded.
1012 * XXX Do we want to FAIL the drive (eg, make the user redo
1013 * everything to get it back in sync), or just degrade the
1014 * drive, which kicks off a resync?
1017 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1018 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1022 /* Restore what we were doing. */
1023 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1024 V2P(vol, virtual, &disk, &offset, &start);
1026 /* Find the other disk, and try to do the I/O to it. */
1028 best = g_raid_tr_raid1e_select_read_disk(vol,
1029 disk, offset, start, mask);
1030 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1032 if (disk >= vol->v_disks_count) {
1033 disk -= vol->v_disks_count;
1034 offset += vol->v_strip_size;
1036 cbp->bio_offset = offset + start;
1037 cbp->bio_length = bp->bio_length;
1038 cbp->bio_data = bp->bio_data;
1039 cbp->bio_ma = bp->bio_ma;
1040 cbp->bio_ma_offset = bp->bio_ma_offset;
1041 cbp->bio_ma_n = bp->bio_ma_n;
1043 nsd = &vol->v_subdisks[disk];
1044 G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1048 if ((mask & (1U << 31)) != 0)
1050 cbp->bio_caller2 = (void *)mask;
1052 cbp->bio_caller1 = nsd;
1053 /* Lock callback starts I/O */
1054 g_raid_lock_range(sd->sd_volume,
1055 virtual, cbp->bio_length, pbp, cbp);
1057 g_raid_subdisk_iostart(nsd, cbp);
1062 * We can't retry. Return the original error by falling
1063 * through. This will happen when there's only one good disk.
1064 * We don't need to fail the raid, since its actual state is
1065 * based on the state of the subdisks.
1067 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1069 if (bp->bio_cmd == BIO_READ &&
1070 bp->bio_error == 0 &&
1071 (mask & (1U << 31)) != 0) {
1072 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1074 /* Restore what we were doing. */
1075 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1076 V2P(vol, virtual, &disk, &offset, &start);
1078 /* Find best disk to write. */
1079 best = g_raid_tr_raid1e_select_read_disk(vol,
1080 disk, offset, start, ~mask);
1081 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1083 if (disk >= vol->v_disks_count) {
1084 disk -= vol->v_disks_count;
1085 offset += vol->v_strip_size;
1087 cbp->bio_offset = offset + start;
1088 cbp->bio_cmd = BIO_WRITE;
1089 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1090 cbp->bio_caller2 = (void *)mask;
1092 G_RAID_LOGREQ(2, cbp,
1093 "Attempting bad sector remap on failing drive.");
1094 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1098 if ((mask & (1U << 31)) != 0) {
1100 * We're done with a recovery, mark the range as unlocked.
1101 * For any write errors, we aggressively fail the disk since
1102 * there was both a READ and a WRITE error at this location.
1103 * Both types of errors generally indicates the drive is on
1104 * the verge of total failure anyway. Better to stop trusting
1105 * it now. However, we need to reset error to 0 in that case
1106 * because we're not failing the original I/O which succeeded.
1109 /* Restore what we were doing. */
1110 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1111 V2P(vol, virtual, &disk, &offset, &start);
1113 for (copy = 0; copy < N; copy++) {
1114 if ((mask & (1 << copy) ) != 0)
1115 vol->v_subdisks[(disk + copy) %
1116 vol->v_disks_count].sd_recovery--;
1119 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1120 G_RAID_LOGREQ(0, bp, "Remap write failed: "
1121 "failing subdisk.");
1122 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1125 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1126 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1128 if (pbp->bio_cmd != BIO_READ) {
1129 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1130 pbp->bio_error = bp->bio_error;
1131 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1132 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1133 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1135 error = pbp->bio_error;
1137 error = bp->bio_error;
1139 if (pbp->bio_children == pbp->bio_inbed) {
1140 pbp->bio_completed = pbp->bio_length;
1141 g_raid_iodone(pbp, error);
1146 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1147 void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1149 struct g_raid_volume *vol;
1150 struct g_raid_subdisk *sd;
1151 struct bio_queue_head queue;
1153 off_t offset, start, length, remain;
1154 u_int no, strip_size;
1157 vol = tr->tro_volume;
1159 strip_size = vol->v_strip_size;
1160 V2P(vol, boffset, &no, &offset, &start);
1163 while (remain > 0) {
1164 length = MIN(strip_size - start, remain);
1165 for (i = 0; i < N; i++) {
1166 sd = &vol->v_subdisks[no];
1167 switch (sd->sd_state) {
1168 case G_RAID_SUBDISK_S_ACTIVE:
1169 case G_RAID_SUBDISK_S_STALE:
1170 case G_RAID_SUBDISK_S_RESYNC:
1172 case G_RAID_SUBDISK_S_REBUILD:
1173 if (offset + start >= sd->sd_rebuild_pos)
1179 error = g_raid_subdisk_kerneldump(sd,
1180 addr, 0, offset + start, length);
1184 if (++no >= vol->v_disks_count) {
1186 offset += strip_size;
1197 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1200 struct g_raid_subdisk *sd;
1202 bp = (struct bio *)argp;
1203 sd = (struct g_raid_subdisk *)bp->bio_caller1;
1204 g_raid_subdisk_iostart(sd, bp);
1210 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1212 struct g_raid_tr_raid1e_object *trs;
1213 struct g_raid_volume *vol;
1215 vol = tr->tro_volume;
1216 trs = (struct g_raid_tr_raid1e_object *)tr;
1217 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1218 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1219 /* Compensate short rebuild I/Os. */
1220 if ((vol->v_disks_count % N) != 0 &&
1221 vol->v_strip_size < g_raid1e_rebuild_slab) {
1222 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1223 trs->trso_recover_slabs /= vol->v_strip_size;
1225 if (trs->trso_type == TR_RAID1E_REBUILD)
1226 g_raid_tr_raid1e_rebuild_some(tr);
1231 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1233 struct g_raid_tr_raid1e_object *trs;
1235 trs = (struct g_raid_tr_raid1e_object *)tr;
1237 if (trs->trso_buffer != NULL) {
1238 free(trs->trso_buffer, M_TR_RAID1E);
1239 trs->trso_buffer = NULL;
1244 G_RAID_TR_DECLARE(raid1e, "RAID1E");