]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/geom/raid/tr_raid1e.c
geom: clean up empty lines in .c and .h files
[FreeBSD/FreeBSD.git] / sys / geom / raid / tr_raid1e.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/bio.h>
34 #include <sys/endian.h>
35 #include <sys/kernel.h>
36 #include <sys/kobj.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/mutex.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include <geom/geom_dbg.h>
45 #include "geom/raid/g_raid.h"
46 #include "g_raid_tr_if.h"
47
48 #define N       2
49
50 SYSCTL_DECL(_kern_geom_raid_raid1e);
51
52 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
55     &g_raid1e_rebuild_slab, 0,
56     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
57
58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
61     &g_raid1e_rebuild_fair_io, 0,
62     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
63
64 #define RAID1E_REBUILD_CLUSTER_IDLE 100
65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
67     &g_raid1e_rebuild_cluster_idle, 0,
68     "Number of slabs to do each time we trigger a rebuild cycle");
69
70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
73     &g_raid1e_rebuild_meta_update, 0,
74     "When to update the meta data.");
75
76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
77
78 #define TR_RAID1E_NONE 0
79 #define TR_RAID1E_REBUILD 1
80 #define TR_RAID1E_RESYNC 2
81
82 #define TR_RAID1E_F_DOING_SOME  0x1
83 #define TR_RAID1E_F_LOCKED      0x2
84 #define TR_RAID1E_F_ABORT       0x4
85
86 struct g_raid_tr_raid1e_object {
87         struct g_raid_tr_object  trso_base;
88         int                      trso_starting;
89         int                      trso_stopping;
90         int                      trso_type;
91         int                      trso_recover_slabs; /* slabs before rest */
92         int                      trso_fair_io;
93         int                      trso_meta_update;
94         int                      trso_flags;
95         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
96         void                    *trso_buffer;    /* Buffer space */
97         off_t                    trso_lock_pos; /* Locked range start. */
98         off_t                    trso_lock_len; /* Locked range length. */
99         struct bio               trso_bio;
100 };
101
102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
103 static g_raid_tr_event_t g_raid_tr_event_raid1e;
104 static g_raid_tr_start_t g_raid_tr_start_raid1e;
105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
111 static g_raid_tr_free_t g_raid_tr_free_raid1e;
112
113 static kobj_method_t g_raid_tr_raid1e_methods[] = {
114         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
115         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
116         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
117         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
118         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
119         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
120         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
121         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
122         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
123         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
124         { 0, 0 }
125 };
126
127 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
128         "RAID1E",
129         g_raid_tr_raid1e_methods,
130         sizeof(struct g_raid_tr_raid1e_object),
131         .trc_enable = 1,
132         .trc_priority = 200,
133         .trc_accept_unmapped = 1
134 };
135
136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
138     struct g_raid_subdisk *sd);
139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
140     int no, off_t off, off_t len, u_int mask);
141
142 static inline void
143 V2P(struct g_raid_volume *vol, off_t virt,
144     int *disk, off_t *offset, off_t *start)
145 {
146         off_t nstrip;
147         u_int strip_size;
148
149         strip_size = vol->v_strip_size;
150         /* Strip number. */
151         nstrip = virt / strip_size;
152         /* Start position in strip. */
153         *start = virt % strip_size;
154         /* Disk number. */
155         *disk = (nstrip * N) % vol->v_disks_count;
156         /* Strip start position in disk. */
157         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
158 }
159
160 static inline void
161 P2V(struct g_raid_volume *vol, int disk, off_t offset,
162     off_t *virt, int *copy)
163 {
164         off_t nstrip, start;
165         u_int strip_size;
166
167         strip_size = vol->v_strip_size;
168         /* Start position in strip. */
169         start = offset % strip_size;
170         /* Physical strip number. */
171         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
172         /* Number of physical strip (copy) inside virtual strip. */
173         *copy = nstrip % N;
174         /* Offset in virtual space. */
175         *virt = (nstrip / N) * strip_size + start;
176 }
177
178 static int
179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
180 {
181         struct g_raid_tr_raid1e_object *trs;
182
183         trs = (struct g_raid_tr_raid1e_object *)tr;
184         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
185             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
186                 return (G_RAID_TR_TASTE_FAIL);
187         trs->trso_starting = 1;
188         return (G_RAID_TR_TASTE_SUCCEED);
189 }
190
191 static int
192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
193 {
194         struct g_raid_softc *sc;
195         struct g_raid_subdisk *sd, *bestsd, *worstsd;
196         int i, j, state, sstate;
197
198         sc = vol->v_softc;
199         state = G_RAID_VOLUME_S_OPTIMAL;
200         for (i = 0; i < vol->v_disks_count / N; i++) {
201                 bestsd = &vol->v_subdisks[i * N];
202                 for (j = 1; j < N; j++) {
203                         sd = &vol->v_subdisks[i * N + j];
204                         if (sd->sd_state > bestsd->sd_state)
205                                 bestsd = sd;
206                         else if (sd->sd_state == bestsd->sd_state &&
207                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
208                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
209                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
210                                 bestsd = sd;
211                 }
212                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
213                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
214                         /* We found reasonable candidate. */
215                         G_RAID_DEBUG1(1, sc,
216                             "Promote subdisk %s:%d from %s to ACTIVE.",
217                             vol->v_name, bestsd->sd_pos,
218                             g_raid_subdisk_state2str(bestsd->sd_state));
219                         g_raid_change_subdisk_state(bestsd,
220                             G_RAID_SUBDISK_S_ACTIVE);
221                         g_raid_write_metadata(sc,
222                             vol, bestsd, bestsd->sd_disk);
223                 }
224                 worstsd = &vol->v_subdisks[i * N];
225                 for (j = 1; j < N; j++) {
226                         sd = &vol->v_subdisks[i * N + j];
227                         if (sd->sd_state < worstsd->sd_state)
228                                 worstsd = sd;
229                 }
230                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
231                         sstate = G_RAID_VOLUME_S_OPTIMAL;
232                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
233                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
234                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235                         sstate = G_RAID_VOLUME_S_DEGRADED;
236                 else
237                         sstate = G_RAID_VOLUME_S_BROKEN;
238                 if (sstate < state)
239                         state = sstate;
240         }
241         return (state);
242 }
243
244 static int
245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
246 {
247         struct g_raid_softc *sc;
248         struct g_raid_subdisk *sd, *bestsd, *worstsd;
249         int i, j, state, sstate;
250
251         sc = vol->v_softc;
252         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
253             vol->v_disks_count)
254                 return (G_RAID_VOLUME_S_OPTIMAL);
255         for (i = 0; i < vol->v_disks_count; i++) {
256                 sd = &vol->v_subdisks[i];
257                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
258                         /* We found reasonable candidate. */
259                         G_RAID_DEBUG1(1, sc,
260                             "Promote subdisk %s:%d from %s to STALE.",
261                             vol->v_name, sd->sd_pos,
262                             g_raid_subdisk_state2str(sd->sd_state));
263                         g_raid_change_subdisk_state(sd,
264                             G_RAID_SUBDISK_S_STALE);
265                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
266                 }
267         }
268         state = G_RAID_VOLUME_S_OPTIMAL;
269         for (i = 0; i < vol->v_disks_count; i++) {
270                 bestsd = &vol->v_subdisks[i];
271                 worstsd = &vol->v_subdisks[i];
272                 for (j = 1; j < N; j++) {
273                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
274                         if (sd->sd_state > bestsd->sd_state)
275                                 bestsd = sd;
276                         else if (sd->sd_state == bestsd->sd_state &&
277                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
278                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
279                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
280                                 bestsd = sd;
281                         if (sd->sd_state < worstsd->sd_state)
282                                 worstsd = sd;
283                 }
284                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
285                         sstate = G_RAID_VOLUME_S_OPTIMAL;
286                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
288                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
289                         sstate = G_RAID_VOLUME_S_DEGRADED;
290                 else
291                         sstate = G_RAID_VOLUME_S_BROKEN;
292                 if (sstate < state)
293                         state = sstate;
294         }
295         return (state);
296 }
297
298 static int
299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
300     struct g_raid_subdisk *sd)
301 {
302         struct g_raid_tr_raid1e_object *trs;
303         struct g_raid_softc *sc;
304         u_int s;
305
306         sc = vol->v_softc;
307         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
308         if (trs->trso_stopping &&
309             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
310                 s = G_RAID_VOLUME_S_STOPPED;
311         else if (trs->trso_starting)
312                 s = G_RAID_VOLUME_S_STARTING;
313         else {
314                 if ((vol->v_disks_count % N) == 0)
315                         s = g_raid_tr_update_state_raid1e_even(vol);
316                 else
317                         s = g_raid_tr_update_state_raid1e_odd(vol);
318         }
319         if (s != vol->v_state) {
320                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
321                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
322                     G_RAID_EVENT_VOLUME);
323                 g_raid_change_volume_state(vol, s);
324                 if (!trs->trso_starting && !trs->trso_stopping)
325                         g_raid_write_metadata(sc, vol, NULL, NULL);
326         }
327         if (!trs->trso_starting && !trs->trso_stopping)
328                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
329         return (0);
330 }
331
332 static void
333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
334     struct g_raid_disk *disk)
335 {
336         struct g_raid_volume *vol;
337
338         vol = sd->sd_volume;
339         /*
340          * We don't fail the last disk in the pack, since it still has decent
341          * data on it and that's better than failing the disk if it is the root
342          * file system.
343          *
344          * XXX should this be controlled via a tunable?  It makes sense for
345          * the volume that has / on it.  I can't think of a case where we'd
346          * want the volume to go away on this kind of event.
347          */
348         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
349              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
350              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
351              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
352              vol->v_disks_count) &&
353             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
354                 return;
355         g_raid_fail_disk(sc, sd, disk);
356 }
357
358 static void
359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
360 {
361         struct g_raid_volume *vol;
362         struct g_raid_subdisk *sd;
363
364         vol = trs->trso_base.tro_volume;
365         sd = trs->trso_failed_sd;
366         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
367         free(trs->trso_buffer, M_TR_RAID1E);
368         trs->trso_buffer = NULL;
369         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
370         trs->trso_type = TR_RAID1E_NONE;
371         trs->trso_recover_slabs = 0;
372         trs->trso_failed_sd = NULL;
373         g_raid_tr_update_state_raid1e(vol, NULL);
374 }
375
376 static void
377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
378 {
379         struct g_raid_tr_raid1e_object *trs;
380         struct g_raid_subdisk *sd;
381
382         trs = (struct g_raid_tr_raid1e_object *)tr;
383         sd = trs->trso_failed_sd;
384         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
385             "Subdisk %s:%d-%s rebuild completed.",
386             sd->sd_volume->v_name, sd->sd_pos,
387             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
388         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
389         sd->sd_rebuild_pos = 0;
390         g_raid_tr_raid1e_rebuild_done(trs);
391 }
392
393 static void
394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
395 {
396         struct g_raid_tr_raid1e_object *trs;
397         struct g_raid_subdisk *sd;
398         struct g_raid_volume *vol;
399
400         vol = tr->tro_volume;
401         trs = (struct g_raid_tr_raid1e_object *)tr;
402         sd = trs->trso_failed_sd;
403         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
404                 G_RAID_DEBUG1(1, vol->v_softc,
405                     "Subdisk %s:%d-%s rebuild is aborting.",
406                     sd->sd_volume->v_name, sd->sd_pos,
407                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
408                 trs->trso_flags |= TR_RAID1E_F_ABORT;
409         } else {
410                 G_RAID_DEBUG1(0, vol->v_softc,
411                     "Subdisk %s:%d-%s rebuild aborted.",
412                     sd->sd_volume->v_name, sd->sd_pos,
413                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
414                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
415                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
416                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
417                         g_raid_unlock_range(tr->tro_volume,
418                             trs->trso_lock_pos, trs->trso_lock_len);
419                 }
420                 g_raid_tr_raid1e_rebuild_done(trs);
421         }
422 }
423
424 static void
425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
426 {
427         struct g_raid_tr_raid1e_object *trs;
428         struct g_raid_softc *sc;
429         struct g_raid_volume *vol;
430         struct g_raid_subdisk *sd;
431         struct bio *bp;
432         off_t len, virtual, vend, offset, start;
433         int disk, copy, best;
434
435         trs = (struct g_raid_tr_raid1e_object *)tr;
436         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
437                 return;
438         vol = tr->tro_volume;
439         sc = vol->v_softc;
440         sd = trs->trso_failed_sd;
441
442         while (1) {
443                 if (sd->sd_rebuild_pos >= sd->sd_size) {
444                         g_raid_tr_raid1e_rebuild_finish(tr);
445                         return;
446                 }
447                 /* Get virtual offset from physical rebuild position. */
448                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
449                 /* Get physical offset back to get first stripe position. */
450                 V2P(vol, virtual, &disk, &offset, &start);
451                 /* Calculate contignous data length. */
452                 len = MIN(g_raid1e_rebuild_slab,
453                     sd->sd_size - sd->sd_rebuild_pos);
454                 if ((vol->v_disks_count % N) != 0)
455                         len = MIN(len, vol->v_strip_size - start);
456                 /* Find disk with most accurate data. */
457                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
458                     offset + start, len, 0);
459                 if (best < 0) {
460                         /* There is no any valid disk. */
461                         g_raid_tr_raid1e_rebuild_abort(tr);
462                         return;
463                 } else if (best != copy) {
464                         /* Some other disk has better data. */
465                         break;
466                 }
467                 /* We have the most accurate data. Skip the range. */
468                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
469                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
470                 sd->sd_rebuild_pos += len;
471         }
472
473         bp = &trs->trso_bio;
474         memset(bp, 0, sizeof(*bp));
475         bp->bio_offset = offset + start +
476             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
477         bp->bio_length = len;
478         bp->bio_data = trs->trso_buffer;
479         bp->bio_cmd = BIO_READ;
480         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
481         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
482         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
483         /*
484          * If we are crossing stripe boundary, correct affected virtual
485          * range we should lock.
486          */
487         if (start + len > vol->v_strip_size) {
488                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
489                 len = vend - virtual;
490         }
491         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
492         trs->trso_flags |= TR_RAID1E_F_LOCKED;
493         trs->trso_lock_pos = virtual;
494         trs->trso_lock_len = len;
495         /* Lock callback starts I/O */
496         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
497 }
498
499 static void
500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
501 {
502         struct g_raid_volume *vol;
503         struct g_raid_tr_raid1e_object *trs;
504         struct g_raid_subdisk *sd;
505
506         vol = tr->tro_volume;
507         trs = (struct g_raid_tr_raid1e_object *)tr;
508         if (trs->trso_failed_sd) {
509                 G_RAID_DEBUG1(1, vol->v_softc,
510                     "Already rebuild in start rebuild. pos %jd\n",
511                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
512                 return;
513         }
514         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
515         if (sd == NULL)
516                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
517         if (sd == NULL) {
518                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
519                 if (sd != NULL) {
520                         sd->sd_rebuild_pos = 0;
521                         g_raid_change_subdisk_state(sd,
522                             G_RAID_SUBDISK_S_RESYNC);
523                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
524                 } else {
525                         sd = g_raid_get_subdisk(vol,
526                             G_RAID_SUBDISK_S_UNINITIALIZED);
527                         if (sd == NULL)
528                                 sd = g_raid_get_subdisk(vol,
529                                     G_RAID_SUBDISK_S_NEW);
530                         if (sd != NULL) {
531                                 sd->sd_rebuild_pos = 0;
532                                 g_raid_change_subdisk_state(sd,
533                                     G_RAID_SUBDISK_S_REBUILD);
534                                 g_raid_write_metadata(vol->v_softc,
535                                     vol, sd, NULL);
536                         }
537                 }
538         }
539         if (sd == NULL) {
540                 G_RAID_DEBUG1(1, vol->v_softc,
541                     "No failed disk to rebuild.  night night.");
542                 return;
543         }
544         trs->trso_failed_sd = sd;
545         G_RAID_DEBUG1(0, vol->v_softc,
546             "Subdisk %s:%d-%s rebuild start at %jd.",
547             sd->sd_volume->v_name, sd->sd_pos,
548             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
549             trs->trso_failed_sd->sd_rebuild_pos);
550         trs->trso_type = TR_RAID1E_REBUILD;
551         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
552         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
553         g_raid_tr_raid1e_rebuild_some(tr);
554 }
555
556 static void
557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
558     struct g_raid_subdisk *sd)
559 {
560         struct g_raid_volume *vol;
561         struct g_raid_tr_raid1e_object *trs;
562         int nr;
563
564         vol = tr->tro_volume;
565         trs = (struct g_raid_tr_raid1e_object *)tr;
566         if (trs->trso_stopping)
567                 return;
568         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
569             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
570         switch(trs->trso_type) {
571         case TR_RAID1E_NONE:
572                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
573                         return;
574                 if (nr == 0) {
575                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
576                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
577                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
578                         if (nr == 0)
579                                 return;
580                 }
581                 g_raid_tr_raid1e_rebuild_start(tr);
582                 break;
583         case TR_RAID1E_REBUILD:
584                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
585                     trs->trso_failed_sd == sd)
586                         g_raid_tr_raid1e_rebuild_abort(tr);
587                 break;
588         case TR_RAID1E_RESYNC:
589                 break;
590         }
591 }
592
593 static int
594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
595     struct g_raid_subdisk *sd, u_int event)
596 {
597
598         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
599         return (0);
600 }
601
602 static int
603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
604 {
605         struct g_raid_tr_raid1e_object *trs;
606         struct g_raid_volume *vol;
607
608         trs = (struct g_raid_tr_raid1e_object *)tr;
609         vol = tr->tro_volume;
610         trs->trso_starting = 0;
611         g_raid_tr_update_state_raid1e(vol, NULL);
612         return (0);
613 }
614
615 static int
616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
617 {
618         struct g_raid_tr_raid1e_object *trs;
619         struct g_raid_volume *vol;
620
621         trs = (struct g_raid_tr_raid1e_object *)tr;
622         vol = tr->tro_volume;
623         trs->trso_starting = 0;
624         trs->trso_stopping = 1;
625         g_raid_tr_update_state_raid1e(vol, NULL);
626         return (0);
627 }
628
629 /*
630  * Select the disk to read from.  Take into account: subdisk state, running
631  * error recovery, average disk load, head position and possible cache hits.
632  */
633 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
634 static int
635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
636     int no, off_t off, off_t len, u_int mask)
637 {
638         struct g_raid_subdisk *sd;
639         off_t offset;
640         int i, best, prio, bestprio;
641
642         best = -1;
643         bestprio = INT_MAX;
644         for (i = 0; i < N; i++) {
645                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
646                 offset = off;
647                 if (no + i >= vol->v_disks_count)
648                         offset += vol->v_strip_size;
649
650                 prio = G_RAID_SUBDISK_LOAD(sd);
651                 if ((mask & (1 << sd->sd_pos)) != 0)
652                         continue;
653                 switch (sd->sd_state) {
654                 case G_RAID_SUBDISK_S_ACTIVE:
655                         break;
656                 case G_RAID_SUBDISK_S_RESYNC:
657                         if (offset + off < sd->sd_rebuild_pos)
658                                 break;
659                         /* FALLTHROUGH */
660                 case G_RAID_SUBDISK_S_STALE:
661                         prio += i << 24;
662                         break;
663                 case G_RAID_SUBDISK_S_REBUILD:
664                         if (offset + off < sd->sd_rebuild_pos)
665                                 break;
666                         /* FALLTHROUGH */
667                 default:
668                         continue;
669                 }
670                 prio += min(sd->sd_recovery, 255) << 16;
671                 /* If disk head is precisely in position - highly prefer it. */
672                 if (G_RAID_SUBDISK_POS(sd) == offset)
673                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
674                 else
675                 /* If disk head is close to position - prefer it. */
676                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
677                     G_RAID_SUBDISK_TRACK_SIZE)
678                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
679                 if (prio < bestprio) {
680                         bestprio = prio;
681                         best = i;
682                 }
683         }
684         return (best);
685 }
686
687 static void
688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
689 {
690         struct g_raid_volume *vol;
691         struct g_raid_subdisk *sd;
692         struct bio_queue_head queue;
693         struct bio *cbp;
694         char *addr;
695         off_t offset, start, length, remain;
696         u_int no, strip_size;
697         int best;
698
699         vol = tr->tro_volume;
700         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
701                 addr = NULL;
702         else
703                 addr = bp->bio_data;
704         strip_size = vol->v_strip_size;
705         V2P(vol, bp->bio_offset, &no, &offset, &start);
706         remain = bp->bio_length;
707         bioq_init(&queue);
708         while (remain > 0) {
709                 length = MIN(strip_size - start, remain);
710                 best = g_raid_tr_raid1e_select_read_disk(vol,
711                     no, offset, length, 0);
712                 KASSERT(best >= 0, ("No readable disk in volume %s!",
713                     vol->v_name));
714                 no += best;
715                 if (no >= vol->v_disks_count) {
716                         no -= vol->v_disks_count;
717                         offset += strip_size;
718                 }
719                 cbp = g_clone_bio(bp);
720                 if (cbp == NULL)
721                         goto failure;
722                 cbp->bio_offset = offset + start;
723                 cbp->bio_length = length;
724                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
725                         cbp->bio_ma_offset += (uintptr_t)addr;
726                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
727                         cbp->bio_ma_offset %= PAGE_SIZE;
728                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
729                             cbp->bio_length) / PAGE_SIZE;
730                 } else
731                         cbp->bio_data = addr;
732                 cbp->bio_caller1 = &vol->v_subdisks[no];
733                 bioq_insert_tail(&queue, cbp);
734                 no += N - best;
735                 if (no >= vol->v_disks_count) {
736                         no -= vol->v_disks_count;
737                         offset += strip_size;
738                 }
739                 remain -= length;
740                 addr += length;
741                 start = 0;
742         }
743         while ((cbp = bioq_takefirst(&queue)) != NULL) {
744                 sd = cbp->bio_caller1;
745                 cbp->bio_caller1 = NULL;
746                 g_raid_subdisk_iostart(sd, cbp);
747         }
748         return;
749 failure:
750         while ((cbp = bioq_takefirst(&queue)) != NULL)
751                 g_destroy_bio(cbp);
752         if (bp->bio_error == 0)
753                 bp->bio_error = ENOMEM;
754         g_raid_iodone(bp, bp->bio_error);
755 }
756
757 static void
758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
759 {
760         struct g_raid_volume *vol;
761         struct g_raid_subdisk *sd;
762         struct bio_queue_head queue;
763         struct bio *cbp;
764         char *addr;
765         off_t offset, start, length, remain;
766         u_int no, strip_size;
767         int i;
768
769         vol = tr->tro_volume;
770         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
771                 addr = NULL;
772         else
773                 addr = bp->bio_data;
774         strip_size = vol->v_strip_size;
775         V2P(vol, bp->bio_offset, &no, &offset, &start);
776         remain = bp->bio_length;
777         bioq_init(&queue);
778         while (remain > 0) {
779                 length = MIN(strip_size - start, remain);
780                 for (i = 0; i < N; i++) {
781                         sd = &vol->v_subdisks[no];
782                         switch (sd->sd_state) {
783                         case G_RAID_SUBDISK_S_ACTIVE:
784                         case G_RAID_SUBDISK_S_STALE:
785                         case G_RAID_SUBDISK_S_RESYNC:
786                                 break;
787                         case G_RAID_SUBDISK_S_REBUILD:
788                                 if (offset + start >= sd->sd_rebuild_pos)
789                                         goto nextdisk;
790                                 break;
791                         default:
792                                 goto nextdisk;
793                         }
794                         cbp = g_clone_bio(bp);
795                         if (cbp == NULL)
796                                 goto failure;
797                         cbp->bio_offset = offset + start;
798                         cbp->bio_length = length;
799                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
800                             bp->bio_cmd != BIO_DELETE) {
801                                 cbp->bio_ma_offset += (uintptr_t)addr;
802                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
803                                 cbp->bio_ma_offset %= PAGE_SIZE;
804                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
805                                     cbp->bio_length) / PAGE_SIZE;
806                         } else
807                                 cbp->bio_data = addr;
808                         cbp->bio_caller1 = sd;
809                         bioq_insert_tail(&queue, cbp);
810 nextdisk:
811                         if (++no >= vol->v_disks_count) {
812                                 no = 0;
813                                 offset += strip_size;
814                         }
815                 }
816                 remain -= length;
817                 if (bp->bio_cmd != BIO_DELETE)
818                         addr += length;
819                 start = 0;
820         }
821         while ((cbp = bioq_takefirst(&queue)) != NULL) {
822                 sd = cbp->bio_caller1;
823                 cbp->bio_caller1 = NULL;
824                 g_raid_subdisk_iostart(sd, cbp);
825         }
826         return;
827 failure:
828         while ((cbp = bioq_takefirst(&queue)) != NULL)
829                 g_destroy_bio(cbp);
830         if (bp->bio_error == 0)
831                 bp->bio_error = ENOMEM;
832         g_raid_iodone(bp, bp->bio_error);
833 }
834
835 static void
836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
837 {
838         struct g_raid_volume *vol;
839         struct g_raid_tr_raid1e_object *trs;
840
841         vol = tr->tro_volume;
842         trs = (struct g_raid_tr_raid1e_object *)tr;
843         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
844             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
845             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
846                 g_raid_iodone(bp, EIO);
847                 return;
848         }
849         /*
850          * If we're rebuilding, squeeze in rebuild activity every so often,
851          * even when the disk is busy.  Be sure to only count real I/O
852          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
853          * by this module.
854          */
855         if (trs->trso_failed_sd != NULL &&
856             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
857                 /* Make this new or running now round short. */
858                 trs->trso_recover_slabs = 0;
859                 if (--trs->trso_fair_io <= 0) {
860                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
861                         g_raid_tr_raid1e_rebuild_some(tr);
862                 }
863         }
864         switch (bp->bio_cmd) {
865         case BIO_READ:
866                 g_raid_tr_iostart_raid1e_read(tr, bp);
867                 break;
868         case BIO_WRITE:
869         case BIO_DELETE:
870                 g_raid_tr_iostart_raid1e_write(tr, bp);
871                 break;
872         case BIO_SPEEDUP:
873         case BIO_FLUSH:
874                 g_raid_tr_flush_common(tr, bp);
875                 break;
876         default:
877                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
878                     bp->bio_cmd, vol->v_name));
879                 break;
880         }
881 }
882
883 static void
884 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
885     struct g_raid_subdisk *sd, struct bio *bp)
886 {
887         struct bio *cbp;
888         struct g_raid_subdisk *nsd;
889         struct g_raid_volume *vol;
890         struct bio *pbp;
891         struct g_raid_tr_raid1e_object *trs;
892         off_t virtual, offset, start;
893         uintptr_t mask;
894         int error, do_write, copy, disk, best;
895
896         trs = (struct g_raid_tr_raid1e_object *)tr;
897         vol = tr->tro_volume;
898         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
899                 if (trs->trso_type == TR_RAID1E_REBUILD) {
900                         nsd = trs->trso_failed_sd;
901                         if (bp->bio_cmd == BIO_READ) {
902                                 /* Immediately abort rebuild, if requested. */
903                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
904                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
905                                         g_raid_tr_raid1e_rebuild_abort(tr);
906                                         return;
907                                 }
908
909                                 /* On read error, skip and cross fingers. */
910                                 if (bp->bio_error != 0) {
911                                         G_RAID_LOGREQ(0, bp,
912                                             "Read error during rebuild (%d), "
913                                             "possible data loss!",
914                                             bp->bio_error);
915                                         goto rebuild_round_done;
916                                 }
917
918                                 /*
919                                  * The read operation finished, queue the
920                                  * write and get out.
921                                  */
922                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
923                                     bp->bio_error);
924                                 bp->bio_cmd = BIO_WRITE;
925                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
926                                 bp->bio_offset = nsd->sd_rebuild_pos;
927                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
928                                 g_raid_subdisk_iostart(nsd, bp);
929                         } else {
930                                 /*
931                                  * The write operation just finished.  Do
932                                  * another.  We keep cloning the master bio
933                                  * since it has the right buffers allocated to
934                                  * it.
935                                  */
936                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
937                                     bp->bio_error);
938                                 if (bp->bio_error != 0 ||
939                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
940                                         if ((trs->trso_flags &
941                                             TR_RAID1E_F_ABORT) == 0) {
942                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
943                                                     nsd, nsd->sd_disk);
944                                         }
945                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
946                                         g_raid_tr_raid1e_rebuild_abort(tr);
947                                         return;
948                                 }
949 rebuild_round_done:
950                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
951                                 g_raid_unlock_range(tr->tro_volume,
952                                     trs->trso_lock_pos, trs->trso_lock_len);
953                                 nsd->sd_rebuild_pos += bp->bio_length;
954                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
955                                         g_raid_tr_raid1e_rebuild_finish(tr);
956                                         return;
957                                 }
958
959                                 /* Abort rebuild if we are stopping */
960                                 if (trs->trso_stopping) {
961                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
962                                         g_raid_tr_raid1e_rebuild_abort(tr);
963                                         return;
964                                 }
965
966                                 if (--trs->trso_meta_update <= 0) {
967                                         g_raid_write_metadata(vol->v_softc,
968                                             vol, nsd, nsd->sd_disk);
969                                         trs->trso_meta_update =
970                                             g_raid1e_rebuild_meta_update;
971                                         /* Compensate short rebuild I/Os. */
972                                         if ((vol->v_disks_count % N) != 0 &&
973                                             vol->v_strip_size <
974                                              g_raid1e_rebuild_slab) {
975                                                 trs->trso_meta_update *=
976                                                     g_raid1e_rebuild_slab;
977                                                 trs->trso_meta_update /=
978                                                     vol->v_strip_size;
979                                         }
980                                 }
981                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
982                                 if (--trs->trso_recover_slabs <= 0)
983                                         return;
984                                 /* Run next rebuild iteration. */
985                                 g_raid_tr_raid1e_rebuild_some(tr);
986                         }
987                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
988                         /*
989                          * read good sd, read bad sd in parallel.  when both
990                          * done, compare the buffers.  write good to the bad
991                          * if different.  do the next bit of work.
992                          */
993                         panic("Somehow, we think we're doing a resync");
994                 }
995                 return;
996         }
997         pbp = bp->bio_parent;
998         pbp->bio_inbed++;
999         mask = (intptr_t)bp->bio_caller2;
1000         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1001                 /*
1002                  * Read failed on first drive.  Retry the read error on
1003                  * another disk drive, if available, before erroring out the
1004                  * read.
1005                  */
1006                 sd->sd_disk->d_read_errs++;
1007                 G_RAID_LOGREQ(0, bp,
1008                     "Read error (%d), %d read errors total",
1009                     bp->bio_error, sd->sd_disk->d_read_errs);
1010
1011                 /*
1012                  * If there are too many read errors, we move to degraded.
1013                  * XXX Do we want to FAIL the drive (eg, make the user redo
1014                  * everything to get it back in sync), or just degrade the
1015                  * drive, which kicks off a resync?
1016                  */
1017                 do_write = 0;
1018                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1019                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1020                 else if (mask == 0)
1021                         do_write = 1;
1022
1023                 /* Restore what we were doing. */
1024                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1025                 V2P(vol, virtual, &disk, &offset, &start);
1026
1027                 /* Find the other disk, and try to do the I/O to it. */
1028                 mask |= 1 << copy;
1029                 best = g_raid_tr_raid1e_select_read_disk(vol,
1030                     disk, offset, start, mask);
1031                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1032                         disk += best;
1033                         if (disk >= vol->v_disks_count) {
1034                                 disk -= vol->v_disks_count;
1035                                 offset += vol->v_strip_size;
1036                         }
1037                         cbp->bio_offset = offset + start;
1038                         cbp->bio_length = bp->bio_length;
1039                         cbp->bio_data = bp->bio_data;
1040                         cbp->bio_ma = bp->bio_ma;
1041                         cbp->bio_ma_offset = bp->bio_ma_offset;
1042                         cbp->bio_ma_n = bp->bio_ma_n;
1043                         g_destroy_bio(bp);
1044                         nsd = &vol->v_subdisks[disk];
1045                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1046                             nsd->sd_pos);
1047                         if (do_write)
1048                                 mask |= 1 << 31;
1049                         if ((mask & (1U << 31)) != 0)
1050                                 sd->sd_recovery++;
1051                         cbp->bio_caller2 = (void *)mask;
1052                         if (do_write) {
1053                                 cbp->bio_caller1 = nsd;
1054                                 /* Lock callback starts I/O */
1055                                 g_raid_lock_range(sd->sd_volume,
1056                                     virtual, cbp->bio_length, pbp, cbp);
1057                         } else {
1058                                 g_raid_subdisk_iostart(nsd, cbp);
1059                         }
1060                         return;
1061                 }
1062                 /*
1063                  * We can't retry.  Return the original error by falling
1064                  * through.  This will happen when there's only one good disk.
1065                  * We don't need to fail the raid, since its actual state is
1066                  * based on the state of the subdisks.
1067                  */
1068                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1069         }
1070         if (bp->bio_cmd == BIO_READ &&
1071             bp->bio_error == 0 &&
1072             (mask & (1U << 31)) != 0) {
1073                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1074
1075                 /* Restore what we were doing. */
1076                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1077                 V2P(vol, virtual, &disk, &offset, &start);
1078
1079                 /* Find best disk to write. */
1080                 best = g_raid_tr_raid1e_select_read_disk(vol,
1081                     disk, offset, start, ~mask);
1082                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1083                         disk += best;
1084                         if (disk >= vol->v_disks_count) {
1085                                 disk -= vol->v_disks_count;
1086                                 offset += vol->v_strip_size;
1087                         }
1088                         cbp->bio_offset = offset + start;
1089                         cbp->bio_cmd = BIO_WRITE;
1090                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1091                         cbp->bio_caller2 = (void *)mask;
1092                         g_destroy_bio(bp);
1093                         G_RAID_LOGREQ(2, cbp,
1094                             "Attempting bad sector remap on failing drive.");
1095                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1096                         return;
1097                 }
1098         }
1099         if ((mask & (1U << 31)) != 0) {
1100                 /*
1101                  * We're done with a recovery, mark the range as unlocked.
1102                  * For any write errors, we aggressively fail the disk since
1103                  * there was both a READ and a WRITE error at this location.
1104                  * Both types of errors generally indicates the drive is on
1105                  * the verge of total failure anyway.  Better to stop trusting
1106                  * it now.  However, we need to reset error to 0 in that case
1107                  * because we're not failing the original I/O which succeeded.
1108                  */
1109
1110                 /* Restore what we were doing. */
1111                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1112                 V2P(vol, virtual, &disk, &offset, &start);
1113
1114                 for (copy = 0; copy < N; copy++) {
1115                         if ((mask & (1 << copy) ) != 0)
1116                                 vol->v_subdisks[(disk + copy) %
1117                                     vol->v_disks_count].sd_recovery--;
1118                 }
1119
1120                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1121                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1122                             "failing subdisk.");
1123                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1124                         bp->bio_error = 0;
1125                 }
1126                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1127                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1128         }
1129         if (pbp->bio_cmd != BIO_READ) {
1130                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1131                         pbp->bio_error = bp->bio_error;
1132                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1133                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1134                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1135                 }
1136                 error = pbp->bio_error;
1137         } else
1138                 error = bp->bio_error;
1139         g_destroy_bio(bp);
1140         if (pbp->bio_children == pbp->bio_inbed) {
1141                 pbp->bio_completed = pbp->bio_length;
1142                 g_raid_iodone(pbp, error);
1143         }
1144 }
1145
1146 static int
1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1148     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1149 {
1150         struct g_raid_volume *vol;
1151         struct g_raid_subdisk *sd;
1152         struct bio_queue_head queue;
1153         char *addr;
1154         off_t offset, start, length, remain;
1155         u_int no, strip_size;
1156         int i, error;
1157
1158         vol = tr->tro_volume;
1159         addr = virtual;
1160         strip_size = vol->v_strip_size;
1161         V2P(vol, boffset, &no, &offset, &start);
1162         remain = blength;
1163         bioq_init(&queue);
1164         while (remain > 0) {
1165                 length = MIN(strip_size - start, remain);
1166                 for (i = 0; i < N; i++) {
1167                         sd = &vol->v_subdisks[no];
1168                         switch (sd->sd_state) {
1169                         case G_RAID_SUBDISK_S_ACTIVE:
1170                         case G_RAID_SUBDISK_S_STALE:
1171                         case G_RAID_SUBDISK_S_RESYNC:
1172                                 break;
1173                         case G_RAID_SUBDISK_S_REBUILD:
1174                                 if (offset + start >= sd->sd_rebuild_pos)
1175                                         goto nextdisk;
1176                                 break;
1177                         default:
1178                                 goto nextdisk;
1179                         }
1180                         error = g_raid_subdisk_kerneldump(sd,
1181                             addr, 0, offset + start, length);
1182                         if (error != 0)
1183                                 return (error);
1184 nextdisk:
1185                         if (++no >= vol->v_disks_count) {
1186                                 no = 0;
1187                                 offset += strip_size;
1188                         }
1189                 }
1190                 remain -= length;
1191                 addr += length;
1192                 start = 0;
1193         }
1194         return (0);
1195 }
1196
1197 static int
1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1199 {
1200         struct bio *bp;
1201         struct g_raid_subdisk *sd;
1202
1203         bp = (struct bio *)argp;
1204         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1205         g_raid_subdisk_iostart(sd, bp);
1206
1207         return (0);
1208 }
1209
1210 static int
1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1212 {
1213         struct g_raid_tr_raid1e_object *trs;
1214         struct g_raid_volume *vol;
1215
1216         vol = tr->tro_volume;
1217         trs = (struct g_raid_tr_raid1e_object *)tr;
1218         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1219         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1220         /* Compensate short rebuild I/Os. */
1221         if ((vol->v_disks_count % N) != 0 &&
1222             vol->v_strip_size < g_raid1e_rebuild_slab) {
1223                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1224                 trs->trso_recover_slabs /= vol->v_strip_size;
1225         }
1226         if (trs->trso_type == TR_RAID1E_REBUILD)
1227                 g_raid_tr_raid1e_rebuild_some(tr);
1228         return (0);
1229 }
1230
1231 static int
1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1233 {
1234         struct g_raid_tr_raid1e_object *trs;
1235
1236         trs = (struct g_raid_tr_raid1e_object *)tr;
1237
1238         if (trs->trso_buffer != NULL) {
1239                 free(trs->trso_buffer, M_TR_RAID1E);
1240                 trs->trso_buffer = NULL;
1241         }
1242         return (0);
1243 }
1244
1245 G_RAID_TR_DECLARE(raid1e, "RAID1E");