]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/geom/raid/tr_raid1e.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / geom / raid / tr_raid1e.c
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44
45 #define N       2
46
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48
49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
52     &g_raid1e_rebuild_slab);
53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
54     &g_raid1e_rebuild_slab, 0,
55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56
57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
60     &g_raid1e_rebuild_fair_io);
61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
62     &g_raid1e_rebuild_fair_io, 0,
63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64
65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
68     &g_raid1e_rebuild_cluster_idle);
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
70     &g_raid1e_rebuild_cluster_idle, 0,
71     "Number of slabs to do each time we trigger a rebuild cycle");
72
73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
76     &g_raid1e_rebuild_meta_update);
77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
78     &g_raid1e_rebuild_meta_update, 0,
79     "When to update the meta data.");
80
81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
82
83 #define TR_RAID1E_NONE 0
84 #define TR_RAID1E_REBUILD 1
85 #define TR_RAID1E_RESYNC 2
86
87 #define TR_RAID1E_F_DOING_SOME  0x1
88 #define TR_RAID1E_F_LOCKED      0x2
89 #define TR_RAID1E_F_ABORT       0x4
90
91 struct g_raid_tr_raid1e_object {
92         struct g_raid_tr_object  trso_base;
93         int                      trso_starting;
94         int                      trso_stopping;
95         int                      trso_type;
96         int                      trso_recover_slabs; /* slabs before rest */
97         int                      trso_fair_io;
98         int                      trso_meta_update;
99         int                      trso_flags;
100         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
101         void                    *trso_buffer;    /* Buffer space */
102         off_t                    trso_lock_pos; /* Locked range start. */
103         off_t                    trso_lock_len; /* Locked range length. */
104         struct bio               trso_bio;
105 };
106
107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
117
118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
119         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
120         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
121         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
122         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
123         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
124         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
125         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
126         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
127         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
128         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
129         { 0, 0 }
130 };
131
132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
133         "RAID1E",
134         g_raid_tr_raid1e_methods,
135         sizeof(struct g_raid_tr_raid1e_object),
136         .trc_enable = 1,
137         .trc_priority = 200,
138         .trc_accept_unmapped = 1
139 };
140
141 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
142 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
143     struct g_raid_subdisk *sd);
144 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
145     int no, off_t off, off_t len, u_int mask);
146
147 static inline void
148 V2P(struct g_raid_volume *vol, off_t virt,
149     int *disk, off_t *offset, off_t *start)
150 {
151         off_t nstrip;
152         u_int strip_size;
153
154         strip_size = vol->v_strip_size;
155         /* Strip number. */
156         nstrip = virt / strip_size;
157         /* Start position in strip. */
158         *start = virt % strip_size;
159         /* Disk number. */
160         *disk = (nstrip * N) % vol->v_disks_count;
161         /* Strip start position in disk. */
162         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
163 }
164
165 static inline void
166 P2V(struct g_raid_volume *vol, int disk, off_t offset,
167     off_t *virt, int *copy)
168 {
169         off_t nstrip, start;
170         u_int strip_size;
171
172         strip_size = vol->v_strip_size;
173         /* Start position in strip. */
174         start = offset % strip_size;
175         /* Physical strip number. */
176         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
177         /* Number of physical strip (copy) inside virtual strip. */
178         *copy = nstrip % N;
179         /* Offset in virtual space. */
180         *virt = (nstrip / N) * strip_size + start;
181 }
182
183 static int
184 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
185 {
186         struct g_raid_tr_raid1e_object *trs;
187
188         trs = (struct g_raid_tr_raid1e_object *)tr;
189         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
190             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
191                 return (G_RAID_TR_TASTE_FAIL);
192         trs->trso_starting = 1;
193         return (G_RAID_TR_TASTE_SUCCEED);
194 }
195
196 static int
197 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
198 {
199         struct g_raid_softc *sc;
200         struct g_raid_subdisk *sd, *bestsd, *worstsd;
201         int i, j, state, sstate;
202
203         sc = vol->v_softc;
204         state = G_RAID_VOLUME_S_OPTIMAL;
205         for (i = 0; i < vol->v_disks_count / N; i++) {
206                 bestsd = &vol->v_subdisks[i * N];
207                 for (j = 1; j < N; j++) {
208                         sd = &vol->v_subdisks[i * N + j];
209                         if (sd->sd_state > bestsd->sd_state)
210                                 bestsd = sd;
211                         else if (sd->sd_state == bestsd->sd_state &&
212                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
213                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
214                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
215                                 bestsd = sd;
216                 }
217                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
218                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
219                         /* We found reasonable candidate. */
220                         G_RAID_DEBUG1(1, sc,
221                             "Promote subdisk %s:%d from %s to ACTIVE.",
222                             vol->v_name, bestsd->sd_pos,
223                             g_raid_subdisk_state2str(bestsd->sd_state));
224                         g_raid_change_subdisk_state(bestsd,
225                             G_RAID_SUBDISK_S_ACTIVE);
226                         g_raid_write_metadata(sc,
227                             vol, bestsd, bestsd->sd_disk);
228                 }
229                 worstsd = &vol->v_subdisks[i * N];
230                 for (j = 1; j < N; j++) {
231                         sd = &vol->v_subdisks[i * N + j];
232                         if (sd->sd_state < worstsd->sd_state)
233                                 worstsd = sd;
234                 }
235                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
236                         sstate = G_RAID_VOLUME_S_OPTIMAL;
237                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
238                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
239                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
240                         sstate = G_RAID_VOLUME_S_DEGRADED;
241                 else
242                         sstate = G_RAID_VOLUME_S_BROKEN;
243                 if (sstate < state)
244                         state = sstate;
245         }
246         return (state);
247 }
248
249 static int
250 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
251 {
252         struct g_raid_softc *sc;
253         struct g_raid_subdisk *sd, *bestsd, *worstsd;
254         int i, j, state, sstate;
255
256         sc = vol->v_softc;
257         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
258             vol->v_disks_count)
259                 return (G_RAID_VOLUME_S_OPTIMAL);
260         for (i = 0; i < vol->v_disks_count; i++) {
261                 sd = &vol->v_subdisks[i];
262                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
263                         /* We found reasonable candidate. */
264                         G_RAID_DEBUG1(1, sc,
265                             "Promote subdisk %s:%d from %s to STALE.",
266                             vol->v_name, sd->sd_pos,
267                             g_raid_subdisk_state2str(sd->sd_state));
268                         g_raid_change_subdisk_state(sd,
269                             G_RAID_SUBDISK_S_STALE);
270                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
271                 }
272         }
273         state = G_RAID_VOLUME_S_OPTIMAL;
274         for (i = 0; i < vol->v_disks_count; i++) {
275                 bestsd = &vol->v_subdisks[i];
276                 worstsd = &vol->v_subdisks[i];
277                 for (j = 1; j < N; j++) {
278                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
279                         if (sd->sd_state > bestsd->sd_state)
280                                 bestsd = sd;
281                         else if (sd->sd_state == bestsd->sd_state &&
282                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
283                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
284                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
285                                 bestsd = sd;
286                         if (sd->sd_state < worstsd->sd_state)
287                                 worstsd = sd;
288                 }
289                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
290                         sstate = G_RAID_VOLUME_S_OPTIMAL;
291                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
292                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
293                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
294                         sstate = G_RAID_VOLUME_S_DEGRADED;
295                 else
296                         sstate = G_RAID_VOLUME_S_BROKEN;
297                 if (sstate < state)
298                         state = sstate;
299         }
300         return (state);
301 }
302
303 static int
304 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
305     struct g_raid_subdisk *sd)
306 {
307         struct g_raid_tr_raid1e_object *trs;
308         struct g_raid_softc *sc;
309         u_int s;
310
311         sc = vol->v_softc;
312         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
313         if (trs->trso_stopping &&
314             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
315                 s = G_RAID_VOLUME_S_STOPPED;
316         else if (trs->trso_starting)
317                 s = G_RAID_VOLUME_S_STARTING;
318         else {
319                 if ((vol->v_disks_count % N) == 0)
320                         s = g_raid_tr_update_state_raid1e_even(vol);
321                 else
322                         s = g_raid_tr_update_state_raid1e_odd(vol);
323         }
324         if (s != vol->v_state) {
325                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
326                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
327                     G_RAID_EVENT_VOLUME);
328                 g_raid_change_volume_state(vol, s);
329                 if (!trs->trso_starting && !trs->trso_stopping)
330                         g_raid_write_metadata(sc, vol, NULL, NULL);
331         }
332         if (!trs->trso_starting && !trs->trso_stopping)
333                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
334         return (0);
335 }
336
337 static void
338 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
339     struct g_raid_disk *disk)
340 {
341         struct g_raid_volume *vol;
342
343         vol = sd->sd_volume;
344         /*
345          * We don't fail the last disk in the pack, since it still has decent
346          * data on it and that's better than failing the disk if it is the root
347          * file system.
348          *
349          * XXX should this be controlled via a tunable?  It makes sense for
350          * the volume that has / on it.  I can't think of a case where we'd
351          * want the volume to go away on this kind of event.
352          */
353         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
354              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
355              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
356              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
357              vol->v_disks_count) &&
358             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
359                 return;
360         g_raid_fail_disk(sc, sd, disk);
361 }
362
363 static void
364 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
365 {
366         struct g_raid_volume *vol;
367         struct g_raid_subdisk *sd;
368
369         vol = trs->trso_base.tro_volume;
370         sd = trs->trso_failed_sd;
371         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
372         free(trs->trso_buffer, M_TR_RAID1E);
373         trs->trso_buffer = NULL;
374         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
375         trs->trso_type = TR_RAID1E_NONE;
376         trs->trso_recover_slabs = 0;
377         trs->trso_failed_sd = NULL;
378         g_raid_tr_update_state_raid1e(vol, NULL);
379 }
380
381 static void
382 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
383 {
384         struct g_raid_tr_raid1e_object *trs;
385         struct g_raid_subdisk *sd;
386
387         trs = (struct g_raid_tr_raid1e_object *)tr;
388         sd = trs->trso_failed_sd;
389         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
390             "Subdisk %s:%d-%s rebuild completed.",
391             sd->sd_volume->v_name, sd->sd_pos,
392             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
393         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
394         sd->sd_rebuild_pos = 0;
395         g_raid_tr_raid1e_rebuild_done(trs);
396 }
397
398 static void
399 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
400 {
401         struct g_raid_tr_raid1e_object *trs;
402         struct g_raid_subdisk *sd;
403         struct g_raid_volume *vol;
404
405         vol = tr->tro_volume;
406         trs = (struct g_raid_tr_raid1e_object *)tr;
407         sd = trs->trso_failed_sd;
408         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
409                 G_RAID_DEBUG1(1, vol->v_softc,
410                     "Subdisk %s:%d-%s rebuild is aborting.",
411                     sd->sd_volume->v_name, sd->sd_pos,
412                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
413                 trs->trso_flags |= TR_RAID1E_F_ABORT;
414         } else {
415                 G_RAID_DEBUG1(0, vol->v_softc,
416                     "Subdisk %s:%d-%s rebuild aborted.",
417                     sd->sd_volume->v_name, sd->sd_pos,
418                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
419                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
420                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
421                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
422                         g_raid_unlock_range(tr->tro_volume,
423                             trs->trso_lock_pos, trs->trso_lock_len);
424                 }
425                 g_raid_tr_raid1e_rebuild_done(trs);
426         }
427 }
428
429 static void
430 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
431 {
432         struct g_raid_tr_raid1e_object *trs;
433         struct g_raid_softc *sc;
434         struct g_raid_volume *vol;
435         struct g_raid_subdisk *sd;
436         struct bio *bp;
437         off_t len, virtual, vend, offset, start;
438         int disk, copy, best;
439
440         trs = (struct g_raid_tr_raid1e_object *)tr;
441         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
442                 return;
443         vol = tr->tro_volume;
444         sc = vol->v_softc;
445         sd = trs->trso_failed_sd;
446
447         while (1) {
448                 if (sd->sd_rebuild_pos >= sd->sd_size) {
449                         g_raid_tr_raid1e_rebuild_finish(tr);
450                         return;
451                 }
452                 /* Get virtual offset from physical rebuild position. */
453                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
454                 /* Get physical offset back to get first stripe position. */
455                 V2P(vol, virtual, &disk, &offset, &start);
456                 /* Calculate contignous data length. */
457                 len = MIN(g_raid1e_rebuild_slab,
458                     sd->sd_size - sd->sd_rebuild_pos);
459                 if ((vol->v_disks_count % N) != 0)
460                         len = MIN(len, vol->v_strip_size - start);
461                 /* Find disk with most accurate data. */
462                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
463                     offset + start, len, 0);
464                 if (best < 0) {
465                         /* There is no any valid disk. */
466                         g_raid_tr_raid1e_rebuild_abort(tr);
467                         return;
468                 } else if (best != copy) {
469                         /* Some other disk has better data. */
470                         break;
471                 }
472                 /* We have the most accurate data. Skip the range. */
473                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
474                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
475                 sd->sd_rebuild_pos += len;
476         }
477
478         bp = &trs->trso_bio;
479         memset(bp, 0, sizeof(*bp));
480         bp->bio_offset = offset + start +
481             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
482         bp->bio_length = len;
483         bp->bio_data = trs->trso_buffer;
484         bp->bio_cmd = BIO_READ;
485         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
486         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
487         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
488         /*
489          * If we are crossing stripe boundary, correct affected virtual
490          * range we should lock.
491          */
492         if (start + len > vol->v_strip_size) {
493                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
494                 len = vend - virtual;
495         }
496         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
497         trs->trso_flags |= TR_RAID1E_F_LOCKED;
498         trs->trso_lock_pos = virtual;
499         trs->trso_lock_len = len;
500         /* Lock callback starts I/O */
501         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
502 }
503
504 static void
505 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
506 {
507         struct g_raid_volume *vol;
508         struct g_raid_tr_raid1e_object *trs;
509         struct g_raid_subdisk *sd;
510
511         vol = tr->tro_volume;
512         trs = (struct g_raid_tr_raid1e_object *)tr;
513         if (trs->trso_failed_sd) {
514                 G_RAID_DEBUG1(1, vol->v_softc,
515                     "Already rebuild in start rebuild. pos %jd\n",
516                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
517                 return;
518         }
519         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
520         if (sd == NULL)
521                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
522         if (sd == NULL) {
523                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
524                 if (sd != NULL) {
525                         sd->sd_rebuild_pos = 0;
526                         g_raid_change_subdisk_state(sd,
527                             G_RAID_SUBDISK_S_RESYNC);
528                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
529                 } else {
530                         sd = g_raid_get_subdisk(vol,
531                             G_RAID_SUBDISK_S_UNINITIALIZED);
532                         if (sd == NULL)
533                                 sd = g_raid_get_subdisk(vol,
534                                     G_RAID_SUBDISK_S_NEW);
535                         if (sd != NULL) {
536                                 sd->sd_rebuild_pos = 0;
537                                 g_raid_change_subdisk_state(sd,
538                                     G_RAID_SUBDISK_S_REBUILD);
539                                 g_raid_write_metadata(vol->v_softc,
540                                     vol, sd, NULL);
541                         }
542                 }
543         }
544         if (sd == NULL) {
545                 G_RAID_DEBUG1(1, vol->v_softc,
546                     "No failed disk to rebuild.  night night.");
547                 return;
548         }
549         trs->trso_failed_sd = sd;
550         G_RAID_DEBUG1(0, vol->v_softc,
551             "Subdisk %s:%d-%s rebuild start at %jd.",
552             sd->sd_volume->v_name, sd->sd_pos,
553             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
554             trs->trso_failed_sd->sd_rebuild_pos);
555         trs->trso_type = TR_RAID1E_REBUILD;
556         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
557         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
558         g_raid_tr_raid1e_rebuild_some(tr);
559 }
560
561 static void
562 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
563     struct g_raid_subdisk *sd)
564 {
565         struct g_raid_volume *vol;
566         struct g_raid_tr_raid1e_object *trs;
567         int nr;
568         
569         vol = tr->tro_volume;
570         trs = (struct g_raid_tr_raid1e_object *)tr;
571         if (trs->trso_stopping)
572                 return;
573         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
574             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
575         switch(trs->trso_type) {
576         case TR_RAID1E_NONE:
577                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
578                         return;
579                 if (nr == 0) {
580                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
581                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
582                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
583                         if (nr == 0)
584                                 return;
585                 }
586                 g_raid_tr_raid1e_rebuild_start(tr);
587                 break;
588         case TR_RAID1E_REBUILD:
589                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
590                     trs->trso_failed_sd == sd)
591                         g_raid_tr_raid1e_rebuild_abort(tr);
592                 break;
593         case TR_RAID1E_RESYNC:
594                 break;
595         }
596 }
597
598 static int
599 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
600     struct g_raid_subdisk *sd, u_int event)
601 {
602
603         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
604         return (0);
605 }
606
607 static int
608 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
609 {
610         struct g_raid_tr_raid1e_object *trs;
611         struct g_raid_volume *vol;
612
613         trs = (struct g_raid_tr_raid1e_object *)tr;
614         vol = tr->tro_volume;
615         trs->trso_starting = 0;
616         g_raid_tr_update_state_raid1e(vol, NULL);
617         return (0);
618 }
619
620 static int
621 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
622 {
623         struct g_raid_tr_raid1e_object *trs;
624         struct g_raid_volume *vol;
625
626         trs = (struct g_raid_tr_raid1e_object *)tr;
627         vol = tr->tro_volume;
628         trs->trso_starting = 0;
629         trs->trso_stopping = 1;
630         g_raid_tr_update_state_raid1e(vol, NULL);
631         return (0);
632 }
633
634 /*
635  * Select the disk to read from.  Take into account: subdisk state, running
636  * error recovery, average disk load, head position and possible cache hits.
637  */
638 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
639 static int
640 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
641     int no, off_t off, off_t len, u_int mask)
642 {
643         struct g_raid_subdisk *sd;
644         off_t offset;
645         int i, best, prio, bestprio;
646
647         best = -1;
648         bestprio = INT_MAX;
649         for (i = 0; i < N; i++) {
650                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
651                 offset = off;
652                 if (no + i >= vol->v_disks_count)
653                         offset += vol->v_strip_size;
654
655                 prio = G_RAID_SUBDISK_LOAD(sd);
656                 if ((mask & (1 << sd->sd_pos)) != 0)
657                         continue;
658                 switch (sd->sd_state) {
659                 case G_RAID_SUBDISK_S_ACTIVE:
660                         break;
661                 case G_RAID_SUBDISK_S_RESYNC:
662                         if (offset + off < sd->sd_rebuild_pos)
663                                 break;
664                         /* FALLTHROUGH */
665                 case G_RAID_SUBDISK_S_STALE:
666                         prio += i << 24;
667                         break;
668                 case G_RAID_SUBDISK_S_REBUILD:
669                         if (offset + off < sd->sd_rebuild_pos)
670                                 break;
671                         /* FALLTHROUGH */
672                 default:
673                         continue;
674                 }
675                 prio += min(sd->sd_recovery, 255) << 16;
676                 /* If disk head is precisely in position - highly prefer it. */
677                 if (G_RAID_SUBDISK_POS(sd) == offset)
678                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
679                 else
680                 /* If disk head is close to position - prefer it. */
681                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
682                     G_RAID_SUBDISK_TRACK_SIZE)
683                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
684                 if (prio < bestprio) {
685                         bestprio = prio;
686                         best = i;
687                 }
688         }
689         return (best);
690 }
691
692 static void
693 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
694 {
695         struct g_raid_volume *vol;
696         struct g_raid_subdisk *sd;
697         struct bio_queue_head queue;
698         struct bio *cbp;
699         char *addr;
700         off_t offset, start, length, remain;
701         u_int no, strip_size;
702         int best;
703
704         vol = tr->tro_volume;
705         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
706                 addr = NULL;
707         else
708                 addr = bp->bio_data;
709         strip_size = vol->v_strip_size;
710         V2P(vol, bp->bio_offset, &no, &offset, &start);
711         remain = bp->bio_length;
712         bioq_init(&queue);
713         while (remain > 0) {
714                 length = MIN(strip_size - start, remain);
715                 best = g_raid_tr_raid1e_select_read_disk(vol,
716                     no, offset, length, 0);
717                 KASSERT(best >= 0, ("No readable disk in volume %s!",
718                     vol->v_name));
719                 no += best;
720                 if (no >= vol->v_disks_count) {
721                         no -= vol->v_disks_count;
722                         offset += strip_size;
723                 }
724                 cbp = g_clone_bio(bp);
725                 if (cbp == NULL)
726                         goto failure;
727                 cbp->bio_offset = offset + start;
728                 cbp->bio_length = length;
729                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
730                         cbp->bio_ma_offset += (uintptr_t)addr;
731                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
732                         cbp->bio_ma_offset %= PAGE_SIZE;
733                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
734                             cbp->bio_length) / PAGE_SIZE;
735                 } else
736                         cbp->bio_data = addr;
737                 cbp->bio_caller1 = &vol->v_subdisks[no];
738                 bioq_insert_tail(&queue, cbp);
739                 no += N - best;
740                 if (no >= vol->v_disks_count) {
741                         no -= vol->v_disks_count;
742                         offset += strip_size;
743                 }
744                 remain -= length;
745                 addr += length;
746                 start = 0;
747         }
748         while ((cbp = bioq_takefirst(&queue)) != NULL) {
749                 sd = cbp->bio_caller1;
750                 cbp->bio_caller1 = NULL;
751                 g_raid_subdisk_iostart(sd, cbp);
752         }
753         return;
754 failure:
755         while ((cbp = bioq_takefirst(&queue)) != NULL)
756                 g_destroy_bio(cbp);
757         if (bp->bio_error == 0)
758                 bp->bio_error = ENOMEM;
759         g_raid_iodone(bp, bp->bio_error);
760 }
761
762 static void
763 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
764 {
765         struct g_raid_volume *vol;
766         struct g_raid_subdisk *sd;
767         struct bio_queue_head queue;
768         struct bio *cbp;
769         char *addr;
770         off_t offset, start, length, remain;
771         u_int no, strip_size;
772         int i;
773
774         vol = tr->tro_volume;
775         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
776                 addr = NULL;
777         else
778                 addr = bp->bio_data;
779         strip_size = vol->v_strip_size;
780         V2P(vol, bp->bio_offset, &no, &offset, &start);
781         remain = bp->bio_length;
782         bioq_init(&queue);
783         while (remain > 0) {
784                 length = MIN(strip_size - start, remain);
785                 for (i = 0; i < N; i++) {
786                         sd = &vol->v_subdisks[no];
787                         switch (sd->sd_state) {
788                         case G_RAID_SUBDISK_S_ACTIVE:
789                         case G_RAID_SUBDISK_S_STALE:
790                         case G_RAID_SUBDISK_S_RESYNC:
791                                 break;
792                         case G_RAID_SUBDISK_S_REBUILD:
793                                 if (offset + start >= sd->sd_rebuild_pos)
794                                         goto nextdisk;
795                                 break;
796                         default:
797                                 goto nextdisk;
798                         }
799                         cbp = g_clone_bio(bp);
800                         if (cbp == NULL)
801                                 goto failure;
802                         cbp->bio_offset = offset + start;
803                         cbp->bio_length = length;
804                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
805                             bp->bio_cmd != BIO_DELETE) {
806                                 cbp->bio_ma_offset += (uintptr_t)addr;
807                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
808                                 cbp->bio_ma_offset %= PAGE_SIZE;
809                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
810                                     cbp->bio_length) / PAGE_SIZE;
811                         } else
812                                 cbp->bio_data = addr;
813                         cbp->bio_caller1 = sd;
814                         bioq_insert_tail(&queue, cbp);
815 nextdisk:
816                         if (++no >= vol->v_disks_count) {
817                                 no = 0;
818                                 offset += strip_size;
819                         }
820                 }
821                 remain -= length;
822                 if (bp->bio_cmd != BIO_DELETE)
823                         addr += length;
824                 start = 0;
825         }
826         while ((cbp = bioq_takefirst(&queue)) != NULL) {
827                 sd = cbp->bio_caller1;
828                 cbp->bio_caller1 = NULL;
829                 g_raid_subdisk_iostart(sd, cbp);
830         }
831         return;
832 failure:
833         while ((cbp = bioq_takefirst(&queue)) != NULL)
834                 g_destroy_bio(cbp);
835         if (bp->bio_error == 0)
836                 bp->bio_error = ENOMEM;
837         g_raid_iodone(bp, bp->bio_error);
838 }
839
840 static void
841 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
842 {
843         struct g_raid_volume *vol;
844         struct g_raid_tr_raid1e_object *trs;
845
846         vol = tr->tro_volume;
847         trs = (struct g_raid_tr_raid1e_object *)tr;
848         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
849             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
850             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
851                 g_raid_iodone(bp, EIO);
852                 return;
853         }
854         /*
855          * If we're rebuilding, squeeze in rebuild activity every so often,
856          * even when the disk is busy.  Be sure to only count real I/O
857          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
858          * by this module.
859          */
860         if (trs->trso_failed_sd != NULL &&
861             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
862                 /* Make this new or running now round short. */
863                 trs->trso_recover_slabs = 0;
864                 if (--trs->trso_fair_io <= 0) {
865                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
866                         g_raid_tr_raid1e_rebuild_some(tr);
867                 }
868         }
869         switch (bp->bio_cmd) {
870         case BIO_READ:
871                 g_raid_tr_iostart_raid1e_read(tr, bp);
872                 break;
873         case BIO_WRITE:
874         case BIO_DELETE:
875                 g_raid_tr_iostart_raid1e_write(tr, bp);
876                 break;
877         case BIO_FLUSH:
878                 g_raid_tr_flush_common(tr, bp);
879                 break;
880         default:
881                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
882                     bp->bio_cmd, vol->v_name));
883                 break;
884         }
885 }
886
887 static void
888 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
889     struct g_raid_subdisk *sd, struct bio *bp)
890 {
891         struct bio *cbp;
892         struct g_raid_subdisk *nsd;
893         struct g_raid_volume *vol;
894         struct bio *pbp;
895         struct g_raid_tr_raid1e_object *trs;
896         off_t virtual, offset, start;
897         uintptr_t mask;
898         int error, do_write, copy, disk, best;
899
900         trs = (struct g_raid_tr_raid1e_object *)tr;
901         vol = tr->tro_volume;
902         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
903                 if (trs->trso_type == TR_RAID1E_REBUILD) {
904                         nsd = trs->trso_failed_sd;
905                         if (bp->bio_cmd == BIO_READ) {
906
907                                 /* Immediately abort rebuild, if requested. */
908                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
909                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
910                                         g_raid_tr_raid1e_rebuild_abort(tr);
911                                         return;
912                                 }
913
914                                 /* On read error, skip and cross fingers. */
915                                 if (bp->bio_error != 0) {
916                                         G_RAID_LOGREQ(0, bp,
917                                             "Read error during rebuild (%d), "
918                                             "possible data loss!",
919                                             bp->bio_error);
920                                         goto rebuild_round_done;
921                                 }
922
923                                 /*
924                                  * The read operation finished, queue the
925                                  * write and get out.
926                                  */
927                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
928                                     bp->bio_error);
929                                 bp->bio_cmd = BIO_WRITE;
930                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
931                                 bp->bio_offset = nsd->sd_rebuild_pos;
932                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
933                                 g_raid_subdisk_iostart(nsd, bp);
934                         } else {
935                                 /*
936                                  * The write operation just finished.  Do
937                                  * another.  We keep cloning the master bio
938                                  * since it has the right buffers allocated to
939                                  * it.
940                                  */
941                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
942                                     bp->bio_error);
943                                 if (bp->bio_error != 0 ||
944                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
945                                         if ((trs->trso_flags &
946                                             TR_RAID1E_F_ABORT) == 0) {
947                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
948                                                     nsd, nsd->sd_disk);
949                                         }
950                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
951                                         g_raid_tr_raid1e_rebuild_abort(tr);
952                                         return;
953                                 }
954 rebuild_round_done:
955                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
956                                 g_raid_unlock_range(tr->tro_volume,
957                                     trs->trso_lock_pos, trs->trso_lock_len);
958                                 nsd->sd_rebuild_pos += bp->bio_length;
959                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
960                                         g_raid_tr_raid1e_rebuild_finish(tr);
961                                         return;
962                                 }
963
964                                 /* Abort rebuild if we are stopping */
965                                 if (trs->trso_stopping) {
966                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
967                                         g_raid_tr_raid1e_rebuild_abort(tr);
968                                         return;
969                                 }
970
971                                 if (--trs->trso_meta_update <= 0) {
972                                         g_raid_write_metadata(vol->v_softc,
973                                             vol, nsd, nsd->sd_disk);
974                                         trs->trso_meta_update =
975                                             g_raid1e_rebuild_meta_update;
976                                         /* Compensate short rebuild I/Os. */
977                                         if ((vol->v_disks_count % N) != 0 &&
978                                             vol->v_strip_size <
979                                              g_raid1e_rebuild_slab) {
980                                                 trs->trso_meta_update *=
981                                                     g_raid1e_rebuild_slab;
982                                                 trs->trso_meta_update /=
983                                                     vol->v_strip_size;
984                                         }
985                                 }
986                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
987                                 if (--trs->trso_recover_slabs <= 0)
988                                         return;
989                                 /* Run next rebuild iteration. */
990                                 g_raid_tr_raid1e_rebuild_some(tr);
991                         }
992                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
993                         /*
994                          * read good sd, read bad sd in parallel.  when both
995                          * done, compare the buffers.  write good to the bad
996                          * if different.  do the next bit of work.
997                          */
998                         panic("Somehow, we think we're doing a resync");
999                 }
1000                 return;
1001         }
1002         pbp = bp->bio_parent;
1003         pbp->bio_inbed++;
1004         mask = (intptr_t)bp->bio_caller2;
1005         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1006                 /*
1007                  * Read failed on first drive.  Retry the read error on
1008                  * another disk drive, if available, before erroring out the
1009                  * read.
1010                  */
1011                 sd->sd_disk->d_read_errs++;
1012                 G_RAID_LOGREQ(0, bp,
1013                     "Read error (%d), %d read errors total",
1014                     bp->bio_error, sd->sd_disk->d_read_errs);
1015
1016                 /*
1017                  * If there are too many read errors, we move to degraded.
1018                  * XXX Do we want to FAIL the drive (eg, make the user redo
1019                  * everything to get it back in sync), or just degrade the
1020                  * drive, which kicks off a resync?
1021                  */
1022                 do_write = 0;
1023                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1024                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1025                 else if (mask == 0)
1026                         do_write = 1;
1027
1028                 /* Restore what we were doing. */
1029                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1030                 V2P(vol, virtual, &disk, &offset, &start);
1031
1032                 /* Find the other disk, and try to do the I/O to it. */
1033                 mask |= 1 << copy;
1034                 best = g_raid_tr_raid1e_select_read_disk(vol,
1035                     disk, offset, start, mask);
1036                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1037                         disk += best;
1038                         if (disk >= vol->v_disks_count) {
1039                                 disk -= vol->v_disks_count;
1040                                 offset += vol->v_strip_size;
1041                         }
1042                         cbp->bio_offset = offset + start;
1043                         cbp->bio_length = bp->bio_length;
1044                         cbp->bio_data = bp->bio_data;
1045                         cbp->bio_ma = bp->bio_ma;
1046                         cbp->bio_ma_offset = bp->bio_ma_offset;
1047                         cbp->bio_ma_n = bp->bio_ma_n;
1048                         g_destroy_bio(bp);
1049                         nsd = &vol->v_subdisks[disk];
1050                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1051                             nsd->sd_pos);
1052                         if (do_write)
1053                                 mask |= 1 << 31;
1054                         if ((mask & (1U << 31)) != 0)
1055                                 sd->sd_recovery++;
1056                         cbp->bio_caller2 = (void *)mask;
1057                         if (do_write) {
1058                                 cbp->bio_caller1 = nsd;
1059                                 /* Lock callback starts I/O */
1060                                 g_raid_lock_range(sd->sd_volume,
1061                                     virtual, cbp->bio_length, pbp, cbp);
1062                         } else {
1063                                 g_raid_subdisk_iostart(nsd, cbp);
1064                         }
1065                         return;
1066                 }
1067                 /*
1068                  * We can't retry.  Return the original error by falling
1069                  * through.  This will happen when there's only one good disk.
1070                  * We don't need to fail the raid, since its actual state is
1071                  * based on the state of the subdisks.
1072                  */
1073                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1074         }
1075         if (bp->bio_cmd == BIO_READ &&
1076             bp->bio_error == 0 &&
1077             (mask & (1U << 31)) != 0) {
1078                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1079
1080                 /* Restore what we were doing. */
1081                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1082                 V2P(vol, virtual, &disk, &offset, &start);
1083
1084                 /* Find best disk to write. */
1085                 best = g_raid_tr_raid1e_select_read_disk(vol,
1086                     disk, offset, start, ~mask);
1087                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1088                         disk += best;
1089                         if (disk >= vol->v_disks_count) {
1090                                 disk -= vol->v_disks_count;
1091                                 offset += vol->v_strip_size;
1092                         }
1093                         cbp->bio_offset = offset + start;
1094                         cbp->bio_cmd = BIO_WRITE;
1095                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1096                         cbp->bio_caller2 = (void *)mask;
1097                         g_destroy_bio(bp);
1098                         G_RAID_LOGREQ(2, cbp,
1099                             "Attempting bad sector remap on failing drive.");
1100                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1101                         return;
1102                 }
1103         }
1104         if ((mask & (1U << 31)) != 0) {
1105                 /*
1106                  * We're done with a recovery, mark the range as unlocked.
1107                  * For any write errors, we agressively fail the disk since
1108                  * there was both a READ and a WRITE error at this location.
1109                  * Both types of errors generally indicates the drive is on
1110                  * the verge of total failure anyway.  Better to stop trusting
1111                  * it now.  However, we need to reset error to 0 in that case
1112                  * because we're not failing the original I/O which succeeded.
1113                  */
1114
1115                 /* Restore what we were doing. */
1116                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1117                 V2P(vol, virtual, &disk, &offset, &start);
1118
1119                 for (copy = 0; copy < N; copy++) {
1120                         if ((mask & (1 << copy) ) != 0)
1121                                 vol->v_subdisks[(disk + copy) %
1122                                     vol->v_disks_count].sd_recovery--;
1123                 }
1124
1125                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1126                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1127                             "failing subdisk.");
1128                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1129                         bp->bio_error = 0;
1130                 }
1131                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1132                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1133         }
1134         if (pbp->bio_cmd != BIO_READ) {
1135                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1136                         pbp->bio_error = bp->bio_error;
1137                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1138                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1139                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1140                 }
1141                 error = pbp->bio_error;
1142         } else
1143                 error = bp->bio_error;
1144         g_destroy_bio(bp);
1145         if (pbp->bio_children == pbp->bio_inbed) {
1146                 pbp->bio_completed = pbp->bio_length;
1147                 g_raid_iodone(pbp, error);
1148         }
1149 }
1150
1151 static int
1152 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1153     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1154 {
1155         struct g_raid_volume *vol;
1156         struct g_raid_subdisk *sd;
1157         struct bio_queue_head queue;
1158         char *addr;
1159         off_t offset, start, length, remain;
1160         u_int no, strip_size;
1161         int i, error;
1162
1163         vol = tr->tro_volume;
1164         addr = virtual;
1165         strip_size = vol->v_strip_size;
1166         V2P(vol, boffset, &no, &offset, &start);
1167         remain = blength;
1168         bioq_init(&queue);
1169         while (remain > 0) {
1170                 length = MIN(strip_size - start, remain);
1171                 for (i = 0; i < N; i++) {
1172                         sd = &vol->v_subdisks[no];
1173                         switch (sd->sd_state) {
1174                         case G_RAID_SUBDISK_S_ACTIVE:
1175                         case G_RAID_SUBDISK_S_STALE:
1176                         case G_RAID_SUBDISK_S_RESYNC:
1177                                 break;
1178                         case G_RAID_SUBDISK_S_REBUILD:
1179                                 if (offset + start >= sd->sd_rebuild_pos)
1180                                         goto nextdisk;
1181                                 break;
1182                         default:
1183                                 goto nextdisk;
1184                         }
1185                         error = g_raid_subdisk_kerneldump(sd,
1186                             addr, 0, offset + start, length);
1187                         if (error != 0)
1188                                 return (error);
1189 nextdisk:
1190                         if (++no >= vol->v_disks_count) {
1191                                 no = 0;
1192                                 offset += strip_size;
1193                         }
1194                 }
1195                 remain -= length;
1196                 addr += length;
1197                 start = 0;
1198         }
1199         return (0);
1200 }
1201
1202 static int
1203 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1204 {
1205         struct bio *bp;
1206         struct g_raid_subdisk *sd;
1207
1208         bp = (struct bio *)argp;
1209         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1210         g_raid_subdisk_iostart(sd, bp);
1211
1212         return (0);
1213 }
1214
1215 static int
1216 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1217 {
1218         struct g_raid_tr_raid1e_object *trs;
1219         struct g_raid_volume *vol;
1220
1221         vol = tr->tro_volume;
1222         trs = (struct g_raid_tr_raid1e_object *)tr;
1223         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1224         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1225         /* Compensate short rebuild I/Os. */
1226         if ((vol->v_disks_count % N) != 0 &&
1227             vol->v_strip_size < g_raid1e_rebuild_slab) {
1228                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1229                 trs->trso_recover_slabs /= vol->v_strip_size;
1230         }
1231         if (trs->trso_type == TR_RAID1E_REBUILD)
1232                 g_raid_tr_raid1e_rebuild_some(tr);
1233         return (0);
1234 }
1235
1236 static int
1237 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1238 {
1239         struct g_raid_tr_raid1e_object *trs;
1240
1241         trs = (struct g_raid_tr_raid1e_object *)tr;
1242
1243         if (trs->trso_buffer != NULL) {
1244                 free(trs->trso_buffer, M_TR_RAID1E);
1245                 trs->trso_buffer = NULL;
1246         }
1247         return (0);
1248 }
1249
1250 G_RAID_TR_DECLARE(raid1e, "RAID1E");