]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/geom/raid/tr_raid1e.c
Update llvm/clang to r241361.
[FreeBSD/FreeBSD.git] / sys / geom / raid / tr_raid1e.c
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44
45 #define N       2
46
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48
49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
52     &g_raid1e_rebuild_slab, 0,
53     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
54
55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
58     &g_raid1e_rebuild_fair_io, 0,
59     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
60
61 #define RAID1E_REBUILD_CLUSTER_IDLE 100
62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
64     &g_raid1e_rebuild_cluster_idle, 0,
65     "Number of slabs to do each time we trigger a rebuild cycle");
66
67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
70     &g_raid1e_rebuild_meta_update, 0,
71     "When to update the meta data.");
72
73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
74
75 #define TR_RAID1E_NONE 0
76 #define TR_RAID1E_REBUILD 1
77 #define TR_RAID1E_RESYNC 2
78
79 #define TR_RAID1E_F_DOING_SOME  0x1
80 #define TR_RAID1E_F_LOCKED      0x2
81 #define TR_RAID1E_F_ABORT       0x4
82
83 struct g_raid_tr_raid1e_object {
84         struct g_raid_tr_object  trso_base;
85         int                      trso_starting;
86         int                      trso_stopping;
87         int                      trso_type;
88         int                      trso_recover_slabs; /* slabs before rest */
89         int                      trso_fair_io;
90         int                      trso_meta_update;
91         int                      trso_flags;
92         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
93         void                    *trso_buffer;    /* Buffer space */
94         off_t                    trso_lock_pos; /* Locked range start. */
95         off_t                    trso_lock_len; /* Locked range length. */
96         struct bio               trso_bio;
97 };
98
99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
100 static g_raid_tr_event_t g_raid_tr_event_raid1e;
101 static g_raid_tr_start_t g_raid_tr_start_raid1e;
102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
108 static g_raid_tr_free_t g_raid_tr_free_raid1e;
109
110 static kobj_method_t g_raid_tr_raid1e_methods[] = {
111         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
112         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
113         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
114         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
115         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
116         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
117         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
118         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
119         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
120         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
121         { 0, 0 }
122 };
123
124 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
125         "RAID1E",
126         g_raid_tr_raid1e_methods,
127         sizeof(struct g_raid_tr_raid1e_object),
128         .trc_enable = 1,
129         .trc_priority = 200,
130         .trc_accept_unmapped = 1
131 };
132
133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
135     struct g_raid_subdisk *sd);
136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
137     int no, off_t off, off_t len, u_int mask);
138
139 static inline void
140 V2P(struct g_raid_volume *vol, off_t virt,
141     int *disk, off_t *offset, off_t *start)
142 {
143         off_t nstrip;
144         u_int strip_size;
145
146         strip_size = vol->v_strip_size;
147         /* Strip number. */
148         nstrip = virt / strip_size;
149         /* Start position in strip. */
150         *start = virt % strip_size;
151         /* Disk number. */
152         *disk = (nstrip * N) % vol->v_disks_count;
153         /* Strip start position in disk. */
154         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
155 }
156
157 static inline void
158 P2V(struct g_raid_volume *vol, int disk, off_t offset,
159     off_t *virt, int *copy)
160 {
161         off_t nstrip, start;
162         u_int strip_size;
163
164         strip_size = vol->v_strip_size;
165         /* Start position in strip. */
166         start = offset % strip_size;
167         /* Physical strip number. */
168         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
169         /* Number of physical strip (copy) inside virtual strip. */
170         *copy = nstrip % N;
171         /* Offset in virtual space. */
172         *virt = (nstrip / N) * strip_size + start;
173 }
174
175 static int
176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
177 {
178         struct g_raid_tr_raid1e_object *trs;
179
180         trs = (struct g_raid_tr_raid1e_object *)tr;
181         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
183                 return (G_RAID_TR_TASTE_FAIL);
184         trs->trso_starting = 1;
185         return (G_RAID_TR_TASTE_SUCCEED);
186 }
187
188 static int
189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
190 {
191         struct g_raid_softc *sc;
192         struct g_raid_subdisk *sd, *bestsd, *worstsd;
193         int i, j, state, sstate;
194
195         sc = vol->v_softc;
196         state = G_RAID_VOLUME_S_OPTIMAL;
197         for (i = 0; i < vol->v_disks_count / N; i++) {
198                 bestsd = &vol->v_subdisks[i * N];
199                 for (j = 1; j < N; j++) {
200                         sd = &vol->v_subdisks[i * N + j];
201                         if (sd->sd_state > bestsd->sd_state)
202                                 bestsd = sd;
203                         else if (sd->sd_state == bestsd->sd_state &&
204                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
205                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
206                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
207                                 bestsd = sd;
208                 }
209                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
210                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
211                         /* We found reasonable candidate. */
212                         G_RAID_DEBUG1(1, sc,
213                             "Promote subdisk %s:%d from %s to ACTIVE.",
214                             vol->v_name, bestsd->sd_pos,
215                             g_raid_subdisk_state2str(bestsd->sd_state));
216                         g_raid_change_subdisk_state(bestsd,
217                             G_RAID_SUBDISK_S_ACTIVE);
218                         g_raid_write_metadata(sc,
219                             vol, bestsd, bestsd->sd_disk);
220                 }
221                 worstsd = &vol->v_subdisks[i * N];
222                 for (j = 1; j < N; j++) {
223                         sd = &vol->v_subdisks[i * N + j];
224                         if (sd->sd_state < worstsd->sd_state)
225                                 worstsd = sd;
226                 }
227                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
228                         sstate = G_RAID_VOLUME_S_OPTIMAL;
229                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
230                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
231                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
232                         sstate = G_RAID_VOLUME_S_DEGRADED;
233                 else
234                         sstate = G_RAID_VOLUME_S_BROKEN;
235                 if (sstate < state)
236                         state = sstate;
237         }
238         return (state);
239 }
240
241 static int
242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
243 {
244         struct g_raid_softc *sc;
245         struct g_raid_subdisk *sd, *bestsd, *worstsd;
246         int i, j, state, sstate;
247
248         sc = vol->v_softc;
249         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
250             vol->v_disks_count)
251                 return (G_RAID_VOLUME_S_OPTIMAL);
252         for (i = 0; i < vol->v_disks_count; i++) {
253                 sd = &vol->v_subdisks[i];
254                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
255                         /* We found reasonable candidate. */
256                         G_RAID_DEBUG1(1, sc,
257                             "Promote subdisk %s:%d from %s to STALE.",
258                             vol->v_name, sd->sd_pos,
259                             g_raid_subdisk_state2str(sd->sd_state));
260                         g_raid_change_subdisk_state(sd,
261                             G_RAID_SUBDISK_S_STALE);
262                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
263                 }
264         }
265         state = G_RAID_VOLUME_S_OPTIMAL;
266         for (i = 0; i < vol->v_disks_count; i++) {
267                 bestsd = &vol->v_subdisks[i];
268                 worstsd = &vol->v_subdisks[i];
269                 for (j = 1; j < N; j++) {
270                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
271                         if (sd->sd_state > bestsd->sd_state)
272                                 bestsd = sd;
273                         else if (sd->sd_state == bestsd->sd_state &&
274                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
275                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
276                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
277                                 bestsd = sd;
278                         if (sd->sd_state < worstsd->sd_state)
279                                 worstsd = sd;
280                 }
281                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
282                         sstate = G_RAID_VOLUME_S_OPTIMAL;
283                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
284                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
285                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286                         sstate = G_RAID_VOLUME_S_DEGRADED;
287                 else
288                         sstate = G_RAID_VOLUME_S_BROKEN;
289                 if (sstate < state)
290                         state = sstate;
291         }
292         return (state);
293 }
294
295 static int
296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
297     struct g_raid_subdisk *sd)
298 {
299         struct g_raid_tr_raid1e_object *trs;
300         struct g_raid_softc *sc;
301         u_int s;
302
303         sc = vol->v_softc;
304         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
305         if (trs->trso_stopping &&
306             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
307                 s = G_RAID_VOLUME_S_STOPPED;
308         else if (trs->trso_starting)
309                 s = G_RAID_VOLUME_S_STARTING;
310         else {
311                 if ((vol->v_disks_count % N) == 0)
312                         s = g_raid_tr_update_state_raid1e_even(vol);
313                 else
314                         s = g_raid_tr_update_state_raid1e_odd(vol);
315         }
316         if (s != vol->v_state) {
317                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
318                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
319                     G_RAID_EVENT_VOLUME);
320                 g_raid_change_volume_state(vol, s);
321                 if (!trs->trso_starting && !trs->trso_stopping)
322                         g_raid_write_metadata(sc, vol, NULL, NULL);
323         }
324         if (!trs->trso_starting && !trs->trso_stopping)
325                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
326         return (0);
327 }
328
329 static void
330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
331     struct g_raid_disk *disk)
332 {
333         struct g_raid_volume *vol;
334
335         vol = sd->sd_volume;
336         /*
337          * We don't fail the last disk in the pack, since it still has decent
338          * data on it and that's better than failing the disk if it is the root
339          * file system.
340          *
341          * XXX should this be controlled via a tunable?  It makes sense for
342          * the volume that has / on it.  I can't think of a case where we'd
343          * want the volume to go away on this kind of event.
344          */
345         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349              vol->v_disks_count) &&
350             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
351                 return;
352         g_raid_fail_disk(sc, sd, disk);
353 }
354
355 static void
356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
357 {
358         struct g_raid_volume *vol;
359         struct g_raid_subdisk *sd;
360
361         vol = trs->trso_base.tro_volume;
362         sd = trs->trso_failed_sd;
363         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
364         free(trs->trso_buffer, M_TR_RAID1E);
365         trs->trso_buffer = NULL;
366         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
367         trs->trso_type = TR_RAID1E_NONE;
368         trs->trso_recover_slabs = 0;
369         trs->trso_failed_sd = NULL;
370         g_raid_tr_update_state_raid1e(vol, NULL);
371 }
372
373 static void
374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
375 {
376         struct g_raid_tr_raid1e_object *trs;
377         struct g_raid_subdisk *sd;
378
379         trs = (struct g_raid_tr_raid1e_object *)tr;
380         sd = trs->trso_failed_sd;
381         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
382             "Subdisk %s:%d-%s rebuild completed.",
383             sd->sd_volume->v_name, sd->sd_pos,
384             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
385         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
386         sd->sd_rebuild_pos = 0;
387         g_raid_tr_raid1e_rebuild_done(trs);
388 }
389
390 static void
391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
392 {
393         struct g_raid_tr_raid1e_object *trs;
394         struct g_raid_subdisk *sd;
395         struct g_raid_volume *vol;
396
397         vol = tr->tro_volume;
398         trs = (struct g_raid_tr_raid1e_object *)tr;
399         sd = trs->trso_failed_sd;
400         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
401                 G_RAID_DEBUG1(1, vol->v_softc,
402                     "Subdisk %s:%d-%s rebuild is aborting.",
403                     sd->sd_volume->v_name, sd->sd_pos,
404                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
405                 trs->trso_flags |= TR_RAID1E_F_ABORT;
406         } else {
407                 G_RAID_DEBUG1(0, vol->v_softc,
408                     "Subdisk %s:%d-%s rebuild aborted.",
409                     sd->sd_volume->v_name, sd->sd_pos,
410                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
411                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
412                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
413                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
414                         g_raid_unlock_range(tr->tro_volume,
415                             trs->trso_lock_pos, trs->trso_lock_len);
416                 }
417                 g_raid_tr_raid1e_rebuild_done(trs);
418         }
419 }
420
421 static void
422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
423 {
424         struct g_raid_tr_raid1e_object *trs;
425         struct g_raid_softc *sc;
426         struct g_raid_volume *vol;
427         struct g_raid_subdisk *sd;
428         struct bio *bp;
429         off_t len, virtual, vend, offset, start;
430         int disk, copy, best;
431
432         trs = (struct g_raid_tr_raid1e_object *)tr;
433         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
434                 return;
435         vol = tr->tro_volume;
436         sc = vol->v_softc;
437         sd = trs->trso_failed_sd;
438
439         while (1) {
440                 if (sd->sd_rebuild_pos >= sd->sd_size) {
441                         g_raid_tr_raid1e_rebuild_finish(tr);
442                         return;
443                 }
444                 /* Get virtual offset from physical rebuild position. */
445                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
446                 /* Get physical offset back to get first stripe position. */
447                 V2P(vol, virtual, &disk, &offset, &start);
448                 /* Calculate contignous data length. */
449                 len = MIN(g_raid1e_rebuild_slab,
450                     sd->sd_size - sd->sd_rebuild_pos);
451                 if ((vol->v_disks_count % N) != 0)
452                         len = MIN(len, vol->v_strip_size - start);
453                 /* Find disk with most accurate data. */
454                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
455                     offset + start, len, 0);
456                 if (best < 0) {
457                         /* There is no any valid disk. */
458                         g_raid_tr_raid1e_rebuild_abort(tr);
459                         return;
460                 } else if (best != copy) {
461                         /* Some other disk has better data. */
462                         break;
463                 }
464                 /* We have the most accurate data. Skip the range. */
465                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
466                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
467                 sd->sd_rebuild_pos += len;
468         }
469
470         bp = &trs->trso_bio;
471         memset(bp, 0, sizeof(*bp));
472         bp->bio_offset = offset + start +
473             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
474         bp->bio_length = len;
475         bp->bio_data = trs->trso_buffer;
476         bp->bio_cmd = BIO_READ;
477         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
478         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
479         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
480         /*
481          * If we are crossing stripe boundary, correct affected virtual
482          * range we should lock.
483          */
484         if (start + len > vol->v_strip_size) {
485                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
486                 len = vend - virtual;
487         }
488         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
489         trs->trso_flags |= TR_RAID1E_F_LOCKED;
490         trs->trso_lock_pos = virtual;
491         trs->trso_lock_len = len;
492         /* Lock callback starts I/O */
493         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
494 }
495
496 static void
497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
498 {
499         struct g_raid_volume *vol;
500         struct g_raid_tr_raid1e_object *trs;
501         struct g_raid_subdisk *sd;
502
503         vol = tr->tro_volume;
504         trs = (struct g_raid_tr_raid1e_object *)tr;
505         if (trs->trso_failed_sd) {
506                 G_RAID_DEBUG1(1, vol->v_softc,
507                     "Already rebuild in start rebuild. pos %jd\n",
508                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
509                 return;
510         }
511         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
512         if (sd == NULL)
513                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
514         if (sd == NULL) {
515                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
516                 if (sd != NULL) {
517                         sd->sd_rebuild_pos = 0;
518                         g_raid_change_subdisk_state(sd,
519                             G_RAID_SUBDISK_S_RESYNC);
520                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
521                 } else {
522                         sd = g_raid_get_subdisk(vol,
523                             G_RAID_SUBDISK_S_UNINITIALIZED);
524                         if (sd == NULL)
525                                 sd = g_raid_get_subdisk(vol,
526                                     G_RAID_SUBDISK_S_NEW);
527                         if (sd != NULL) {
528                                 sd->sd_rebuild_pos = 0;
529                                 g_raid_change_subdisk_state(sd,
530                                     G_RAID_SUBDISK_S_REBUILD);
531                                 g_raid_write_metadata(vol->v_softc,
532                                     vol, sd, NULL);
533                         }
534                 }
535         }
536         if (sd == NULL) {
537                 G_RAID_DEBUG1(1, vol->v_softc,
538                     "No failed disk to rebuild.  night night.");
539                 return;
540         }
541         trs->trso_failed_sd = sd;
542         G_RAID_DEBUG1(0, vol->v_softc,
543             "Subdisk %s:%d-%s rebuild start at %jd.",
544             sd->sd_volume->v_name, sd->sd_pos,
545             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
546             trs->trso_failed_sd->sd_rebuild_pos);
547         trs->trso_type = TR_RAID1E_REBUILD;
548         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
549         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
550         g_raid_tr_raid1e_rebuild_some(tr);
551 }
552
553 static void
554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
555     struct g_raid_subdisk *sd)
556 {
557         struct g_raid_volume *vol;
558         struct g_raid_tr_raid1e_object *trs;
559         int nr;
560         
561         vol = tr->tro_volume;
562         trs = (struct g_raid_tr_raid1e_object *)tr;
563         if (trs->trso_stopping)
564                 return;
565         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
566             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
567         switch(trs->trso_type) {
568         case TR_RAID1E_NONE:
569                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
570                         return;
571                 if (nr == 0) {
572                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
573                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
574                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
575                         if (nr == 0)
576                                 return;
577                 }
578                 g_raid_tr_raid1e_rebuild_start(tr);
579                 break;
580         case TR_RAID1E_REBUILD:
581                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
582                     trs->trso_failed_sd == sd)
583                         g_raid_tr_raid1e_rebuild_abort(tr);
584                 break;
585         case TR_RAID1E_RESYNC:
586                 break;
587         }
588 }
589
590 static int
591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
592     struct g_raid_subdisk *sd, u_int event)
593 {
594
595         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
596         return (0);
597 }
598
599 static int
600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
601 {
602         struct g_raid_tr_raid1e_object *trs;
603         struct g_raid_volume *vol;
604
605         trs = (struct g_raid_tr_raid1e_object *)tr;
606         vol = tr->tro_volume;
607         trs->trso_starting = 0;
608         g_raid_tr_update_state_raid1e(vol, NULL);
609         return (0);
610 }
611
612 static int
613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
614 {
615         struct g_raid_tr_raid1e_object *trs;
616         struct g_raid_volume *vol;
617
618         trs = (struct g_raid_tr_raid1e_object *)tr;
619         vol = tr->tro_volume;
620         trs->trso_starting = 0;
621         trs->trso_stopping = 1;
622         g_raid_tr_update_state_raid1e(vol, NULL);
623         return (0);
624 }
625
626 /*
627  * Select the disk to read from.  Take into account: subdisk state, running
628  * error recovery, average disk load, head position and possible cache hits.
629  */
630 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
631 static int
632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
633     int no, off_t off, off_t len, u_int mask)
634 {
635         struct g_raid_subdisk *sd;
636         off_t offset;
637         int i, best, prio, bestprio;
638
639         best = -1;
640         bestprio = INT_MAX;
641         for (i = 0; i < N; i++) {
642                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
643                 offset = off;
644                 if (no + i >= vol->v_disks_count)
645                         offset += vol->v_strip_size;
646
647                 prio = G_RAID_SUBDISK_LOAD(sd);
648                 if ((mask & (1 << sd->sd_pos)) != 0)
649                         continue;
650                 switch (sd->sd_state) {
651                 case G_RAID_SUBDISK_S_ACTIVE:
652                         break;
653                 case G_RAID_SUBDISK_S_RESYNC:
654                         if (offset + off < sd->sd_rebuild_pos)
655                                 break;
656                         /* FALLTHROUGH */
657                 case G_RAID_SUBDISK_S_STALE:
658                         prio += i << 24;
659                         break;
660                 case G_RAID_SUBDISK_S_REBUILD:
661                         if (offset + off < sd->sd_rebuild_pos)
662                                 break;
663                         /* FALLTHROUGH */
664                 default:
665                         continue;
666                 }
667                 prio += min(sd->sd_recovery, 255) << 16;
668                 /* If disk head is precisely in position - highly prefer it. */
669                 if (G_RAID_SUBDISK_POS(sd) == offset)
670                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
671                 else
672                 /* If disk head is close to position - prefer it. */
673                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
674                     G_RAID_SUBDISK_TRACK_SIZE)
675                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
676                 if (prio < bestprio) {
677                         bestprio = prio;
678                         best = i;
679                 }
680         }
681         return (best);
682 }
683
684 static void
685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
686 {
687         struct g_raid_volume *vol;
688         struct g_raid_subdisk *sd;
689         struct bio_queue_head queue;
690         struct bio *cbp;
691         char *addr;
692         off_t offset, start, length, remain;
693         u_int no, strip_size;
694         int best;
695
696         vol = tr->tro_volume;
697         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698                 addr = NULL;
699         else
700                 addr = bp->bio_data;
701         strip_size = vol->v_strip_size;
702         V2P(vol, bp->bio_offset, &no, &offset, &start);
703         remain = bp->bio_length;
704         bioq_init(&queue);
705         while (remain > 0) {
706                 length = MIN(strip_size - start, remain);
707                 best = g_raid_tr_raid1e_select_read_disk(vol,
708                     no, offset, length, 0);
709                 KASSERT(best >= 0, ("No readable disk in volume %s!",
710                     vol->v_name));
711                 no += best;
712                 if (no >= vol->v_disks_count) {
713                         no -= vol->v_disks_count;
714                         offset += strip_size;
715                 }
716                 cbp = g_clone_bio(bp);
717                 if (cbp == NULL)
718                         goto failure;
719                 cbp->bio_offset = offset + start;
720                 cbp->bio_length = length;
721                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722                         cbp->bio_ma_offset += (uintptr_t)addr;
723                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724                         cbp->bio_ma_offset %= PAGE_SIZE;
725                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726                             cbp->bio_length) / PAGE_SIZE;
727                 } else
728                         cbp->bio_data = addr;
729                 cbp->bio_caller1 = &vol->v_subdisks[no];
730                 bioq_insert_tail(&queue, cbp);
731                 no += N - best;
732                 if (no >= vol->v_disks_count) {
733                         no -= vol->v_disks_count;
734                         offset += strip_size;
735                 }
736                 remain -= length;
737                 addr += length;
738                 start = 0;
739         }
740         while ((cbp = bioq_takefirst(&queue)) != NULL) {
741                 sd = cbp->bio_caller1;
742                 cbp->bio_caller1 = NULL;
743                 g_raid_subdisk_iostart(sd, cbp);
744         }
745         return;
746 failure:
747         while ((cbp = bioq_takefirst(&queue)) != NULL)
748                 g_destroy_bio(cbp);
749         if (bp->bio_error == 0)
750                 bp->bio_error = ENOMEM;
751         g_raid_iodone(bp, bp->bio_error);
752 }
753
754 static void
755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
756 {
757         struct g_raid_volume *vol;
758         struct g_raid_subdisk *sd;
759         struct bio_queue_head queue;
760         struct bio *cbp;
761         char *addr;
762         off_t offset, start, length, remain;
763         u_int no, strip_size;
764         int i;
765
766         vol = tr->tro_volume;
767         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768                 addr = NULL;
769         else
770                 addr = bp->bio_data;
771         strip_size = vol->v_strip_size;
772         V2P(vol, bp->bio_offset, &no, &offset, &start);
773         remain = bp->bio_length;
774         bioq_init(&queue);
775         while (remain > 0) {
776                 length = MIN(strip_size - start, remain);
777                 for (i = 0; i < N; i++) {
778                         sd = &vol->v_subdisks[no];
779                         switch (sd->sd_state) {
780                         case G_RAID_SUBDISK_S_ACTIVE:
781                         case G_RAID_SUBDISK_S_STALE:
782                         case G_RAID_SUBDISK_S_RESYNC:
783                                 break;
784                         case G_RAID_SUBDISK_S_REBUILD:
785                                 if (offset + start >= sd->sd_rebuild_pos)
786                                         goto nextdisk;
787                                 break;
788                         default:
789                                 goto nextdisk;
790                         }
791                         cbp = g_clone_bio(bp);
792                         if (cbp == NULL)
793                                 goto failure;
794                         cbp->bio_offset = offset + start;
795                         cbp->bio_length = length;
796                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797                             bp->bio_cmd != BIO_DELETE) {
798                                 cbp->bio_ma_offset += (uintptr_t)addr;
799                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800                                 cbp->bio_ma_offset %= PAGE_SIZE;
801                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802                                     cbp->bio_length) / PAGE_SIZE;
803                         } else
804                                 cbp->bio_data = addr;
805                         cbp->bio_caller1 = sd;
806                         bioq_insert_tail(&queue, cbp);
807 nextdisk:
808                         if (++no >= vol->v_disks_count) {
809                                 no = 0;
810                                 offset += strip_size;
811                         }
812                 }
813                 remain -= length;
814                 if (bp->bio_cmd != BIO_DELETE)
815                         addr += length;
816                 start = 0;
817         }
818         while ((cbp = bioq_takefirst(&queue)) != NULL) {
819                 sd = cbp->bio_caller1;
820                 cbp->bio_caller1 = NULL;
821                 g_raid_subdisk_iostart(sd, cbp);
822         }
823         return;
824 failure:
825         while ((cbp = bioq_takefirst(&queue)) != NULL)
826                 g_destroy_bio(cbp);
827         if (bp->bio_error == 0)
828                 bp->bio_error = ENOMEM;
829         g_raid_iodone(bp, bp->bio_error);
830 }
831
832 static void
833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
834 {
835         struct g_raid_volume *vol;
836         struct g_raid_tr_raid1e_object *trs;
837
838         vol = tr->tro_volume;
839         trs = (struct g_raid_tr_raid1e_object *)tr;
840         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
841             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
842             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
843                 g_raid_iodone(bp, EIO);
844                 return;
845         }
846         /*
847          * If we're rebuilding, squeeze in rebuild activity every so often,
848          * even when the disk is busy.  Be sure to only count real I/O
849          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
850          * by this module.
851          */
852         if (trs->trso_failed_sd != NULL &&
853             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
854                 /* Make this new or running now round short. */
855                 trs->trso_recover_slabs = 0;
856                 if (--trs->trso_fair_io <= 0) {
857                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
858                         g_raid_tr_raid1e_rebuild_some(tr);
859                 }
860         }
861         switch (bp->bio_cmd) {
862         case BIO_READ:
863                 g_raid_tr_iostart_raid1e_read(tr, bp);
864                 break;
865         case BIO_WRITE:
866         case BIO_DELETE:
867                 g_raid_tr_iostart_raid1e_write(tr, bp);
868                 break;
869         case BIO_FLUSH:
870                 g_raid_tr_flush_common(tr, bp);
871                 break;
872         default:
873                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
874                     bp->bio_cmd, vol->v_name));
875                 break;
876         }
877 }
878
879 static void
880 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
881     struct g_raid_subdisk *sd, struct bio *bp)
882 {
883         struct bio *cbp;
884         struct g_raid_subdisk *nsd;
885         struct g_raid_volume *vol;
886         struct bio *pbp;
887         struct g_raid_tr_raid1e_object *trs;
888         off_t virtual, offset, start;
889         uintptr_t mask;
890         int error, do_write, copy, disk, best;
891
892         trs = (struct g_raid_tr_raid1e_object *)tr;
893         vol = tr->tro_volume;
894         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
895                 if (trs->trso_type == TR_RAID1E_REBUILD) {
896                         nsd = trs->trso_failed_sd;
897                         if (bp->bio_cmd == BIO_READ) {
898
899                                 /* Immediately abort rebuild, if requested. */
900                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
901                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
902                                         g_raid_tr_raid1e_rebuild_abort(tr);
903                                         return;
904                                 }
905
906                                 /* On read error, skip and cross fingers. */
907                                 if (bp->bio_error != 0) {
908                                         G_RAID_LOGREQ(0, bp,
909                                             "Read error during rebuild (%d), "
910                                             "possible data loss!",
911                                             bp->bio_error);
912                                         goto rebuild_round_done;
913                                 }
914
915                                 /*
916                                  * The read operation finished, queue the
917                                  * write and get out.
918                                  */
919                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
920                                     bp->bio_error);
921                                 bp->bio_cmd = BIO_WRITE;
922                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
923                                 bp->bio_offset = nsd->sd_rebuild_pos;
924                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
925                                 g_raid_subdisk_iostart(nsd, bp);
926                         } else {
927                                 /*
928                                  * The write operation just finished.  Do
929                                  * another.  We keep cloning the master bio
930                                  * since it has the right buffers allocated to
931                                  * it.
932                                  */
933                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
934                                     bp->bio_error);
935                                 if (bp->bio_error != 0 ||
936                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
937                                         if ((trs->trso_flags &
938                                             TR_RAID1E_F_ABORT) == 0) {
939                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
940                                                     nsd, nsd->sd_disk);
941                                         }
942                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
943                                         g_raid_tr_raid1e_rebuild_abort(tr);
944                                         return;
945                                 }
946 rebuild_round_done:
947                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
948                                 g_raid_unlock_range(tr->tro_volume,
949                                     trs->trso_lock_pos, trs->trso_lock_len);
950                                 nsd->sd_rebuild_pos += bp->bio_length;
951                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
952                                         g_raid_tr_raid1e_rebuild_finish(tr);
953                                         return;
954                                 }
955
956                                 /* Abort rebuild if we are stopping */
957                                 if (trs->trso_stopping) {
958                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
959                                         g_raid_tr_raid1e_rebuild_abort(tr);
960                                         return;
961                                 }
962
963                                 if (--trs->trso_meta_update <= 0) {
964                                         g_raid_write_metadata(vol->v_softc,
965                                             vol, nsd, nsd->sd_disk);
966                                         trs->trso_meta_update =
967                                             g_raid1e_rebuild_meta_update;
968                                         /* Compensate short rebuild I/Os. */
969                                         if ((vol->v_disks_count % N) != 0 &&
970                                             vol->v_strip_size <
971                                              g_raid1e_rebuild_slab) {
972                                                 trs->trso_meta_update *=
973                                                     g_raid1e_rebuild_slab;
974                                                 trs->trso_meta_update /=
975                                                     vol->v_strip_size;
976                                         }
977                                 }
978                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
979                                 if (--trs->trso_recover_slabs <= 0)
980                                         return;
981                                 /* Run next rebuild iteration. */
982                                 g_raid_tr_raid1e_rebuild_some(tr);
983                         }
984                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
985                         /*
986                          * read good sd, read bad sd in parallel.  when both
987                          * done, compare the buffers.  write good to the bad
988                          * if different.  do the next bit of work.
989                          */
990                         panic("Somehow, we think we're doing a resync");
991                 }
992                 return;
993         }
994         pbp = bp->bio_parent;
995         pbp->bio_inbed++;
996         mask = (intptr_t)bp->bio_caller2;
997         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
998                 /*
999                  * Read failed on first drive.  Retry the read error on
1000                  * another disk drive, if available, before erroring out the
1001                  * read.
1002                  */
1003                 sd->sd_disk->d_read_errs++;
1004                 G_RAID_LOGREQ(0, bp,
1005                     "Read error (%d), %d read errors total",
1006                     bp->bio_error, sd->sd_disk->d_read_errs);
1007
1008                 /*
1009                  * If there are too many read errors, we move to degraded.
1010                  * XXX Do we want to FAIL the drive (eg, make the user redo
1011                  * everything to get it back in sync), or just degrade the
1012                  * drive, which kicks off a resync?
1013                  */
1014                 do_write = 0;
1015                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017                 else if (mask == 0)
1018                         do_write = 1;
1019
1020                 /* Restore what we were doing. */
1021                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1022                 V2P(vol, virtual, &disk, &offset, &start);
1023
1024                 /* Find the other disk, and try to do the I/O to it. */
1025                 mask |= 1 << copy;
1026                 best = g_raid_tr_raid1e_select_read_disk(vol,
1027                     disk, offset, start, mask);
1028                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029                         disk += best;
1030                         if (disk >= vol->v_disks_count) {
1031                                 disk -= vol->v_disks_count;
1032                                 offset += vol->v_strip_size;
1033                         }
1034                         cbp->bio_offset = offset + start;
1035                         cbp->bio_length = bp->bio_length;
1036                         cbp->bio_data = bp->bio_data;
1037                         cbp->bio_ma = bp->bio_ma;
1038                         cbp->bio_ma_offset = bp->bio_ma_offset;
1039                         cbp->bio_ma_n = bp->bio_ma_n;
1040                         g_destroy_bio(bp);
1041                         nsd = &vol->v_subdisks[disk];
1042                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043                             nsd->sd_pos);
1044                         if (do_write)
1045                                 mask |= 1 << 31;
1046                         if ((mask & (1U << 31)) != 0)
1047                                 sd->sd_recovery++;
1048                         cbp->bio_caller2 = (void *)mask;
1049                         if (do_write) {
1050                                 cbp->bio_caller1 = nsd;
1051                                 /* Lock callback starts I/O */
1052                                 g_raid_lock_range(sd->sd_volume,
1053                                     virtual, cbp->bio_length, pbp, cbp);
1054                         } else {
1055                                 g_raid_subdisk_iostart(nsd, cbp);
1056                         }
1057                         return;
1058                 }
1059                 /*
1060                  * We can't retry.  Return the original error by falling
1061                  * through.  This will happen when there's only one good disk.
1062                  * We don't need to fail the raid, since its actual state is
1063                  * based on the state of the subdisks.
1064                  */
1065                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066         }
1067         if (bp->bio_cmd == BIO_READ &&
1068             bp->bio_error == 0 &&
1069             (mask & (1U << 31)) != 0) {
1070                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071
1072                 /* Restore what we were doing. */
1073                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1074                 V2P(vol, virtual, &disk, &offset, &start);
1075
1076                 /* Find best disk to write. */
1077                 best = g_raid_tr_raid1e_select_read_disk(vol,
1078                     disk, offset, start, ~mask);
1079                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080                         disk += best;
1081                         if (disk >= vol->v_disks_count) {
1082                                 disk -= vol->v_disks_count;
1083                                 offset += vol->v_strip_size;
1084                         }
1085                         cbp->bio_offset = offset + start;
1086                         cbp->bio_cmd = BIO_WRITE;
1087                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088                         cbp->bio_caller2 = (void *)mask;
1089                         g_destroy_bio(bp);
1090                         G_RAID_LOGREQ(2, cbp,
1091                             "Attempting bad sector remap on failing drive.");
1092                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093                         return;
1094                 }
1095         }
1096         if ((mask & (1U << 31)) != 0) {
1097                 /*
1098                  * We're done with a recovery, mark the range as unlocked.
1099                  * For any write errors, we agressively fail the disk since
1100                  * there was both a READ and a WRITE error at this location.
1101                  * Both types of errors generally indicates the drive is on
1102                  * the verge of total failure anyway.  Better to stop trusting
1103                  * it now.  However, we need to reset error to 0 in that case
1104                  * because we're not failing the original I/O which succeeded.
1105                  */
1106
1107                 /* Restore what we were doing. */
1108                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1109                 V2P(vol, virtual, &disk, &offset, &start);
1110
1111                 for (copy = 0; copy < N; copy++) {
1112                         if ((mask & (1 << copy) ) != 0)
1113                                 vol->v_subdisks[(disk + copy) %
1114                                     vol->v_disks_count].sd_recovery--;
1115                 }
1116
1117                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119                             "failing subdisk.");
1120                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121                         bp->bio_error = 0;
1122                 }
1123                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125         }
1126         if (pbp->bio_cmd != BIO_READ) {
1127                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128                         pbp->bio_error = bp->bio_error;
1129                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132                 }
1133                 error = pbp->bio_error;
1134         } else
1135                 error = bp->bio_error;
1136         g_destroy_bio(bp);
1137         if (pbp->bio_children == pbp->bio_inbed) {
1138                 pbp->bio_completed = pbp->bio_length;
1139                 g_raid_iodone(pbp, error);
1140         }
1141 }
1142
1143 static int
1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1145     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1146 {
1147         struct g_raid_volume *vol;
1148         struct g_raid_subdisk *sd;
1149         struct bio_queue_head queue;
1150         char *addr;
1151         off_t offset, start, length, remain;
1152         u_int no, strip_size;
1153         int i, error;
1154
1155         vol = tr->tro_volume;
1156         addr = virtual;
1157         strip_size = vol->v_strip_size;
1158         V2P(vol, boffset, &no, &offset, &start);
1159         remain = blength;
1160         bioq_init(&queue);
1161         while (remain > 0) {
1162                 length = MIN(strip_size - start, remain);
1163                 for (i = 0; i < N; i++) {
1164                         sd = &vol->v_subdisks[no];
1165                         switch (sd->sd_state) {
1166                         case G_RAID_SUBDISK_S_ACTIVE:
1167                         case G_RAID_SUBDISK_S_STALE:
1168                         case G_RAID_SUBDISK_S_RESYNC:
1169                                 break;
1170                         case G_RAID_SUBDISK_S_REBUILD:
1171                                 if (offset + start >= sd->sd_rebuild_pos)
1172                                         goto nextdisk;
1173                                 break;
1174                         default:
1175                                 goto nextdisk;
1176                         }
1177                         error = g_raid_subdisk_kerneldump(sd,
1178                             addr, 0, offset + start, length);
1179                         if (error != 0)
1180                                 return (error);
1181 nextdisk:
1182                         if (++no >= vol->v_disks_count) {
1183                                 no = 0;
1184                                 offset += strip_size;
1185                         }
1186                 }
1187                 remain -= length;
1188                 addr += length;
1189                 start = 0;
1190         }
1191         return (0);
1192 }
1193
1194 static int
1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196 {
1197         struct bio *bp;
1198         struct g_raid_subdisk *sd;
1199
1200         bp = (struct bio *)argp;
1201         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202         g_raid_subdisk_iostart(sd, bp);
1203
1204         return (0);
1205 }
1206
1207 static int
1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209 {
1210         struct g_raid_tr_raid1e_object *trs;
1211         struct g_raid_volume *vol;
1212
1213         vol = tr->tro_volume;
1214         trs = (struct g_raid_tr_raid1e_object *)tr;
1215         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217         /* Compensate short rebuild I/Os. */
1218         if ((vol->v_disks_count % N) != 0 &&
1219             vol->v_strip_size < g_raid1e_rebuild_slab) {
1220                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221                 trs->trso_recover_slabs /= vol->v_strip_size;
1222         }
1223         if (trs->trso_type == TR_RAID1E_REBUILD)
1224                 g_raid_tr_raid1e_rebuild_some(tr);
1225         return (0);
1226 }
1227
1228 static int
1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230 {
1231         struct g_raid_tr_raid1e_object *trs;
1232
1233         trs = (struct g_raid_tr_raid1e_object *)tr;
1234
1235         if (trs->trso_buffer != NULL) {
1236                 free(trs->trso_buffer, M_TR_RAID1E);
1237                 trs->trso_buffer = NULL;
1238         }
1239         return (0);
1240 }
1241
1242 G_RAID_TR_DECLARE(raid1e, "RAID1E");