]> CyberLeo.Net >> Repos - FreeBSD/releng/9.1.git/blob - sys/geom/raid/tr_raid1e.c
MFC r240465:
[FreeBSD/releng/9.1.git] / sys / geom / raid / tr_raid1e.c
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44
45 #define N       2
46
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48
49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
52     &g_raid1e_rebuild_slab);
53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
54     &g_raid1e_rebuild_slab, 0,
55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56
57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
60     &g_raid1e_rebuild_fair_io);
61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
62     &g_raid1e_rebuild_fair_io, 0,
63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64
65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
68     &g_raid1e_rebuild_cluster_idle);
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
70     &g_raid1e_rebuild_cluster_idle, 0,
71     "Number of slabs to do each time we trigger a rebuild cycle");
72
73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
76     &g_raid1e_rebuild_meta_update);
77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
78     &g_raid1e_rebuild_meta_update, 0,
79     "When to update the meta data.");
80
81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
82
83 #define TR_RAID1E_NONE 0
84 #define TR_RAID1E_REBUILD 1
85 #define TR_RAID1E_RESYNC 2
86
87 #define TR_RAID1E_F_DOING_SOME  0x1
88 #define TR_RAID1E_F_LOCKED      0x2
89 #define TR_RAID1E_F_ABORT       0x4
90
91 struct g_raid_tr_raid1e_object {
92         struct g_raid_tr_object  trso_base;
93         int                      trso_starting;
94         int                      trso_stopping;
95         int                      trso_type;
96         int                      trso_recover_slabs; /* slabs before rest */
97         int                      trso_fair_io;
98         int                      trso_meta_update;
99         int                      trso_flags;
100         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
101         void                    *trso_buffer;    /* Buffer space */
102         off_t                    trso_lock_pos; /* Locked range start. */
103         off_t                    trso_lock_len; /* Locked range length. */
104         struct bio               trso_bio;
105 };
106
107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
117
118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
119         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
120         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
121         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
122         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
123         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
124         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
125         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
126         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
127         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
128         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
129         { 0, 0 }
130 };
131
132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
133         "RAID1E",
134         g_raid_tr_raid1e_methods,
135         sizeof(struct g_raid_tr_raid1e_object),
136         .trc_enable = 1,
137         .trc_priority = 200
138 };
139
140 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
141 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
142     struct g_raid_subdisk *sd);
143 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
144     int no, off_t off, off_t len, u_int mask);
145
146 static inline void
147 V2P(struct g_raid_volume *vol, off_t virt,
148     int *disk, off_t *offset, off_t *start)
149 {
150         off_t nstrip;
151         u_int strip_size;
152
153         strip_size = vol->v_strip_size;
154         /* Strip number. */
155         nstrip = virt / strip_size;
156         /* Start position in strip. */
157         *start = virt % strip_size;
158         /* Disk number. */
159         *disk = (nstrip * N) % vol->v_disks_count;
160         /* Strip start position in disk. */
161         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
162 }
163
164 static inline void
165 P2V(struct g_raid_volume *vol, int disk, off_t offset,
166     off_t *virt, int *copy)
167 {
168         off_t nstrip, start;
169         u_int strip_size;
170
171         strip_size = vol->v_strip_size;
172         /* Start position in strip. */
173         start = offset % strip_size;
174         /* Physical strip number. */
175         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
176         /* Number of physical strip (copy) inside virtual strip. */
177         *copy = nstrip % N;
178         /* Offset in virtual space. */
179         *virt = (nstrip / N) * strip_size + start;
180 }
181
182 static int
183 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
184 {
185         struct g_raid_tr_raid1e_object *trs;
186
187         trs = (struct g_raid_tr_raid1e_object *)tr;
188         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
189             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
190                 return (G_RAID_TR_TASTE_FAIL);
191         trs->trso_starting = 1;
192         return (G_RAID_TR_TASTE_SUCCEED);
193 }
194
195 static int
196 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
197 {
198         struct g_raid_softc *sc;
199         struct g_raid_subdisk *sd, *bestsd, *worstsd;
200         int i, j, state, sstate;
201
202         sc = vol->v_softc;
203         state = G_RAID_VOLUME_S_OPTIMAL;
204         for (i = 0; i < vol->v_disks_count / N; i++) {
205                 bestsd = &vol->v_subdisks[i * N];
206                 for (j = 1; j < N; j++) {
207                         sd = &vol->v_subdisks[i * N + j];
208                         if (sd->sd_state > bestsd->sd_state)
209                                 bestsd = sd;
210                         else if (sd->sd_state == bestsd->sd_state &&
211                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
212                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
213                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
214                                 bestsd = sd;
215                 }
216                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
217                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
218                         /* We found reasonable candidate. */
219                         G_RAID_DEBUG1(1, sc,
220                             "Promote subdisk %s:%d from %s to ACTIVE.",
221                             vol->v_name, bestsd->sd_pos,
222                             g_raid_subdisk_state2str(bestsd->sd_state));
223                         g_raid_change_subdisk_state(bestsd,
224                             G_RAID_SUBDISK_S_ACTIVE);
225                         g_raid_write_metadata(sc,
226                             vol, bestsd, bestsd->sd_disk);
227                 }
228                 worstsd = &vol->v_subdisks[i * N];
229                 for (j = 1; j < N; j++) {
230                         sd = &vol->v_subdisks[i * N + j];
231                         if (sd->sd_state < worstsd->sd_state)
232                                 worstsd = sd;
233                 }
234                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235                         sstate = G_RAID_VOLUME_S_OPTIMAL;
236                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
237                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
238                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
239                         sstate = G_RAID_VOLUME_S_DEGRADED;
240                 else
241                         sstate = G_RAID_VOLUME_S_BROKEN;
242                 if (sstate < state)
243                         state = sstate;
244         }
245         return (state);
246 }
247
248 static int
249 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
250 {
251         struct g_raid_softc *sc;
252         struct g_raid_subdisk *sd, *bestsd, *worstsd;
253         int i, j, state, sstate;
254
255         sc = vol->v_softc;
256         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
257             vol->v_disks_count)
258                 return (G_RAID_VOLUME_S_OPTIMAL);
259         for (i = 0; i < vol->v_disks_count; i++) {
260                 sd = &vol->v_subdisks[i];
261                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
262                         /* We found reasonable candidate. */
263                         G_RAID_DEBUG1(1, sc,
264                             "Promote subdisk %s:%d from %s to STALE.",
265                             vol->v_name, sd->sd_pos,
266                             g_raid_subdisk_state2str(sd->sd_state));
267                         g_raid_change_subdisk_state(sd,
268                             G_RAID_SUBDISK_S_STALE);
269                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
270                 }
271         }
272         state = G_RAID_VOLUME_S_OPTIMAL;
273         for (i = 0; i < vol->v_disks_count; i++) {
274                 bestsd = &vol->v_subdisks[i];
275                 worstsd = &vol->v_subdisks[i];
276                 for (j = 1; j < N; j++) {
277                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
278                         if (sd->sd_state > bestsd->sd_state)
279                                 bestsd = sd;
280                         else if (sd->sd_state == bestsd->sd_state &&
281                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
282                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
283                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
284                                 bestsd = sd;
285                         if (sd->sd_state < worstsd->sd_state)
286                                 worstsd = sd;
287                 }
288                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
289                         sstate = G_RAID_VOLUME_S_OPTIMAL;
290                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
291                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
292                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
293                         sstate = G_RAID_VOLUME_S_DEGRADED;
294                 else
295                         sstate = G_RAID_VOLUME_S_BROKEN;
296                 if (sstate < state)
297                         state = sstate;
298         }
299         return (state);
300 }
301
302 static int
303 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
304     struct g_raid_subdisk *sd)
305 {
306         struct g_raid_tr_raid1e_object *trs;
307         struct g_raid_softc *sc;
308         u_int s;
309
310         sc = vol->v_softc;
311         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
312         if (trs->trso_stopping &&
313             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
314                 s = G_RAID_VOLUME_S_STOPPED;
315         else if (trs->trso_starting)
316                 s = G_RAID_VOLUME_S_STARTING;
317         else {
318                 if ((vol->v_disks_count % N) == 0)
319                         s = g_raid_tr_update_state_raid1e_even(vol);
320                 else
321                         s = g_raid_tr_update_state_raid1e_odd(vol);
322         }
323         if (s != vol->v_state) {
324                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
325                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
326                     G_RAID_EVENT_VOLUME);
327                 g_raid_change_volume_state(vol, s);
328                 if (!trs->trso_starting && !trs->trso_stopping)
329                         g_raid_write_metadata(sc, vol, NULL, NULL);
330         }
331         if (!trs->trso_starting && !trs->trso_stopping)
332                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
333         return (0);
334 }
335
336 static void
337 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
338     struct g_raid_disk *disk)
339 {
340         struct g_raid_volume *vol;
341
342         vol = sd->sd_volume;
343         /*
344          * We don't fail the last disk in the pack, since it still has decent
345          * data on it and that's better than failing the disk if it is the root
346          * file system.
347          *
348          * XXX should this be controlled via a tunable?  It makes sense for
349          * the volume that has / on it.  I can't think of a case where we'd
350          * want the volume to go away on this kind of event.
351          */
352         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
353              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
354              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
355              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
356              vol->v_disks_count) &&
357             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
358                 return;
359         g_raid_fail_disk(sc, sd, disk);
360 }
361
362 static void
363 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
364 {
365         struct g_raid_volume *vol;
366         struct g_raid_subdisk *sd;
367
368         vol = trs->trso_base.tro_volume;
369         sd = trs->trso_failed_sd;
370         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
371         free(trs->trso_buffer, M_TR_RAID1E);
372         trs->trso_buffer = NULL;
373         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
374         trs->trso_type = TR_RAID1E_NONE;
375         trs->trso_recover_slabs = 0;
376         trs->trso_failed_sd = NULL;
377         g_raid_tr_update_state_raid1e(vol, NULL);
378 }
379
380 static void
381 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
382 {
383         struct g_raid_tr_raid1e_object *trs;
384         struct g_raid_subdisk *sd;
385
386         trs = (struct g_raid_tr_raid1e_object *)tr;
387         sd = trs->trso_failed_sd;
388         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
389             "Subdisk %s:%d-%s rebuild completed.",
390             sd->sd_volume->v_name, sd->sd_pos,
391             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
392         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
393         sd->sd_rebuild_pos = 0;
394         g_raid_tr_raid1e_rebuild_done(trs);
395 }
396
397 static void
398 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
399 {
400         struct g_raid_tr_raid1e_object *trs;
401         struct g_raid_subdisk *sd;
402         struct g_raid_volume *vol;
403
404         vol = tr->tro_volume;
405         trs = (struct g_raid_tr_raid1e_object *)tr;
406         sd = trs->trso_failed_sd;
407         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
408                 G_RAID_DEBUG1(1, vol->v_softc,
409                     "Subdisk %s:%d-%s rebuild is aborting.",
410                     sd->sd_volume->v_name, sd->sd_pos,
411                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412                 trs->trso_flags |= TR_RAID1E_F_ABORT;
413         } else {
414                 G_RAID_DEBUG1(0, vol->v_softc,
415                     "Subdisk %s:%d-%s rebuild aborted.",
416                     sd->sd_volume->v_name, sd->sd_pos,
417                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
418                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
419                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
420                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
421                         g_raid_unlock_range(tr->tro_volume,
422                             trs->trso_lock_pos, trs->trso_lock_len);
423                 }
424                 g_raid_tr_raid1e_rebuild_done(trs);
425         }
426 }
427
428 static void
429 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
430 {
431         struct g_raid_tr_raid1e_object *trs;
432         struct g_raid_softc *sc;
433         struct g_raid_volume *vol;
434         struct g_raid_subdisk *sd;
435         struct bio *bp;
436         off_t len, virtual, vend, offset, start;
437         int disk, copy, best;
438
439         trs = (struct g_raid_tr_raid1e_object *)tr;
440         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
441                 return;
442         vol = tr->tro_volume;
443         sc = vol->v_softc;
444         sd = trs->trso_failed_sd;
445
446         while (1) {
447                 if (sd->sd_rebuild_pos >= sd->sd_size) {
448                         g_raid_tr_raid1e_rebuild_finish(tr);
449                         return;
450                 }
451                 /* Get virtual offset from physical rebuild position. */
452                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
453                 /* Get physical offset back to get first stripe position. */
454                 V2P(vol, virtual, &disk, &offset, &start);
455                 /* Calculate contignous data length. */
456                 len = MIN(g_raid1e_rebuild_slab,
457                     sd->sd_size - sd->sd_rebuild_pos);
458                 if ((vol->v_disks_count % N) != 0)
459                         len = MIN(len, vol->v_strip_size - start);
460                 /* Find disk with most accurate data. */
461                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
462                     offset + start, len, 0);
463                 if (best < 0) {
464                         /* There is no any valid disk. */
465                         g_raid_tr_raid1e_rebuild_abort(tr);
466                         return;
467                 } else if (best != copy) {
468                         /* Some other disk has better data. */
469                         break;
470                 }
471                 /* We have the most accurate data. Skip the range. */
472                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
473                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
474                 sd->sd_rebuild_pos += len;
475         }
476
477         bp = &trs->trso_bio;
478         memset(bp, 0, sizeof(*bp));
479         bp->bio_offset = offset + start +
480             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
481         bp->bio_length = len;
482         bp->bio_data = trs->trso_buffer;
483         bp->bio_cmd = BIO_READ;
484         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
485         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
486         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
487         /*
488          * If we are crossing stripe boundary, correct affected virtual
489          * range we should lock.
490          */
491         if (start + len > vol->v_strip_size) {
492                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
493                 len = vend - virtual;
494         }
495         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
496         trs->trso_flags |= TR_RAID1E_F_LOCKED;
497         trs->trso_lock_pos = virtual;
498         trs->trso_lock_len = len;
499         /* Lock callback starts I/O */
500         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
501 }
502
503 static void
504 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
505 {
506         struct g_raid_volume *vol;
507         struct g_raid_tr_raid1e_object *trs;
508         struct g_raid_subdisk *sd;
509
510         vol = tr->tro_volume;
511         trs = (struct g_raid_tr_raid1e_object *)tr;
512         if (trs->trso_failed_sd) {
513                 G_RAID_DEBUG1(1, vol->v_softc,
514                     "Already rebuild in start rebuild. pos %jd\n",
515                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
516                 return;
517         }
518         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
519         if (sd == NULL)
520                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
521         if (sd == NULL) {
522                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
523                 if (sd != NULL) {
524                         sd->sd_rebuild_pos = 0;
525                         g_raid_change_subdisk_state(sd,
526                             G_RAID_SUBDISK_S_RESYNC);
527                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
528                 } else {
529                         sd = g_raid_get_subdisk(vol,
530                             G_RAID_SUBDISK_S_UNINITIALIZED);
531                         if (sd == NULL)
532                                 sd = g_raid_get_subdisk(vol,
533                                     G_RAID_SUBDISK_S_NEW);
534                         if (sd != NULL) {
535                                 sd->sd_rebuild_pos = 0;
536                                 g_raid_change_subdisk_state(sd,
537                                     G_RAID_SUBDISK_S_REBUILD);
538                                 g_raid_write_metadata(vol->v_softc,
539                                     vol, sd, NULL);
540                         }
541                 }
542         }
543         if (sd == NULL) {
544                 G_RAID_DEBUG1(1, vol->v_softc,
545                     "No failed disk to rebuild.  night night.");
546                 return;
547         }
548         trs->trso_failed_sd = sd;
549         G_RAID_DEBUG1(0, vol->v_softc,
550             "Subdisk %s:%d-%s rebuild start at %jd.",
551             sd->sd_volume->v_name, sd->sd_pos,
552             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
553             trs->trso_failed_sd->sd_rebuild_pos);
554         trs->trso_type = TR_RAID1E_REBUILD;
555         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
556         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
557         g_raid_tr_raid1e_rebuild_some(tr);
558 }
559
560 static void
561 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
562     struct g_raid_subdisk *sd)
563 {
564         struct g_raid_volume *vol;
565         struct g_raid_tr_raid1e_object *trs;
566         int nr;
567         
568         vol = tr->tro_volume;
569         trs = (struct g_raid_tr_raid1e_object *)tr;
570         if (trs->trso_stopping)
571                 return;
572         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
573             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
574         switch(trs->trso_type) {
575         case TR_RAID1E_NONE:
576                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
577                         return;
578                 if (nr == 0) {
579                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
580                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
581                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
582                         if (nr == 0)
583                                 return;
584                 }
585                 g_raid_tr_raid1e_rebuild_start(tr);
586                 break;
587         case TR_RAID1E_REBUILD:
588                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
589                     trs->trso_failed_sd == sd)
590                         g_raid_tr_raid1e_rebuild_abort(tr);
591                 break;
592         case TR_RAID1E_RESYNC:
593                 break;
594         }
595 }
596
597 static int
598 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
599     struct g_raid_subdisk *sd, u_int event)
600 {
601
602         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
603         return (0);
604 }
605
606 static int
607 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
608 {
609         struct g_raid_tr_raid1e_object *trs;
610         struct g_raid_volume *vol;
611
612         trs = (struct g_raid_tr_raid1e_object *)tr;
613         vol = tr->tro_volume;
614         trs->trso_starting = 0;
615         g_raid_tr_update_state_raid1e(vol, NULL);
616         return (0);
617 }
618
619 static int
620 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
621 {
622         struct g_raid_tr_raid1e_object *trs;
623         struct g_raid_volume *vol;
624
625         trs = (struct g_raid_tr_raid1e_object *)tr;
626         vol = tr->tro_volume;
627         trs->trso_starting = 0;
628         trs->trso_stopping = 1;
629         g_raid_tr_update_state_raid1e(vol, NULL);
630         return (0);
631 }
632
633 /*
634  * Select the disk to read from.  Take into account: subdisk state, running
635  * error recovery, average disk load, head position and possible cache hits.
636  */
637 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
638 static int
639 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
640     int no, off_t off, off_t len, u_int mask)
641 {
642         struct g_raid_subdisk *sd;
643         off_t offset;
644         int i, best, prio, bestprio;
645
646         best = -1;
647         bestprio = INT_MAX;
648         for (i = 0; i < N; i++) {
649                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
650                 offset = off;
651                 if (no + i >= vol->v_disks_count)
652                         offset += vol->v_strip_size;
653
654                 prio = G_RAID_SUBDISK_LOAD(sd);
655                 if ((mask & (1 << sd->sd_pos)) != 0)
656                         continue;
657                 switch (sd->sd_state) {
658                 case G_RAID_SUBDISK_S_ACTIVE:
659                         break;
660                 case G_RAID_SUBDISK_S_RESYNC:
661                         if (offset + off < sd->sd_rebuild_pos)
662                                 break;
663                         /* FALLTHROUGH */
664                 case G_RAID_SUBDISK_S_STALE:
665                         prio += i << 24;
666                         break;
667                 case G_RAID_SUBDISK_S_REBUILD:
668                         if (offset + off < sd->sd_rebuild_pos)
669                                 break;
670                         /* FALLTHROUGH */
671                 default:
672                         continue;
673                 }
674                 prio += min(sd->sd_recovery, 255) << 16;
675                 /* If disk head is precisely in position - highly prefer it. */
676                 if (G_RAID_SUBDISK_POS(sd) == offset)
677                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
678                 else
679                 /* If disk head is close to position - prefer it. */
680                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
681                     G_RAID_SUBDISK_TRACK_SIZE)
682                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
683                 if (prio < bestprio) {
684                         bestprio = prio;
685                         best = i;
686                 }
687         }
688         return (best);
689 }
690
691 static void
692 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
693 {
694         struct g_raid_volume *vol;
695         struct g_raid_subdisk *sd;
696         struct bio_queue_head queue;
697         struct bio *cbp;
698         char *addr;
699         off_t offset, start, length, remain;
700         u_int no, strip_size;
701         int best;
702
703         vol = tr->tro_volume;
704         addr = bp->bio_data;
705         strip_size = vol->v_strip_size;
706         V2P(vol, bp->bio_offset, &no, &offset, &start);
707         remain = bp->bio_length;
708         bioq_init(&queue);
709         while (remain > 0) {
710                 length = MIN(strip_size - start, remain);
711                 best = g_raid_tr_raid1e_select_read_disk(vol,
712                     no, offset, length, 0);
713                 KASSERT(best >= 0, ("No readable disk in volume %s!",
714                     vol->v_name));
715                 no += best;
716                 if (no >= vol->v_disks_count) {
717                         no -= vol->v_disks_count;
718                         offset += strip_size;
719                 }
720                 cbp = g_clone_bio(bp);
721                 if (cbp == NULL)
722                         goto failure;
723                 cbp->bio_offset = offset + start;
724                 cbp->bio_data = addr;
725                 cbp->bio_length = length;
726                 cbp->bio_caller1 = &vol->v_subdisks[no];
727                 bioq_insert_tail(&queue, cbp);
728                 no += N - best;
729                 if (no >= vol->v_disks_count) {
730                         no -= vol->v_disks_count;
731                         offset += strip_size;
732                 }
733                 remain -= length;
734                 addr += length;
735                 start = 0;
736         }
737         for (cbp = bioq_first(&queue); cbp != NULL;
738             cbp = bioq_first(&queue)) {
739                 bioq_remove(&queue, cbp);
740                 sd = cbp->bio_caller1;
741                 cbp->bio_caller1 = NULL;
742                 g_raid_subdisk_iostart(sd, cbp);
743         }
744         return;
745 failure:
746         for (cbp = bioq_first(&queue); cbp != NULL;
747             cbp = bioq_first(&queue)) {
748                 bioq_remove(&queue, cbp);
749                 g_destroy_bio(cbp);
750         }
751         if (bp->bio_error == 0)
752                 bp->bio_error = ENOMEM;
753         g_raid_iodone(bp, bp->bio_error);
754 }
755
756 static void
757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
758 {
759         struct g_raid_volume *vol;
760         struct g_raid_subdisk *sd;
761         struct bio_queue_head queue;
762         struct bio *cbp;
763         char *addr;
764         off_t offset, start, length, remain;
765         u_int no, strip_size;
766         int i;
767
768         vol = tr->tro_volume;
769         addr = bp->bio_data;
770         strip_size = vol->v_strip_size;
771         V2P(vol, bp->bio_offset, &no, &offset, &start);
772         remain = bp->bio_length;
773         bioq_init(&queue);
774         while (remain > 0) {
775                 length = MIN(strip_size - start, remain);
776                 for (i = 0; i < N; i++) {
777                         sd = &vol->v_subdisks[no];
778                         switch (sd->sd_state) {
779                         case G_RAID_SUBDISK_S_ACTIVE:
780                         case G_RAID_SUBDISK_S_STALE:
781                         case G_RAID_SUBDISK_S_RESYNC:
782                                 break;
783                         case G_RAID_SUBDISK_S_REBUILD:
784                                 if (offset + start >= sd->sd_rebuild_pos)
785                                         goto nextdisk;
786                                 break;
787                         default:
788                                 goto nextdisk;
789                         }
790                         cbp = g_clone_bio(bp);
791                         if (cbp == NULL)
792                                 goto failure;
793                         cbp->bio_offset = offset + start;
794                         cbp->bio_data = addr;
795                         cbp->bio_length = length;
796                         cbp->bio_caller1 = sd;
797                         bioq_insert_tail(&queue, cbp);
798 nextdisk:
799                         if (++no >= vol->v_disks_count) {
800                                 no = 0;
801                                 offset += strip_size;
802                         }
803                 }
804                 remain -= length;
805                 addr += length;
806                 start = 0;
807         }
808         for (cbp = bioq_first(&queue); cbp != NULL;
809             cbp = bioq_first(&queue)) {
810                 bioq_remove(&queue, cbp);
811                 sd = cbp->bio_caller1;
812                 cbp->bio_caller1 = NULL;
813                 g_raid_subdisk_iostart(sd, cbp);
814         }
815         return;
816 failure:
817         for (cbp = bioq_first(&queue); cbp != NULL;
818             cbp = bioq_first(&queue)) {
819                 bioq_remove(&queue, cbp);
820                 g_destroy_bio(cbp);
821         }
822         if (bp->bio_error == 0)
823                 bp->bio_error = ENOMEM;
824         g_raid_iodone(bp, bp->bio_error);
825 }
826
827 static void
828 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
829 {
830         struct g_raid_volume *vol;
831         struct g_raid_tr_raid1e_object *trs;
832
833         vol = tr->tro_volume;
834         trs = (struct g_raid_tr_raid1e_object *)tr;
835         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
836             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
837             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
838                 g_raid_iodone(bp, EIO);
839                 return;
840         }
841         /*
842          * If we're rebuilding, squeeze in rebuild activity every so often,
843          * even when the disk is busy.  Be sure to only count real I/O
844          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
845          * by this module.
846          */
847         if (trs->trso_failed_sd != NULL &&
848             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
849                 /* Make this new or running now round short. */
850                 trs->trso_recover_slabs = 0;
851                 if (--trs->trso_fair_io <= 0) {
852                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
853                         g_raid_tr_raid1e_rebuild_some(tr);
854                 }
855         }
856         switch (bp->bio_cmd) {
857         case BIO_READ:
858                 g_raid_tr_iostart_raid1e_read(tr, bp);
859                 break;
860         case BIO_WRITE:
861                 g_raid_tr_iostart_raid1e_write(tr, bp);
862                 break;
863         case BIO_DELETE:
864                 g_raid_iodone(bp, EIO);
865                 break;
866         case BIO_FLUSH:
867                 g_raid_tr_flush_common(tr, bp);
868                 break;
869         default:
870                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
871                     bp->bio_cmd, vol->v_name));
872                 break;
873         }
874 }
875
876 static void
877 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
878     struct g_raid_subdisk *sd, struct bio *bp)
879 {
880         struct bio *cbp;
881         struct g_raid_subdisk *nsd;
882         struct g_raid_volume *vol;
883         struct bio *pbp;
884         struct g_raid_tr_raid1e_object *trs;
885         off_t virtual, offset, start;
886         uintptr_t mask;
887         int error, do_write, copy, disk, best;
888
889         trs = (struct g_raid_tr_raid1e_object *)tr;
890         vol = tr->tro_volume;
891         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
892                 if (trs->trso_type == TR_RAID1E_REBUILD) {
893                         nsd = trs->trso_failed_sd;
894                         if (bp->bio_cmd == BIO_READ) {
895
896                                 /* Immediately abort rebuild, if requested. */
897                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
898                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
899                                         g_raid_tr_raid1e_rebuild_abort(tr);
900                                         return;
901                                 }
902
903                                 /* On read error, skip and cross fingers. */
904                                 if (bp->bio_error != 0) {
905                                         G_RAID_LOGREQ(0, bp,
906                                             "Read error during rebuild (%d), "
907                                             "possible data loss!",
908                                             bp->bio_error);
909                                         goto rebuild_round_done;
910                                 }
911
912                                 /*
913                                  * The read operation finished, queue the
914                                  * write and get out.
915                                  */
916                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
917                                     bp->bio_error);
918                                 bp->bio_cmd = BIO_WRITE;
919                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
920                                 bp->bio_offset = nsd->sd_rebuild_pos;
921                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
922                                 g_raid_subdisk_iostart(nsd, bp);
923                         } else {
924                                 /*
925                                  * The write operation just finished.  Do
926                                  * another.  We keep cloning the master bio
927                                  * since it has the right buffers allocated to
928                                  * it.
929                                  */
930                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
931                                     bp->bio_error);
932                                 if (bp->bio_error != 0 ||
933                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
934                                         if ((trs->trso_flags &
935                                             TR_RAID1E_F_ABORT) == 0) {
936                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
937                                                     nsd, nsd->sd_disk);
938                                         }
939                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
940                                         g_raid_tr_raid1e_rebuild_abort(tr);
941                                         return;
942                                 }
943 rebuild_round_done:
944                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
945                                 g_raid_unlock_range(tr->tro_volume,
946                                     trs->trso_lock_pos, trs->trso_lock_len);
947                                 nsd->sd_rebuild_pos += bp->bio_length;
948                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
949                                         g_raid_tr_raid1e_rebuild_finish(tr);
950                                         return;
951                                 }
952
953                                 /* Abort rebuild if we are stopping */
954                                 if (trs->trso_stopping) {
955                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
956                                         g_raid_tr_raid1e_rebuild_abort(tr);
957                                         return;
958                                 }
959
960                                 if (--trs->trso_meta_update <= 0) {
961                                         g_raid_write_metadata(vol->v_softc,
962                                             vol, nsd, nsd->sd_disk);
963                                         trs->trso_meta_update =
964                                             g_raid1e_rebuild_meta_update;
965                                         /* Compensate short rebuild I/Os. */
966                                         if ((vol->v_disks_count % N) != 0 &&
967                                             vol->v_strip_size <
968                                              g_raid1e_rebuild_slab) {
969                                                 trs->trso_meta_update *=
970                                                     g_raid1e_rebuild_slab;
971                                                 trs->trso_meta_update /=
972                                                     vol->v_strip_size;
973                                         }
974                                 }
975                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
976                                 if (--trs->trso_recover_slabs <= 0)
977                                         return;
978                                 /* Run next rebuild iteration. */
979                                 g_raid_tr_raid1e_rebuild_some(tr);
980                         }
981                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
982                         /*
983                          * read good sd, read bad sd in parallel.  when both
984                          * done, compare the buffers.  write good to the bad
985                          * if different.  do the next bit of work.
986                          */
987                         panic("Somehow, we think we're doing a resync");
988                 }
989                 return;
990         }
991         pbp = bp->bio_parent;
992         pbp->bio_inbed++;
993         mask = (intptr_t)bp->bio_caller2;
994         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
995                 /*
996                  * Read failed on first drive.  Retry the read error on
997                  * another disk drive, if available, before erroring out the
998                  * read.
999                  */
1000                 sd->sd_disk->d_read_errs++;
1001                 G_RAID_LOGREQ(0, bp,
1002                     "Read error (%d), %d read errors total",
1003                     bp->bio_error, sd->sd_disk->d_read_errs);
1004
1005                 /*
1006                  * If there are too many read errors, we move to degraded.
1007                  * XXX Do we want to FAIL the drive (eg, make the user redo
1008                  * everything to get it back in sync), or just degrade the
1009                  * drive, which kicks off a resync?
1010                  */
1011                 do_write = 0;
1012                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1013                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1014                 else if (mask == 0)
1015                         do_write = 1;
1016
1017                 /* Restore what we were doing. */
1018                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1019                 V2P(vol, virtual, &disk, &offset, &start);
1020
1021                 /* Find the other disk, and try to do the I/O to it. */
1022                 mask |= 1 << copy;
1023                 best = g_raid_tr_raid1e_select_read_disk(vol,
1024                     disk, offset, start, mask);
1025                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1026                         disk += best;
1027                         if (disk >= vol->v_disks_count) {
1028                                 disk -= vol->v_disks_count;
1029                                 offset += vol->v_strip_size;
1030                         }
1031                         cbp->bio_offset = offset + start;
1032                         cbp->bio_length = bp->bio_length;
1033                         cbp->bio_data = bp->bio_data;
1034                         g_destroy_bio(bp);
1035                         nsd = &vol->v_subdisks[disk];
1036                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1037                             nsd->sd_pos);
1038                         if (do_write)
1039                                 mask |= 1 << 31;
1040                         if ((mask & (1 << 31)) != 0)
1041                                 sd->sd_recovery++;
1042                         cbp->bio_caller2 = (void *)mask;
1043                         if (do_write) {
1044                                 cbp->bio_caller1 = nsd;
1045                                 /* Lock callback starts I/O */
1046                                 g_raid_lock_range(sd->sd_volume,
1047                                     virtual, cbp->bio_length, pbp, cbp);
1048                         } else {
1049                                 g_raid_subdisk_iostart(nsd, cbp);
1050                         }
1051                         return;
1052                 }
1053                 /*
1054                  * We can't retry.  Return the original error by falling
1055                  * through.  This will happen when there's only one good disk.
1056                  * We don't need to fail the raid, since its actual state is
1057                  * based on the state of the subdisks.
1058                  */
1059                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1060         }
1061         if (bp->bio_cmd == BIO_READ &&
1062             bp->bio_error == 0 &&
1063             (mask & (1 << 31)) != 0) {
1064                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1065
1066                 /* Restore what we were doing. */
1067                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1068                 V2P(vol, virtual, &disk, &offset, &start);
1069
1070                 /* Find best disk to write. */
1071                 best = g_raid_tr_raid1e_select_read_disk(vol,
1072                     disk, offset, start, ~mask);
1073                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1074                         disk += best;
1075                         if (disk >= vol->v_disks_count) {
1076                                 disk -= vol->v_disks_count;
1077                                 offset += vol->v_strip_size;
1078                         }
1079                         cbp->bio_offset = offset + start;
1080                         cbp->bio_length = bp->bio_length;
1081                         cbp->bio_data = bp->bio_data;
1082                         cbp->bio_cmd = BIO_WRITE;
1083                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1084                         cbp->bio_caller2 = (void *)mask;
1085                         g_destroy_bio(bp);
1086                         G_RAID_LOGREQ(2, cbp,
1087                             "Attempting bad sector remap on failing drive.");
1088                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1089                         return;
1090                 }
1091         }
1092         if ((mask & (1 << 31)) != 0) {
1093                 /*
1094                  * We're done with a recovery, mark the range as unlocked.
1095                  * For any write errors, we agressively fail the disk since
1096                  * there was both a READ and a WRITE error at this location.
1097                  * Both types of errors generally indicates the drive is on
1098                  * the verge of total failure anyway.  Better to stop trusting
1099                  * it now.  However, we need to reset error to 0 in that case
1100                  * because we're not failing the original I/O which succeeded.
1101                  */
1102
1103                 /* Restore what we were doing. */
1104                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1105                 V2P(vol, virtual, &disk, &offset, &start);
1106
1107                 for (copy = 0; copy < N; copy++) {
1108                         if ((mask & (1 << copy) ) != 0)
1109                                 vol->v_subdisks[(disk + copy) %
1110                                     vol->v_disks_count].sd_recovery--;
1111                 }
1112
1113                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1114                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1115                             "failing subdisk.");
1116                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1117                         bp->bio_error = 0;
1118                 }
1119                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1120                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1121         }
1122         if (pbp->bio_cmd != BIO_READ) {
1123                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1124                         pbp->bio_error = bp->bio_error;
1125                 if (bp->bio_error != 0) {
1126                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1127                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1128                 }
1129                 error = pbp->bio_error;
1130         } else
1131                 error = bp->bio_error;
1132         g_destroy_bio(bp);
1133         if (pbp->bio_children == pbp->bio_inbed) {
1134                 pbp->bio_completed = pbp->bio_length;
1135                 g_raid_iodone(pbp, error);
1136         }
1137 }
1138
1139 static int
1140 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1141     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1142 {
1143         struct g_raid_volume *vol;
1144         struct g_raid_subdisk *sd;
1145         struct bio_queue_head queue;
1146         char *addr;
1147         off_t offset, start, length, remain;
1148         u_int no, strip_size;
1149         int i, error;
1150
1151         vol = tr->tro_volume;
1152         addr = virtual;
1153         strip_size = vol->v_strip_size;
1154         V2P(vol, boffset, &no, &offset, &start);
1155         remain = blength;
1156         bioq_init(&queue);
1157         while (remain > 0) {
1158                 length = MIN(strip_size - start, remain);
1159                 for (i = 0; i < N; i++) {
1160                         sd = &vol->v_subdisks[no];
1161                         switch (sd->sd_state) {
1162                         case G_RAID_SUBDISK_S_ACTIVE:
1163                         case G_RAID_SUBDISK_S_STALE:
1164                         case G_RAID_SUBDISK_S_RESYNC:
1165                                 break;
1166                         case G_RAID_SUBDISK_S_REBUILD:
1167                                 if (offset + start >= sd->sd_rebuild_pos)
1168                                         goto nextdisk;
1169                                 break;
1170                         default:
1171                                 goto nextdisk;
1172                         }
1173                         error = g_raid_subdisk_kerneldump(sd,
1174                             addr, 0, offset + start, length);
1175                         if (error != 0)
1176                                 return (error);
1177 nextdisk:
1178                         if (++no >= vol->v_disks_count) {
1179                                 no = 0;
1180                                 offset += strip_size;
1181                         }
1182                 }
1183                 remain -= length;
1184                 addr += length;
1185                 start = 0;
1186         }
1187         return (0);
1188 }
1189
1190 static int
1191 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1192 {
1193         struct bio *bp;
1194         struct g_raid_subdisk *sd;
1195
1196         bp = (struct bio *)argp;
1197         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1198         g_raid_subdisk_iostart(sd, bp);
1199
1200         return (0);
1201 }
1202
1203 static int
1204 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1205 {
1206         struct g_raid_tr_raid1e_object *trs;
1207         struct g_raid_volume *vol;
1208
1209         vol = tr->tro_volume;
1210         trs = (struct g_raid_tr_raid1e_object *)tr;
1211         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1212         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1213         /* Compensate short rebuild I/Os. */
1214         if ((vol->v_disks_count % N) != 0 &&
1215             vol->v_strip_size < g_raid1e_rebuild_slab) {
1216                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1217                 trs->trso_recover_slabs /= vol->v_strip_size;
1218         }
1219         if (trs->trso_type == TR_RAID1E_REBUILD)
1220                 g_raid_tr_raid1e_rebuild_some(tr);
1221         return (0);
1222 }
1223
1224 static int
1225 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1226 {
1227         struct g_raid_tr_raid1e_object *trs;
1228
1229         trs = (struct g_raid_tr_raid1e_object *)tr;
1230
1231         if (trs->trso_buffer != NULL) {
1232                 free(trs->trso_buffer, M_TR_RAID1E);
1233                 trs->trso_buffer = NULL;
1234         }
1235         return (0);
1236 }
1237
1238 G_RAID_TR_DECLARE(raid1e, "RAID1E");