sys/geom/raid/tr_raid1e.c

   1 /*-
   2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <sys/param.h>
  31 #include <sys/bio.h>
  32 #include <sys/endian.h>
  33 #include <sys/kernel.h>
  34 #include <sys/kobj.h>
  35 #include <sys/limits.h>
  36 #include <sys/lock.h>
  37 #include <sys/malloc.h>
  38 #include <sys/mutex.h>
  39 #include <sys/sysctl.h>
  40 #include <sys/systm.h>
  41 #include <geom/geom.h>
  42 #include "geom/raid/g_raid.h"
  43 #include "g_raid_tr_if.h"
  44
  45 #define N       2
  46
  47 SYSCTL_DECL(_kern_geom_raid);
  48 SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1e, CTLFLAG_RW, 0,
  49     "RAID1E parameters");
  50
  51 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
  52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
  53 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
  54     &g_raid1e_rebuild_slab);
  55 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
  56     &g_raid1e_rebuild_slab, 0,
  57     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
  58
  59 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
  60 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
  61 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
  62     &g_raid1e_rebuild_fair_io);
  63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
  64     &g_raid1e_rebuild_fair_io, 0,
  65     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
  66
  67 #define RAID1E_REBUILD_CLUSTER_IDLE 100
  68 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
  69 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
  70     &g_raid1e_rebuild_cluster_idle);
  71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
  72     &g_raid1e_rebuild_cluster_idle, 0,
  73     "Number of slabs to do each time we trigger a rebuild cycle");
  74
  75 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
  76 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
  77 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
  78     &g_raid1e_rebuild_meta_update);
  79 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
  80     &g_raid1e_rebuild_meta_update, 0,
  81     "When to update the meta data.");
  82
  83 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
  84
  85 #define TR_RAID1E_NONE 0
  86 #define TR_RAID1E_REBUILD 1
  87 #define TR_RAID1E_RESYNC 2
  88
  89 #define TR_RAID1E_F_DOING_SOME  0x1
  90 #define TR_RAID1E_F_LOCKED      0x2
  91 #define TR_RAID1E_F_ABORT       0x4
  92
  93 struct g_raid_tr_raid1e_object {
  94         struct g_raid_tr_object  trso_base;
  95         int                      trso_starting;
  96         int                      trso_stopping;
  97         int                      trso_type;
  98         int                      trso_recover_slabs; /* slabs before rest */
  99         int                      trso_fair_io;
 100         int                      trso_meta_update;
 101         int                      trso_flags;
 102         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
 103         void                    *trso_buffer;    /* Buffer space */
 104         off_t                    trso_lock_pos; /* Locked range start. */
 105         off_t                    trso_lock_len; /* Locked range length. */
 106         struct bio               trso_bio;
 107 };
 108
 109 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 110 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 111 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 112 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 113 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 114 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 115 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 116 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 117 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 118 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 119
 120 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 121         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
 122         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
 123         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
 124         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
 125         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
 126         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
 127         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 128         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
 129         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
 130         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
 131         { 0, 0 }
 132 };
 133
 134 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 135         "RAID1E",
 136         g_raid_tr_raid1e_methods,
 137         sizeof(struct g_raid_tr_raid1e_object),
 138         .trc_priority = 200
 139 };
 140
 141 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 142 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 143     struct g_raid_subdisk *sd);
 144 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 145     int no, off_t off, off_t len, u_int mask);
 146
 147 static inline void
 148 V2P(struct g_raid_volume *vol, off_t virt,
 149     int *disk, off_t *offset, off_t *start)
 150 {
 151         off_t nstrip;
 152         u_int strip_size;
 153
 154         strip_size = vol->v_strip_size;
 155         /* Strip number. */
 156         nstrip = virt / strip_size;
 157         /* Start position in strip. */
 158         *start = virt % strip_size;
 159         /* Disk number. */
 160         *disk = (nstrip * N) % vol->v_disks_count;
 161         /* Strip start position in disk. */
 162         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 163 }
 164
 165 static inline void
 166 P2V(struct g_raid_volume *vol, int disk, off_t offset,
 167     off_t *virt, int *copy)
 168 {
 169         off_t nstrip, start;
 170         u_int strip_size;
 171
 172         strip_size = vol->v_strip_size;
 173         /* Start position in strip. */
 174         start = offset % strip_size;
 175         /* Physical strip number. */
 176         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 177         /* Number of physical strip (copy) inside virtual strip. */
 178         *copy = nstrip % N;
 179         /* Offset in virtual space. */
 180         *virt = (nstrip / N) * strip_size + start;
 181 }
 182
 183 static int
 184 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 185 {
 186         struct g_raid_tr_raid1e_object *trs;
 187
 188         trs = (struct g_raid_tr_raid1e_object *)tr;
 189         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 190             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
 191                 return (G_RAID_TR_TASTE_FAIL);
 192         trs->trso_starting = 1;
 193         return (G_RAID_TR_TASTE_SUCCEED);
 194 }
 195
 196 static int
 197 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 198 {
 199         struct g_raid_softc *sc;
 200         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 201         int i, j, state, sstate;
 202
 203         sc = vol->v_softc;
 204         state = G_RAID_VOLUME_S_OPTIMAL;
 205         for (i = 0; i < vol->v_disks_count / N; i++) {
 206                 bestsd = &vol->v_subdisks[i * N];
 207                 for (j = 1; j < N; j++) {
 208                         sd = &vol->v_subdisks[i * N + j];
 209                         if (sd->sd_state > bestsd->sd_state)
 210                                 bestsd = sd;
 211                         else if (sd->sd_state == bestsd->sd_state &&
 212                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 213                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 214                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 215                                 bestsd = sd;
 216                 }
 217                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 218                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 219                         /* We found reasonable candidate. */
 220                         G_RAID_DEBUG1(1, sc,
 221                             "Promote subdisk %s:%d from %s to ACTIVE.",
 222                             vol->v_name, bestsd->sd_pos,
 223                             g_raid_subdisk_state2str(bestsd->sd_state));
 224                         g_raid_change_subdisk_state(bestsd,
 225                             G_RAID_SUBDISK_S_ACTIVE);
 226                         g_raid_write_metadata(sc,
 227                             vol, bestsd, bestsd->sd_disk);
 228                 }
 229                 worstsd = &vol->v_subdisks[i * N];
 230                 for (j = 1; j < N; j++) {
 231                         sd = &vol->v_subdisks[i * N + j];
 232                         if (sd->sd_state < worstsd->sd_state)
 233                                 worstsd = sd;
 234                 }
 235                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 236                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 237                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 238                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 239                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 240                         sstate = G_RAID_VOLUME_S_DEGRADED;
 241                 else
 242                         sstate = G_RAID_VOLUME_S_BROKEN;
 243                 if (sstate < state)
 244                         state = sstate;
 245         }
 246         return (state);
 247 }
 248
 249 static int
 250 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 251 {
 252         struct g_raid_softc *sc;
 253         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 254         int i, j, state, sstate;
 255
 256         sc = vol->v_softc;
 257         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 258             vol->v_disks_count)
 259                 return (G_RAID_VOLUME_S_OPTIMAL);
 260         for (i = 0; i < vol->v_disks_count; i++) {
 261                 sd = &vol->v_subdisks[i];
 262                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 263                         /* We found reasonable candidate. */
 264                         G_RAID_DEBUG1(1, sc,
 265                             "Promote subdisk %s:%d from %s to STALE.",
 266                             vol->v_name, sd->sd_pos,
 267                             g_raid_subdisk_state2str(sd->sd_state));
 268                         g_raid_change_subdisk_state(sd,
 269                             G_RAID_SUBDISK_S_STALE);
 270                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 271                 }
 272         }
 273         state = G_RAID_VOLUME_S_OPTIMAL;
 274         for (i = 0; i < vol->v_disks_count; i++) {
 275                 bestsd = &vol->v_subdisks[i];
 276                 worstsd = &vol->v_subdisks[i];
 277                 for (j = 1; j < N; j++) {
 278                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 279                         if (sd->sd_state > bestsd->sd_state)
 280                                 bestsd = sd;
 281                         else if (sd->sd_state == bestsd->sd_state &&
 282                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 283                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 284                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 285                                 bestsd = sd;
 286                         if (sd->sd_state < worstsd->sd_state)
 287                                 worstsd = sd;
 288                 }
 289                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 290                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 291                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 292                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 293                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 294                         sstate = G_RAID_VOLUME_S_DEGRADED;
 295                 else
 296                         sstate = G_RAID_VOLUME_S_BROKEN;
 297                 if (sstate < state)
 298                         state = sstate;
 299         }
 300         return (state);
 301 }
 302
 303 static int
 304 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
 305     struct g_raid_subdisk *sd)
 306 {
 307         struct g_raid_tr_raid1e_object *trs;
 308         struct g_raid_softc *sc;
 309         u_int s;
 310
 311         sc = vol->v_softc;
 312         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 313         if (trs->trso_stopping &&
 314             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 315                 s = G_RAID_VOLUME_S_STOPPED;
 316         else if (trs->trso_starting)
 317                 s = G_RAID_VOLUME_S_STARTING;
 318         else {
 319                 if ((vol->v_disks_count % N) == 0)
 320                         s = g_raid_tr_update_state_raid1e_even(vol);
 321                 else
 322                         s = g_raid_tr_update_state_raid1e_odd(vol);
 323         }
 324         if (s != vol->v_state) {
 325                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 326                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 327                     G_RAID_EVENT_VOLUME);
 328                 g_raid_change_volume_state(vol, s);
 329                 if (!trs->trso_starting && !trs->trso_stopping)
 330                         g_raid_write_metadata(sc, vol, NULL, NULL);
 331         }
 332         if (!trs->trso_starting && !trs->trso_stopping)
 333                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 334         return (0);
 335 }
 336
 337 static void
 338 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
 339     struct g_raid_disk *disk)
 340 {
 341         /*
 342          * We don't fail the last disk in the pack, since it still has decent
 343          * data on it and that's better than failing the disk if it is the root
 344          * file system.
 345          *
 346          * XXX should this be controlled via a tunable?  It makes sense for
 347          * the volume that has / on it.  I can't think of a case where we'd
 348          * want the volume to go away on this kind of event.
 349          */
 350         if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
 351             g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
 352                 return;
 353         g_raid_fail_disk(sc, sd, disk);
 354 }
 355
 356 static void
 357 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 358 {
 359         struct g_raid_volume *vol;
 360         struct g_raid_subdisk *sd;
 361
 362         vol = trs->trso_base.tro_volume;
 363         sd = trs->trso_failed_sd;
 364         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 365         free(trs->trso_buffer, M_TR_RAID1E);
 366         trs->trso_buffer = NULL;
 367         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 368         trs->trso_type = TR_RAID1E_NONE;
 369         trs->trso_recover_slabs = 0;
 370         trs->trso_failed_sd = NULL;
 371         g_raid_tr_update_state_raid1e(vol, NULL);
 372 }
 373
 374 static void
 375 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 376 {
 377         struct g_raid_tr_raid1e_object *trs;
 378         struct g_raid_subdisk *sd;
 379
 380         trs = (struct g_raid_tr_raid1e_object *)tr;
 381         sd = trs->trso_failed_sd;
 382         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 383             "Subdisk %s:%d-%s rebuild completed.",
 384             sd->sd_volume->v_name, sd->sd_pos,
 385             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 386         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 387         sd->sd_rebuild_pos = 0;
 388         g_raid_tr_raid1e_rebuild_done(trs);
 389 }
 390
 391 static void
 392 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 393 {
 394         struct g_raid_tr_raid1e_object *trs;
 395         struct g_raid_subdisk *sd;
 396         struct g_raid_volume *vol;
 397
 398         vol = tr->tro_volume;
 399         trs = (struct g_raid_tr_raid1e_object *)tr;
 400         sd = trs->trso_failed_sd;
 401         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 402                 G_RAID_DEBUG1(1, vol->v_softc,
 403                     "Subdisk %s:%d-%s rebuild is aborting.",
 404                     sd->sd_volume->v_name, sd->sd_pos,
 405                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 406                 trs->trso_flags |= TR_RAID1E_F_ABORT;
 407         } else {
 408                 G_RAID_DEBUG1(0, vol->v_softc,
 409                     "Subdisk %s:%d-%s rebuild aborted.",
 410                     sd->sd_volume->v_name, sd->sd_pos,
 411                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 412                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 413                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 414                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 415                         g_raid_unlock_range(tr->tro_volume,
 416                             trs->trso_lock_pos, trs->trso_lock_len);
 417                 }
 418                 g_raid_tr_raid1e_rebuild_done(trs);
 419         }
 420 }
 421
 422 static void
 423 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 424 {
 425         struct g_raid_tr_raid1e_object *trs;
 426         struct g_raid_softc *sc;
 427         struct g_raid_volume *vol;
 428         struct g_raid_subdisk *sd;
 429         struct bio *bp;
 430         off_t len, virtual, vend, offset, start;
 431         int disk, copy, best;
 432
 433         trs = (struct g_raid_tr_raid1e_object *)tr;
 434         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 435                 return;
 436         vol = tr->tro_volume;
 437         sc = vol->v_softc;
 438         sd = trs->trso_failed_sd;
 439
 440         while (1) {
 441                 if (sd->sd_rebuild_pos >= sd->sd_size) {
 442                         g_raid_tr_raid1e_rebuild_finish(tr);
 443                         return;
 444                 }
 445                 /* Get virtual offset from physical rebuild position. */
 446                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 447                 /* Get physical offset back to get first stripe position. */
 448                 V2P(vol, virtual, &disk, &offset, &start);
 449                 /* Calculate contignous data length. */
 450                 len = MIN(g_raid1e_rebuild_slab,
 451                     sd->sd_size - sd->sd_rebuild_pos);
 452                 if ((vol->v_disks_count % N) != 0)
 453                         len = MIN(len, vol->v_strip_size - start);
 454                 /* Find disk with most accurate data. */
 455                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 456                     offset + start, len, 0);
 457                 if (best < 0) {
 458                         /* There is no any valid disk. */
 459                         g_raid_tr_raid1e_rebuild_abort(tr);
 460                         return;
 461                 } else if (best != copy) {
 462                         /* Some other disk has better data. */
 463                         break;
 464                 }
 465                 /* We have the most accurate data. Skip the range. */
 466                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 467                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 468                 sd->sd_rebuild_pos += len;
 469         }
 470
 471         bp = &trs->trso_bio;
 472         memset(bp, 0, sizeof(*bp));
 473         bp->bio_offset = offset + start +
 474             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 475         bp->bio_length = len;
 476         bp->bio_data = trs->trso_buffer;
 477         bp->bio_cmd = BIO_READ;
 478         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 479         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 480         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 481         /*
 482          * If we are crossing stripe boundary, correct affected virtual
 483          * range we should lock.
 484          */
 485         if (start + len > vol->v_strip_size) {
 486                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 487                 len = vend - virtual;
 488         }
 489         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 490         trs->trso_flags |= TR_RAID1E_F_LOCKED;
 491         trs->trso_lock_pos = virtual;
 492         trs->trso_lock_len = len;
 493         /* Lock callback starts I/O */
 494         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 495 }
 496
 497 static void
 498 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 499 {
 500         struct g_raid_volume *vol;
 501         struct g_raid_tr_raid1e_object *trs;
 502         struct g_raid_subdisk *sd;
 503
 504         vol = tr->tro_volume;
 505         trs = (struct g_raid_tr_raid1e_object *)tr;
 506         if (trs->trso_failed_sd) {
 507                 G_RAID_DEBUG1(1, vol->v_softc,
 508                     "Already rebuild in start rebuild. pos %jd\n",
 509                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 510                 return;
 511         }
 512         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 513         if (sd == NULL)
 514                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 515         if (sd == NULL) {
 516                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 517                 if (sd != NULL) {
 518                         sd->sd_rebuild_pos = 0;
 519                         g_raid_change_subdisk_state(sd,
 520                             G_RAID_SUBDISK_S_RESYNC);
 521                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 522                 } else {
 523                         sd = g_raid_get_subdisk(vol,
 524                             G_RAID_SUBDISK_S_UNINITIALIZED);
 525                         if (sd == NULL)
 526                                 sd = g_raid_get_subdisk(vol,
 527                                     G_RAID_SUBDISK_S_NEW);
 528                         if (sd != NULL) {
 529                                 sd->sd_rebuild_pos = 0;
 530                                 g_raid_change_subdisk_state(sd,
 531                                     G_RAID_SUBDISK_S_REBUILD);
 532                                 g_raid_write_metadata(vol->v_softc,
 533                                     vol, sd, NULL);
 534                         }
 535                 }
 536         }
 537         if (sd == NULL) {
 538                 G_RAID_DEBUG1(1, vol->v_softc,
 539                     "No failed disk to rebuild.  night night.");
 540                 return;
 541         }
 542         trs->trso_failed_sd = sd;
 543         G_RAID_DEBUG1(0, vol->v_softc,
 544             "Subdisk %s:%d-%s rebuild start at %jd.",
 545             sd->sd_volume->v_name, sd->sd_pos,
 546             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 547             trs->trso_failed_sd->sd_rebuild_pos);
 548         trs->trso_type = TR_RAID1E_REBUILD;
 549         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 550         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 551         g_raid_tr_raid1e_rebuild_some(tr);
 552 }
 553
 554 static void
 555 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 556     struct g_raid_subdisk *sd)
 557 {
 558         struct g_raid_volume *vol;
 559         struct g_raid_tr_raid1e_object *trs;
 560         int nr;
 561
 562         vol = tr->tro_volume;
 563         trs = (struct g_raid_tr_raid1e_object *)tr;
 564         if (trs->trso_stopping)
 565                 return;
 566         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 567             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 568         switch(trs->trso_type) {
 569         case TR_RAID1E_NONE:
 570                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 571                         return;
 572                 if (nr == 0) {
 573                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 574                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 575                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 576                         if (nr == 0)
 577                                 return;
 578                 }
 579                 g_raid_tr_raid1e_rebuild_start(tr);
 580                 break;
 581         case TR_RAID1E_REBUILD:
 582                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 583                     trs->trso_failed_sd == sd)
 584                         g_raid_tr_raid1e_rebuild_abort(tr);
 585                 break;
 586         case TR_RAID1E_RESYNC:
 587                 break;
 588         }
 589 }
 590
 591 static int
 592 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
 593     struct g_raid_subdisk *sd, u_int event)
 594 {
 595
 596         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 597         return (0);
 598 }
 599
 600 static int
 601 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 602 {
 603         struct g_raid_tr_raid1e_object *trs;
 604         struct g_raid_volume *vol;
 605
 606         trs = (struct g_raid_tr_raid1e_object *)tr;
 607         vol = tr->tro_volume;
 608         trs->trso_starting = 0;
 609         g_raid_tr_update_state_raid1e(vol, NULL);
 610         return (0);
 611 }
 612
 613 static int
 614 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 615 {
 616         struct g_raid_tr_raid1e_object *trs;
 617         struct g_raid_volume *vol;
 618
 619         trs = (struct g_raid_tr_raid1e_object *)tr;
 620         vol = tr->tro_volume;
 621         trs->trso_starting = 0;
 622         trs->trso_stopping = 1;
 623         g_raid_tr_update_state_raid1e(vol, NULL);
 624         return (0);
 625 }
 626
 627 /*
 628  * Select the disk to read from.  Take into account: subdisk state, running
 629  * error recovery, average disk load, head position and possible cache hits.
 630  */
 631 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
 632 static int
 633 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 634     int no, off_t off, off_t len, u_int mask)
 635 {
 636         struct g_raid_subdisk *sd;
 637         off_t offset;
 638         int i, best, prio, bestprio;
 639
 640         best = -1;
 641         bestprio = INT_MAX;
 642         for (i = 0; i < N; i++) {
 643                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 644                 offset = off;
 645                 if (no + i >= vol->v_disks_count)
 646                         offset += vol->v_strip_size;
 647
 648                 prio = G_RAID_SUBDISK_LOAD(sd);
 649                 if ((mask & (1 << sd->sd_pos)) != 0)
 650                         continue;
 651                 switch (sd->sd_state) {
 652                 case G_RAID_SUBDISK_S_ACTIVE:
 653                         break;
 654                 case G_RAID_SUBDISK_S_RESYNC:
 655                         if (offset + off < sd->sd_rebuild_pos)
 656                                 break;
 657                         /* FALLTHROUGH */
 658                 case G_RAID_SUBDISK_S_STALE:
 659                         prio += i << 24;
 660                         break;
 661                 case G_RAID_SUBDISK_S_REBUILD:
 662                         if (offset + off < sd->sd_rebuild_pos)
 663                                 break;
 664                         /* FALLTHROUGH */
 665                 default:
 666                         continue;
 667                 }
 668                 prio += min(sd->sd_recovery, 255) << 16;
 669                 /* If disk head is precisely in position - highly prefer it. */
 670                 if (G_RAID_SUBDISK_POS(sd) == offset)
 671                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 672                 else
 673                 /* If disk head is close to position - prefer it. */
 674                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 675                     G_RAID_SUBDISK_TRACK_SIZE)
 676                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 677                 if (prio < bestprio) {
 678                         bestprio = prio;
 679                         best = i;
 680                 }
 681         }
 682         return (best);
 683 }
 684
 685 static void
 686 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 687 {
 688         struct g_raid_volume *vol;
 689         struct g_raid_subdisk *sd;
 690         struct bio_queue_head queue;
 691         struct bio *cbp;
 692         char *addr;
 693         off_t offset, start, length, remain;
 694         u_int no, strip_size;
 695         int best;
 696
 697         vol = tr->tro_volume;
 698         addr = bp->bio_data;
 699         strip_size = vol->v_strip_size;
 700         V2P(vol, bp->bio_offset, &no, &offset, &start);
 701         remain = bp->bio_length;
 702         bioq_init(&queue);
 703         while (remain > 0) {
 704                 length = MIN(strip_size - start, remain);
 705                 best = g_raid_tr_raid1e_select_read_disk(vol,
 706                     no, offset, length, 0);
 707                 KASSERT(best >= 0, ("No readable disk in volume %s!",
 708                     vol->v_name));
 709                 no += best;
 710                 if (no >= vol->v_disks_count) {
 711                         no -= vol->v_disks_count;
 712                         offset += strip_size;
 713                 }
 714                 cbp = g_clone_bio(bp);
 715                 if (cbp == NULL)
 716                         goto failure;
 717                 cbp->bio_offset = offset + start;
 718                 cbp->bio_data = addr;
 719                 cbp->bio_length = length;
 720                 cbp->bio_caller1 = &vol->v_subdisks[no];
 721                 bioq_insert_tail(&queue, cbp);
 722                 no += N - best;
 723                 if (no >= vol->v_disks_count) {
 724                         no -= vol->v_disks_count;
 725                         offset += strip_size;
 726                 }
 727                 remain -= length;
 728                 addr += length;
 729                 start = 0;
 730         }
 731         for (cbp = bioq_first(&queue); cbp != NULL;
 732             cbp = bioq_first(&queue)) {
 733                 bioq_remove(&queue, cbp);
 734                 sd = cbp->bio_caller1;
 735                 cbp->bio_caller1 = NULL;
 736                 g_raid_subdisk_iostart(sd, cbp);
 737         }
 738         return;
 739 failure:
 740         for (cbp = bioq_first(&queue); cbp != NULL;
 741             cbp = bioq_first(&queue)) {
 742                 bioq_remove(&queue, cbp);
 743                 g_destroy_bio(cbp);
 744         }
 745         if (bp->bio_error == 0)
 746                 bp->bio_error = ENOMEM;
 747         g_raid_iodone(bp, bp->bio_error);
 748 }
 749
 750 static void
 751 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 752 {
 753         struct g_raid_volume *vol;
 754         struct g_raid_subdisk *sd;
 755         struct bio_queue_head queue;
 756         struct bio *cbp;
 757         char *addr;
 758         off_t offset, start, length, remain;
 759         u_int no, strip_size;
 760         int i;
 761
 762         vol = tr->tro_volume;
 763         addr = bp->bio_data;
 764         strip_size = vol->v_strip_size;
 765         V2P(vol, bp->bio_offset, &no, &offset, &start);
 766         remain = bp->bio_length;
 767         bioq_init(&queue);
 768         while (remain > 0) {
 769                 length = MIN(strip_size - start, remain);
 770                 for (i = 0; i < N; i++) {
 771                         sd = &vol->v_subdisks[no];
 772                         switch (sd->sd_state) {
 773                         case G_RAID_SUBDISK_S_ACTIVE:
 774                         case G_RAID_SUBDISK_S_STALE:
 775                         case G_RAID_SUBDISK_S_RESYNC:
 776                                 break;
 777                         case G_RAID_SUBDISK_S_REBUILD:
 778                                 if (offset + start >= sd->sd_rebuild_pos)
 779                                         goto nextdisk;
 780                                 break;
 781                         default:
 782                                 goto nextdisk;
 783                         }
 784                         cbp = g_clone_bio(bp);
 785                         if (cbp == NULL)
 786                                 goto failure;
 787                         cbp->bio_offset = offset + start;
 788                         cbp->bio_data = addr;
 789                         cbp->bio_length = length;
 790                         cbp->bio_caller1 = sd;
 791                         bioq_insert_tail(&queue, cbp);
 792 nextdisk:
 793                         if (++no >= vol->v_disks_count) {
 794                                 no = 0;
 795                                 offset += strip_size;
 796                         }
 797                 }
 798                 remain -= length;
 799                 addr += length;
 800                 start = 0;
 801         }
 802         for (cbp = bioq_first(&queue); cbp != NULL;
 803             cbp = bioq_first(&queue)) {
 804                 bioq_remove(&queue, cbp);
 805                 sd = cbp->bio_caller1;
 806                 cbp->bio_caller1 = NULL;
 807                 g_raid_subdisk_iostart(sd, cbp);
 808         }
 809         return;
 810 failure:
 811         for (cbp = bioq_first(&queue); cbp != NULL;
 812             cbp = bioq_first(&queue)) {
 813                 bioq_remove(&queue, cbp);
 814                 g_destroy_bio(cbp);
 815         }
 816         if (bp->bio_error == 0)
 817                 bp->bio_error = ENOMEM;
 818         g_raid_iodone(bp, bp->bio_error);
 819 }
 820
 821 static void
 822 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 823 {
 824         struct g_raid_volume *vol;
 825         struct g_raid_tr_raid1e_object *trs;
 826
 827         vol = tr->tro_volume;
 828         trs = (struct g_raid_tr_raid1e_object *)tr;
 829         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 830             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 831             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 832                 g_raid_iodone(bp, EIO);
 833                 return;
 834         }
 835         /*
 836          * If we're rebuilding, squeeze in rebuild activity every so often,
 837          * even when the disk is busy.  Be sure to only count real I/O
 838          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 839          * by this module.
 840          */
 841         if (trs->trso_failed_sd != NULL &&
 842             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 843                 /* Make this new or running now round short. */
 844                 trs->trso_recover_slabs = 0;
 845                 if (--trs->trso_fair_io <= 0) {
 846                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 847                         g_raid_tr_raid1e_rebuild_some(tr);
 848                 }
 849         }
 850         switch (bp->bio_cmd) {
 851         case BIO_READ:
 852                 g_raid_tr_iostart_raid1e_read(tr, bp);
 853                 break;
 854         case BIO_WRITE:
 855                 g_raid_tr_iostart_raid1e_write(tr, bp);
 856                 break;
 857         case BIO_DELETE:
 858                 g_raid_iodone(bp, EIO);
 859                 break;
 860         case BIO_FLUSH:
 861                 g_raid_tr_flush_common(tr, bp);
 862                 break;
 863         default:
 864                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 865                     bp->bio_cmd, vol->v_name));
 866                 break;
 867         }
 868 }
 869
 870 static void
 871 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
 872     struct g_raid_subdisk *sd, struct bio *bp)
 873 {
 874         struct bio *cbp;
 875         struct g_raid_subdisk *nsd;
 876         struct g_raid_volume *vol;
 877         struct bio *pbp;
 878         struct g_raid_tr_raid1e_object *trs;
 879         off_t virtual, offset, start;
 880         uintptr_t mask;
 881         int error, do_write, copy, disk, best;
 882
 883         trs = (struct g_raid_tr_raid1e_object *)tr;
 884         vol = tr->tro_volume;
 885         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 886                 if (trs->trso_type == TR_RAID1E_REBUILD) {
 887                         nsd = trs->trso_failed_sd;
 888                         if (bp->bio_cmd == BIO_READ) {
 889
 890                                 /* Immediately abort rebuild, if requested. */
 891                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 892                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 893                                         g_raid_tr_raid1e_rebuild_abort(tr);
 894                                         return;
 895                                 }
 896
 897                                 /* On read error, skip and cross fingers. */
 898                                 if (bp->bio_error != 0) {
 899                                         G_RAID_LOGREQ(0, bp,
 900                                             "Read error during rebuild (%d), "
 901                                             "possible data loss!",
 902                                             bp->bio_error);
 903                                         goto rebuild_round_done;
 904                                 }
 905
 906                                 /*
 907                                  * The read operation finished, queue the
 908                                  * write and get out.
 909                                  */
 910                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 911                                     bp->bio_error);
 912                                 bp->bio_cmd = BIO_WRITE;
 913                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 914                                 bp->bio_offset = nsd->sd_rebuild_pos;
 915                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 916                                 g_raid_subdisk_iostart(nsd, bp);
 917                         } else {
 918                                 /*
 919                                  * The write operation just finished.  Do
 920                                  * another.  We keep cloning the master bio
 921                                  * since it has the right buffers allocated to
 922                                  * it.
 923                                  */
 924                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 925                                     bp->bio_error);
 926                                 if (bp->bio_error != 0 ||
 927                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
 928                                         if ((trs->trso_flags &
 929                                             TR_RAID1E_F_ABORT) == 0) {
 930                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 931                                                     nsd, nsd->sd_disk);
 932                                         }
 933                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 934                                         g_raid_tr_raid1e_rebuild_abort(tr);
 935                                         return;
 936                                 }
 937 rebuild_round_done:
 938                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 939                                 g_raid_unlock_range(tr->tro_volume,
 940                                     trs->trso_lock_pos, trs->trso_lock_len);
 941                                 nsd->sd_rebuild_pos += bp->bio_length;
 942                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 943                                         g_raid_tr_raid1e_rebuild_finish(tr);
 944                                         return;
 945                                 }
 946
 947                                 /* Abort rebuild if we are stopping */
 948                                 if (trs->trso_stopping) {
 949                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 950                                         g_raid_tr_raid1e_rebuild_abort(tr);
 951                                         return;
 952                                 }
 953
 954                                 if (--trs->trso_meta_update <= 0) {
 955                                         g_raid_write_metadata(vol->v_softc,
 956                                             vol, nsd, nsd->sd_disk);
 957                                         trs->trso_meta_update =
 958                                             g_raid1e_rebuild_meta_update;
 959                                         /* Compensate short rebuild I/Os. */
 960                                         if ((vol->v_disks_count % N) != 0 &&
 961                                             vol->v_strip_size <
 962                                              g_raid1e_rebuild_slab) {
 963                                                 trs->trso_meta_update *=
 964                                                     g_raid1e_rebuild_slab;
 965                                                 trs->trso_meta_update /=
 966                                                     vol->v_strip_size;
 967                                         }
 968                                 }
 969                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 970                                 if (--trs->trso_recover_slabs <= 0)
 971                                         return;
 972                                 /* Run next rebuild iteration. */
 973                                 g_raid_tr_raid1e_rebuild_some(tr);
 974                         }
 975                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
 976                         /*
 977                          * read good sd, read bad sd in parallel.  when both
 978                          * done, compare the buffers.  write good to the bad
 979                          * if different.  do the next bit of work.
 980                          */
 981                         panic("Somehow, we think we're doing a resync");
 982                 }
 983                 return;
 984         }
 985         pbp = bp->bio_parent;
 986         pbp->bio_inbed++;
 987         mask = (intptr_t)bp->bio_caller2;
 988         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 989                 /*
 990                  * Read failed on first drive.  Retry the read error on
 991                  * another disk drive, if available, before erroring out the
 992                  * read.
 993                  */
 994                 sd->sd_disk->d_read_errs++;
 995                 G_RAID_LOGREQ(0, bp,
 996                     "Read error (%d), %d read errors total",
 997                     bp->bio_error, sd->sd_disk->d_read_errs);
 998
 999                 /*
1000                  * If there are too many read errors, we move to degraded.
1001                  * XXX Do we want to FAIL the drive (eg, make the user redo
1002                  * everything to get it back in sync), or just degrade the
1003                  * drive, which kicks off a resync?
1004                  */
1005                 do_write = 0;
1006                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1007                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1008                 else if (mask == 0)
1009                         do_write = 1;
1010
1011                 /* Restore what we were doing. */
1012                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1013                 V2P(vol, virtual, &disk, &offset, &start);
1014
1015                 /* Find the other disk, and try to do the I/O to it. */
1016                 mask |= 1 << copy;
1017                 best = g_raid_tr_raid1e_select_read_disk(vol,
1018                     disk, offset, start, mask);
1019                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1020                         disk += best;
1021                         if (disk >= vol->v_disks_count) {
1022                                 disk -= vol->v_disks_count;
1023                                 offset += vol->v_strip_size;
1024                         }
1025                         cbp->bio_offset = offset + start;
1026                         cbp->bio_length = bp->bio_length;
1027                         cbp->bio_data = bp->bio_data;
1028                         g_destroy_bio(bp);
1029                         nsd = &vol->v_subdisks[disk];
1030                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1031                             nsd->sd_pos);
1032                         if (do_write)
1033                                 mask |= 1 << 31;
1034                         if ((mask & (1 << 31)) != 0)
1035                                 sd->sd_recovery++;
1036                         cbp->bio_caller2 = (void *)mask;
1037                         if (do_write) {
1038                                 cbp->bio_caller1 = nsd;
1039                                 /* Lock callback starts I/O */
1040                                 g_raid_lock_range(sd->sd_volume,
1041                                     virtual, cbp->bio_length, pbp, cbp);
1042                         } else {
1043                                 g_raid_subdisk_iostart(nsd, cbp);
1044                         }
1045                         return;
1046                 }
1047                 /*
1048                  * We can't retry.  Return the original error by falling
1049                  * through.  This will happen when there's only one good disk.
1050                  * We don't need to fail the raid, since its actual state is
1051                  * based on the state of the subdisks.
1052                  */
1053                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1054         }
1055         if (bp->bio_cmd == BIO_READ &&
1056             bp->bio_error == 0 &&
1057             (mask & (1 << 31)) != 0) {
1058                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1059
1060                 /* Restore what we were doing. */
1061                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1062                 V2P(vol, virtual, &disk, &offset, &start);
1063
1064                 /* Find best disk to write. */
1065                 best = g_raid_tr_raid1e_select_read_disk(vol,
1066                     disk, offset, start, ~mask);
1067                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1068                         disk += best;
1069                         if (disk >= vol->v_disks_count) {
1070                                 disk -= vol->v_disks_count;
1071                                 offset += vol->v_strip_size;
1072                         }
1073                         cbp->bio_offset = offset + start;
1074                         cbp->bio_length = bp->bio_length;
1075                         cbp->bio_data = bp->bio_data;
1076                         cbp->bio_cmd = BIO_WRITE;
1077                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1078                         cbp->bio_caller2 = (void *)mask;
1079                         g_destroy_bio(bp);
1080                         G_RAID_LOGREQ(2, cbp,
1081                             "Attempting bad sector remap on failing drive.");
1082                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1083                         return;
1084                 }
1085         }
1086         if ((mask & (1 << 31)) != 0) {
1087                 /*
1088                  * We're done with a recovery, mark the range as unlocked.
1089                  * For any write errors, we agressively fail the disk since
1090                  * there was both a READ and a WRITE error at this location.
1091                  * Both types of errors generally indicates the drive is on
1092                  * the verge of total failure anyway.  Better to stop trusting
1093                  * it now.  However, we need to reset error to 0 in that case
1094                  * because we're not failing the original I/O which succeeded.
1095                  */
1096
1097                 /* Restore what we were doing. */
1098                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1099                 V2P(vol, virtual, &disk, &offset, &start);
1100
1101                 for (copy = 0; copy < N; copy++) {
1102                         if ((mask & (1 << copy) ) != 0)
1103                                 vol->v_subdisks[(disk + copy) %
1104                                     vol->v_disks_count].sd_recovery--;
1105                 }
1106
1107                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1108                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1109                             "failing subdisk.");
1110                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1111                         bp->bio_error = 0;
1112                 }
1113                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1114                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1115         }
1116         error = bp->bio_error;
1117         g_destroy_bio(bp);
1118         if (pbp->bio_children == pbp->bio_inbed) {
1119                 pbp->bio_completed = pbp->bio_length;
1120                 g_raid_iodone(pbp, error);
1121         }
1122 }
1123
1124 static int
1125 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1126     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1127 {
1128         struct g_raid_volume *vol;
1129         struct g_raid_subdisk *sd;
1130         struct bio_queue_head queue;
1131         char *addr;
1132         off_t offset, start, length, remain;
1133         u_int no, strip_size;
1134         int i, error;
1135
1136         vol = tr->tro_volume;
1137         addr = virtual;
1138         strip_size = vol->v_strip_size;
1139         V2P(vol, boffset, &no, &offset, &start);
1140         remain = blength;
1141         bioq_init(&queue);
1142         while (remain > 0) {
1143                 length = MIN(strip_size - start, remain);
1144                 for (i = 0; i < N; i++) {
1145                         sd = &vol->v_subdisks[no];
1146                         switch (sd->sd_state) {
1147                         case G_RAID_SUBDISK_S_ACTIVE:
1148                         case G_RAID_SUBDISK_S_STALE:
1149                         case G_RAID_SUBDISK_S_RESYNC:
1150                                 break;
1151                         case G_RAID_SUBDISK_S_REBUILD:
1152                                 if (offset + start >= sd->sd_rebuild_pos)
1153                                         goto nextdisk;
1154                                 break;
1155                         default:
1156                                 goto nextdisk;
1157                         }
1158                         error = g_raid_subdisk_kerneldump(sd,
1159                             addr, 0, offset + start, length);
1160                         if (error != 0)
1161                                 return (error);
1162 nextdisk:
1163                         if (++no >= vol->v_disks_count) {
1164                                 no = 0;
1165                                 offset += strip_size;
1166                         }
1167                 }
1168                 remain -= length;
1169                 addr += length;
1170                 start = 0;
1171         }
1172         return (0);
1173 }
1174
1175 static int
1176 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1177 {
1178         struct bio *bp;
1179         struct g_raid_subdisk *sd;
1180
1181         bp = (struct bio *)argp;
1182         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1183         g_raid_subdisk_iostart(sd, bp);
1184
1185         return (0);
1186 }
1187
1188 static int
1189 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1190 {
1191         struct g_raid_tr_raid1e_object *trs;
1192         struct g_raid_volume *vol;
1193
1194         vol = tr->tro_volume;
1195         trs = (struct g_raid_tr_raid1e_object *)tr;
1196         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1197         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1198         /* Compensate short rebuild I/Os. */
1199         if ((vol->v_disks_count % N) != 0 &&
1200             vol->v_strip_size < g_raid1e_rebuild_slab) {
1201                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1202                 trs->trso_recover_slabs /= vol->v_strip_size;
1203         }
1204         if (trs->trso_type == TR_RAID1E_REBUILD)
1205                 g_raid_tr_raid1e_rebuild_some(tr);
1206         return (0);
1207 }
1208
1209 static int
1210 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1211 {
1212         struct g_raid_tr_raid1e_object *trs;
1213
1214         trs = (struct g_raid_tr_raid1e_object *)tr;
1215
1216         if (trs->trso_buffer != NULL) {
1217                 free(trs->trso_buffer, M_TR_RAID1E);
1218                 trs->trso_buffer = NULL;
1219         }
1220         return (0);
1221 }
1222
1223 G_RAID_TR_DECLARE(g_raid_tr_raid1e);