sys/geom/raid/tr_raid1e.c

   1 /*-
   2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <sys/param.h>
  31 #include <sys/bio.h>
  32 #include <sys/endian.h>
  33 #include <sys/kernel.h>
  34 #include <sys/kobj.h>
  35 #include <sys/limits.h>
  36 #include <sys/lock.h>
  37 #include <sys/malloc.h>
  38 #include <sys/mutex.h>
  39 #include <sys/sysctl.h>
  40 #include <sys/systm.h>
  41 #include <geom/geom.h>
  42 #include "geom/raid/g_raid.h"
  43 #include "g_raid_tr_if.h"
  44
  45 #define N       2
  46
  47 SYSCTL_DECL(_kern_geom_raid_raid1e);
  48
  49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
  50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
  51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
  52     &g_raid1e_rebuild_slab);
  53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
  54     &g_raid1e_rebuild_slab, 0,
  55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
  56
  57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
  58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
  59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
  60     &g_raid1e_rebuild_fair_io);
  61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
  62     &g_raid1e_rebuild_fair_io, 0,
  63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
  64
  65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
  66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
  67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
  68     &g_raid1e_rebuild_cluster_idle);
  69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
  70     &g_raid1e_rebuild_cluster_idle, 0,
  71     "Number of slabs to do each time we trigger a rebuild cycle");
  72
  73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
  74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
  75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
  76     &g_raid1e_rebuild_meta_update);
  77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
  78     &g_raid1e_rebuild_meta_update, 0,
  79     "When to update the meta data.");
  80
  81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
  82
  83 #define TR_RAID1E_NONE 0
  84 #define TR_RAID1E_REBUILD 1
  85 #define TR_RAID1E_RESYNC 2
  86
  87 #define TR_RAID1E_F_DOING_SOME  0x1
  88 #define TR_RAID1E_F_LOCKED      0x2
  89 #define TR_RAID1E_F_ABORT       0x4
  90
  91 struct g_raid_tr_raid1e_object {
  92         struct g_raid_tr_object  trso_base;
  93         int                      trso_starting;
  94         int                      trso_stopping;
  95         int                      trso_type;
  96         int                      trso_recover_slabs; /* slabs before rest */
  97         int                      trso_fair_io;
  98         int                      trso_meta_update;
  99         int                      trso_flags;
 100         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
 101         void                    *trso_buffer;    /* Buffer space */
 102         off_t                    trso_lock_pos; /* Locked range start. */
 103         off_t                    trso_lock_len; /* Locked range length. */
 104         struct bio               trso_bio;
 105 };
 106
 107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 117
 118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 119         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
 120         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
 121         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
 122         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
 123         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
 124         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
 125         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 126         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
 127         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
 128         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
 129         { 0, 0 }
 130 };
 131
 132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 133         "RAID1E",
 134         g_raid_tr_raid1e_methods,
 135         sizeof(struct g_raid_tr_raid1e_object),
 136         .trc_enable = 1,
 137         .trc_priority = 200
 138 };
 139
 140 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 141 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 142     struct g_raid_subdisk *sd);
 143 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 144     int no, off_t off, off_t len, u_int mask);
 145
 146 static inline void
 147 V2P(struct g_raid_volume *vol, off_t virt,
 148     int *disk, off_t *offset, off_t *start)
 149 {
 150         off_t nstrip;
 151         u_int strip_size;
 152
 153         strip_size = vol->v_strip_size;
 154         /* Strip number. */
 155         nstrip = virt / strip_size;
 156         /* Start position in strip. */
 157         *start = virt % strip_size;
 158         /* Disk number. */
 159         *disk = (nstrip * N) % vol->v_disks_count;
 160         /* Strip start position in disk. */
 161         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 162 }
 163
 164 static inline void
 165 P2V(struct g_raid_volume *vol, int disk, off_t offset,
 166     off_t *virt, int *copy)
 167 {
 168         off_t nstrip, start;
 169         u_int strip_size;
 170
 171         strip_size = vol->v_strip_size;
 172         /* Start position in strip. */
 173         start = offset % strip_size;
 174         /* Physical strip number. */
 175         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 176         /* Number of physical strip (copy) inside virtual strip. */
 177         *copy = nstrip % N;
 178         /* Offset in virtual space. */
 179         *virt = (nstrip / N) * strip_size + start;
 180 }
 181
 182 static int
 183 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 184 {
 185         struct g_raid_tr_raid1e_object *trs;
 186
 187         trs = (struct g_raid_tr_raid1e_object *)tr;
 188         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 189             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 190                 return (G_RAID_TR_TASTE_FAIL);
 191         trs->trso_starting = 1;
 192         return (G_RAID_TR_TASTE_SUCCEED);
 193 }
 194
 195 static int
 196 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 197 {
 198         struct g_raid_softc *sc;
 199         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 200         int i, j, state, sstate;
 201
 202         sc = vol->v_softc;
 203         state = G_RAID_VOLUME_S_OPTIMAL;
 204         for (i = 0; i < vol->v_disks_count / N; i++) {
 205                 bestsd = &vol->v_subdisks[i * N];
 206                 for (j = 1; j < N; j++) {
 207                         sd = &vol->v_subdisks[i * N + j];
 208                         if (sd->sd_state > bestsd->sd_state)
 209                                 bestsd = sd;
 210                         else if (sd->sd_state == bestsd->sd_state &&
 211                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 212                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 213                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 214                                 bestsd = sd;
 215                 }
 216                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 217                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 218                         /* We found reasonable candidate. */
 219                         G_RAID_DEBUG1(1, sc,
 220                             "Promote subdisk %s:%d from %s to ACTIVE.",
 221                             vol->v_name, bestsd->sd_pos,
 222                             g_raid_subdisk_state2str(bestsd->sd_state));
 223                         g_raid_change_subdisk_state(bestsd,
 224                             G_RAID_SUBDISK_S_ACTIVE);
 225                         g_raid_write_metadata(sc,
 226                             vol, bestsd, bestsd->sd_disk);
 227                 }
 228                 worstsd = &vol->v_subdisks[i * N];
 229                 for (j = 1; j < N; j++) {
 230                         sd = &vol->v_subdisks[i * N + j];
 231                         if (sd->sd_state < worstsd->sd_state)
 232                                 worstsd = sd;
 233                 }
 234                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 235                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 236                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 237                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 238                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 239                         sstate = G_RAID_VOLUME_S_DEGRADED;
 240                 else
 241                         sstate = G_RAID_VOLUME_S_BROKEN;
 242                 if (sstate < state)
 243                         state = sstate;
 244         }
 245         return (state);
 246 }
 247
 248 static int
 249 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 250 {
 251         struct g_raid_softc *sc;
 252         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 253         int i, j, state, sstate;
 254
 255         sc = vol->v_softc;
 256         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 257             vol->v_disks_count)
 258                 return (G_RAID_VOLUME_S_OPTIMAL);
 259         for (i = 0; i < vol->v_disks_count; i++) {
 260                 sd = &vol->v_subdisks[i];
 261                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 262                         /* We found reasonable candidate. */
 263                         G_RAID_DEBUG1(1, sc,
 264                             "Promote subdisk %s:%d from %s to STALE.",
 265                             vol->v_name, sd->sd_pos,
 266                             g_raid_subdisk_state2str(sd->sd_state));
 267                         g_raid_change_subdisk_state(sd,
 268                             G_RAID_SUBDISK_S_STALE);
 269                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 270                 }
 271         }
 272         state = G_RAID_VOLUME_S_OPTIMAL;
 273         for (i = 0; i < vol->v_disks_count; i++) {
 274                 bestsd = &vol->v_subdisks[i];
 275                 worstsd = &vol->v_subdisks[i];
 276                 for (j = 1; j < N; j++) {
 277                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 278                         if (sd->sd_state > bestsd->sd_state)
 279                                 bestsd = sd;
 280                         else if (sd->sd_state == bestsd->sd_state &&
 281                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 282                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 283                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 284                                 bestsd = sd;
 285                         if (sd->sd_state < worstsd->sd_state)
 286                                 worstsd = sd;
 287                 }
 288                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 289                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 290                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 291                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 292                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 293                         sstate = G_RAID_VOLUME_S_DEGRADED;
 294                 else
 295                         sstate = G_RAID_VOLUME_S_BROKEN;
 296                 if (sstate < state)
 297                         state = sstate;
 298         }
 299         return (state);
 300 }
 301
 302 static int
 303 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
 304     struct g_raid_subdisk *sd)
 305 {
 306         struct g_raid_tr_raid1e_object *trs;
 307         struct g_raid_softc *sc;
 308         u_int s;
 309
 310         sc = vol->v_softc;
 311         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 312         if (trs->trso_stopping &&
 313             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 314                 s = G_RAID_VOLUME_S_STOPPED;
 315         else if (trs->trso_starting)
 316                 s = G_RAID_VOLUME_S_STARTING;
 317         else {
 318                 if ((vol->v_disks_count % N) == 0)
 319                         s = g_raid_tr_update_state_raid1e_even(vol);
 320                 else
 321                         s = g_raid_tr_update_state_raid1e_odd(vol);
 322         }
 323         if (s != vol->v_state) {
 324                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 325                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 326                     G_RAID_EVENT_VOLUME);
 327                 g_raid_change_volume_state(vol, s);
 328                 if (!trs->trso_starting && !trs->trso_stopping)
 329                         g_raid_write_metadata(sc, vol, NULL, NULL);
 330         }
 331         if (!trs->trso_starting && !trs->trso_stopping)
 332                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 333         return (0);
 334 }
 335
 336 static void
 337 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
 338     struct g_raid_disk *disk)
 339 {
 340         struct g_raid_volume *vol;
 341
 342         vol = sd->sd_volume;
 343         /*
 344          * We don't fail the last disk in the pack, since it still has decent
 345          * data on it and that's better than failing the disk if it is the root
 346          * file system.
 347          *
 348          * XXX should this be controlled via a tunable?  It makes sense for
 349          * the volume that has / on it.  I can't think of a case where we'd
 350          * want the volume to go away on this kind of event.
 351          */
 352         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 353              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 354              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 355              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 356              vol->v_disks_count) &&
 357             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 358                 return;
 359         g_raid_fail_disk(sc, sd, disk);
 360 }
 361
 362 static void
 363 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 364 {
 365         struct g_raid_volume *vol;
 366         struct g_raid_subdisk *sd;
 367
 368         vol = trs->trso_base.tro_volume;
 369         sd = trs->trso_failed_sd;
 370         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 371         free(trs->trso_buffer, M_TR_RAID1E);
 372         trs->trso_buffer = NULL;
 373         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 374         trs->trso_type = TR_RAID1E_NONE;
 375         trs->trso_recover_slabs = 0;
 376         trs->trso_failed_sd = NULL;
 377         g_raid_tr_update_state_raid1e(vol, NULL);
 378 }
 379
 380 static void
 381 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 382 {
 383         struct g_raid_tr_raid1e_object *trs;
 384         struct g_raid_subdisk *sd;
 385
 386         trs = (struct g_raid_tr_raid1e_object *)tr;
 387         sd = trs->trso_failed_sd;
 388         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 389             "Subdisk %s:%d-%s rebuild completed.",
 390             sd->sd_volume->v_name, sd->sd_pos,
 391             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 392         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 393         sd->sd_rebuild_pos = 0;
 394         g_raid_tr_raid1e_rebuild_done(trs);
 395 }
 396
 397 static void
 398 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 399 {
 400         struct g_raid_tr_raid1e_object *trs;
 401         struct g_raid_subdisk *sd;
 402         struct g_raid_volume *vol;
 403
 404         vol = tr->tro_volume;
 405         trs = (struct g_raid_tr_raid1e_object *)tr;
 406         sd = trs->trso_failed_sd;
 407         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 408                 G_RAID_DEBUG1(1, vol->v_softc,
 409                     "Subdisk %s:%d-%s rebuild is aborting.",
 410                     sd->sd_volume->v_name, sd->sd_pos,
 411                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 412                 trs->trso_flags |= TR_RAID1E_F_ABORT;
 413         } else {
 414                 G_RAID_DEBUG1(0, vol->v_softc,
 415                     "Subdisk %s:%d-%s rebuild aborted.",
 416                     sd->sd_volume->v_name, sd->sd_pos,
 417                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 418                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 419                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 420                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 421                         g_raid_unlock_range(tr->tro_volume,
 422                             trs->trso_lock_pos, trs->trso_lock_len);
 423                 }
 424                 g_raid_tr_raid1e_rebuild_done(trs);
 425         }
 426 }
 427
 428 static void
 429 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 430 {
 431         struct g_raid_tr_raid1e_object *trs;
 432         struct g_raid_softc *sc;
 433         struct g_raid_volume *vol;
 434         struct g_raid_subdisk *sd;
 435         struct bio *bp;
 436         off_t len, virtual, vend, offset, start;
 437         int disk, copy, best;
 438
 439         trs = (struct g_raid_tr_raid1e_object *)tr;
 440         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 441                 return;
 442         vol = tr->tro_volume;
 443         sc = vol->v_softc;
 444         sd = trs->trso_failed_sd;
 445
 446         while (1) {
 447                 if (sd->sd_rebuild_pos >= sd->sd_size) {
 448                         g_raid_tr_raid1e_rebuild_finish(tr);
 449                         return;
 450                 }
 451                 /* Get virtual offset from physical rebuild position. */
 452                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 453                 /* Get physical offset back to get first stripe position. */
 454                 V2P(vol, virtual, &disk, &offset, &start);
 455                 /* Calculate contignous data length. */
 456                 len = MIN(g_raid1e_rebuild_slab,
 457                     sd->sd_size - sd->sd_rebuild_pos);
 458                 if ((vol->v_disks_count % N) != 0)
 459                         len = MIN(len, vol->v_strip_size - start);
 460                 /* Find disk with most accurate data. */
 461                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 462                     offset + start, len, 0);
 463                 if (best < 0) {
 464                         /* There is no any valid disk. */
 465                         g_raid_tr_raid1e_rebuild_abort(tr);
 466                         return;
 467                 } else if (best != copy) {
 468                         /* Some other disk has better data. */
 469                         break;
 470                 }
 471                 /* We have the most accurate data. Skip the range. */
 472                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 473                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 474                 sd->sd_rebuild_pos += len;
 475         }
 476
 477         bp = &trs->trso_bio;
 478         memset(bp, 0, sizeof(*bp));
 479         bp->bio_offset = offset + start +
 480             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 481         bp->bio_length = len;
 482         bp->bio_data = trs->trso_buffer;
 483         bp->bio_cmd = BIO_READ;
 484         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 485         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 486         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 487         /*
 488          * If we are crossing stripe boundary, correct affected virtual
 489          * range we should lock.
 490          */
 491         if (start + len > vol->v_strip_size) {
 492                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 493                 len = vend - virtual;
 494         }
 495         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 496         trs->trso_flags |= TR_RAID1E_F_LOCKED;
 497         trs->trso_lock_pos = virtual;
 498         trs->trso_lock_len = len;
 499         /* Lock callback starts I/O */
 500         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 501 }
 502
 503 static void
 504 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 505 {
 506         struct g_raid_volume *vol;
 507         struct g_raid_tr_raid1e_object *trs;
 508         struct g_raid_subdisk *sd;
 509
 510         vol = tr->tro_volume;
 511         trs = (struct g_raid_tr_raid1e_object *)tr;
 512         if (trs->trso_failed_sd) {
 513                 G_RAID_DEBUG1(1, vol->v_softc,
 514                     "Already rebuild in start rebuild. pos %jd\n",
 515                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 516                 return;
 517         }
 518         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 519         if (sd == NULL)
 520                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 521         if (sd == NULL) {
 522                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 523                 if (sd != NULL) {
 524                         sd->sd_rebuild_pos = 0;
 525                         g_raid_change_subdisk_state(sd,
 526                             G_RAID_SUBDISK_S_RESYNC);
 527                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 528                 } else {
 529                         sd = g_raid_get_subdisk(vol,
 530                             G_RAID_SUBDISK_S_UNINITIALIZED);
 531                         if (sd == NULL)
 532                                 sd = g_raid_get_subdisk(vol,
 533                                     G_RAID_SUBDISK_S_NEW);
 534                         if (sd != NULL) {
 535                                 sd->sd_rebuild_pos = 0;
 536                                 g_raid_change_subdisk_state(sd,
 537                                     G_RAID_SUBDISK_S_REBUILD);
 538                                 g_raid_write_metadata(vol->v_softc,
 539                                     vol, sd, NULL);
 540                         }
 541                 }
 542         }
 543         if (sd == NULL) {
 544                 G_RAID_DEBUG1(1, vol->v_softc,
 545                     "No failed disk to rebuild.  night night.");
 546                 return;
 547         }
 548         trs->trso_failed_sd = sd;
 549         G_RAID_DEBUG1(0, vol->v_softc,
 550             "Subdisk %s:%d-%s rebuild start at %jd.",
 551             sd->sd_volume->v_name, sd->sd_pos,
 552             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 553             trs->trso_failed_sd->sd_rebuild_pos);
 554         trs->trso_type = TR_RAID1E_REBUILD;
 555         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 556         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 557         g_raid_tr_raid1e_rebuild_some(tr);
 558 }
 559
 560 static void
 561 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 562     struct g_raid_subdisk *sd)
 563 {
 564         struct g_raid_volume *vol;
 565         struct g_raid_tr_raid1e_object *trs;
 566         int nr;
 567
 568         vol = tr->tro_volume;
 569         trs = (struct g_raid_tr_raid1e_object *)tr;
 570         if (trs->trso_stopping)
 571                 return;
 572         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 573             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 574         switch(trs->trso_type) {
 575         case TR_RAID1E_NONE:
 576                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 577                         return;
 578                 if (nr == 0) {
 579                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 580                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 581                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 582                         if (nr == 0)
 583                                 return;
 584                 }
 585                 g_raid_tr_raid1e_rebuild_start(tr);
 586                 break;
 587         case TR_RAID1E_REBUILD:
 588                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 589                     trs->trso_failed_sd == sd)
 590                         g_raid_tr_raid1e_rebuild_abort(tr);
 591                 break;
 592         case TR_RAID1E_RESYNC:
 593                 break;
 594         }
 595 }
 596
 597 static int
 598 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
 599     struct g_raid_subdisk *sd, u_int event)
 600 {
 601
 602         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 603         return (0);
 604 }
 605
 606 static int
 607 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 608 {
 609         struct g_raid_tr_raid1e_object *trs;
 610         struct g_raid_volume *vol;
 611
 612         trs = (struct g_raid_tr_raid1e_object *)tr;
 613         vol = tr->tro_volume;
 614         trs->trso_starting = 0;
 615         g_raid_tr_update_state_raid1e(vol, NULL);
 616         return (0);
 617 }
 618
 619 static int
 620 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 621 {
 622         struct g_raid_tr_raid1e_object *trs;
 623         struct g_raid_volume *vol;
 624
 625         trs = (struct g_raid_tr_raid1e_object *)tr;
 626         vol = tr->tro_volume;
 627         trs->trso_starting = 0;
 628         trs->trso_stopping = 1;
 629         g_raid_tr_update_state_raid1e(vol, NULL);
 630         return (0);
 631 }
 632
 633 /*
 634  * Select the disk to read from.  Take into account: subdisk state, running
 635  * error recovery, average disk load, head position and possible cache hits.
 636  */
 637 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
 638 static int
 639 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 640     int no, off_t off, off_t len, u_int mask)
 641 {
 642         struct g_raid_subdisk *sd;
 643         off_t offset;
 644         int i, best, prio, bestprio;
 645
 646         best = -1;
 647         bestprio = INT_MAX;
 648         for (i = 0; i < N; i++) {
 649                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 650                 offset = off;
 651                 if (no + i >= vol->v_disks_count)
 652                         offset += vol->v_strip_size;
 653
 654                 prio = G_RAID_SUBDISK_LOAD(sd);
 655                 if ((mask & (1 << sd->sd_pos)) != 0)
 656                         continue;
 657                 switch (sd->sd_state) {
 658                 case G_RAID_SUBDISK_S_ACTIVE:
 659                         break;
 660                 case G_RAID_SUBDISK_S_RESYNC:
 661                         if (offset + off < sd->sd_rebuild_pos)
 662                                 break;
 663                         /* FALLTHROUGH */
 664                 case G_RAID_SUBDISK_S_STALE:
 665                         prio += i << 24;
 666                         break;
 667                 case G_RAID_SUBDISK_S_REBUILD:
 668                         if (offset + off < sd->sd_rebuild_pos)
 669                                 break;
 670                         /* FALLTHROUGH */
 671                 default:
 672                         continue;
 673                 }
 674                 prio += min(sd->sd_recovery, 255) << 16;
 675                 /* If disk head is precisely in position - highly prefer it. */
 676                 if (G_RAID_SUBDISK_POS(sd) == offset)
 677                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 678                 else
 679                 /* If disk head is close to position - prefer it. */
 680                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 681                     G_RAID_SUBDISK_TRACK_SIZE)
 682                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 683                 if (prio < bestprio) {
 684                         bestprio = prio;
 685                         best = i;
 686                 }
 687         }
 688         return (best);
 689 }
 690
 691 static void
 692 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 693 {
 694         struct g_raid_volume *vol;
 695         struct g_raid_subdisk *sd;
 696         struct bio_queue_head queue;
 697         struct bio *cbp;
 698         char *addr;
 699         off_t offset, start, length, remain;
 700         u_int no, strip_size;
 701         int best;
 702
 703         vol = tr->tro_volume;
 704         addr = bp->bio_data;
 705         strip_size = vol->v_strip_size;
 706         V2P(vol, bp->bio_offset, &no, &offset, &start);
 707         remain = bp->bio_length;
 708         bioq_init(&queue);
 709         while (remain > 0) {
 710                 length = MIN(strip_size - start, remain);
 711                 best = g_raid_tr_raid1e_select_read_disk(vol,
 712                     no, offset, length, 0);
 713                 KASSERT(best >= 0, ("No readable disk in volume %s!",
 714                     vol->v_name));
 715                 no += best;
 716                 if (no >= vol->v_disks_count) {
 717                         no -= vol->v_disks_count;
 718                         offset += strip_size;
 719                 }
 720                 cbp = g_clone_bio(bp);
 721                 if (cbp == NULL)
 722                         goto failure;
 723                 cbp->bio_offset = offset + start;
 724                 cbp->bio_data = addr;
 725                 cbp->bio_length = length;
 726                 cbp->bio_caller1 = &vol->v_subdisks[no];
 727                 bioq_insert_tail(&queue, cbp);
 728                 no += N - best;
 729                 if (no >= vol->v_disks_count) {
 730                         no -= vol->v_disks_count;
 731                         offset += strip_size;
 732                 }
 733                 remain -= length;
 734                 addr += length;
 735                 start = 0;
 736         }
 737         for (cbp = bioq_first(&queue); cbp != NULL;
 738             cbp = bioq_first(&queue)) {
 739                 bioq_remove(&queue, cbp);
 740                 sd = cbp->bio_caller1;
 741                 cbp->bio_caller1 = NULL;
 742                 g_raid_subdisk_iostart(sd, cbp);
 743         }
 744         return;
 745 failure:
 746         for (cbp = bioq_first(&queue); cbp != NULL;
 747             cbp = bioq_first(&queue)) {
 748                 bioq_remove(&queue, cbp);
 749                 g_destroy_bio(cbp);
 750         }
 751         if (bp->bio_error == 0)
 752                 bp->bio_error = ENOMEM;
 753         g_raid_iodone(bp, bp->bio_error);
 754 }
 755
 756 static void
 757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 758 {
 759         struct g_raid_volume *vol;
 760         struct g_raid_subdisk *sd;
 761         struct bio_queue_head queue;
 762         struct bio *cbp;
 763         char *addr;
 764         off_t offset, start, length, remain;
 765         u_int no, strip_size;
 766         int i;
 767
 768         vol = tr->tro_volume;
 769         addr = bp->bio_data;
 770         strip_size = vol->v_strip_size;
 771         V2P(vol, bp->bio_offset, &no, &offset, &start);
 772         remain = bp->bio_length;
 773         bioq_init(&queue);
 774         while (remain > 0) {
 775                 length = MIN(strip_size - start, remain);
 776                 for (i = 0; i < N; i++) {
 777                         sd = &vol->v_subdisks[no];
 778                         switch (sd->sd_state) {
 779                         case G_RAID_SUBDISK_S_ACTIVE:
 780                         case G_RAID_SUBDISK_S_STALE:
 781                         case G_RAID_SUBDISK_S_RESYNC:
 782                                 break;
 783                         case G_RAID_SUBDISK_S_REBUILD:
 784                                 if (offset + start >= sd->sd_rebuild_pos)
 785                                         goto nextdisk;
 786                                 break;
 787                         default:
 788                                 goto nextdisk;
 789                         }
 790                         cbp = g_clone_bio(bp);
 791                         if (cbp == NULL)
 792                                 goto failure;
 793                         cbp->bio_offset = offset + start;
 794                         cbp->bio_data = addr;
 795                         cbp->bio_length = length;
 796                         cbp->bio_caller1 = sd;
 797                         bioq_insert_tail(&queue, cbp);
 798 nextdisk:
 799                         if (++no >= vol->v_disks_count) {
 800                                 no = 0;
 801                                 offset += strip_size;
 802                         }
 803                 }
 804                 remain -= length;
 805                 if (bp->bio_cmd != BIO_DELETE)
 806                         addr += length;
 807                 start = 0;
 808         }
 809         for (cbp = bioq_first(&queue); cbp != NULL;
 810             cbp = bioq_first(&queue)) {
 811                 bioq_remove(&queue, cbp);
 812                 sd = cbp->bio_caller1;
 813                 cbp->bio_caller1 = NULL;
 814                 g_raid_subdisk_iostart(sd, cbp);
 815         }
 816         return;
 817 failure:
 818         for (cbp = bioq_first(&queue); cbp != NULL;
 819             cbp = bioq_first(&queue)) {
 820                 bioq_remove(&queue, cbp);
 821                 g_destroy_bio(cbp);
 822         }
 823         if (bp->bio_error == 0)
 824                 bp->bio_error = ENOMEM;
 825         g_raid_iodone(bp, bp->bio_error);
 826 }
 827
 828 static void
 829 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 830 {
 831         struct g_raid_volume *vol;
 832         struct g_raid_tr_raid1e_object *trs;
 833
 834         vol = tr->tro_volume;
 835         trs = (struct g_raid_tr_raid1e_object *)tr;
 836         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 837             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 838             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 839                 g_raid_iodone(bp, EIO);
 840                 return;
 841         }
 842         /*
 843          * If we're rebuilding, squeeze in rebuild activity every so often,
 844          * even when the disk is busy.  Be sure to only count real I/O
 845          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 846          * by this module.
 847          */
 848         if (trs->trso_failed_sd != NULL &&
 849             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 850                 /* Make this new or running now round short. */
 851                 trs->trso_recover_slabs = 0;
 852                 if (--trs->trso_fair_io <= 0) {
 853                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 854                         g_raid_tr_raid1e_rebuild_some(tr);
 855                 }
 856         }
 857         switch (bp->bio_cmd) {
 858         case BIO_READ:
 859                 g_raid_tr_iostart_raid1e_read(tr, bp);
 860                 break;
 861         case BIO_WRITE:
 862         case BIO_DELETE:
 863                 g_raid_tr_iostart_raid1e_write(tr, bp);
 864                 break;
 865         case BIO_FLUSH:
 866                 g_raid_tr_flush_common(tr, bp);
 867                 break;
 868         default:
 869                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 870                     bp->bio_cmd, vol->v_name));
 871                 break;
 872         }
 873 }
 874
 875 static void
 876 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
 877     struct g_raid_subdisk *sd, struct bio *bp)
 878 {
 879         struct bio *cbp;
 880         struct g_raid_subdisk *nsd;
 881         struct g_raid_volume *vol;
 882         struct bio *pbp;
 883         struct g_raid_tr_raid1e_object *trs;
 884         off_t virtual, offset, start;
 885         uintptr_t mask;
 886         int error, do_write, copy, disk, best;
 887
 888         trs = (struct g_raid_tr_raid1e_object *)tr;
 889         vol = tr->tro_volume;
 890         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 891                 if (trs->trso_type == TR_RAID1E_REBUILD) {
 892                         nsd = trs->trso_failed_sd;
 893                         if (bp->bio_cmd == BIO_READ) {
 894
 895                                 /* Immediately abort rebuild, if requested. */
 896                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 897                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 898                                         g_raid_tr_raid1e_rebuild_abort(tr);
 899                                         return;
 900                                 }
 901
 902                                 /* On read error, skip and cross fingers. */
 903                                 if (bp->bio_error != 0) {
 904                                         G_RAID_LOGREQ(0, bp,
 905                                             "Read error during rebuild (%d), "
 906                                             "possible data loss!",
 907                                             bp->bio_error);
 908                                         goto rebuild_round_done;
 909                                 }
 910
 911                                 /*
 912                                  * The read operation finished, queue the
 913                                  * write and get out.
 914                                  */
 915                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 916                                     bp->bio_error);
 917                                 bp->bio_cmd = BIO_WRITE;
 918                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 919                                 bp->bio_offset = nsd->sd_rebuild_pos;
 920                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 921                                 g_raid_subdisk_iostart(nsd, bp);
 922                         } else {
 923                                 /*
 924                                  * The write operation just finished.  Do
 925                                  * another.  We keep cloning the master bio
 926                                  * since it has the right buffers allocated to
 927                                  * it.
 928                                  */
 929                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 930                                     bp->bio_error);
 931                                 if (bp->bio_error != 0 ||
 932                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
 933                                         if ((trs->trso_flags &
 934                                             TR_RAID1E_F_ABORT) == 0) {
 935                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 936                                                     nsd, nsd->sd_disk);
 937                                         }
 938                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 939                                         g_raid_tr_raid1e_rebuild_abort(tr);
 940                                         return;
 941                                 }
 942 rebuild_round_done:
 943                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 944                                 g_raid_unlock_range(tr->tro_volume,
 945                                     trs->trso_lock_pos, trs->trso_lock_len);
 946                                 nsd->sd_rebuild_pos += bp->bio_length;
 947                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 948                                         g_raid_tr_raid1e_rebuild_finish(tr);
 949                                         return;
 950                                 }
 951
 952                                 /* Abort rebuild if we are stopping */
 953                                 if (trs->trso_stopping) {
 954                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 955                                         g_raid_tr_raid1e_rebuild_abort(tr);
 956                                         return;
 957                                 }
 958
 959                                 if (--trs->trso_meta_update <= 0) {
 960                                         g_raid_write_metadata(vol->v_softc,
 961                                             vol, nsd, nsd->sd_disk);
 962                                         trs->trso_meta_update =
 963                                             g_raid1e_rebuild_meta_update;
 964                                         /* Compensate short rebuild I/Os. */
 965                                         if ((vol->v_disks_count % N) != 0 &&
 966                                             vol->v_strip_size <
 967                                              g_raid1e_rebuild_slab) {
 968                                                 trs->trso_meta_update *=
 969                                                     g_raid1e_rebuild_slab;
 970                                                 trs->trso_meta_update /=
 971                                                     vol->v_strip_size;
 972                                         }
 973                                 }
 974                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 975                                 if (--trs->trso_recover_slabs <= 0)
 976                                         return;
 977                                 /* Run next rebuild iteration. */
 978                                 g_raid_tr_raid1e_rebuild_some(tr);
 979                         }
 980                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
 981                         /*
 982                          * read good sd, read bad sd in parallel.  when both
 983                          * done, compare the buffers.  write good to the bad
 984                          * if different.  do the next bit of work.
 985                          */
 986                         panic("Somehow, we think we're doing a resync");
 987                 }
 988                 return;
 989         }
 990         pbp = bp->bio_parent;
 991         pbp->bio_inbed++;
 992         mask = (intptr_t)bp->bio_caller2;
 993         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 994                 /*
 995                  * Read failed on first drive.  Retry the read error on
 996                  * another disk drive, if available, before erroring out the
 997                  * read.
 998                  */
 999                 sd->sd_disk->d_read_errs++;
1000                 G_RAID_LOGREQ(0, bp,
1001                     "Read error (%d), %d read errors total",
1002                     bp->bio_error, sd->sd_disk->d_read_errs);
1003
1004                 /*
1005                  * If there are too many read errors, we move to degraded.
1006                  * XXX Do we want to FAIL the drive (eg, make the user redo
1007                  * everything to get it back in sync), or just degrade the
1008                  * drive, which kicks off a resync?
1009                  */
1010                 do_write = 0;
1011                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1012                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1013                 else if (mask == 0)
1014                         do_write = 1;
1015
1016                 /* Restore what we were doing. */
1017                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1018                 V2P(vol, virtual, &disk, &offset, &start);
1019
1020                 /* Find the other disk, and try to do the I/O to it. */
1021                 mask |= 1 << copy;
1022                 best = g_raid_tr_raid1e_select_read_disk(vol,
1023                     disk, offset, start, mask);
1024                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1025                         disk += best;
1026                         if (disk >= vol->v_disks_count) {
1027                                 disk -= vol->v_disks_count;
1028                                 offset += vol->v_strip_size;
1029                         }
1030                         cbp->bio_offset = offset + start;
1031                         cbp->bio_length = bp->bio_length;
1032                         cbp->bio_data = bp->bio_data;
1033                         g_destroy_bio(bp);
1034                         nsd = &vol->v_subdisks[disk];
1035                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1036                             nsd->sd_pos);
1037                         if (do_write)
1038                                 mask |= 1 << 31;
1039                         if ((mask & (1 << 31)) != 0)
1040                                 sd->sd_recovery++;
1041                         cbp->bio_caller2 = (void *)mask;
1042                         if (do_write) {
1043                                 cbp->bio_caller1 = nsd;
1044                                 /* Lock callback starts I/O */
1045                                 g_raid_lock_range(sd->sd_volume,
1046                                     virtual, cbp->bio_length, pbp, cbp);
1047                         } else {
1048                                 g_raid_subdisk_iostart(nsd, cbp);
1049                         }
1050                         return;
1051                 }
1052                 /*
1053                  * We can't retry.  Return the original error by falling
1054                  * through.  This will happen when there's only one good disk.
1055                  * We don't need to fail the raid, since its actual state is
1056                  * based on the state of the subdisks.
1057                  */
1058                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1059         }
1060         if (bp->bio_cmd == BIO_READ &&
1061             bp->bio_error == 0 &&
1062             (mask & (1 << 31)) != 0) {
1063                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1064
1065                 /* Restore what we were doing. */
1066                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1067                 V2P(vol, virtual, &disk, &offset, &start);
1068
1069                 /* Find best disk to write. */
1070                 best = g_raid_tr_raid1e_select_read_disk(vol,
1071                     disk, offset, start, ~mask);
1072                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1073                         disk += best;
1074                         if (disk >= vol->v_disks_count) {
1075                                 disk -= vol->v_disks_count;
1076                                 offset += vol->v_strip_size;
1077                         }
1078                         cbp->bio_offset = offset + start;
1079                         cbp->bio_cmd = BIO_WRITE;
1080                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1081                         cbp->bio_caller2 = (void *)mask;
1082                         g_destroy_bio(bp);
1083                         G_RAID_LOGREQ(2, cbp,
1084                             "Attempting bad sector remap on failing drive.");
1085                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1086                         return;
1087                 }
1088         }
1089         if ((mask & (1 << 31)) != 0) {
1090                 /*
1091                  * We're done with a recovery, mark the range as unlocked.
1092                  * For any write errors, we agressively fail the disk since
1093                  * there was both a READ and a WRITE error at this location.
1094                  * Both types of errors generally indicates the drive is on
1095                  * the verge of total failure anyway.  Better to stop trusting
1096                  * it now.  However, we need to reset error to 0 in that case
1097                  * because we're not failing the original I/O which succeeded.
1098                  */
1099
1100                 /* Restore what we were doing. */
1101                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1102                 V2P(vol, virtual, &disk, &offset, &start);
1103
1104                 for (copy = 0; copy < N; copy++) {
1105                         if ((mask & (1 << copy) ) != 0)
1106                                 vol->v_subdisks[(disk + copy) %
1107                                     vol->v_disks_count].sd_recovery--;
1108                 }
1109
1110                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1111                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1112                             "failing subdisk.");
1113                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1114                         bp->bio_error = 0;
1115                 }
1116                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1117                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1118         }
1119         if (pbp->bio_cmd != BIO_READ) {
1120                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1121                         pbp->bio_error = bp->bio_error;
1122                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1123                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1124                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1125                 }
1126                 error = pbp->bio_error;
1127         } else
1128                 error = bp->bio_error;
1129         g_destroy_bio(bp);
1130         if (pbp->bio_children == pbp->bio_inbed) {
1131                 pbp->bio_completed = pbp->bio_length;
1132                 g_raid_iodone(pbp, error);
1133         }
1134 }
1135
1136 static int
1137 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1138     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1139 {
1140         struct g_raid_volume *vol;
1141         struct g_raid_subdisk *sd;
1142         struct bio_queue_head queue;
1143         char *addr;
1144         off_t offset, start, length, remain;
1145         u_int no, strip_size;
1146         int i, error;
1147
1148         vol = tr->tro_volume;
1149         addr = virtual;
1150         strip_size = vol->v_strip_size;
1151         V2P(vol, boffset, &no, &offset, &start);
1152         remain = blength;
1153         bioq_init(&queue);
1154         while (remain > 0) {
1155                 length = MIN(strip_size - start, remain);
1156                 for (i = 0; i < N; i++) {
1157                         sd = &vol->v_subdisks[no];
1158                         switch (sd->sd_state) {
1159                         case G_RAID_SUBDISK_S_ACTIVE:
1160                         case G_RAID_SUBDISK_S_STALE:
1161                         case G_RAID_SUBDISK_S_RESYNC:
1162                                 break;
1163                         case G_RAID_SUBDISK_S_REBUILD:
1164                                 if (offset + start >= sd->sd_rebuild_pos)
1165                                         goto nextdisk;
1166                                 break;
1167                         default:
1168                                 goto nextdisk;
1169                         }
1170                         error = g_raid_subdisk_kerneldump(sd,
1171                             addr, 0, offset + start, length);
1172                         if (error != 0)
1173                                 return (error);
1174 nextdisk:
1175                         if (++no >= vol->v_disks_count) {
1176                                 no = 0;
1177                                 offset += strip_size;
1178                         }
1179                 }
1180                 remain -= length;
1181                 addr += length;
1182                 start = 0;
1183         }
1184         return (0);
1185 }
1186
1187 static int
1188 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1189 {
1190         struct bio *bp;
1191         struct g_raid_subdisk *sd;
1192
1193         bp = (struct bio *)argp;
1194         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1195         g_raid_subdisk_iostart(sd, bp);
1196
1197         return (0);
1198 }
1199
1200 static int
1201 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1202 {
1203         struct g_raid_tr_raid1e_object *trs;
1204         struct g_raid_volume *vol;
1205
1206         vol = tr->tro_volume;
1207         trs = (struct g_raid_tr_raid1e_object *)tr;
1208         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1209         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1210         /* Compensate short rebuild I/Os. */
1211         if ((vol->v_disks_count % N) != 0 &&
1212             vol->v_strip_size < g_raid1e_rebuild_slab) {
1213                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1214                 trs->trso_recover_slabs /= vol->v_strip_size;
1215         }
1216         if (trs->trso_type == TR_RAID1E_REBUILD)
1217                 g_raid_tr_raid1e_rebuild_some(tr);
1218         return (0);
1219 }
1220
1221 static int
1222 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1223 {
1224         struct g_raid_tr_raid1e_object *trs;
1225
1226         trs = (struct g_raid_tr_raid1e_object *)tr;
1227
1228         if (trs->trso_buffer != NULL) {
1229                 free(trs->trso_buffer, M_TR_RAID1E);
1230                 trs->trso_buffer = NULL;
1231         }
1232         return (0);
1233 }
1234
1235 G_RAID_TR_DECLARE(raid1e, "RAID1E");