sys/geom/raid/tr_raid1e.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/bio.h>
  34 #include <sys/endian.h>
  35 #include <sys/kernel.h>
  36 #include <sys/kobj.h>
  37 #include <sys/limits.h>
  38 #include <sys/lock.h>
  39 #include <sys/malloc.h>
  40 #include <sys/mutex.h>
  41 #include <sys/sysctl.h>
  42 #include <sys/systm.h>
  43 #include <geom/geom.h>
  44 #include <geom/geom_dbg.h>
  45 #include "geom/raid/g_raid.h"
  46 #include "g_raid_tr_if.h"
  47
  48 #define N       2
  49
  50 SYSCTL_DECL(_kern_geom_raid_raid1e);
  51
  52 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
  53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
  54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
  55     &g_raid1e_rebuild_slab, 0,
  56     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
  57
  58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
  59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
  60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
  61     &g_raid1e_rebuild_fair_io, 0,
  62     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
  63
  64 #define RAID1E_REBUILD_CLUSTER_IDLE 100
  65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
  66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
  67     &g_raid1e_rebuild_cluster_idle, 0,
  68     "Number of slabs to do each time we trigger a rebuild cycle");
  69
  70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
  71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
  72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
  73     &g_raid1e_rebuild_meta_update, 0,
  74     "When to update the meta data.");
  75
  76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
  77
  78 #define TR_RAID1E_NONE 0
  79 #define TR_RAID1E_REBUILD 1
  80 #define TR_RAID1E_RESYNC 2
  81
  82 #define TR_RAID1E_F_DOING_SOME  0x1
  83 #define TR_RAID1E_F_LOCKED      0x2
  84 #define TR_RAID1E_F_ABORT       0x4
  85
  86 struct g_raid_tr_raid1e_object {
  87         struct g_raid_tr_object  trso_base;
  88         int                      trso_starting;
  89         int                      trso_stopping;
  90         int                      trso_type;
  91         int                      trso_recover_slabs; /* slabs before rest */
  92         int                      trso_fair_io;
  93         int                      trso_meta_update;
  94         int                      trso_flags;
  95         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  96         void                    *trso_buffer;    /* Buffer space */
  97         off_t                    trso_lock_pos; /* Locked range start. */
  98         off_t                    trso_lock_len; /* Locked range length. */
  99         struct bio               trso_bio;
 100 };
 101
 102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 103 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 104 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 111 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 112
 113 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 114         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
 115         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
 116         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
 117         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
 118         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
 119         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
 120         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 121         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
 122         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
 123         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
 124         { 0, 0 }
 125 };
 126
 127 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 128         "RAID1E",
 129         g_raid_tr_raid1e_methods,
 130         sizeof(struct g_raid_tr_raid1e_object),
 131         .trc_enable = 1,
 132         .trc_priority = 200,
 133         .trc_accept_unmapped = 1
 134 };
 135
 136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 138     struct g_raid_subdisk *sd);
 139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 140     int no, off_t off, off_t len, u_int mask);
 141
 142 static inline void
 143 V2P(struct g_raid_volume *vol, off_t virt,
 144     int *disk, off_t *offset, off_t *start)
 145 {
 146         off_t nstrip;
 147         u_int strip_size;
 148
 149         strip_size = vol->v_strip_size;
 150         /* Strip number. */
 151         nstrip = virt / strip_size;
 152         /* Start position in strip. */
 153         *start = virt % strip_size;
 154         /* Disk number. */
 155         *disk = (nstrip * N) % vol->v_disks_count;
 156         /* Strip start position in disk. */
 157         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 158 }
 159
 160 static inline void
 161 P2V(struct g_raid_volume *vol, int disk, off_t offset,
 162     off_t *virt, int *copy)
 163 {
 164         off_t nstrip, start;
 165         u_int strip_size;
 166
 167         strip_size = vol->v_strip_size;
 168         /* Start position in strip. */
 169         start = offset % strip_size;
 170         /* Physical strip number. */
 171         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 172         /* Number of physical strip (copy) inside virtual strip. */
 173         *copy = nstrip % N;
 174         /* Offset in virtual space. */
 175         *virt = (nstrip / N) * strip_size + start;
 176 }
 177
 178 static int
 179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 180 {
 181         struct g_raid_tr_raid1e_object *trs;
 182
 183         trs = (struct g_raid_tr_raid1e_object *)tr;
 184         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 185             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 186                 return (G_RAID_TR_TASTE_FAIL);
 187         trs->trso_starting = 1;
 188         return (G_RAID_TR_TASTE_SUCCEED);
 189 }
 190
 191 static int
 192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 193 {
 194         struct g_raid_softc *sc;
 195         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 196         int i, j, state, sstate;
 197
 198         sc = vol->v_softc;
 199         state = G_RAID_VOLUME_S_OPTIMAL;
 200         for (i = 0; i < vol->v_disks_count / N; i++) {
 201                 bestsd = &vol->v_subdisks[i * N];
 202                 for (j = 1; j < N; j++) {
 203                         sd = &vol->v_subdisks[i * N + j];
 204                         if (sd->sd_state > bestsd->sd_state)
 205                                 bestsd = sd;
 206                         else if (sd->sd_state == bestsd->sd_state &&
 207                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 208                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 209                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 210                                 bestsd = sd;
 211                 }
 212                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 213                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 214                         /* We found reasonable candidate. */
 215                         G_RAID_DEBUG1(1, sc,
 216                             "Promote subdisk %s:%d from %s to ACTIVE.",
 217                             vol->v_name, bestsd->sd_pos,
 218                             g_raid_subdisk_state2str(bestsd->sd_state));
 219                         g_raid_change_subdisk_state(bestsd,
 220                             G_RAID_SUBDISK_S_ACTIVE);
 221                         g_raid_write_metadata(sc,
 222                             vol, bestsd, bestsd->sd_disk);
 223                 }
 224                 worstsd = &vol->v_subdisks[i * N];
 225                 for (j = 1; j < N; j++) {
 226                         sd = &vol->v_subdisks[i * N + j];
 227                         if (sd->sd_state < worstsd->sd_state)
 228                                 worstsd = sd;
 229                 }
 230                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 231                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 232                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 233                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 234                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 235                         sstate = G_RAID_VOLUME_S_DEGRADED;
 236                 else
 237                         sstate = G_RAID_VOLUME_S_BROKEN;
 238                 if (sstate < state)
 239                         state = sstate;
 240         }
 241         return (state);
 242 }
 243
 244 static int
 245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 246 {
 247         struct g_raid_softc *sc;
 248         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 249         int i, j, state, sstate;
 250
 251         sc = vol->v_softc;
 252         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 253             vol->v_disks_count)
 254                 return (G_RAID_VOLUME_S_OPTIMAL);
 255         for (i = 0; i < vol->v_disks_count; i++) {
 256                 sd = &vol->v_subdisks[i];
 257                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 258                         /* We found reasonable candidate. */
 259                         G_RAID_DEBUG1(1, sc,
 260                             "Promote subdisk %s:%d from %s to STALE.",
 261                             vol->v_name, sd->sd_pos,
 262                             g_raid_subdisk_state2str(sd->sd_state));
 263                         g_raid_change_subdisk_state(sd,
 264                             G_RAID_SUBDISK_S_STALE);
 265                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 266                 }
 267         }
 268         state = G_RAID_VOLUME_S_OPTIMAL;
 269         for (i = 0; i < vol->v_disks_count; i++) {
 270                 bestsd = &vol->v_subdisks[i];
 271                 worstsd = &vol->v_subdisks[i];
 272                 for (j = 1; j < N; j++) {
 273                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 274                         if (sd->sd_state > bestsd->sd_state)
 275                                 bestsd = sd;
 276                         else if (sd->sd_state == bestsd->sd_state &&
 277                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 278                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 279                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 280                                 bestsd = sd;
 281                         if (sd->sd_state < worstsd->sd_state)
 282                                 worstsd = sd;
 283                 }
 284                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 285                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 286                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 287                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 288                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 289                         sstate = G_RAID_VOLUME_S_DEGRADED;
 290                 else
 291                         sstate = G_RAID_VOLUME_S_BROKEN;
 292                 if (sstate < state)
 293                         state = sstate;
 294         }
 295         return (state);
 296 }
 297
 298 static int
 299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
 300     struct g_raid_subdisk *sd)
 301 {
 302         struct g_raid_tr_raid1e_object *trs;
 303         struct g_raid_softc *sc;
 304         u_int s;
 305
 306         sc = vol->v_softc;
 307         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 308         if (trs->trso_stopping &&
 309             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 310                 s = G_RAID_VOLUME_S_STOPPED;
 311         else if (trs->trso_starting)
 312                 s = G_RAID_VOLUME_S_STARTING;
 313         else {
 314                 if ((vol->v_disks_count % N) == 0)
 315                         s = g_raid_tr_update_state_raid1e_even(vol);
 316                 else
 317                         s = g_raid_tr_update_state_raid1e_odd(vol);
 318         }
 319         if (s != vol->v_state) {
 320                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 321                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 322                     G_RAID_EVENT_VOLUME);
 323                 g_raid_change_volume_state(vol, s);
 324                 if (!trs->trso_starting && !trs->trso_stopping)
 325                         g_raid_write_metadata(sc, vol, NULL, NULL);
 326         }
 327         if (!trs->trso_starting && !trs->trso_stopping)
 328                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 329         return (0);
 330 }
 331
 332 static void
 333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
 334     struct g_raid_disk *disk)
 335 {
 336         struct g_raid_volume *vol;
 337
 338         vol = sd->sd_volume;
 339         /*
 340          * We don't fail the last disk in the pack, since it still has decent
 341          * data on it and that's better than failing the disk if it is the root
 342          * file system.
 343          *
 344          * XXX should this be controlled via a tunable?  It makes sense for
 345          * the volume that has / on it.  I can't think of a case where we'd
 346          * want the volume to go away on this kind of event.
 347          */
 348         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 349              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 350              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 351              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 352              vol->v_disks_count) &&
 353             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 354                 return;
 355         g_raid_fail_disk(sc, sd, disk);
 356 }
 357
 358 static void
 359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 360 {
 361         struct g_raid_volume *vol;
 362         struct g_raid_subdisk *sd;
 363
 364         vol = trs->trso_base.tro_volume;
 365         sd = trs->trso_failed_sd;
 366         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 367         free(trs->trso_buffer, M_TR_RAID1E);
 368         trs->trso_buffer = NULL;
 369         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 370         trs->trso_type = TR_RAID1E_NONE;
 371         trs->trso_recover_slabs = 0;
 372         trs->trso_failed_sd = NULL;
 373         g_raid_tr_update_state_raid1e(vol, NULL);
 374 }
 375
 376 static void
 377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 378 {
 379         struct g_raid_tr_raid1e_object *trs;
 380         struct g_raid_subdisk *sd;
 381
 382         trs = (struct g_raid_tr_raid1e_object *)tr;
 383         sd = trs->trso_failed_sd;
 384         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 385             "Subdisk %s:%d-%s rebuild completed.",
 386             sd->sd_volume->v_name, sd->sd_pos,
 387             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 388         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 389         sd->sd_rebuild_pos = 0;
 390         g_raid_tr_raid1e_rebuild_done(trs);
 391 }
 392
 393 static void
 394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 395 {
 396         struct g_raid_tr_raid1e_object *trs;
 397         struct g_raid_subdisk *sd;
 398         struct g_raid_volume *vol;
 399
 400         vol = tr->tro_volume;
 401         trs = (struct g_raid_tr_raid1e_object *)tr;
 402         sd = trs->trso_failed_sd;
 403         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 404                 G_RAID_DEBUG1(1, vol->v_softc,
 405                     "Subdisk %s:%d-%s rebuild is aborting.",
 406                     sd->sd_volume->v_name, sd->sd_pos,
 407                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 408                 trs->trso_flags |= TR_RAID1E_F_ABORT;
 409         } else {
 410                 G_RAID_DEBUG1(0, vol->v_softc,
 411                     "Subdisk %s:%d-%s rebuild aborted.",
 412                     sd->sd_volume->v_name, sd->sd_pos,
 413                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 414                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 415                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 416                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 417                         g_raid_unlock_range(tr->tro_volume,
 418                             trs->trso_lock_pos, trs->trso_lock_len);
 419                 }
 420                 g_raid_tr_raid1e_rebuild_done(trs);
 421         }
 422 }
 423
 424 static void
 425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 426 {
 427         struct g_raid_tr_raid1e_object *trs;
 428         struct g_raid_softc *sc;
 429         struct g_raid_volume *vol;
 430         struct g_raid_subdisk *sd;
 431         struct bio *bp;
 432         off_t len, virtual, vend, offset, start;
 433         int disk, copy, best;
 434
 435         trs = (struct g_raid_tr_raid1e_object *)tr;
 436         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 437                 return;
 438         vol = tr->tro_volume;
 439         sc = vol->v_softc;
 440         sd = trs->trso_failed_sd;
 441
 442         while (1) {
 443                 if (sd->sd_rebuild_pos >= sd->sd_size) {
 444                         g_raid_tr_raid1e_rebuild_finish(tr);
 445                         return;
 446                 }
 447                 /* Get virtual offset from physical rebuild position. */
 448                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 449                 /* Get physical offset back to get first stripe position. */
 450                 V2P(vol, virtual, &disk, &offset, &start);
 451                 /* Calculate contignous data length. */
 452                 len = MIN(g_raid1e_rebuild_slab,
 453                     sd->sd_size - sd->sd_rebuild_pos);
 454                 if ((vol->v_disks_count % N) != 0)
 455                         len = MIN(len, vol->v_strip_size - start);
 456                 /* Find disk with most accurate data. */
 457                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 458                     offset + start, len, 0);
 459                 if (best < 0) {
 460                         /* There is no any valid disk. */
 461                         g_raid_tr_raid1e_rebuild_abort(tr);
 462                         return;
 463                 } else if (best != copy) {
 464                         /* Some other disk has better data. */
 465                         break;
 466                 }
 467                 /* We have the most accurate data. Skip the range. */
 468                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 469                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 470                 sd->sd_rebuild_pos += len;
 471         }
 472
 473         bp = &trs->trso_bio;
 474         memset(bp, 0, sizeof(*bp));
 475         bp->bio_offset = offset + start +
 476             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 477         bp->bio_length = len;
 478         bp->bio_data = trs->trso_buffer;
 479         bp->bio_cmd = BIO_READ;
 480         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 481         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 482         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 483         /*
 484          * If we are crossing stripe boundary, correct affected virtual
 485          * range we should lock.
 486          */
 487         if (start + len > vol->v_strip_size) {
 488                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 489                 len = vend - virtual;
 490         }
 491         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 492         trs->trso_flags |= TR_RAID1E_F_LOCKED;
 493         trs->trso_lock_pos = virtual;
 494         trs->trso_lock_len = len;
 495         /* Lock callback starts I/O */
 496         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 497 }
 498
 499 static void
 500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 501 {
 502         struct g_raid_volume *vol;
 503         struct g_raid_tr_raid1e_object *trs;
 504         struct g_raid_subdisk *sd;
 505
 506         vol = tr->tro_volume;
 507         trs = (struct g_raid_tr_raid1e_object *)tr;
 508         if (trs->trso_failed_sd) {
 509                 G_RAID_DEBUG1(1, vol->v_softc,
 510                     "Already rebuild in start rebuild. pos %jd\n",
 511                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 512                 return;
 513         }
 514         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 515         if (sd == NULL)
 516                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 517         if (sd == NULL) {
 518                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 519                 if (sd != NULL) {
 520                         sd->sd_rebuild_pos = 0;
 521                         g_raid_change_subdisk_state(sd,
 522                             G_RAID_SUBDISK_S_RESYNC);
 523                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 524                 } else {
 525                         sd = g_raid_get_subdisk(vol,
 526                             G_RAID_SUBDISK_S_UNINITIALIZED);
 527                         if (sd == NULL)
 528                                 sd = g_raid_get_subdisk(vol,
 529                                     G_RAID_SUBDISK_S_NEW);
 530                         if (sd != NULL) {
 531                                 sd->sd_rebuild_pos = 0;
 532                                 g_raid_change_subdisk_state(sd,
 533                                     G_RAID_SUBDISK_S_REBUILD);
 534                                 g_raid_write_metadata(vol->v_softc,
 535                                     vol, sd, NULL);
 536                         }
 537                 }
 538         }
 539         if (sd == NULL) {
 540                 G_RAID_DEBUG1(1, vol->v_softc,
 541                     "No failed disk to rebuild.  night night.");
 542                 return;
 543         }
 544         trs->trso_failed_sd = sd;
 545         G_RAID_DEBUG1(0, vol->v_softc,
 546             "Subdisk %s:%d-%s rebuild start at %jd.",
 547             sd->sd_volume->v_name, sd->sd_pos,
 548             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 549             trs->trso_failed_sd->sd_rebuild_pos);
 550         trs->trso_type = TR_RAID1E_REBUILD;
 551         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 552         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 553         g_raid_tr_raid1e_rebuild_some(tr);
 554 }
 555
 556 static void
 557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 558     struct g_raid_subdisk *sd)
 559 {
 560         struct g_raid_volume *vol;
 561         struct g_raid_tr_raid1e_object *trs;
 562         int nr;
 563
 564         vol = tr->tro_volume;
 565         trs = (struct g_raid_tr_raid1e_object *)tr;
 566         if (trs->trso_stopping)
 567                 return;
 568         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 569             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 570         switch(trs->trso_type) {
 571         case TR_RAID1E_NONE:
 572                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 573                         return;
 574                 if (nr == 0) {
 575                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 576                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 577                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 578                         if (nr == 0)
 579                                 return;
 580                 }
 581                 g_raid_tr_raid1e_rebuild_start(tr);
 582                 break;
 583         case TR_RAID1E_REBUILD:
 584                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 585                     trs->trso_failed_sd == sd)
 586                         g_raid_tr_raid1e_rebuild_abort(tr);
 587                 break;
 588         case TR_RAID1E_RESYNC:
 589                 break;
 590         }
 591 }
 592
 593 static int
 594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
 595     struct g_raid_subdisk *sd, u_int event)
 596 {
 597
 598         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 599         return (0);
 600 }
 601
 602 static int
 603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 604 {
 605         struct g_raid_tr_raid1e_object *trs;
 606         struct g_raid_volume *vol;
 607
 608         trs = (struct g_raid_tr_raid1e_object *)tr;
 609         vol = tr->tro_volume;
 610         trs->trso_starting = 0;
 611         g_raid_tr_update_state_raid1e(vol, NULL);
 612         return (0);
 613 }
 614
 615 static int
 616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 617 {
 618         struct g_raid_tr_raid1e_object *trs;
 619         struct g_raid_volume *vol;
 620
 621         trs = (struct g_raid_tr_raid1e_object *)tr;
 622         vol = tr->tro_volume;
 623         trs->trso_starting = 0;
 624         trs->trso_stopping = 1;
 625         g_raid_tr_update_state_raid1e(vol, NULL);
 626         return (0);
 627 }
 628
 629 /*
 630  * Select the disk to read from.  Take into account: subdisk state, running
 631  * error recovery, average disk load, head position and possible cache hits.
 632  */
 633 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
 634 static int
 635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 636     int no, off_t off, off_t len, u_int mask)
 637 {
 638         struct g_raid_subdisk *sd;
 639         off_t offset;
 640         int i, best, prio, bestprio;
 641
 642         best = -1;
 643         bestprio = INT_MAX;
 644         for (i = 0; i < N; i++) {
 645                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 646                 offset = off;
 647                 if (no + i >= vol->v_disks_count)
 648                         offset += vol->v_strip_size;
 649
 650                 prio = G_RAID_SUBDISK_LOAD(sd);
 651                 if ((mask & (1 << sd->sd_pos)) != 0)
 652                         continue;
 653                 switch (sd->sd_state) {
 654                 case G_RAID_SUBDISK_S_ACTIVE:
 655                         break;
 656                 case G_RAID_SUBDISK_S_RESYNC:
 657                         if (offset + off < sd->sd_rebuild_pos)
 658                                 break;
 659                         /* FALLTHROUGH */
 660                 case G_RAID_SUBDISK_S_STALE:
 661                         prio += i << 24;
 662                         break;
 663                 case G_RAID_SUBDISK_S_REBUILD:
 664                         if (offset + off < sd->sd_rebuild_pos)
 665                                 break;
 666                         /* FALLTHROUGH */
 667                 default:
 668                         continue;
 669                 }
 670                 prio += min(sd->sd_recovery, 255) << 16;
 671                 /* If disk head is precisely in position - highly prefer it. */
 672                 if (G_RAID_SUBDISK_POS(sd) == offset)
 673                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 674                 else
 675                 /* If disk head is close to position - prefer it. */
 676                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 677                     G_RAID_SUBDISK_TRACK_SIZE)
 678                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 679                 if (prio < bestprio) {
 680                         bestprio = prio;
 681                         best = i;
 682                 }
 683         }
 684         return (best);
 685 }
 686
 687 static void
 688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 689 {
 690         struct g_raid_volume *vol;
 691         struct g_raid_subdisk *sd;
 692         struct bio_queue_head queue;
 693         struct bio *cbp;
 694         char *addr;
 695         off_t offset, start, length, remain;
 696         u_int no, strip_size;
 697         int best;
 698
 699         vol = tr->tro_volume;
 700         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 701                 addr = NULL;
 702         else
 703                 addr = bp->bio_data;
 704         strip_size = vol->v_strip_size;
 705         V2P(vol, bp->bio_offset, &no, &offset, &start);
 706         remain = bp->bio_length;
 707         bioq_init(&queue);
 708         while (remain > 0) {
 709                 length = MIN(strip_size - start, remain);
 710                 best = g_raid_tr_raid1e_select_read_disk(vol,
 711                     no, offset, length, 0);
 712                 KASSERT(best >= 0, ("No readable disk in volume %s!",
 713                     vol->v_name));
 714                 no += best;
 715                 if (no >= vol->v_disks_count) {
 716                         no -= vol->v_disks_count;
 717                         offset += strip_size;
 718                 }
 719                 cbp = g_clone_bio(bp);
 720                 if (cbp == NULL)
 721                         goto failure;
 722                 cbp->bio_offset = offset + start;
 723                 cbp->bio_length = length;
 724                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 725                         cbp->bio_ma_offset += (uintptr_t)addr;
 726                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 727                         cbp->bio_ma_offset %= PAGE_SIZE;
 728                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 729                             cbp->bio_length) / PAGE_SIZE;
 730                 } else
 731                         cbp->bio_data = addr;
 732                 cbp->bio_caller1 = &vol->v_subdisks[no];
 733                 bioq_insert_tail(&queue, cbp);
 734                 no += N - best;
 735                 if (no >= vol->v_disks_count) {
 736                         no -= vol->v_disks_count;
 737                         offset += strip_size;
 738                 }
 739                 remain -= length;
 740                 addr += length;
 741                 start = 0;
 742         }
 743         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 744                 sd = cbp->bio_caller1;
 745                 cbp->bio_caller1 = NULL;
 746                 g_raid_subdisk_iostart(sd, cbp);
 747         }
 748         return;
 749 failure:
 750         while ((cbp = bioq_takefirst(&queue)) != NULL)
 751                 g_destroy_bio(cbp);
 752         if (bp->bio_error == 0)
 753                 bp->bio_error = ENOMEM;
 754         g_raid_iodone(bp, bp->bio_error);
 755 }
 756
 757 static void
 758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 759 {
 760         struct g_raid_volume *vol;
 761         struct g_raid_subdisk *sd;
 762         struct bio_queue_head queue;
 763         struct bio *cbp;
 764         char *addr;
 765         off_t offset, start, length, remain;
 766         u_int no, strip_size;
 767         int i;
 768
 769         vol = tr->tro_volume;
 770         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 771                 addr = NULL;
 772         else
 773                 addr = bp->bio_data;
 774         strip_size = vol->v_strip_size;
 775         V2P(vol, bp->bio_offset, &no, &offset, &start);
 776         remain = bp->bio_length;
 777         bioq_init(&queue);
 778         while (remain > 0) {
 779                 length = MIN(strip_size - start, remain);
 780                 for (i = 0; i < N; i++) {
 781                         sd = &vol->v_subdisks[no];
 782                         switch (sd->sd_state) {
 783                         case G_RAID_SUBDISK_S_ACTIVE:
 784                         case G_RAID_SUBDISK_S_STALE:
 785                         case G_RAID_SUBDISK_S_RESYNC:
 786                                 break;
 787                         case G_RAID_SUBDISK_S_REBUILD:
 788                                 if (offset + start >= sd->sd_rebuild_pos)
 789                                         goto nextdisk;
 790                                 break;
 791                         default:
 792                                 goto nextdisk;
 793                         }
 794                         cbp = g_clone_bio(bp);
 795                         if (cbp == NULL)
 796                                 goto failure;
 797                         cbp->bio_offset = offset + start;
 798                         cbp->bio_length = length;
 799                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 800                             bp->bio_cmd != BIO_DELETE) {
 801                                 cbp->bio_ma_offset += (uintptr_t)addr;
 802                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 803                                 cbp->bio_ma_offset %= PAGE_SIZE;
 804                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 805                                     cbp->bio_length) / PAGE_SIZE;
 806                         } else
 807                                 cbp->bio_data = addr;
 808                         cbp->bio_caller1 = sd;
 809                         bioq_insert_tail(&queue, cbp);
 810 nextdisk:
 811                         if (++no >= vol->v_disks_count) {
 812                                 no = 0;
 813                                 offset += strip_size;
 814                         }
 815                 }
 816                 remain -= length;
 817                 if (bp->bio_cmd != BIO_DELETE)
 818                         addr += length;
 819                 start = 0;
 820         }
 821         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 822                 sd = cbp->bio_caller1;
 823                 cbp->bio_caller1 = NULL;
 824                 g_raid_subdisk_iostart(sd, cbp);
 825         }
 826         return;
 827 failure:
 828         while ((cbp = bioq_takefirst(&queue)) != NULL)
 829                 g_destroy_bio(cbp);
 830         if (bp->bio_error == 0)
 831                 bp->bio_error = ENOMEM;
 832         g_raid_iodone(bp, bp->bio_error);
 833 }
 834
 835 static void
 836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 837 {
 838         struct g_raid_volume *vol;
 839         struct g_raid_tr_raid1e_object *trs;
 840
 841         vol = tr->tro_volume;
 842         trs = (struct g_raid_tr_raid1e_object *)tr;
 843         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 844             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 845             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 846                 g_raid_iodone(bp, EIO);
 847                 return;
 848         }
 849         /*
 850          * If we're rebuilding, squeeze in rebuild activity every so often,
 851          * even when the disk is busy.  Be sure to only count real I/O
 852          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 853          * by this module.
 854          */
 855         if (trs->trso_failed_sd != NULL &&
 856             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 857                 /* Make this new or running now round short. */
 858                 trs->trso_recover_slabs = 0;
 859                 if (--trs->trso_fair_io <= 0) {
 860                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 861                         g_raid_tr_raid1e_rebuild_some(tr);
 862                 }
 863         }
 864         switch (bp->bio_cmd) {
 865         case BIO_READ:
 866                 g_raid_tr_iostart_raid1e_read(tr, bp);
 867                 break;
 868         case BIO_WRITE:
 869         case BIO_DELETE:
 870                 g_raid_tr_iostart_raid1e_write(tr, bp);
 871                 break;
 872         case BIO_SPEEDUP:
 873         case BIO_FLUSH:
 874                 g_raid_tr_flush_common(tr, bp);
 875                 break;
 876         default:
 877                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 878                     bp->bio_cmd, vol->v_name));
 879                 break;
 880         }
 881 }
 882
 883 static void
 884 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
 885     struct g_raid_subdisk *sd, struct bio *bp)
 886 {
 887         struct bio *cbp;
 888         struct g_raid_subdisk *nsd;
 889         struct g_raid_volume *vol;
 890         struct bio *pbp;
 891         struct g_raid_tr_raid1e_object *trs;
 892         off_t virtual, offset, start;
 893         uintptr_t mask;
 894         int error, do_write, copy, disk, best;
 895
 896         trs = (struct g_raid_tr_raid1e_object *)tr;
 897         vol = tr->tro_volume;
 898         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 899                 if (trs->trso_type == TR_RAID1E_REBUILD) {
 900                         nsd = trs->trso_failed_sd;
 901                         if (bp->bio_cmd == BIO_READ) {
 902                                 /* Immediately abort rebuild, if requested. */
 903                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 904                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 905                                         g_raid_tr_raid1e_rebuild_abort(tr);
 906                                         return;
 907                                 }
 908
 909                                 /* On read error, skip and cross fingers. */
 910                                 if (bp->bio_error != 0) {
 911                                         G_RAID_LOGREQ(0, bp,
 912                                             "Read error during rebuild (%d), "
 913                                             "possible data loss!",
 914                                             bp->bio_error);
 915                                         goto rebuild_round_done;
 916                                 }
 917
 918                                 /*
 919                                  * The read operation finished, queue the
 920                                  * write and get out.
 921                                  */
 922                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 923                                     bp->bio_error);
 924                                 bp->bio_cmd = BIO_WRITE;
 925                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 926                                 bp->bio_offset = nsd->sd_rebuild_pos;
 927                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 928                                 g_raid_subdisk_iostart(nsd, bp);
 929                         } else {
 930                                 /*
 931                                  * The write operation just finished.  Do
 932                                  * another.  We keep cloning the master bio
 933                                  * since it has the right buffers allocated to
 934                                  * it.
 935                                  */
 936                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 937                                     bp->bio_error);
 938                                 if (bp->bio_error != 0 ||
 939                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
 940                                         if ((trs->trso_flags &
 941                                             TR_RAID1E_F_ABORT) == 0) {
 942                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 943                                                     nsd, nsd->sd_disk);
 944                                         }
 945                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 946                                         g_raid_tr_raid1e_rebuild_abort(tr);
 947                                         return;
 948                                 }
 949 rebuild_round_done:
 950                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 951                                 g_raid_unlock_range(tr->tro_volume,
 952                                     trs->trso_lock_pos, trs->trso_lock_len);
 953                                 nsd->sd_rebuild_pos += bp->bio_length;
 954                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 955                                         g_raid_tr_raid1e_rebuild_finish(tr);
 956                                         return;
 957                                 }
 958
 959                                 /* Abort rebuild if we are stopping */
 960                                 if (trs->trso_stopping) {
 961                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 962                                         g_raid_tr_raid1e_rebuild_abort(tr);
 963                                         return;
 964                                 }
 965
 966                                 if (--trs->trso_meta_update <= 0) {
 967                                         g_raid_write_metadata(vol->v_softc,
 968                                             vol, nsd, nsd->sd_disk);
 969                                         trs->trso_meta_update =
 970                                             g_raid1e_rebuild_meta_update;
 971                                         /* Compensate short rebuild I/Os. */
 972                                         if ((vol->v_disks_count % N) != 0 &&
 973                                             vol->v_strip_size <
 974                                              g_raid1e_rebuild_slab) {
 975                                                 trs->trso_meta_update *=
 976                                                     g_raid1e_rebuild_slab;
 977                                                 trs->trso_meta_update /=
 978                                                     vol->v_strip_size;
 979                                         }
 980                                 }
 981                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 982                                 if (--trs->trso_recover_slabs <= 0)
 983                                         return;
 984                                 /* Run next rebuild iteration. */
 985                                 g_raid_tr_raid1e_rebuild_some(tr);
 986                         }
 987                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
 988                         /*
 989                          * read good sd, read bad sd in parallel.  when both
 990                          * done, compare the buffers.  write good to the bad
 991                          * if different.  do the next bit of work.
 992                          */
 993                         panic("Somehow, we think we're doing a resync");
 994                 }
 995                 return;
 996         }
 997         pbp = bp->bio_parent;
 998         pbp->bio_inbed++;
 999         mask = (intptr_t)bp->bio_caller2;
1000         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1001                 /*
1002                  * Read failed on first drive.  Retry the read error on
1003                  * another disk drive, if available, before erroring out the
1004                  * read.
1005                  */
1006                 sd->sd_disk->d_read_errs++;
1007                 G_RAID_LOGREQ(0, bp,
1008                     "Read error (%d), %d read errors total",
1009                     bp->bio_error, sd->sd_disk->d_read_errs);
1010
1011                 /*
1012                  * If there are too many read errors, we move to degraded.
1013                  * XXX Do we want to FAIL the drive (eg, make the user redo
1014                  * everything to get it back in sync), or just degrade the
1015                  * drive, which kicks off a resync?
1016                  */
1017                 do_write = 0;
1018                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1019                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1020                 else if (mask == 0)
1021                         do_write = 1;
1022
1023                 /* Restore what we were doing. */
1024                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1025                 V2P(vol, virtual, &disk, &offset, &start);
1026
1027                 /* Find the other disk, and try to do the I/O to it. */
1028                 mask |= 1 << copy;
1029                 best = g_raid_tr_raid1e_select_read_disk(vol,
1030                     disk, offset, start, mask);
1031                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1032                         disk += best;
1033                         if (disk >= vol->v_disks_count) {
1034                                 disk -= vol->v_disks_count;
1035                                 offset += vol->v_strip_size;
1036                         }
1037                         cbp->bio_offset = offset + start;
1038                         cbp->bio_length = bp->bio_length;
1039                         cbp->bio_data = bp->bio_data;
1040                         cbp->bio_ma = bp->bio_ma;
1041                         cbp->bio_ma_offset = bp->bio_ma_offset;
1042                         cbp->bio_ma_n = bp->bio_ma_n;
1043                         g_destroy_bio(bp);
1044                         nsd = &vol->v_subdisks[disk];
1045                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1046                             nsd->sd_pos);
1047                         if (do_write)
1048                                 mask |= 1 << 31;
1049                         if ((mask & (1U << 31)) != 0)
1050                                 sd->sd_recovery++;
1051                         cbp->bio_caller2 = (void *)mask;
1052                         if (do_write) {
1053                                 cbp->bio_caller1 = nsd;
1054                                 /* Lock callback starts I/O */
1055                                 g_raid_lock_range(sd->sd_volume,
1056                                     virtual, cbp->bio_length, pbp, cbp);
1057                         } else {
1058                                 g_raid_subdisk_iostart(nsd, cbp);
1059                         }
1060                         return;
1061                 }
1062                 /*
1063                  * We can't retry.  Return the original error by falling
1064                  * through.  This will happen when there's only one good disk.
1065                  * We don't need to fail the raid, since its actual state is
1066                  * based on the state of the subdisks.
1067                  */
1068                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1069         }
1070         if (bp->bio_cmd == BIO_READ &&
1071             bp->bio_error == 0 &&
1072             (mask & (1U << 31)) != 0) {
1073                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1074
1075                 /* Restore what we were doing. */
1076                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1077                 V2P(vol, virtual, &disk, &offset, &start);
1078
1079                 /* Find best disk to write. */
1080                 best = g_raid_tr_raid1e_select_read_disk(vol,
1081                     disk, offset, start, ~mask);
1082                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1083                         disk += best;
1084                         if (disk >= vol->v_disks_count) {
1085                                 disk -= vol->v_disks_count;
1086                                 offset += vol->v_strip_size;
1087                         }
1088                         cbp->bio_offset = offset + start;
1089                         cbp->bio_cmd = BIO_WRITE;
1090                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1091                         cbp->bio_caller2 = (void *)mask;
1092                         g_destroy_bio(bp);
1093                         G_RAID_LOGREQ(2, cbp,
1094                             "Attempting bad sector remap on failing drive.");
1095                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1096                         return;
1097                 }
1098         }
1099         if ((mask & (1U << 31)) != 0) {
1100                 /*
1101                  * We're done with a recovery, mark the range as unlocked.
1102                  * For any write errors, we aggressively fail the disk since
1103                  * there was both a READ and a WRITE error at this location.
1104                  * Both types of errors generally indicates the drive is on
1105                  * the verge of total failure anyway.  Better to stop trusting
1106                  * it now.  However, we need to reset error to 0 in that case
1107                  * because we're not failing the original I/O which succeeded.
1108                  */
1109
1110                 /* Restore what we were doing. */
1111                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1112                 V2P(vol, virtual, &disk, &offset, &start);
1113
1114                 for (copy = 0; copy < N; copy++) {
1115                         if ((mask & (1 << copy) ) != 0)
1116                                 vol->v_subdisks[(disk + copy) %
1117                                     vol->v_disks_count].sd_recovery--;
1118                 }
1119
1120                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1121                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1122                             "failing subdisk.");
1123                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1124                         bp->bio_error = 0;
1125                 }
1126                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1127                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1128         }
1129         if (pbp->bio_cmd != BIO_READ) {
1130                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1131                         pbp->bio_error = bp->bio_error;
1132                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1133                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1134                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1135                 }
1136                 error = pbp->bio_error;
1137         } else
1138                 error = bp->bio_error;
1139         g_destroy_bio(bp);
1140         if (pbp->bio_children == pbp->bio_inbed) {
1141                 pbp->bio_completed = pbp->bio_length;
1142                 g_raid_iodone(pbp, error);
1143         }
1144 }
1145
1146 static int
1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1148     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1149 {
1150         struct g_raid_volume *vol;
1151         struct g_raid_subdisk *sd;
1152         struct bio_queue_head queue;
1153         char *addr;
1154         off_t offset, start, length, remain;
1155         u_int no, strip_size;
1156         int i, error;
1157
1158         vol = tr->tro_volume;
1159         addr = virtual;
1160         strip_size = vol->v_strip_size;
1161         V2P(vol, boffset, &no, &offset, &start);
1162         remain = blength;
1163         bioq_init(&queue);
1164         while (remain > 0) {
1165                 length = MIN(strip_size - start, remain);
1166                 for (i = 0; i < N; i++) {
1167                         sd = &vol->v_subdisks[no];
1168                         switch (sd->sd_state) {
1169                         case G_RAID_SUBDISK_S_ACTIVE:
1170                         case G_RAID_SUBDISK_S_STALE:
1171                         case G_RAID_SUBDISK_S_RESYNC:
1172                                 break;
1173                         case G_RAID_SUBDISK_S_REBUILD:
1174                                 if (offset + start >= sd->sd_rebuild_pos)
1175                                         goto nextdisk;
1176                                 break;
1177                         default:
1178                                 goto nextdisk;
1179                         }
1180                         error = g_raid_subdisk_kerneldump(sd,
1181                             addr, 0, offset + start, length);
1182                         if (error != 0)
1183                                 return (error);
1184 nextdisk:
1185                         if (++no >= vol->v_disks_count) {
1186                                 no = 0;
1187                                 offset += strip_size;
1188                         }
1189                 }
1190                 remain -= length;
1191                 addr += length;
1192                 start = 0;
1193         }
1194         return (0);
1195 }
1196
1197 static int
1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1199 {
1200         struct bio *bp;
1201         struct g_raid_subdisk *sd;
1202
1203         bp = (struct bio *)argp;
1204         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1205         g_raid_subdisk_iostart(sd, bp);
1206
1207         return (0);
1208 }
1209
1210 static int
1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1212 {
1213         struct g_raid_tr_raid1e_object *trs;
1214         struct g_raid_volume *vol;
1215
1216         vol = tr->tro_volume;
1217         trs = (struct g_raid_tr_raid1e_object *)tr;
1218         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1219         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1220         /* Compensate short rebuild I/Os. */
1221         if ((vol->v_disks_count % N) != 0 &&
1222             vol->v_strip_size < g_raid1e_rebuild_slab) {
1223                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1224                 trs->trso_recover_slabs /= vol->v_strip_size;
1225         }
1226         if (trs->trso_type == TR_RAID1E_REBUILD)
1227                 g_raid_tr_raid1e_rebuild_some(tr);
1228         return (0);
1229 }
1230
1231 static int
1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1233 {
1234         struct g_raid_tr_raid1e_object *trs;
1235
1236         trs = (struct g_raid_tr_raid1e_object *)tr;
1237
1238         if (trs->trso_buffer != NULL) {
1239                 free(trs->trso_buffer, M_TR_RAID1E);
1240                 trs->trso_buffer = NULL;
1241         }
1242         return (0);
1243 }
1244
1245 G_RAID_TR_DECLARE(raid1e, "RAID1E");