sys/geom/raid/tr_raid1e.c

   1 /*-
   2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <sys/param.h>
  31 #include <sys/bio.h>
  32 #include <sys/endian.h>
  33 #include <sys/kernel.h>
  34 #include <sys/kobj.h>
  35 #include <sys/limits.h>
  36 #include <sys/lock.h>
  37 #include <sys/malloc.h>
  38 #include <sys/mutex.h>
  39 #include <sys/sysctl.h>
  40 #include <sys/systm.h>
  41 #include <geom/geom.h>
  42 #include "geom/raid/g_raid.h"
  43 #include "g_raid_tr_if.h"
  44
  45 #define N       2
  46
  47 SYSCTL_DECL(_kern_geom_raid_raid1e);
  48
  49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
  50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
  51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
  52     &g_raid1e_rebuild_slab, 0,
  53     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
  54
  55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
  56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
  57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
  58     &g_raid1e_rebuild_fair_io, 0,
  59     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
  60
  61 #define RAID1E_REBUILD_CLUSTER_IDLE 100
  62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
  63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
  64     &g_raid1e_rebuild_cluster_idle, 0,
  65     "Number of slabs to do each time we trigger a rebuild cycle");
  66
  67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
  68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
  69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
  70     &g_raid1e_rebuild_meta_update, 0,
  71     "When to update the meta data.");
  72
  73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
  74
  75 #define TR_RAID1E_NONE 0
  76 #define TR_RAID1E_REBUILD 1
  77 #define TR_RAID1E_RESYNC 2
  78
  79 #define TR_RAID1E_F_DOING_SOME  0x1
  80 #define TR_RAID1E_F_LOCKED      0x2
  81 #define TR_RAID1E_F_ABORT       0x4
  82
  83 struct g_raid_tr_raid1e_object {
  84         struct g_raid_tr_object  trso_base;
  85         int                      trso_starting;
  86         int                      trso_stopping;
  87         int                      trso_type;
  88         int                      trso_recover_slabs; /* slabs before rest */
  89         int                      trso_fair_io;
  90         int                      trso_meta_update;
  91         int                      trso_flags;
  92         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  93         void                    *trso_buffer;    /* Buffer space */
  94         off_t                    trso_lock_pos; /* Locked range start. */
  95         off_t                    trso_lock_len; /* Locked range length. */
  96         struct bio               trso_bio;
  97 };
  98
  99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 100 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 101 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 108 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 109
 110 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 111         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
 112         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
 113         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
 114         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
 115         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
 116         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
 117         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 118         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
 119         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
 120         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
 121         { 0, 0 }
 122 };
 123
 124 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 125         "RAID1E",
 126         g_raid_tr_raid1e_methods,
 127         sizeof(struct g_raid_tr_raid1e_object),
 128         .trc_enable = 1,
 129         .trc_priority = 200,
 130         .trc_accept_unmapped = 1
 131 };
 132
 133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 135     struct g_raid_subdisk *sd);
 136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 137     int no, off_t off, off_t len, u_int mask);
 138
 139 static inline void
 140 V2P(struct g_raid_volume *vol, off_t virt,
 141     int *disk, off_t *offset, off_t *start)
 142 {
 143         off_t nstrip;
 144         u_int strip_size;
 145
 146         strip_size = vol->v_strip_size;
 147         /* Strip number. */
 148         nstrip = virt / strip_size;
 149         /* Start position in strip. */
 150         *start = virt % strip_size;
 151         /* Disk number. */
 152         *disk = (nstrip * N) % vol->v_disks_count;
 153         /* Strip start position in disk. */
 154         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 155 }
 156
 157 static inline void
 158 P2V(struct g_raid_volume *vol, int disk, off_t offset,
 159     off_t *virt, int *copy)
 160 {
 161         off_t nstrip, start;
 162         u_int strip_size;
 163
 164         strip_size = vol->v_strip_size;
 165         /* Start position in strip. */
 166         start = offset % strip_size;
 167         /* Physical strip number. */
 168         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 169         /* Number of physical strip (copy) inside virtual strip. */
 170         *copy = nstrip % N;
 171         /* Offset in virtual space. */
 172         *virt = (nstrip / N) * strip_size + start;
 173 }
 174
 175 static int
 176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 177 {
 178         struct g_raid_tr_raid1e_object *trs;
 179
 180         trs = (struct g_raid_tr_raid1e_object *)tr;
 181         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 182             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 183                 return (G_RAID_TR_TASTE_FAIL);
 184         trs->trso_starting = 1;
 185         return (G_RAID_TR_TASTE_SUCCEED);
 186 }
 187
 188 static int
 189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 190 {
 191         struct g_raid_softc *sc;
 192         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 193         int i, j, state, sstate;
 194
 195         sc = vol->v_softc;
 196         state = G_RAID_VOLUME_S_OPTIMAL;
 197         for (i = 0; i < vol->v_disks_count / N; i++) {
 198                 bestsd = &vol->v_subdisks[i * N];
 199                 for (j = 1; j < N; j++) {
 200                         sd = &vol->v_subdisks[i * N + j];
 201                         if (sd->sd_state > bestsd->sd_state)
 202                                 bestsd = sd;
 203                         else if (sd->sd_state == bestsd->sd_state &&
 204                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 205                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 206                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 207                                 bestsd = sd;
 208                 }
 209                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 210                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 211                         /* We found reasonable candidate. */
 212                         G_RAID_DEBUG1(1, sc,
 213                             "Promote subdisk %s:%d from %s to ACTIVE.",
 214                             vol->v_name, bestsd->sd_pos,
 215                             g_raid_subdisk_state2str(bestsd->sd_state));
 216                         g_raid_change_subdisk_state(bestsd,
 217                             G_RAID_SUBDISK_S_ACTIVE);
 218                         g_raid_write_metadata(sc,
 219                             vol, bestsd, bestsd->sd_disk);
 220                 }
 221                 worstsd = &vol->v_subdisks[i * N];
 222                 for (j = 1; j < N; j++) {
 223                         sd = &vol->v_subdisks[i * N + j];
 224                         if (sd->sd_state < worstsd->sd_state)
 225                                 worstsd = sd;
 226                 }
 227                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 228                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 229                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 230                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 231                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 232                         sstate = G_RAID_VOLUME_S_DEGRADED;
 233                 else
 234                         sstate = G_RAID_VOLUME_S_BROKEN;
 235                 if (sstate < state)
 236                         state = sstate;
 237         }
 238         return (state);
 239 }
 240
 241 static int
 242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 243 {
 244         struct g_raid_softc *sc;
 245         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 246         int i, j, state, sstate;
 247
 248         sc = vol->v_softc;
 249         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 250             vol->v_disks_count)
 251                 return (G_RAID_VOLUME_S_OPTIMAL);
 252         for (i = 0; i < vol->v_disks_count; i++) {
 253                 sd = &vol->v_subdisks[i];
 254                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 255                         /* We found reasonable candidate. */
 256                         G_RAID_DEBUG1(1, sc,
 257                             "Promote subdisk %s:%d from %s to STALE.",
 258                             vol->v_name, sd->sd_pos,
 259                             g_raid_subdisk_state2str(sd->sd_state));
 260                         g_raid_change_subdisk_state(sd,
 261                             G_RAID_SUBDISK_S_STALE);
 262                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 263                 }
 264         }
 265         state = G_RAID_VOLUME_S_OPTIMAL;
 266         for (i = 0; i < vol->v_disks_count; i++) {
 267                 bestsd = &vol->v_subdisks[i];
 268                 worstsd = &vol->v_subdisks[i];
 269                 for (j = 1; j < N; j++) {
 270                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 271                         if (sd->sd_state > bestsd->sd_state)
 272                                 bestsd = sd;
 273                         else if (sd->sd_state == bestsd->sd_state &&
 274                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 275                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 276                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 277                                 bestsd = sd;
 278                         if (sd->sd_state < worstsd->sd_state)
 279                                 worstsd = sd;
 280                 }
 281                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 282                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 283                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 284                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 285                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 286                         sstate = G_RAID_VOLUME_S_DEGRADED;
 287                 else
 288                         sstate = G_RAID_VOLUME_S_BROKEN;
 289                 if (sstate < state)
 290                         state = sstate;
 291         }
 292         return (state);
 293 }
 294
 295 static int
 296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
 297     struct g_raid_subdisk *sd)
 298 {
 299         struct g_raid_tr_raid1e_object *trs;
 300         struct g_raid_softc *sc;
 301         u_int s;
 302
 303         sc = vol->v_softc;
 304         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 305         if (trs->trso_stopping &&
 306             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 307                 s = G_RAID_VOLUME_S_STOPPED;
 308         else if (trs->trso_starting)
 309                 s = G_RAID_VOLUME_S_STARTING;
 310         else {
 311                 if ((vol->v_disks_count % N) == 0)
 312                         s = g_raid_tr_update_state_raid1e_even(vol);
 313                 else
 314                         s = g_raid_tr_update_state_raid1e_odd(vol);
 315         }
 316         if (s != vol->v_state) {
 317                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 318                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 319                     G_RAID_EVENT_VOLUME);
 320                 g_raid_change_volume_state(vol, s);
 321                 if (!trs->trso_starting && !trs->trso_stopping)
 322                         g_raid_write_metadata(sc, vol, NULL, NULL);
 323         }
 324         if (!trs->trso_starting && !trs->trso_stopping)
 325                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 326         return (0);
 327 }
 328
 329 static void
 330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
 331     struct g_raid_disk *disk)
 332 {
 333         struct g_raid_volume *vol;
 334
 335         vol = sd->sd_volume;
 336         /*
 337          * We don't fail the last disk in the pack, since it still has decent
 338          * data on it and that's better than failing the disk if it is the root
 339          * file system.
 340          *
 341          * XXX should this be controlled via a tunable?  It makes sense for
 342          * the volume that has / on it.  I can't think of a case where we'd
 343          * want the volume to go away on this kind of event.
 344          */
 345         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 346              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 347              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 348              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 349              vol->v_disks_count) &&
 350             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 351                 return;
 352         g_raid_fail_disk(sc, sd, disk);
 353 }
 354
 355 static void
 356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 357 {
 358         struct g_raid_volume *vol;
 359         struct g_raid_subdisk *sd;
 360
 361         vol = trs->trso_base.tro_volume;
 362         sd = trs->trso_failed_sd;
 363         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 364         free(trs->trso_buffer, M_TR_RAID1E);
 365         trs->trso_buffer = NULL;
 366         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 367         trs->trso_type = TR_RAID1E_NONE;
 368         trs->trso_recover_slabs = 0;
 369         trs->trso_failed_sd = NULL;
 370         g_raid_tr_update_state_raid1e(vol, NULL);
 371 }
 372
 373 static void
 374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 375 {
 376         struct g_raid_tr_raid1e_object *trs;
 377         struct g_raid_subdisk *sd;
 378
 379         trs = (struct g_raid_tr_raid1e_object *)tr;
 380         sd = trs->trso_failed_sd;
 381         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 382             "Subdisk %s:%d-%s rebuild completed.",
 383             sd->sd_volume->v_name, sd->sd_pos,
 384             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 385         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 386         sd->sd_rebuild_pos = 0;
 387         g_raid_tr_raid1e_rebuild_done(trs);
 388 }
 389
 390 static void
 391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 392 {
 393         struct g_raid_tr_raid1e_object *trs;
 394         struct g_raid_subdisk *sd;
 395         struct g_raid_volume *vol;
 396
 397         vol = tr->tro_volume;
 398         trs = (struct g_raid_tr_raid1e_object *)tr;
 399         sd = trs->trso_failed_sd;
 400         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 401                 G_RAID_DEBUG1(1, vol->v_softc,
 402                     "Subdisk %s:%d-%s rebuild is aborting.",
 403                     sd->sd_volume->v_name, sd->sd_pos,
 404                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 405                 trs->trso_flags |= TR_RAID1E_F_ABORT;
 406         } else {
 407                 G_RAID_DEBUG1(0, vol->v_softc,
 408                     "Subdisk %s:%d-%s rebuild aborted.",
 409                     sd->sd_volume->v_name, sd->sd_pos,
 410                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 411                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 412                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 413                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 414                         g_raid_unlock_range(tr->tro_volume,
 415                             trs->trso_lock_pos, trs->trso_lock_len);
 416                 }
 417                 g_raid_tr_raid1e_rebuild_done(trs);
 418         }
 419 }
 420
 421 static void
 422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 423 {
 424         struct g_raid_tr_raid1e_object *trs;
 425         struct g_raid_softc *sc;
 426         struct g_raid_volume *vol;
 427         struct g_raid_subdisk *sd;
 428         struct bio *bp;
 429         off_t len, virtual, vend, offset, start;
 430         int disk, copy, best;
 431
 432         trs = (struct g_raid_tr_raid1e_object *)tr;
 433         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 434                 return;
 435         vol = tr->tro_volume;
 436         sc = vol->v_softc;
 437         sd = trs->trso_failed_sd;
 438
 439         while (1) {
 440                 if (sd->sd_rebuild_pos >= sd->sd_size) {
 441                         g_raid_tr_raid1e_rebuild_finish(tr);
 442                         return;
 443                 }
 444                 /* Get virtual offset from physical rebuild position. */
 445                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 446                 /* Get physical offset back to get first stripe position. */
 447                 V2P(vol, virtual, &disk, &offset, &start);
 448                 /* Calculate contignous data length. */
 449                 len = MIN(g_raid1e_rebuild_slab,
 450                     sd->sd_size - sd->sd_rebuild_pos);
 451                 if ((vol->v_disks_count % N) != 0)
 452                         len = MIN(len, vol->v_strip_size - start);
 453                 /* Find disk with most accurate data. */
 454                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 455                     offset + start, len, 0);
 456                 if (best < 0) {
 457                         /* There is no any valid disk. */
 458                         g_raid_tr_raid1e_rebuild_abort(tr);
 459                         return;
 460                 } else if (best != copy) {
 461                         /* Some other disk has better data. */
 462                         break;
 463                 }
 464                 /* We have the most accurate data. Skip the range. */
 465                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 466                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 467                 sd->sd_rebuild_pos += len;
 468         }
 469
 470         bp = &trs->trso_bio;
 471         memset(bp, 0, sizeof(*bp));
 472         bp->bio_offset = offset + start +
 473             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 474         bp->bio_length = len;
 475         bp->bio_data = trs->trso_buffer;
 476         bp->bio_cmd = BIO_READ;
 477         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 478         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 479         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 480         /*
 481          * If we are crossing stripe boundary, correct affected virtual
 482          * range we should lock.
 483          */
 484         if (start + len > vol->v_strip_size) {
 485                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 486                 len = vend - virtual;
 487         }
 488         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 489         trs->trso_flags |= TR_RAID1E_F_LOCKED;
 490         trs->trso_lock_pos = virtual;
 491         trs->trso_lock_len = len;
 492         /* Lock callback starts I/O */
 493         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 494 }
 495
 496 static void
 497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 498 {
 499         struct g_raid_volume *vol;
 500         struct g_raid_tr_raid1e_object *trs;
 501         struct g_raid_subdisk *sd;
 502
 503         vol = tr->tro_volume;
 504         trs = (struct g_raid_tr_raid1e_object *)tr;
 505         if (trs->trso_failed_sd) {
 506                 G_RAID_DEBUG1(1, vol->v_softc,
 507                     "Already rebuild in start rebuild. pos %jd\n",
 508                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 509                 return;
 510         }
 511         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 512         if (sd == NULL)
 513                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 514         if (sd == NULL) {
 515                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 516                 if (sd != NULL) {
 517                         sd->sd_rebuild_pos = 0;
 518                         g_raid_change_subdisk_state(sd,
 519                             G_RAID_SUBDISK_S_RESYNC);
 520                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 521                 } else {
 522                         sd = g_raid_get_subdisk(vol,
 523                             G_RAID_SUBDISK_S_UNINITIALIZED);
 524                         if (sd == NULL)
 525                                 sd = g_raid_get_subdisk(vol,
 526                                     G_RAID_SUBDISK_S_NEW);
 527                         if (sd != NULL) {
 528                                 sd->sd_rebuild_pos = 0;
 529                                 g_raid_change_subdisk_state(sd,
 530                                     G_RAID_SUBDISK_S_REBUILD);
 531                                 g_raid_write_metadata(vol->v_softc,
 532                                     vol, sd, NULL);
 533                         }
 534                 }
 535         }
 536         if (sd == NULL) {
 537                 G_RAID_DEBUG1(1, vol->v_softc,
 538                     "No failed disk to rebuild.  night night.");
 539                 return;
 540         }
 541         trs->trso_failed_sd = sd;
 542         G_RAID_DEBUG1(0, vol->v_softc,
 543             "Subdisk %s:%d-%s rebuild start at %jd.",
 544             sd->sd_volume->v_name, sd->sd_pos,
 545             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 546             trs->trso_failed_sd->sd_rebuild_pos);
 547         trs->trso_type = TR_RAID1E_REBUILD;
 548         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 549         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 550         g_raid_tr_raid1e_rebuild_some(tr);
 551 }
 552
 553 static void
 554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 555     struct g_raid_subdisk *sd)
 556 {
 557         struct g_raid_volume *vol;
 558         struct g_raid_tr_raid1e_object *trs;
 559         int nr;
 560
 561         vol = tr->tro_volume;
 562         trs = (struct g_raid_tr_raid1e_object *)tr;
 563         if (trs->trso_stopping)
 564                 return;
 565         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 566             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 567         switch(trs->trso_type) {
 568         case TR_RAID1E_NONE:
 569                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 570                         return;
 571                 if (nr == 0) {
 572                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 573                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 574                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 575                         if (nr == 0)
 576                                 return;
 577                 }
 578                 g_raid_tr_raid1e_rebuild_start(tr);
 579                 break;
 580         case TR_RAID1E_REBUILD:
 581                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 582                     trs->trso_failed_sd == sd)
 583                         g_raid_tr_raid1e_rebuild_abort(tr);
 584                 break;
 585         case TR_RAID1E_RESYNC:
 586                 break;
 587         }
 588 }
 589
 590 static int
 591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
 592     struct g_raid_subdisk *sd, u_int event)
 593 {
 594
 595         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 596         return (0);
 597 }
 598
 599 static int
 600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 601 {
 602         struct g_raid_tr_raid1e_object *trs;
 603         struct g_raid_volume *vol;
 604
 605         trs = (struct g_raid_tr_raid1e_object *)tr;
 606         vol = tr->tro_volume;
 607         trs->trso_starting = 0;
 608         g_raid_tr_update_state_raid1e(vol, NULL);
 609         return (0);
 610 }
 611
 612 static int
 613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 614 {
 615         struct g_raid_tr_raid1e_object *trs;
 616         struct g_raid_volume *vol;
 617
 618         trs = (struct g_raid_tr_raid1e_object *)tr;
 619         vol = tr->tro_volume;
 620         trs->trso_starting = 0;
 621         trs->trso_stopping = 1;
 622         g_raid_tr_update_state_raid1e(vol, NULL);
 623         return (0);
 624 }
 625
 626 /*
 627  * Select the disk to read from.  Take into account: subdisk state, running
 628  * error recovery, average disk load, head position and possible cache hits.
 629  */
 630 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
 631 static int
 632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 633     int no, off_t off, off_t len, u_int mask)
 634 {
 635         struct g_raid_subdisk *sd;
 636         off_t offset;
 637         int i, best, prio, bestprio;
 638
 639         best = -1;
 640         bestprio = INT_MAX;
 641         for (i = 0; i < N; i++) {
 642                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 643                 offset = off;
 644                 if (no + i >= vol->v_disks_count)
 645                         offset += vol->v_strip_size;
 646
 647                 prio = G_RAID_SUBDISK_LOAD(sd);
 648                 if ((mask & (1 << sd->sd_pos)) != 0)
 649                         continue;
 650                 switch (sd->sd_state) {
 651                 case G_RAID_SUBDISK_S_ACTIVE:
 652                         break;
 653                 case G_RAID_SUBDISK_S_RESYNC:
 654                         if (offset + off < sd->sd_rebuild_pos)
 655                                 break;
 656                         /* FALLTHROUGH */
 657                 case G_RAID_SUBDISK_S_STALE:
 658                         prio += i << 24;
 659                         break;
 660                 case G_RAID_SUBDISK_S_REBUILD:
 661                         if (offset + off < sd->sd_rebuild_pos)
 662                                 break;
 663                         /* FALLTHROUGH */
 664                 default:
 665                         continue;
 666                 }
 667                 prio += min(sd->sd_recovery, 255) << 16;
 668                 /* If disk head is precisely in position - highly prefer it. */
 669                 if (G_RAID_SUBDISK_POS(sd) == offset)
 670                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 671                 else
 672                 /* If disk head is close to position - prefer it. */
 673                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 674                     G_RAID_SUBDISK_TRACK_SIZE)
 675                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 676                 if (prio < bestprio) {
 677                         bestprio = prio;
 678                         best = i;
 679                 }
 680         }
 681         return (best);
 682 }
 683
 684 static void
 685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 686 {
 687         struct g_raid_volume *vol;
 688         struct g_raid_subdisk *sd;
 689         struct bio_queue_head queue;
 690         struct bio *cbp;
 691         char *addr;
 692         off_t offset, start, length, remain;
 693         u_int no, strip_size;
 694         int best;
 695
 696         vol = tr->tro_volume;
 697         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 698                 addr = NULL;
 699         else
 700                 addr = bp->bio_data;
 701         strip_size = vol->v_strip_size;
 702         V2P(vol, bp->bio_offset, &no, &offset, &start);
 703         remain = bp->bio_length;
 704         bioq_init(&queue);
 705         while (remain > 0) {
 706                 length = MIN(strip_size - start, remain);
 707                 best = g_raid_tr_raid1e_select_read_disk(vol,
 708                     no, offset, length, 0);
 709                 KASSERT(best >= 0, ("No readable disk in volume %s!",
 710                     vol->v_name));
 711                 no += best;
 712                 if (no >= vol->v_disks_count) {
 713                         no -= vol->v_disks_count;
 714                         offset += strip_size;
 715                 }
 716                 cbp = g_clone_bio(bp);
 717                 if (cbp == NULL)
 718                         goto failure;
 719                 cbp->bio_offset = offset + start;
 720                 cbp->bio_length = length;
 721                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 722                         cbp->bio_ma_offset += (uintptr_t)addr;
 723                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 724                         cbp->bio_ma_offset %= PAGE_SIZE;
 725                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 726                             cbp->bio_length) / PAGE_SIZE;
 727                 } else
 728                         cbp->bio_data = addr;
 729                 cbp->bio_caller1 = &vol->v_subdisks[no];
 730                 bioq_insert_tail(&queue, cbp);
 731                 no += N - best;
 732                 if (no >= vol->v_disks_count) {
 733                         no -= vol->v_disks_count;
 734                         offset += strip_size;
 735                 }
 736                 remain -= length;
 737                 addr += length;
 738                 start = 0;
 739         }
 740         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 741                 sd = cbp->bio_caller1;
 742                 cbp->bio_caller1 = NULL;
 743                 g_raid_subdisk_iostart(sd, cbp);
 744         }
 745         return;
 746 failure:
 747         while ((cbp = bioq_takefirst(&queue)) != NULL)
 748                 g_destroy_bio(cbp);
 749         if (bp->bio_error == 0)
 750                 bp->bio_error = ENOMEM;
 751         g_raid_iodone(bp, bp->bio_error);
 752 }
 753
 754 static void
 755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 756 {
 757         struct g_raid_volume *vol;
 758         struct g_raid_subdisk *sd;
 759         struct bio_queue_head queue;
 760         struct bio *cbp;
 761         char *addr;
 762         off_t offset, start, length, remain;
 763         u_int no, strip_size;
 764         int i;
 765
 766         vol = tr->tro_volume;
 767         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 768                 addr = NULL;
 769         else
 770                 addr = bp->bio_data;
 771         strip_size = vol->v_strip_size;
 772         V2P(vol, bp->bio_offset, &no, &offset, &start);
 773         remain = bp->bio_length;
 774         bioq_init(&queue);
 775         while (remain > 0) {
 776                 length = MIN(strip_size - start, remain);
 777                 for (i = 0; i < N; i++) {
 778                         sd = &vol->v_subdisks[no];
 779                         switch (sd->sd_state) {
 780                         case G_RAID_SUBDISK_S_ACTIVE:
 781                         case G_RAID_SUBDISK_S_STALE:
 782                         case G_RAID_SUBDISK_S_RESYNC:
 783                                 break;
 784                         case G_RAID_SUBDISK_S_REBUILD:
 785                                 if (offset + start >= sd->sd_rebuild_pos)
 786                                         goto nextdisk;
 787                                 break;
 788                         default:
 789                                 goto nextdisk;
 790                         }
 791                         cbp = g_clone_bio(bp);
 792                         if (cbp == NULL)
 793                                 goto failure;
 794                         cbp->bio_offset = offset + start;
 795                         cbp->bio_length = length;
 796                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 797                             bp->bio_cmd != BIO_DELETE) {
 798                                 cbp->bio_ma_offset += (uintptr_t)addr;
 799                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 800                                 cbp->bio_ma_offset %= PAGE_SIZE;
 801                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 802                                     cbp->bio_length) / PAGE_SIZE;
 803                         } else
 804                                 cbp->bio_data = addr;
 805                         cbp->bio_caller1 = sd;
 806                         bioq_insert_tail(&queue, cbp);
 807 nextdisk:
 808                         if (++no >= vol->v_disks_count) {
 809                                 no = 0;
 810                                 offset += strip_size;
 811                         }
 812                 }
 813                 remain -= length;
 814                 if (bp->bio_cmd != BIO_DELETE)
 815                         addr += length;
 816                 start = 0;
 817         }
 818         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 819                 sd = cbp->bio_caller1;
 820                 cbp->bio_caller1 = NULL;
 821                 g_raid_subdisk_iostart(sd, cbp);
 822         }
 823         return;
 824 failure:
 825         while ((cbp = bioq_takefirst(&queue)) != NULL)
 826                 g_destroy_bio(cbp);
 827         if (bp->bio_error == 0)
 828                 bp->bio_error = ENOMEM;
 829         g_raid_iodone(bp, bp->bio_error);
 830 }
 831
 832 static void
 833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 834 {
 835         struct g_raid_volume *vol;
 836         struct g_raid_tr_raid1e_object *trs;
 837
 838         vol = tr->tro_volume;
 839         trs = (struct g_raid_tr_raid1e_object *)tr;
 840         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 841             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 842             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 843                 g_raid_iodone(bp, EIO);
 844                 return;
 845         }
 846         /*
 847          * If we're rebuilding, squeeze in rebuild activity every so often,
 848          * even when the disk is busy.  Be sure to only count real I/O
 849          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 850          * by this module.
 851          */
 852         if (trs->trso_failed_sd != NULL &&
 853             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 854                 /* Make this new or running now round short. */
 855                 trs->trso_recover_slabs = 0;
 856                 if (--trs->trso_fair_io <= 0) {
 857                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 858                         g_raid_tr_raid1e_rebuild_some(tr);
 859                 }
 860         }
 861         switch (bp->bio_cmd) {
 862         case BIO_READ:
 863                 g_raid_tr_iostart_raid1e_read(tr, bp);
 864                 break;
 865         case BIO_WRITE:
 866         case BIO_DELETE:
 867                 g_raid_tr_iostart_raid1e_write(tr, bp);
 868                 break;
 869         case BIO_FLUSH:
 870                 g_raid_tr_flush_common(tr, bp);
 871                 break;
 872         default:
 873                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 874                     bp->bio_cmd, vol->v_name));
 875                 break;
 876         }
 877 }
 878
 879 static void
 880 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
 881     struct g_raid_subdisk *sd, struct bio *bp)
 882 {
 883         struct bio *cbp;
 884         struct g_raid_subdisk *nsd;
 885         struct g_raid_volume *vol;
 886         struct bio *pbp;
 887         struct g_raid_tr_raid1e_object *trs;
 888         off_t virtual, offset, start;
 889         uintptr_t mask;
 890         int error, do_write, copy, disk, best;
 891
 892         trs = (struct g_raid_tr_raid1e_object *)tr;
 893         vol = tr->tro_volume;
 894         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 895                 if (trs->trso_type == TR_RAID1E_REBUILD) {
 896                         nsd = trs->trso_failed_sd;
 897                         if (bp->bio_cmd == BIO_READ) {
 898
 899                                 /* Immediately abort rebuild, if requested. */
 900                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 901                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 902                                         g_raid_tr_raid1e_rebuild_abort(tr);
 903                                         return;
 904                                 }
 905
 906                                 /* On read error, skip and cross fingers. */
 907                                 if (bp->bio_error != 0) {
 908                                         G_RAID_LOGREQ(0, bp,
 909                                             "Read error during rebuild (%d), "
 910                                             "possible data loss!",
 911                                             bp->bio_error);
 912                                         goto rebuild_round_done;
 913                                 }
 914
 915                                 /*
 916                                  * The read operation finished, queue the
 917                                  * write and get out.
 918                                  */
 919                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 920                                     bp->bio_error);
 921                                 bp->bio_cmd = BIO_WRITE;
 922                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 923                                 bp->bio_offset = nsd->sd_rebuild_pos;
 924                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 925                                 g_raid_subdisk_iostart(nsd, bp);
 926                         } else {
 927                                 /*
 928                                  * The write operation just finished.  Do
 929                                  * another.  We keep cloning the master bio
 930                                  * since it has the right buffers allocated to
 931                                  * it.
 932                                  */
 933                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 934                                     bp->bio_error);
 935                                 if (bp->bio_error != 0 ||
 936                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
 937                                         if ((trs->trso_flags &
 938                                             TR_RAID1E_F_ABORT) == 0) {
 939                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 940                                                     nsd, nsd->sd_disk);
 941                                         }
 942                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 943                                         g_raid_tr_raid1e_rebuild_abort(tr);
 944                                         return;
 945                                 }
 946 rebuild_round_done:
 947                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 948                                 g_raid_unlock_range(tr->tro_volume,
 949                                     trs->trso_lock_pos, trs->trso_lock_len);
 950                                 nsd->sd_rebuild_pos += bp->bio_length;
 951                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 952                                         g_raid_tr_raid1e_rebuild_finish(tr);
 953                                         return;
 954                                 }
 955
 956                                 /* Abort rebuild if we are stopping */
 957                                 if (trs->trso_stopping) {
 958                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 959                                         g_raid_tr_raid1e_rebuild_abort(tr);
 960                                         return;
 961                                 }
 962
 963                                 if (--trs->trso_meta_update <= 0) {
 964                                         g_raid_write_metadata(vol->v_softc,
 965                                             vol, nsd, nsd->sd_disk);
 966                                         trs->trso_meta_update =
 967                                             g_raid1e_rebuild_meta_update;
 968                                         /* Compensate short rebuild I/Os. */
 969                                         if ((vol->v_disks_count % N) != 0 &&
 970                                             vol->v_strip_size <
 971                                              g_raid1e_rebuild_slab) {
 972                                                 trs->trso_meta_update *=
 973                                                     g_raid1e_rebuild_slab;
 974                                                 trs->trso_meta_update /=
 975                                                     vol->v_strip_size;
 976                                         }
 977                                 }
 978                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 979                                 if (--trs->trso_recover_slabs <= 0)
 980                                         return;
 981                                 /* Run next rebuild iteration. */
 982                                 g_raid_tr_raid1e_rebuild_some(tr);
 983                         }
 984                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
 985                         /*
 986                          * read good sd, read bad sd in parallel.  when both
 987                          * done, compare the buffers.  write good to the bad
 988                          * if different.  do the next bit of work.
 989                          */
 990                         panic("Somehow, we think we're doing a resync");
 991                 }
 992                 return;
 993         }
 994         pbp = bp->bio_parent;
 995         pbp->bio_inbed++;
 996         mask = (intptr_t)bp->bio_caller2;
 997         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 998                 /*
 999                  * Read failed on first drive.  Retry the read error on
1000                  * another disk drive, if available, before erroring out the
1001                  * read.
1002                  */
1003                 sd->sd_disk->d_read_errs++;
1004                 G_RAID_LOGREQ(0, bp,
1005                     "Read error (%d), %d read errors total",
1006                     bp->bio_error, sd->sd_disk->d_read_errs);
1007
1008                 /*
1009                  * If there are too many read errors, we move to degraded.
1010                  * XXX Do we want to FAIL the drive (eg, make the user redo
1011                  * everything to get it back in sync), or just degrade the
1012                  * drive, which kicks off a resync?
1013                  */
1014                 do_write = 0;
1015                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017                 else if (mask == 0)
1018                         do_write = 1;
1019
1020                 /* Restore what we were doing. */
1021                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1022                 V2P(vol, virtual, &disk, &offset, &start);
1023
1024                 /* Find the other disk, and try to do the I/O to it. */
1025                 mask |= 1 << copy;
1026                 best = g_raid_tr_raid1e_select_read_disk(vol,
1027                     disk, offset, start, mask);
1028                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029                         disk += best;
1030                         if (disk >= vol->v_disks_count) {
1031                                 disk -= vol->v_disks_count;
1032                                 offset += vol->v_strip_size;
1033                         }
1034                         cbp->bio_offset = offset + start;
1035                         cbp->bio_length = bp->bio_length;
1036                         cbp->bio_data = bp->bio_data;
1037                         cbp->bio_ma = bp->bio_ma;
1038                         cbp->bio_ma_offset = bp->bio_ma_offset;
1039                         cbp->bio_ma_n = bp->bio_ma_n;
1040                         g_destroy_bio(bp);
1041                         nsd = &vol->v_subdisks[disk];
1042                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043                             nsd->sd_pos);
1044                         if (do_write)
1045                                 mask |= 1 << 31;
1046                         if ((mask & (1U << 31)) != 0)
1047                                 sd->sd_recovery++;
1048                         cbp->bio_caller2 = (void *)mask;
1049                         if (do_write) {
1050                                 cbp->bio_caller1 = nsd;
1051                                 /* Lock callback starts I/O */
1052                                 g_raid_lock_range(sd->sd_volume,
1053                                     virtual, cbp->bio_length, pbp, cbp);
1054                         } else {
1055                                 g_raid_subdisk_iostart(nsd, cbp);
1056                         }
1057                         return;
1058                 }
1059                 /*
1060                  * We can't retry.  Return the original error by falling
1061                  * through.  This will happen when there's only one good disk.
1062                  * We don't need to fail the raid, since its actual state is
1063                  * based on the state of the subdisks.
1064                  */
1065                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066         }
1067         if (bp->bio_cmd == BIO_READ &&
1068             bp->bio_error == 0 &&
1069             (mask & (1U << 31)) != 0) {
1070                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071
1072                 /* Restore what we were doing. */
1073                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1074                 V2P(vol, virtual, &disk, &offset, &start);
1075
1076                 /* Find best disk to write. */
1077                 best = g_raid_tr_raid1e_select_read_disk(vol,
1078                     disk, offset, start, ~mask);
1079                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080                         disk += best;
1081                         if (disk >= vol->v_disks_count) {
1082                                 disk -= vol->v_disks_count;
1083                                 offset += vol->v_strip_size;
1084                         }
1085                         cbp->bio_offset = offset + start;
1086                         cbp->bio_cmd = BIO_WRITE;
1087                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088                         cbp->bio_caller2 = (void *)mask;
1089                         g_destroy_bio(bp);
1090                         G_RAID_LOGREQ(2, cbp,
1091                             "Attempting bad sector remap on failing drive.");
1092                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093                         return;
1094                 }
1095         }
1096         if ((mask & (1U << 31)) != 0) {
1097                 /*
1098                  * We're done with a recovery, mark the range as unlocked.
1099                  * For any write errors, we agressively fail the disk since
1100                  * there was both a READ and a WRITE error at this location.
1101                  * Both types of errors generally indicates the drive is on
1102                  * the verge of total failure anyway.  Better to stop trusting
1103                  * it now.  However, we need to reset error to 0 in that case
1104                  * because we're not failing the original I/O which succeeded.
1105                  */
1106
1107                 /* Restore what we were doing. */
1108                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1109                 V2P(vol, virtual, &disk, &offset, &start);
1110
1111                 for (copy = 0; copy < N; copy++) {
1112                         if ((mask & (1 << copy) ) != 0)
1113                                 vol->v_subdisks[(disk + copy) %
1114                                     vol->v_disks_count].sd_recovery--;
1115                 }
1116
1117                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119                             "failing subdisk.");
1120                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121                         bp->bio_error = 0;
1122                 }
1123                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125         }
1126         if (pbp->bio_cmd != BIO_READ) {
1127                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128                         pbp->bio_error = bp->bio_error;
1129                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132                 }
1133                 error = pbp->bio_error;
1134         } else
1135                 error = bp->bio_error;
1136         g_destroy_bio(bp);
1137         if (pbp->bio_children == pbp->bio_inbed) {
1138                 pbp->bio_completed = pbp->bio_length;
1139                 g_raid_iodone(pbp, error);
1140         }
1141 }
1142
1143 static int
1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1145     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1146 {
1147         struct g_raid_volume *vol;
1148         struct g_raid_subdisk *sd;
1149         struct bio_queue_head queue;
1150         char *addr;
1151         off_t offset, start, length, remain;
1152         u_int no, strip_size;
1153         int i, error;
1154
1155         vol = tr->tro_volume;
1156         addr = virtual;
1157         strip_size = vol->v_strip_size;
1158         V2P(vol, boffset, &no, &offset, &start);
1159         remain = blength;
1160         bioq_init(&queue);
1161         while (remain > 0) {
1162                 length = MIN(strip_size - start, remain);
1163                 for (i = 0; i < N; i++) {
1164                         sd = &vol->v_subdisks[no];
1165                         switch (sd->sd_state) {
1166                         case G_RAID_SUBDISK_S_ACTIVE:
1167                         case G_RAID_SUBDISK_S_STALE:
1168                         case G_RAID_SUBDISK_S_RESYNC:
1169                                 break;
1170                         case G_RAID_SUBDISK_S_REBUILD:
1171                                 if (offset + start >= sd->sd_rebuild_pos)
1172                                         goto nextdisk;
1173                                 break;
1174                         default:
1175                                 goto nextdisk;
1176                         }
1177                         error = g_raid_subdisk_kerneldump(sd,
1178                             addr, 0, offset + start, length);
1179                         if (error != 0)
1180                                 return (error);
1181 nextdisk:
1182                         if (++no >= vol->v_disks_count) {
1183                                 no = 0;
1184                                 offset += strip_size;
1185                         }
1186                 }
1187                 remain -= length;
1188                 addr += length;
1189                 start = 0;
1190         }
1191         return (0);
1192 }
1193
1194 static int
1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196 {
1197         struct bio *bp;
1198         struct g_raid_subdisk *sd;
1199
1200         bp = (struct bio *)argp;
1201         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202         g_raid_subdisk_iostart(sd, bp);
1203
1204         return (0);
1205 }
1206
1207 static int
1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209 {
1210         struct g_raid_tr_raid1e_object *trs;
1211         struct g_raid_volume *vol;
1212
1213         vol = tr->tro_volume;
1214         trs = (struct g_raid_tr_raid1e_object *)tr;
1215         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217         /* Compensate short rebuild I/Os. */
1218         if ((vol->v_disks_count % N) != 0 &&
1219             vol->v_strip_size < g_raid1e_rebuild_slab) {
1220                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221                 trs->trso_recover_slabs /= vol->v_strip_size;
1222         }
1223         if (trs->trso_type == TR_RAID1E_REBUILD)
1224                 g_raid_tr_raid1e_rebuild_some(tr);
1225         return (0);
1226 }
1227
1228 static int
1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230 {
1231         struct g_raid_tr_raid1e_object *trs;
1232
1233         trs = (struct g_raid_tr_raid1e_object *)tr;
1234
1235         if (trs->trso_buffer != NULL) {
1236                 free(trs->trso_buffer, M_TR_RAID1E);
1237                 trs->trso_buffer = NULL;
1238         }
1239         return (0);
1240 }
1241
1242 G_RAID_TR_DECLARE(raid1e, "RAID1E");