sys/geom/raid/tr_raid1e.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/bio.h>
  34 #include <sys/endian.h>
  35 #include <sys/kernel.h>
  36 #include <sys/kobj.h>
  37 #include <sys/limits.h>
  38 #include <sys/lock.h>
  39 #include <sys/malloc.h>
  40 #include <sys/mutex.h>
  41 #include <sys/sysctl.h>
  42 #include <sys/systm.h>
  43 #include <geom/geom.h>
  44 #include "geom/raid/g_raid.h"
  45 #include "g_raid_tr_if.h"
  46
  47 #define N       2
  48
  49 SYSCTL_DECL(_kern_geom_raid_raid1e);
  50
  51 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
  52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
  53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
  54     &g_raid1e_rebuild_slab, 0,
  55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
  56
  57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
  58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
  59 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
  60     &g_raid1e_rebuild_fair_io, 0,
  61     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
  62
  63 #define RAID1E_REBUILD_CLUSTER_IDLE 100
  64 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
  65 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
  66     &g_raid1e_rebuild_cluster_idle, 0,
  67     "Number of slabs to do each time we trigger a rebuild cycle");
  68
  69 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
  70 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
  71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
  72     &g_raid1e_rebuild_meta_update, 0,
  73     "When to update the meta data.");
  74
  75 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
  76
  77 #define TR_RAID1E_NONE 0
  78 #define TR_RAID1E_REBUILD 1
  79 #define TR_RAID1E_RESYNC 2
  80
  81 #define TR_RAID1E_F_DOING_SOME  0x1
  82 #define TR_RAID1E_F_LOCKED      0x2
  83 #define TR_RAID1E_F_ABORT       0x4
  84
  85 struct g_raid_tr_raid1e_object {
  86         struct g_raid_tr_object  trso_base;
  87         int                      trso_starting;
  88         int                      trso_stopping;
  89         int                      trso_type;
  90         int                      trso_recover_slabs; /* slabs before rest */
  91         int                      trso_fair_io;
  92         int                      trso_meta_update;
  93         int                      trso_flags;
  94         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  95         void                    *trso_buffer;    /* Buffer space */
  96         off_t                    trso_lock_pos; /* Locked range start. */
  97         off_t                    trso_lock_len; /* Locked range length. */
  98         struct bio               trso_bio;
  99 };
 100
 101 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 102 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 103 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 104 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 105 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 106 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 107 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 108 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 109 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 110 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 111
 112 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 113         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
 114         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
 115         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
 116         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
 117         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
 118         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
 119         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 120         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
 121         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
 122         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
 123         { 0, 0 }
 124 };
 125
 126 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 127         "RAID1E",
 128         g_raid_tr_raid1e_methods,
 129         sizeof(struct g_raid_tr_raid1e_object),
 130         .trc_enable = 1,
 131         .trc_priority = 200,
 132         .trc_accept_unmapped = 1
 133 };
 134
 135 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 136 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 137     struct g_raid_subdisk *sd);
 138 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 139     int no, off_t off, off_t len, u_int mask);
 140
 141 static inline void
 142 V2P(struct g_raid_volume *vol, off_t virt,
 143     int *disk, off_t *offset, off_t *start)
 144 {
 145         off_t nstrip;
 146         u_int strip_size;
 147
 148         strip_size = vol->v_strip_size;
 149         /* Strip number. */
 150         nstrip = virt / strip_size;
 151         /* Start position in strip. */
 152         *start = virt % strip_size;
 153         /* Disk number. */
 154         *disk = (nstrip * N) % vol->v_disks_count;
 155         /* Strip start position in disk. */
 156         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 157 }
 158
 159 static inline void
 160 P2V(struct g_raid_volume *vol, int disk, off_t offset,
 161     off_t *virt, int *copy)
 162 {
 163         off_t nstrip, start;
 164         u_int strip_size;
 165
 166         strip_size = vol->v_strip_size;
 167         /* Start position in strip. */
 168         start = offset % strip_size;
 169         /* Physical strip number. */
 170         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 171         /* Number of physical strip (copy) inside virtual strip. */
 172         *copy = nstrip % N;
 173         /* Offset in virtual space. */
 174         *virt = (nstrip / N) * strip_size + start;
 175 }
 176
 177 static int
 178 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 179 {
 180         struct g_raid_tr_raid1e_object *trs;
 181
 182         trs = (struct g_raid_tr_raid1e_object *)tr;
 183         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 184             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 185                 return (G_RAID_TR_TASTE_FAIL);
 186         trs->trso_starting = 1;
 187         return (G_RAID_TR_TASTE_SUCCEED);
 188 }
 189
 190 static int
 191 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 192 {
 193         struct g_raid_softc *sc;
 194         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 195         int i, j, state, sstate;
 196
 197         sc = vol->v_softc;
 198         state = G_RAID_VOLUME_S_OPTIMAL;
 199         for (i = 0; i < vol->v_disks_count / N; i++) {
 200                 bestsd = &vol->v_subdisks[i * N];
 201                 for (j = 1; j < N; j++) {
 202                         sd = &vol->v_subdisks[i * N + j];
 203                         if (sd->sd_state > bestsd->sd_state)
 204                                 bestsd = sd;
 205                         else if (sd->sd_state == bestsd->sd_state &&
 206                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 207                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 208                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 209                                 bestsd = sd;
 210                 }
 211                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 212                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 213                         /* We found reasonable candidate. */
 214                         G_RAID_DEBUG1(1, sc,
 215                             "Promote subdisk %s:%d from %s to ACTIVE.",
 216                             vol->v_name, bestsd->sd_pos,
 217                             g_raid_subdisk_state2str(bestsd->sd_state));
 218                         g_raid_change_subdisk_state(bestsd,
 219                             G_RAID_SUBDISK_S_ACTIVE);
 220                         g_raid_write_metadata(sc,
 221                             vol, bestsd, bestsd->sd_disk);
 222                 }
 223                 worstsd = &vol->v_subdisks[i * N];
 224                 for (j = 1; j < N; j++) {
 225                         sd = &vol->v_subdisks[i * N + j];
 226                         if (sd->sd_state < worstsd->sd_state)
 227                                 worstsd = sd;
 228                 }
 229                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 230                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 231                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 232                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 233                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 234                         sstate = G_RAID_VOLUME_S_DEGRADED;
 235                 else
 236                         sstate = G_RAID_VOLUME_S_BROKEN;
 237                 if (sstate < state)
 238                         state = sstate;
 239         }
 240         return (state);
 241 }
 242
 243 static int
 244 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 245 {
 246         struct g_raid_softc *sc;
 247         struct g_raid_subdisk *sd, *bestsd, *worstsd;
 248         int i, j, state, sstate;
 249
 250         sc = vol->v_softc;
 251         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 252             vol->v_disks_count)
 253                 return (G_RAID_VOLUME_S_OPTIMAL);
 254         for (i = 0; i < vol->v_disks_count; i++) {
 255                 sd = &vol->v_subdisks[i];
 256                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 257                         /* We found reasonable candidate. */
 258                         G_RAID_DEBUG1(1, sc,
 259                             "Promote subdisk %s:%d from %s to STALE.",
 260                             vol->v_name, sd->sd_pos,
 261                             g_raid_subdisk_state2str(sd->sd_state));
 262                         g_raid_change_subdisk_state(sd,
 263                             G_RAID_SUBDISK_S_STALE);
 264                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 265                 }
 266         }
 267         state = G_RAID_VOLUME_S_OPTIMAL;
 268         for (i = 0; i < vol->v_disks_count; i++) {
 269                 bestsd = &vol->v_subdisks[i];
 270                 worstsd = &vol->v_subdisks[i];
 271                 for (j = 1; j < N; j++) {
 272                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 273                         if (sd->sd_state > bestsd->sd_state)
 274                                 bestsd = sd;
 275                         else if (sd->sd_state == bestsd->sd_state &&
 276                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 277                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 278                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 279                                 bestsd = sd;
 280                         if (sd->sd_state < worstsd->sd_state)
 281                                 worstsd = sd;
 282                 }
 283                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 284                         sstate = G_RAID_VOLUME_S_OPTIMAL;
 285                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 286                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 287                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 288                         sstate = G_RAID_VOLUME_S_DEGRADED;
 289                 else
 290                         sstate = G_RAID_VOLUME_S_BROKEN;
 291                 if (sstate < state)
 292                         state = sstate;
 293         }
 294         return (state);
 295 }
 296
 297 static int
 298 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
 299     struct g_raid_subdisk *sd)
 300 {
 301         struct g_raid_tr_raid1e_object *trs;
 302         struct g_raid_softc *sc;
 303         u_int s;
 304
 305         sc = vol->v_softc;
 306         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 307         if (trs->trso_stopping &&
 308             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 309                 s = G_RAID_VOLUME_S_STOPPED;
 310         else if (trs->trso_starting)
 311                 s = G_RAID_VOLUME_S_STARTING;
 312         else {
 313                 if ((vol->v_disks_count % N) == 0)
 314                         s = g_raid_tr_update_state_raid1e_even(vol);
 315                 else
 316                         s = g_raid_tr_update_state_raid1e_odd(vol);
 317         }
 318         if (s != vol->v_state) {
 319                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 320                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 321                     G_RAID_EVENT_VOLUME);
 322                 g_raid_change_volume_state(vol, s);
 323                 if (!trs->trso_starting && !trs->trso_stopping)
 324                         g_raid_write_metadata(sc, vol, NULL, NULL);
 325         }
 326         if (!trs->trso_starting && !trs->trso_stopping)
 327                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 328         return (0);
 329 }
 330
 331 static void
 332 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
 333     struct g_raid_disk *disk)
 334 {
 335         struct g_raid_volume *vol;
 336
 337         vol = sd->sd_volume;
 338         /*
 339          * We don't fail the last disk in the pack, since it still has decent
 340          * data on it and that's better than failing the disk if it is the root
 341          * file system.
 342          *
 343          * XXX should this be controlled via a tunable?  It makes sense for
 344          * the volume that has / on it.  I can't think of a case where we'd
 345          * want the volume to go away on this kind of event.
 346          */
 347         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 348              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 349              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 350              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 351              vol->v_disks_count) &&
 352             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 353                 return;
 354         g_raid_fail_disk(sc, sd, disk);
 355 }
 356
 357 static void
 358 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 359 {
 360         struct g_raid_volume *vol;
 361         struct g_raid_subdisk *sd;
 362
 363         vol = trs->trso_base.tro_volume;
 364         sd = trs->trso_failed_sd;
 365         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 366         free(trs->trso_buffer, M_TR_RAID1E);
 367         trs->trso_buffer = NULL;
 368         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 369         trs->trso_type = TR_RAID1E_NONE;
 370         trs->trso_recover_slabs = 0;
 371         trs->trso_failed_sd = NULL;
 372         g_raid_tr_update_state_raid1e(vol, NULL);
 373 }
 374
 375 static void
 376 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 377 {
 378         struct g_raid_tr_raid1e_object *trs;
 379         struct g_raid_subdisk *sd;
 380
 381         trs = (struct g_raid_tr_raid1e_object *)tr;
 382         sd = trs->trso_failed_sd;
 383         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 384             "Subdisk %s:%d-%s rebuild completed.",
 385             sd->sd_volume->v_name, sd->sd_pos,
 386             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 387         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 388         sd->sd_rebuild_pos = 0;
 389         g_raid_tr_raid1e_rebuild_done(trs);
 390 }
 391
 392 static void
 393 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 394 {
 395         struct g_raid_tr_raid1e_object *trs;
 396         struct g_raid_subdisk *sd;
 397         struct g_raid_volume *vol;
 398
 399         vol = tr->tro_volume;
 400         trs = (struct g_raid_tr_raid1e_object *)tr;
 401         sd = trs->trso_failed_sd;
 402         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 403                 G_RAID_DEBUG1(1, vol->v_softc,
 404                     "Subdisk %s:%d-%s rebuild is aborting.",
 405                     sd->sd_volume->v_name, sd->sd_pos,
 406                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 407                 trs->trso_flags |= TR_RAID1E_F_ABORT;
 408         } else {
 409                 G_RAID_DEBUG1(0, vol->v_softc,
 410                     "Subdisk %s:%d-%s rebuild aborted.",
 411                     sd->sd_volume->v_name, sd->sd_pos,
 412                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 413                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 414                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 415                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 416                         g_raid_unlock_range(tr->tro_volume,
 417                             trs->trso_lock_pos, trs->trso_lock_len);
 418                 }
 419                 g_raid_tr_raid1e_rebuild_done(trs);
 420         }
 421 }
 422
 423 static void
 424 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 425 {
 426         struct g_raid_tr_raid1e_object *trs;
 427         struct g_raid_softc *sc;
 428         struct g_raid_volume *vol;
 429         struct g_raid_subdisk *sd;
 430         struct bio *bp;
 431         off_t len, virtual, vend, offset, start;
 432         int disk, copy, best;
 433
 434         trs = (struct g_raid_tr_raid1e_object *)tr;
 435         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 436                 return;
 437         vol = tr->tro_volume;
 438         sc = vol->v_softc;
 439         sd = trs->trso_failed_sd;
 440
 441         while (1) {
 442                 if (sd->sd_rebuild_pos >= sd->sd_size) {
 443                         g_raid_tr_raid1e_rebuild_finish(tr);
 444                         return;
 445                 }
 446                 /* Get virtual offset from physical rebuild position. */
 447                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 448                 /* Get physical offset back to get first stripe position. */
 449                 V2P(vol, virtual, &disk, &offset, &start);
 450                 /* Calculate contignous data length. */
 451                 len = MIN(g_raid1e_rebuild_slab,
 452                     sd->sd_size - sd->sd_rebuild_pos);
 453                 if ((vol->v_disks_count % N) != 0)
 454                         len = MIN(len, vol->v_strip_size - start);
 455                 /* Find disk with most accurate data. */
 456                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 457                     offset + start, len, 0);
 458                 if (best < 0) {
 459                         /* There is no any valid disk. */
 460                         g_raid_tr_raid1e_rebuild_abort(tr);
 461                         return;
 462                 } else if (best != copy) {
 463                         /* Some other disk has better data. */
 464                         break;
 465                 }
 466                 /* We have the most accurate data. Skip the range. */
 467                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 468                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 469                 sd->sd_rebuild_pos += len;
 470         }
 471
 472         bp = &trs->trso_bio;
 473         memset(bp, 0, sizeof(*bp));
 474         bp->bio_offset = offset + start +
 475             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 476         bp->bio_length = len;
 477         bp->bio_data = trs->trso_buffer;
 478         bp->bio_cmd = BIO_READ;
 479         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 480         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 481         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 482         /*
 483          * If we are crossing stripe boundary, correct affected virtual
 484          * range we should lock.
 485          */
 486         if (start + len > vol->v_strip_size) {
 487                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 488                 len = vend - virtual;
 489         }
 490         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 491         trs->trso_flags |= TR_RAID1E_F_LOCKED;
 492         trs->trso_lock_pos = virtual;
 493         trs->trso_lock_len = len;
 494         /* Lock callback starts I/O */
 495         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 496 }
 497
 498 static void
 499 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 500 {
 501         struct g_raid_volume *vol;
 502         struct g_raid_tr_raid1e_object *trs;
 503         struct g_raid_subdisk *sd;
 504
 505         vol = tr->tro_volume;
 506         trs = (struct g_raid_tr_raid1e_object *)tr;
 507         if (trs->trso_failed_sd) {
 508                 G_RAID_DEBUG1(1, vol->v_softc,
 509                     "Already rebuild in start rebuild. pos %jd\n",
 510                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 511                 return;
 512         }
 513         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 514         if (sd == NULL)
 515                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 516         if (sd == NULL) {
 517                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 518                 if (sd != NULL) {
 519                         sd->sd_rebuild_pos = 0;
 520                         g_raid_change_subdisk_state(sd,
 521                             G_RAID_SUBDISK_S_RESYNC);
 522                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 523                 } else {
 524                         sd = g_raid_get_subdisk(vol,
 525                             G_RAID_SUBDISK_S_UNINITIALIZED);
 526                         if (sd == NULL)
 527                                 sd = g_raid_get_subdisk(vol,
 528                                     G_RAID_SUBDISK_S_NEW);
 529                         if (sd != NULL) {
 530                                 sd->sd_rebuild_pos = 0;
 531                                 g_raid_change_subdisk_state(sd,
 532                                     G_RAID_SUBDISK_S_REBUILD);
 533                                 g_raid_write_metadata(vol->v_softc,
 534                                     vol, sd, NULL);
 535                         }
 536                 }
 537         }
 538         if (sd == NULL) {
 539                 G_RAID_DEBUG1(1, vol->v_softc,
 540                     "No failed disk to rebuild.  night night.");
 541                 return;
 542         }
 543         trs->trso_failed_sd = sd;
 544         G_RAID_DEBUG1(0, vol->v_softc,
 545             "Subdisk %s:%d-%s rebuild start at %jd.",
 546             sd->sd_volume->v_name, sd->sd_pos,
 547             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 548             trs->trso_failed_sd->sd_rebuild_pos);
 549         trs->trso_type = TR_RAID1E_REBUILD;
 550         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 551         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 552         g_raid_tr_raid1e_rebuild_some(tr);
 553 }
 554
 555 static void
 556 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
 557     struct g_raid_subdisk *sd)
 558 {
 559         struct g_raid_volume *vol;
 560         struct g_raid_tr_raid1e_object *trs;
 561         int nr;
 562
 563         vol = tr->tro_volume;
 564         trs = (struct g_raid_tr_raid1e_object *)tr;
 565         if (trs->trso_stopping)
 566                 return;
 567         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 568             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 569         switch(trs->trso_type) {
 570         case TR_RAID1E_NONE:
 571                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 572                         return;
 573                 if (nr == 0) {
 574                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 575                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 576                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 577                         if (nr == 0)
 578                                 return;
 579                 }
 580                 g_raid_tr_raid1e_rebuild_start(tr);
 581                 break;
 582         case TR_RAID1E_REBUILD:
 583                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 584                     trs->trso_failed_sd == sd)
 585                         g_raid_tr_raid1e_rebuild_abort(tr);
 586                 break;
 587         case TR_RAID1E_RESYNC:
 588                 break;
 589         }
 590 }
 591
 592 static int
 593 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
 594     struct g_raid_subdisk *sd, u_int event)
 595 {
 596
 597         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 598         return (0);
 599 }
 600
 601 static int
 602 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 603 {
 604         struct g_raid_tr_raid1e_object *trs;
 605         struct g_raid_volume *vol;
 606
 607         trs = (struct g_raid_tr_raid1e_object *)tr;
 608         vol = tr->tro_volume;
 609         trs->trso_starting = 0;
 610         g_raid_tr_update_state_raid1e(vol, NULL);
 611         return (0);
 612 }
 613
 614 static int
 615 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 616 {
 617         struct g_raid_tr_raid1e_object *trs;
 618         struct g_raid_volume *vol;
 619
 620         trs = (struct g_raid_tr_raid1e_object *)tr;
 621         vol = tr->tro_volume;
 622         trs->trso_starting = 0;
 623         trs->trso_stopping = 1;
 624         g_raid_tr_update_state_raid1e(vol, NULL);
 625         return (0);
 626 }
 627
 628 /*
 629  * Select the disk to read from.  Take into account: subdisk state, running
 630  * error recovery, average disk load, head position and possible cache hits.
 631  */
 632 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
 633 static int
 634 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
 635     int no, off_t off, off_t len, u_int mask)
 636 {
 637         struct g_raid_subdisk *sd;
 638         off_t offset;
 639         int i, best, prio, bestprio;
 640
 641         best = -1;
 642         bestprio = INT_MAX;
 643         for (i = 0; i < N; i++) {
 644                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 645                 offset = off;
 646                 if (no + i >= vol->v_disks_count)
 647                         offset += vol->v_strip_size;
 648
 649                 prio = G_RAID_SUBDISK_LOAD(sd);
 650                 if ((mask & (1 << sd->sd_pos)) != 0)
 651                         continue;
 652                 switch (sd->sd_state) {
 653                 case G_RAID_SUBDISK_S_ACTIVE:
 654                         break;
 655                 case G_RAID_SUBDISK_S_RESYNC:
 656                         if (offset + off < sd->sd_rebuild_pos)
 657                                 break;
 658                         /* FALLTHROUGH */
 659                 case G_RAID_SUBDISK_S_STALE:
 660                         prio += i << 24;
 661                         break;
 662                 case G_RAID_SUBDISK_S_REBUILD:
 663                         if (offset + off < sd->sd_rebuild_pos)
 664                                 break;
 665                         /* FALLTHROUGH */
 666                 default:
 667                         continue;
 668                 }
 669                 prio += min(sd->sd_recovery, 255) << 16;
 670                 /* If disk head is precisely in position - highly prefer it. */
 671                 if (G_RAID_SUBDISK_POS(sd) == offset)
 672                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 673                 else
 674                 /* If disk head is close to position - prefer it. */
 675                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 676                     G_RAID_SUBDISK_TRACK_SIZE)
 677                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 678                 if (prio < bestprio) {
 679                         bestprio = prio;
 680                         best = i;
 681                 }
 682         }
 683         return (best);
 684 }
 685
 686 static void
 687 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 688 {
 689         struct g_raid_volume *vol;
 690         struct g_raid_subdisk *sd;
 691         struct bio_queue_head queue;
 692         struct bio *cbp;
 693         char *addr;
 694         off_t offset, start, length, remain;
 695         u_int no, strip_size;
 696         int best;
 697
 698         vol = tr->tro_volume;
 699         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 700                 addr = NULL;
 701         else
 702                 addr = bp->bio_data;
 703         strip_size = vol->v_strip_size;
 704         V2P(vol, bp->bio_offset, &no, &offset, &start);
 705         remain = bp->bio_length;
 706         bioq_init(&queue);
 707         while (remain > 0) {
 708                 length = MIN(strip_size - start, remain);
 709                 best = g_raid_tr_raid1e_select_read_disk(vol,
 710                     no, offset, length, 0);
 711                 KASSERT(best >= 0, ("No readable disk in volume %s!",
 712                     vol->v_name));
 713                 no += best;
 714                 if (no >= vol->v_disks_count) {
 715                         no -= vol->v_disks_count;
 716                         offset += strip_size;
 717                 }
 718                 cbp = g_clone_bio(bp);
 719                 if (cbp == NULL)
 720                         goto failure;
 721                 cbp->bio_offset = offset + start;
 722                 cbp->bio_length = length;
 723                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 724                         cbp->bio_ma_offset += (uintptr_t)addr;
 725                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 726                         cbp->bio_ma_offset %= PAGE_SIZE;
 727                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 728                             cbp->bio_length) / PAGE_SIZE;
 729                 } else
 730                         cbp->bio_data = addr;
 731                 cbp->bio_caller1 = &vol->v_subdisks[no];
 732                 bioq_insert_tail(&queue, cbp);
 733                 no += N - best;
 734                 if (no >= vol->v_disks_count) {
 735                         no -= vol->v_disks_count;
 736                         offset += strip_size;
 737                 }
 738                 remain -= length;
 739                 addr += length;
 740                 start = 0;
 741         }
 742         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 743                 sd = cbp->bio_caller1;
 744                 cbp->bio_caller1 = NULL;
 745                 g_raid_subdisk_iostart(sd, cbp);
 746         }
 747         return;
 748 failure:
 749         while ((cbp = bioq_takefirst(&queue)) != NULL)
 750                 g_destroy_bio(cbp);
 751         if (bp->bio_error == 0)
 752                 bp->bio_error = ENOMEM;
 753         g_raid_iodone(bp, bp->bio_error);
 754 }
 755
 756 static void
 757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 758 {
 759         struct g_raid_volume *vol;
 760         struct g_raid_subdisk *sd;
 761         struct bio_queue_head queue;
 762         struct bio *cbp;
 763         char *addr;
 764         off_t offset, start, length, remain;
 765         u_int no, strip_size;
 766         int i;
 767
 768         vol = tr->tro_volume;
 769         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 770                 addr = NULL;
 771         else
 772                 addr = bp->bio_data;
 773         strip_size = vol->v_strip_size;
 774         V2P(vol, bp->bio_offset, &no, &offset, &start);
 775         remain = bp->bio_length;
 776         bioq_init(&queue);
 777         while (remain > 0) {
 778                 length = MIN(strip_size - start, remain);
 779                 for (i = 0; i < N; i++) {
 780                         sd = &vol->v_subdisks[no];
 781                         switch (sd->sd_state) {
 782                         case G_RAID_SUBDISK_S_ACTIVE:
 783                         case G_RAID_SUBDISK_S_STALE:
 784                         case G_RAID_SUBDISK_S_RESYNC:
 785                                 break;
 786                         case G_RAID_SUBDISK_S_REBUILD:
 787                                 if (offset + start >= sd->sd_rebuild_pos)
 788                                         goto nextdisk;
 789                                 break;
 790                         default:
 791                                 goto nextdisk;
 792                         }
 793                         cbp = g_clone_bio(bp);
 794                         if (cbp == NULL)
 795                                 goto failure;
 796                         cbp->bio_offset = offset + start;
 797                         cbp->bio_length = length;
 798                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 799                             bp->bio_cmd != BIO_DELETE) {
 800                                 cbp->bio_ma_offset += (uintptr_t)addr;
 801                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 802                                 cbp->bio_ma_offset %= PAGE_SIZE;
 803                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 804                                     cbp->bio_length) / PAGE_SIZE;
 805                         } else
 806                                 cbp->bio_data = addr;
 807                         cbp->bio_caller1 = sd;
 808                         bioq_insert_tail(&queue, cbp);
 809 nextdisk:
 810                         if (++no >= vol->v_disks_count) {
 811                                 no = 0;
 812                                 offset += strip_size;
 813                         }
 814                 }
 815                 remain -= length;
 816                 if (bp->bio_cmd != BIO_DELETE)
 817                         addr += length;
 818                 start = 0;
 819         }
 820         while ((cbp = bioq_takefirst(&queue)) != NULL) {
 821                 sd = cbp->bio_caller1;
 822                 cbp->bio_caller1 = NULL;
 823                 g_raid_subdisk_iostart(sd, cbp);
 824         }
 825         return;
 826 failure:
 827         while ((cbp = bioq_takefirst(&queue)) != NULL)
 828                 g_destroy_bio(cbp);
 829         if (bp->bio_error == 0)
 830                 bp->bio_error = ENOMEM;
 831         g_raid_iodone(bp, bp->bio_error);
 832 }
 833
 834 static void
 835 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 836 {
 837         struct g_raid_volume *vol;
 838         struct g_raid_tr_raid1e_object *trs;
 839
 840         vol = tr->tro_volume;
 841         trs = (struct g_raid_tr_raid1e_object *)tr;
 842         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 843             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 844             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 845                 g_raid_iodone(bp, EIO);
 846                 return;
 847         }
 848         /*
 849          * If we're rebuilding, squeeze in rebuild activity every so often,
 850          * even when the disk is busy.  Be sure to only count real I/O
 851          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 852          * by this module.
 853          */
 854         if (trs->trso_failed_sd != NULL &&
 855             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 856                 /* Make this new or running now round short. */
 857                 trs->trso_recover_slabs = 0;
 858                 if (--trs->trso_fair_io <= 0) {
 859                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 860                         g_raid_tr_raid1e_rebuild_some(tr);
 861                 }
 862         }
 863         switch (bp->bio_cmd) {
 864         case BIO_READ:
 865                 g_raid_tr_iostart_raid1e_read(tr, bp);
 866                 break;
 867         case BIO_WRITE:
 868         case BIO_DELETE:
 869                 g_raid_tr_iostart_raid1e_write(tr, bp);
 870                 break;
 871         case BIO_FLUSH:
 872                 g_raid_tr_flush_common(tr, bp);
 873                 break;
 874         default:
 875                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 876                     bp->bio_cmd, vol->v_name));
 877                 break;
 878         }
 879 }
 880
 881 static void
 882 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
 883     struct g_raid_subdisk *sd, struct bio *bp)
 884 {
 885         struct bio *cbp;
 886         struct g_raid_subdisk *nsd;
 887         struct g_raid_volume *vol;
 888         struct bio *pbp;
 889         struct g_raid_tr_raid1e_object *trs;
 890         off_t virtual, offset, start;
 891         uintptr_t mask;
 892         int error, do_write, copy, disk, best;
 893
 894         trs = (struct g_raid_tr_raid1e_object *)tr;
 895         vol = tr->tro_volume;
 896         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 897                 if (trs->trso_type == TR_RAID1E_REBUILD) {
 898                         nsd = trs->trso_failed_sd;
 899                         if (bp->bio_cmd == BIO_READ) {
 900
 901                                 /* Immediately abort rebuild, if requested. */
 902                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 903                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 904                                         g_raid_tr_raid1e_rebuild_abort(tr);
 905                                         return;
 906                                 }
 907
 908                                 /* On read error, skip and cross fingers. */
 909                                 if (bp->bio_error != 0) {
 910                                         G_RAID_LOGREQ(0, bp,
 911                                             "Read error during rebuild (%d), "
 912                                             "possible data loss!",
 913                                             bp->bio_error);
 914                                         goto rebuild_round_done;
 915                                 }
 916
 917                                 /*
 918                                  * The read operation finished, queue the
 919                                  * write and get out.
 920                                  */
 921                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 922                                     bp->bio_error);
 923                                 bp->bio_cmd = BIO_WRITE;
 924                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 925                                 bp->bio_offset = nsd->sd_rebuild_pos;
 926                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 927                                 g_raid_subdisk_iostart(nsd, bp);
 928                         } else {
 929                                 /*
 930                                  * The write operation just finished.  Do
 931                                  * another.  We keep cloning the master bio
 932                                  * since it has the right buffers allocated to
 933                                  * it.
 934                                  */
 935                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 936                                     bp->bio_error);
 937                                 if (bp->bio_error != 0 ||
 938                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
 939                                         if ((trs->trso_flags &
 940                                             TR_RAID1E_F_ABORT) == 0) {
 941                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 942                                                     nsd, nsd->sd_disk);
 943                                         }
 944                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 945                                         g_raid_tr_raid1e_rebuild_abort(tr);
 946                                         return;
 947                                 }
 948 rebuild_round_done:
 949                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 950                                 g_raid_unlock_range(tr->tro_volume,
 951                                     trs->trso_lock_pos, trs->trso_lock_len);
 952                                 nsd->sd_rebuild_pos += bp->bio_length;
 953                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 954                                         g_raid_tr_raid1e_rebuild_finish(tr);
 955                                         return;
 956                                 }
 957
 958                                 /* Abort rebuild if we are stopping */
 959                                 if (trs->trso_stopping) {
 960                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 961                                         g_raid_tr_raid1e_rebuild_abort(tr);
 962                                         return;
 963                                 }
 964
 965                                 if (--trs->trso_meta_update <= 0) {
 966                                         g_raid_write_metadata(vol->v_softc,
 967                                             vol, nsd, nsd->sd_disk);
 968                                         trs->trso_meta_update =
 969                                             g_raid1e_rebuild_meta_update;
 970                                         /* Compensate short rebuild I/Os. */
 971                                         if ((vol->v_disks_count % N) != 0 &&
 972                                             vol->v_strip_size <
 973                                              g_raid1e_rebuild_slab) {
 974                                                 trs->trso_meta_update *=
 975                                                     g_raid1e_rebuild_slab;
 976                                                 trs->trso_meta_update /=
 977                                                     vol->v_strip_size;
 978                                         }
 979                                 }
 980                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 981                                 if (--trs->trso_recover_slabs <= 0)
 982                                         return;
 983                                 /* Run next rebuild iteration. */
 984                                 g_raid_tr_raid1e_rebuild_some(tr);
 985                         }
 986                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
 987                         /*
 988                          * read good sd, read bad sd in parallel.  when both
 989                          * done, compare the buffers.  write good to the bad
 990                          * if different.  do the next bit of work.
 991                          */
 992                         panic("Somehow, we think we're doing a resync");
 993                 }
 994                 return;
 995         }
 996         pbp = bp->bio_parent;
 997         pbp->bio_inbed++;
 998         mask = (intptr_t)bp->bio_caller2;
 999         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1000                 /*
1001                  * Read failed on first drive.  Retry the read error on
1002                  * another disk drive, if available, before erroring out the
1003                  * read.
1004                  */
1005                 sd->sd_disk->d_read_errs++;
1006                 G_RAID_LOGREQ(0, bp,
1007                     "Read error (%d), %d read errors total",
1008                     bp->bio_error, sd->sd_disk->d_read_errs);
1009
1010                 /*
1011                  * If there are too many read errors, we move to degraded.
1012                  * XXX Do we want to FAIL the drive (eg, make the user redo
1013                  * everything to get it back in sync), or just degrade the
1014                  * drive, which kicks off a resync?
1015                  */
1016                 do_write = 0;
1017                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1018                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1019                 else if (mask == 0)
1020                         do_write = 1;
1021
1022                 /* Restore what we were doing. */
1023                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1024                 V2P(vol, virtual, &disk, &offset, &start);
1025
1026                 /* Find the other disk, and try to do the I/O to it. */
1027                 mask |= 1 << copy;
1028                 best = g_raid_tr_raid1e_select_read_disk(vol,
1029                     disk, offset, start, mask);
1030                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1031                         disk += best;
1032                         if (disk >= vol->v_disks_count) {
1033                                 disk -= vol->v_disks_count;
1034                                 offset += vol->v_strip_size;
1035                         }
1036                         cbp->bio_offset = offset + start;
1037                         cbp->bio_length = bp->bio_length;
1038                         cbp->bio_data = bp->bio_data;
1039                         cbp->bio_ma = bp->bio_ma;
1040                         cbp->bio_ma_offset = bp->bio_ma_offset;
1041                         cbp->bio_ma_n = bp->bio_ma_n;
1042                         g_destroy_bio(bp);
1043                         nsd = &vol->v_subdisks[disk];
1044                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1045                             nsd->sd_pos);
1046                         if (do_write)
1047                                 mask |= 1 << 31;
1048                         if ((mask & (1U << 31)) != 0)
1049                                 sd->sd_recovery++;
1050                         cbp->bio_caller2 = (void *)mask;
1051                         if (do_write) {
1052                                 cbp->bio_caller1 = nsd;
1053                                 /* Lock callback starts I/O */
1054                                 g_raid_lock_range(sd->sd_volume,
1055                                     virtual, cbp->bio_length, pbp, cbp);
1056                         } else {
1057                                 g_raid_subdisk_iostart(nsd, cbp);
1058                         }
1059                         return;
1060                 }
1061                 /*
1062                  * We can't retry.  Return the original error by falling
1063                  * through.  This will happen when there's only one good disk.
1064                  * We don't need to fail the raid, since its actual state is
1065                  * based on the state of the subdisks.
1066                  */
1067                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1068         }
1069         if (bp->bio_cmd == BIO_READ &&
1070             bp->bio_error == 0 &&
1071             (mask & (1U << 31)) != 0) {
1072                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1073
1074                 /* Restore what we were doing. */
1075                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1076                 V2P(vol, virtual, &disk, &offset, &start);
1077
1078                 /* Find best disk to write. */
1079                 best = g_raid_tr_raid1e_select_read_disk(vol,
1080                     disk, offset, start, ~mask);
1081                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1082                         disk += best;
1083                         if (disk >= vol->v_disks_count) {
1084                                 disk -= vol->v_disks_count;
1085                                 offset += vol->v_strip_size;
1086                         }
1087                         cbp->bio_offset = offset + start;
1088                         cbp->bio_cmd = BIO_WRITE;
1089                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1090                         cbp->bio_caller2 = (void *)mask;
1091                         g_destroy_bio(bp);
1092                         G_RAID_LOGREQ(2, cbp,
1093                             "Attempting bad sector remap on failing drive.");
1094                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1095                         return;
1096                 }
1097         }
1098         if ((mask & (1U << 31)) != 0) {
1099                 /*
1100                  * We're done with a recovery, mark the range as unlocked.
1101                  * For any write errors, we aggressively fail the disk since
1102                  * there was both a READ and a WRITE error at this location.
1103                  * Both types of errors generally indicates the drive is on
1104                  * the verge of total failure anyway.  Better to stop trusting
1105                  * it now.  However, we need to reset error to 0 in that case
1106                  * because we're not failing the original I/O which succeeded.
1107                  */
1108
1109                 /* Restore what we were doing. */
1110                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1111                 V2P(vol, virtual, &disk, &offset, &start);
1112
1113                 for (copy = 0; copy < N; copy++) {
1114                         if ((mask & (1 << copy) ) != 0)
1115                                 vol->v_subdisks[(disk + copy) %
1116                                     vol->v_disks_count].sd_recovery--;
1117                 }
1118
1119                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1120                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
1121                             "failing subdisk.");
1122                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1123                         bp->bio_error = 0;
1124                 }
1125                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1126                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1127         }
1128         if (pbp->bio_cmd != BIO_READ) {
1129                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1130                         pbp->bio_error = bp->bio_error;
1131                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1132                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1133                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1134                 }
1135                 error = pbp->bio_error;
1136         } else
1137                 error = bp->bio_error;
1138         g_destroy_bio(bp);
1139         if (pbp->bio_children == pbp->bio_inbed) {
1140                 pbp->bio_completed = pbp->bio_length;
1141                 g_raid_iodone(pbp, error);
1142         }
1143 }
1144
1145 static int
1146 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1147     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1148 {
1149         struct g_raid_volume *vol;
1150         struct g_raid_subdisk *sd;
1151         struct bio_queue_head queue;
1152         char *addr;
1153         off_t offset, start, length, remain;
1154         u_int no, strip_size;
1155         int i, error;
1156
1157         vol = tr->tro_volume;
1158         addr = virtual;
1159         strip_size = vol->v_strip_size;
1160         V2P(vol, boffset, &no, &offset, &start);
1161         remain = blength;
1162         bioq_init(&queue);
1163         while (remain > 0) {
1164                 length = MIN(strip_size - start, remain);
1165                 for (i = 0; i < N; i++) {
1166                         sd = &vol->v_subdisks[no];
1167                         switch (sd->sd_state) {
1168                         case G_RAID_SUBDISK_S_ACTIVE:
1169                         case G_RAID_SUBDISK_S_STALE:
1170                         case G_RAID_SUBDISK_S_RESYNC:
1171                                 break;
1172                         case G_RAID_SUBDISK_S_REBUILD:
1173                                 if (offset + start >= sd->sd_rebuild_pos)
1174                                         goto nextdisk;
1175                                 break;
1176                         default:
1177                                 goto nextdisk;
1178                         }
1179                         error = g_raid_subdisk_kerneldump(sd,
1180                             addr, 0, offset + start, length);
1181                         if (error != 0)
1182                                 return (error);
1183 nextdisk:
1184                         if (++no >= vol->v_disks_count) {
1185                                 no = 0;
1186                                 offset += strip_size;
1187                         }
1188                 }
1189                 remain -= length;
1190                 addr += length;
1191                 start = 0;
1192         }
1193         return (0);
1194 }
1195
1196 static int
1197 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1198 {
1199         struct bio *bp;
1200         struct g_raid_subdisk *sd;
1201
1202         bp = (struct bio *)argp;
1203         sd = (struct g_raid_subdisk *)bp->bio_caller1;
1204         g_raid_subdisk_iostart(sd, bp);
1205
1206         return (0);
1207 }
1208
1209 static int
1210 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1211 {
1212         struct g_raid_tr_raid1e_object *trs;
1213         struct g_raid_volume *vol;
1214
1215         vol = tr->tro_volume;
1216         trs = (struct g_raid_tr_raid1e_object *)tr;
1217         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1218         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1219         /* Compensate short rebuild I/Os. */
1220         if ((vol->v_disks_count % N) != 0 &&
1221             vol->v_strip_size < g_raid1e_rebuild_slab) {
1222                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1223                 trs->trso_recover_slabs /= vol->v_strip_size;
1224         }
1225         if (trs->trso_type == TR_RAID1E_REBUILD)
1226                 g_raid_tr_raid1e_rebuild_some(tr);
1227         return (0);
1228 }
1229
1230 static int
1231 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1232 {
1233         struct g_raid_tr_raid1e_object *trs;
1234
1235         trs = (struct g_raid_tr_raid1e_object *)tr;
1236
1237         if (trs->trso_buffer != NULL) {
1238                 free(trs->trso_buffer, M_TR_RAID1E);
1239                 trs->trso_buffer = NULL;
1240         }
1241         return (0);
1242 }
1243
1244 G_RAID_TR_DECLARE(raid1e, "RAID1E");