sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_impl.h>
  31 #ifdef illumos
  32 #include <sys/vdev_disk.h>
  33 #endif
  34 #include <sys/vdev_file.h>
  35 #include <sys/vdev_raidz.h>
  36 #include <sys/zio.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/fs/zfs.h>
  39 #include <sys/fm/fs/zfs.h>
  40 #include <sys/bio.h>
  41
  42 /*
  43  * Virtual device vector for RAID-Z.
  44  *
  45  * This vdev supports single, double, and triple parity. For single parity,
  46  * we use a simple XOR of all the data columns. For double or triple parity,
  47  * we use a special case of Reed-Solomon coding. This extends the
  48  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  49  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  50  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  51  * former is also based. The latter is designed to provide higher performance
  52  * for writes.
  53  *
  54  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  55  * amended six years later identifying a critical flaw that invalidates its
  56  * claims. Nevertheless, the technique can be adapted to work for up to
  57  * triple parity. For additional parity, the amendment "Note: Correction to
  58  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  59  * is viable, but the additional complexity means that write performance will
  60  * suffer.
  61  *
  62  * All of the methods above operate on a Galois field, defined over the
  63  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  64  * can be expressed with a single byte. Briefly, the operations on the
  65  * field are defined as follows:
  66  *
  67  *   o addition (+) is represented by a bitwise XOR
  68  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  69  *   o multiplication of A by 2 is defined by the following bitwise expression:
  70  *
  71  *      (A * 2)_7 = A_6
  72  *      (A * 2)_6 = A_5
  73  *      (A * 2)_5 = A_4
  74  *      (A * 2)_4 = A_3 + A_7
  75  *      (A * 2)_3 = A_2 + A_7
  76  *      (A * 2)_2 = A_1 + A_7
  77  *      (A * 2)_1 = A_0
  78  *      (A * 2)_0 = A_7
  79  *
  80  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  81  * As an aside, this multiplication is derived from the error correcting
  82  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  83  *
  84  * Observe that any number in the field (except for 0) can be expressed as a
  85  * power of 2 -- a generator for the field. We store a table of the powers of
  86  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  87  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  88  * than field addition). The inverse of a field element A (A^-1) is therefore
  89  * A ^ (255 - 1) = A^254.
  90  *
  91  * The up-to-three parity columns, P, Q, R over several data columns,
  92  * D_0, ... D_n-1, can be expressed by field operations:
  93  *
  94  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  95  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  96  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  97  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  98  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  99  *
 100  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 101  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 102  * independent coefficients. (There are no additional coefficients that have
 103  * this property which is why the uncorrected Plank method breaks down.)
 104  *
 105  * See the reconstruction code below for how P, Q and R can used individually
 106  * or in concert to recover missing data columns.
 107  */
 108
 109 typedef struct raidz_col {
 110         uint64_t rc_devidx;             /* child device index for I/O */
 111         uint64_t rc_offset;             /* device offset */
 112         uint64_t rc_size;               /* I/O size */
 113         void *rc_data;                  /* I/O data */
 114         void *rc_gdata;                 /* used to store the "good" version */
 115         int rc_error;                   /* I/O error for this device */
 116         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 117         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 118 } raidz_col_t;
 119
 120 typedef struct raidz_map {
 121         uint64_t rm_cols;               /* Regular column count */
 122         uint64_t rm_scols;              /* Count including skipped columns */
 123         uint64_t rm_bigcols;            /* Number of oversized columns */
 124         uint64_t rm_asize;              /* Actual total I/O size */
 125         uint64_t rm_missingdata;        /* Count of missing data devices */
 126         uint64_t rm_missingparity;      /* Count of missing parity devices */
 127         uint64_t rm_firstdatacol;       /* First data column/parity count */
 128         uint64_t rm_nskip;              /* Skipped sectors for padding */
 129         uint64_t rm_skipstart;          /* Column index of padding start */
 130         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 131         uintptr_t rm_reports;           /* # of referencing checksum reports */
 132         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 133         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 134         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 135 } raidz_map_t;
 136
 137 #define VDEV_RAIDZ_P            0
 138 #define VDEV_RAIDZ_Q            1
 139 #define VDEV_RAIDZ_R            2
 140
 141 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 142 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 143
 144 /*
 145  * We provide a mechanism to perform the field multiplication operation on a
 146  * 64-bit value all at once rather than a byte at a time. This works by
 147  * creating a mask from the top bit in each byte and using that to
 148  * conditionally apply the XOR of 0x1d.
 149  */
 150 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 151 { \
 152         (mask) = (x) & 0x8080808080808080ULL; \
 153         (mask) = ((mask) << 1) - ((mask) >> 7); \
 154         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 155             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 156 }
 157
 158 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 159 { \
 160         VDEV_RAIDZ_64MUL_2((x), mask); \
 161         VDEV_RAIDZ_64MUL_2((x), mask); \
 162 }
 163
 164 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 165
 166 /*
 167  * Force reconstruction to use the general purpose method.
 168  */
 169 int vdev_raidz_default_to_general;
 170
 171 /* Powers of 2 in the Galois field defined above. */
 172 static const uint8_t vdev_raidz_pow2[256] = {
 173         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 174         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 175         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 176         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 177         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 178         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 179         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 180         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 181         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 182         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 183         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 184         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 185         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 186         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 187         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 188         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 189         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 190         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 191         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 192         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 193         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 194         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 195         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 196         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 197         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 198         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 199         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 200         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 201         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 202         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 203         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 204         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 205 };
 206 /* Logs of 2 in the Galois field defined above. */
 207 static const uint8_t vdev_raidz_log2[256] = {
 208         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 209         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 210         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 211         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 212         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 213         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 214         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 215         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 216         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 217         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 218         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 219         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 220         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 221         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 222         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 223         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 224         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 225         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 226         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 227         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 228         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 229         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 230         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 231         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 232         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 233         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 234         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 235         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 236         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 237         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 238         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 239         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 240 };
 241
 242 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 243
 244 /*
 245  * Multiply a given number by 2 raised to the given power.
 246  */
 247 static uint8_t
 248 vdev_raidz_exp2(uint_t a, int exp)
 249 {
 250         if (a == 0)
 251                 return (0);
 252
 253         ASSERT(exp >= 0);
 254         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 255
 256         exp += vdev_raidz_log2[a];
 257         if (exp > 255)
 258                 exp -= 255;
 259
 260         return (vdev_raidz_pow2[exp]);
 261 }
 262
 263 static void
 264 vdev_raidz_map_free(raidz_map_t *rm)
 265 {
 266         int c;
 267         size_t size;
 268
 269         for (c = 0; c < rm->rm_firstdatacol; c++) {
 270                 if (rm->rm_col[c].rc_data != NULL)
 271                         zio_buf_free(rm->rm_col[c].rc_data,
 272                             rm->rm_col[c].rc_size);
 273
 274                 if (rm->rm_col[c].rc_gdata != NULL)
 275                         zio_buf_free(rm->rm_col[c].rc_gdata,
 276                             rm->rm_col[c].rc_size);
 277         }
 278
 279         size = 0;
 280         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 281                 size += rm->rm_col[c].rc_size;
 282
 283         if (rm->rm_datacopy != NULL)
 284                 zio_buf_free(rm->rm_datacopy, size);
 285
 286         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 287 }
 288
 289 static void
 290 vdev_raidz_map_free_vsd(zio_t *zio)
 291 {
 292         raidz_map_t *rm = zio->io_vsd;
 293
 294         ASSERT0(rm->rm_freed);
 295         rm->rm_freed = 1;
 296
 297         if (rm->rm_reports == 0)
 298                 vdev_raidz_map_free(rm);
 299 }
 300
 301 /*ARGSUSED*/
 302 static void
 303 vdev_raidz_cksum_free(void *arg, size_t ignored)
 304 {
 305         raidz_map_t *rm = arg;
 306
 307         ASSERT3U(rm->rm_reports, >, 0);
 308
 309         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 310                 vdev_raidz_map_free(rm);
 311 }
 312
 313 static void
 314 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 315 {
 316         raidz_map_t *rm = zcr->zcr_cbdata;
 317         size_t c = zcr->zcr_cbinfo;
 318         size_t x;
 319
 320         const char *good = NULL;
 321         const char *bad = rm->rm_col[c].rc_data;
 322
 323         if (good_data == NULL) {
 324                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 325                 return;
 326         }
 327
 328         if (c < rm->rm_firstdatacol) {
 329                 /*
 330                  * The first time through, calculate the parity blocks for
 331                  * the good data (this relies on the fact that the good
 332                  * data never changes for a given logical ZIO)
 333                  */
 334                 if (rm->rm_col[0].rc_gdata == NULL) {
 335                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 336                         char *buf;
 337
 338                         /*
 339                          * Set up the rm_col[]s to generate the parity for
 340                          * good_data, first saving the parity bufs and
 341                          * replacing them with buffers to hold the result.
 342                          */
 343                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 344                                 bad_parity[x] = rm->rm_col[x].rc_data;
 345                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 346                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 347                         }
 348
 349                         /* fill in the data columns from good_data */
 350                         buf = (char *)good_data;
 351                         for (; x < rm->rm_cols; x++) {
 352                                 rm->rm_col[x].rc_data = buf;
 353                                 buf += rm->rm_col[x].rc_size;
 354                         }
 355
 356                         /*
 357                          * Construct the parity from the good data.
 358                          */
 359                         vdev_raidz_generate_parity(rm);
 360
 361                         /* restore everything back to its original state */
 362                         for (x = 0; x < rm->rm_firstdatacol; x++)
 363                                 rm->rm_col[x].rc_data = bad_parity[x];
 364
 365                         buf = rm->rm_datacopy;
 366                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 367                                 rm->rm_col[x].rc_data = buf;
 368                                 buf += rm->rm_col[x].rc_size;
 369                         }
 370                 }
 371
 372                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 373                 good = rm->rm_col[c].rc_gdata;
 374         } else {
 375                 /* adjust good_data to point at the start of our column */
 376                 good = good_data;
 377
 378                 for (x = rm->rm_firstdatacol; x < c; x++)
 379                         good += rm->rm_col[x].rc_size;
 380         }
 381
 382         /* we drop the ereport if it ends up that the data was good */
 383         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 384 }
 385
 386 /*
 387  * Invoked indirectly by zfs_ereport_start_checksum(), called
 388  * below when our read operation fails completely.  The main point
 389  * is to keep a copy of everything we read from disk, so that at
 390  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 391  */
 392 static void
 393 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 394 {
 395         size_t c = (size_t)(uintptr_t)arg;
 396         caddr_t buf;
 397
 398         raidz_map_t *rm = zio->io_vsd;
 399         size_t size;
 400
 401         /* set up the report and bump the refcount  */
 402         zcr->zcr_cbdata = rm;
 403         zcr->zcr_cbinfo = c;
 404         zcr->zcr_finish = vdev_raidz_cksum_finish;
 405         zcr->zcr_free = vdev_raidz_cksum_free;
 406
 407         rm->rm_reports++;
 408         ASSERT3U(rm->rm_reports, >, 0);
 409
 410         if (rm->rm_datacopy != NULL)
 411                 return;
 412
 413         /*
 414          * It's the first time we're called for this raidz_map_t, so we need
 415          * to copy the data aside; there's no guarantee that our zio's buffer
 416          * won't be re-used for something else.
 417          *
 418          * Our parity data is already in separate buffers, so there's no need
 419          * to copy them.
 420          */
 421
 422         size = 0;
 423         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 424                 size += rm->rm_col[c].rc_size;
 425
 426         buf = rm->rm_datacopy = zio_buf_alloc(size);
 427
 428         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 429                 raidz_col_t *col = &rm->rm_col[c];
 430
 431                 bcopy(col->rc_data, buf, col->rc_size);
 432                 col->rc_data = buf;
 433
 434                 buf += col->rc_size;
 435         }
 436         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 437 }
 438
 439 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 440         vdev_raidz_map_free_vsd,
 441         vdev_raidz_cksum_report
 442 };
 443
 444 /*
 445  * Divides the IO evenly across all child vdevs; usually, dcols is
 446  * the number of children in the target vdev.
 447  */
 448 static raidz_map_t *
 449 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
 450     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 451 {
 452         raidz_map_t *rm;
 453         /* The starting RAIDZ (parent) vdev sector of the block. */
 454         uint64_t b = offset >> unit_shift;
 455         /* The zio's size in units of the vdev's minimum sector size. */
 456         uint64_t s = size >> unit_shift;
 457         /* The first column for this stripe. */
 458         uint64_t f = b % dcols;
 459         /* The starting byte offset on each child vdev. */
 460         uint64_t o = (b / dcols) << unit_shift;
 461         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 462
 463         /*
 464          * "Quotient": The number of data sectors for this stripe on all but
 465          * the "big column" child vdevs that also contain "remainder" data.
 466          */
 467         q = s / (dcols - nparity);
 468
 469         /*
 470          * "Remainder": The number of partial stripe data sectors in this I/O.
 471          * This will add a sector to some, but not all, child vdevs.
 472          */
 473         r = s - q * (dcols - nparity);
 474
 475         /* The number of "big columns" - those which contain remainder data. */
 476         bc = (r == 0 ? 0 : r + nparity);
 477
 478         /*
 479          * The total number of data and parity sectors associated with
 480          * this I/O.
 481          */
 482         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 483
 484         /* acols: The columns that will be accessed. */
 485         /* scols: The columns that will be accessed or skipped. */
 486         if (q == 0) {
 487                 /* Our I/O request doesn't span all child vdevs. */
 488                 acols = bc;
 489                 scols = MIN(dcols, roundup(bc, nparity + 1));
 490         } else {
 491                 acols = dcols;
 492                 scols = dcols;
 493         }
 494
 495         ASSERT3U(acols, <=, scols);
 496
 497         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 498
 499         rm->rm_cols = acols;
 500         rm->rm_scols = scols;
 501         rm->rm_bigcols = bc;
 502         rm->rm_skipstart = bc;
 503         rm->rm_missingdata = 0;
 504         rm->rm_missingparity = 0;
 505         rm->rm_firstdatacol = nparity;
 506         rm->rm_datacopy = NULL;
 507         rm->rm_reports = 0;
 508         rm->rm_freed = 0;
 509         rm->rm_ecksuminjected = 0;
 510
 511         asize = 0;
 512
 513         for (c = 0; c < scols; c++) {
 514                 col = f + c;
 515                 coff = o;
 516                 if (col >= dcols) {
 517                         col -= dcols;
 518                         coff += 1ULL << unit_shift;
 519                 }
 520                 rm->rm_col[c].rc_devidx = col;
 521                 rm->rm_col[c].rc_offset = coff;
 522                 rm->rm_col[c].rc_data = NULL;
 523                 rm->rm_col[c].rc_gdata = NULL;
 524                 rm->rm_col[c].rc_error = 0;
 525                 rm->rm_col[c].rc_tried = 0;
 526                 rm->rm_col[c].rc_skipped = 0;
 527
 528                 if (c >= acols)
 529                         rm->rm_col[c].rc_size = 0;
 530                 else if (c < bc)
 531                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 532                 else
 533                         rm->rm_col[c].rc_size = q << unit_shift;
 534
 535                 asize += rm->rm_col[c].rc_size;
 536         }
 537
 538         ASSERT3U(asize, ==, tot << unit_shift);
 539         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 540         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 541         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 542         ASSERT3U(rm->rm_nskip, <=, nparity);
 543
 544         if (!dofree) {
 545                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 546                         rm->rm_col[c].rc_data =
 547                             zio_buf_alloc(rm->rm_col[c].rc_size);
 548                 }
 549
 550                 rm->rm_col[c].rc_data = data;
 551
 552                 for (c = c + 1; c < acols; c++) {
 553                         rm->rm_col[c].rc_data =
 554                             (char *)rm->rm_col[c - 1].rc_data +
 555                             rm->rm_col[c - 1].rc_size;
 556                 }
 557         }
 558
 559         /*
 560          * If all data stored spans all columns, there's a danger that parity
 561          * will always be on the same device and, since parity isn't read
 562          * during normal operation, that that device's I/O bandwidth won't be
 563          * used effectively. We therefore switch the parity every 1MB.
 564          *
 565          * ... at least that was, ostensibly, the theory. As a practical
 566          * matter unless we juggle the parity between all devices evenly, we
 567          * won't see any benefit. Further, occasional writes that aren't a
 568          * multiple of the LCM of the number of children and the minimum
 569          * stripe width are sufficient to avoid pessimal behavior.
 570          * Unfortunately, this decision created an implicit on-disk format
 571          * requirement that we need to support for all eternity, but only
 572          * for single-parity RAID-Z.
 573          *
 574          * If we intend to skip a sector in the zeroth column for padding
 575          * we must make sure to note this swap. We will never intend to
 576          * skip the first column since at least one data and one parity
 577          * column must appear in each row.
 578          */
 579         ASSERT(rm->rm_cols >= 2);
 580         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 581
 582         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 583                 devidx = rm->rm_col[0].rc_devidx;
 584                 o = rm->rm_col[0].rc_offset;
 585                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 586                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 587                 rm->rm_col[1].rc_devidx = devidx;
 588                 rm->rm_col[1].rc_offset = o;
 589
 590                 if (rm->rm_skipstart == 0)
 591                         rm->rm_skipstart = 1;
 592         }
 593
 594         return (rm);
 595 }
 596
 597 static void
 598 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 599 {
 600         uint64_t *p, *src, pcount, ccount, i;
 601         int c;
 602
 603         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 604
 605         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 606                 src = rm->rm_col[c].rc_data;
 607                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 608                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 609
 610                 if (c == rm->rm_firstdatacol) {
 611                         ASSERT(ccount == pcount);
 612                         for (i = 0; i < ccount; i++, src++, p++) {
 613                                 *p = *src;
 614                         }
 615                 } else {
 616                         ASSERT(ccount <= pcount);
 617                         for (i = 0; i < ccount; i++, src++, p++) {
 618                                 *p ^= *src;
 619                         }
 620                 }
 621         }
 622 }
 623
 624 static void
 625 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 626 {
 627         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 628         int c;
 629
 630         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 631         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 632             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 633
 634         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 635                 src = rm->rm_col[c].rc_data;
 636                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 637                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 638
 639                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 640
 641                 if (c == rm->rm_firstdatacol) {
 642                         ASSERT(ccnt == pcnt || ccnt == 0);
 643                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 644                                 *p = *src;
 645                                 *q = *src;
 646                         }
 647                         for (; i < pcnt; i++, src++, p++, q++) {
 648                                 *p = 0;
 649                                 *q = 0;
 650                         }
 651                 } else {
 652                         ASSERT(ccnt <= pcnt);
 653
 654                         /*
 655                          * Apply the algorithm described above by multiplying
 656                          * the previous result and adding in the new value.
 657                          */
 658                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 659                                 *p ^= *src;
 660
 661                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 662                                 *q ^= *src;
 663                         }
 664
 665                         /*
 666                          * Treat short columns as though they are full of 0s.
 667                          * Note that there's therefore nothing needed for P.
 668                          */
 669                         for (; i < pcnt; i++, q++) {
 670                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 671                         }
 672                 }
 673         }
 674 }
 675
 676 static void
 677 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 678 {
 679         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 680         int c;
 681
 682         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 683         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 684             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 685         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 686             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 687
 688         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 689                 src = rm->rm_col[c].rc_data;
 690                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 691                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 692                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 693
 694                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 695
 696                 if (c == rm->rm_firstdatacol) {
 697                         ASSERT(ccnt == pcnt || ccnt == 0);
 698                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 699                                 *p = *src;
 700                                 *q = *src;
 701                                 *r = *src;
 702                         }
 703                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 704                                 *p = 0;
 705                                 *q = 0;
 706                                 *r = 0;
 707                         }
 708                 } else {
 709                         ASSERT(ccnt <= pcnt);
 710
 711                         /*
 712                          * Apply the algorithm described above by multiplying
 713                          * the previous result and adding in the new value.
 714                          */
 715                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 716                                 *p ^= *src;
 717
 718                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 719                                 *q ^= *src;
 720
 721                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 722                                 *r ^= *src;
 723                         }
 724
 725                         /*
 726                          * Treat short columns as though they are full of 0s.
 727                          * Note that there's therefore nothing needed for P.
 728                          */
 729                         for (; i < pcnt; i++, q++, r++) {
 730                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 731                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 732                         }
 733                 }
 734         }
 735 }
 736
 737 /*
 738  * Generate RAID parity in the first virtual columns according to the number of
 739  * parity columns available.
 740  */
 741 static void
 742 vdev_raidz_generate_parity(raidz_map_t *rm)
 743 {
 744         switch (rm->rm_firstdatacol) {
 745         case 1:
 746                 vdev_raidz_generate_parity_p(rm);
 747                 break;
 748         case 2:
 749                 vdev_raidz_generate_parity_pq(rm);
 750                 break;
 751         case 3:
 752                 vdev_raidz_generate_parity_pqr(rm);
 753                 break;
 754         default:
 755                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 756         }
 757 }
 758
 759 static int
 760 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 761 {
 762         uint64_t *dst, *src, xcount, ccount, count, i;
 763         int x = tgts[0];
 764         int c;
 765
 766         ASSERT(ntgts == 1);
 767         ASSERT(x >= rm->rm_firstdatacol);
 768         ASSERT(x < rm->rm_cols);
 769
 770         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 771         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 772         ASSERT(xcount > 0);
 773
 774         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 775         dst = rm->rm_col[x].rc_data;
 776         for (i = 0; i < xcount; i++, dst++, src++) {
 777                 *dst = *src;
 778         }
 779
 780         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 781                 src = rm->rm_col[c].rc_data;
 782                 dst = rm->rm_col[x].rc_data;
 783
 784                 if (c == x)
 785                         continue;
 786
 787                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 788                 count = MIN(ccount, xcount);
 789
 790                 for (i = 0; i < count; i++, dst++, src++) {
 791                         *dst ^= *src;
 792                 }
 793         }
 794
 795         return (1 << VDEV_RAIDZ_P);
 796 }
 797
 798 static int
 799 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 800 {
 801         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 802         uint8_t *b;
 803         int x = tgts[0];
 804         int c, j, exp;
 805
 806         ASSERT(ntgts == 1);
 807
 808         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 809         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 810
 811         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 812                 src = rm->rm_col[c].rc_data;
 813                 dst = rm->rm_col[x].rc_data;
 814
 815                 if (c == x)
 816                         ccount = 0;
 817                 else
 818                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 819
 820                 count = MIN(ccount, xcount);
 821
 822                 if (c == rm->rm_firstdatacol) {
 823                         for (i = 0; i < count; i++, dst++, src++) {
 824                                 *dst = *src;
 825                         }
 826                         for (; i < xcount; i++, dst++) {
 827                                 *dst = 0;
 828                         }
 829
 830                 } else {
 831                         for (i = 0; i < count; i++, dst++, src++) {
 832                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 833                                 *dst ^= *src;
 834                         }
 835
 836                         for (; i < xcount; i++, dst++) {
 837                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 838                         }
 839                 }
 840         }
 841
 842         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 843         dst = rm->rm_col[x].rc_data;
 844         exp = 255 - (rm->rm_cols - 1 - x);
 845
 846         for (i = 0; i < xcount; i++, dst++, src++) {
 847                 *dst ^= *src;
 848                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 849                         *b = vdev_raidz_exp2(*b, exp);
 850                 }
 851         }
 852
 853         return (1 << VDEV_RAIDZ_Q);
 854 }
 855
 856 static int
 857 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 858 {
 859         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 860         void *pdata, *qdata;
 861         uint64_t xsize, ysize, i;
 862         int x = tgts[0];
 863         int y = tgts[1];
 864
 865         ASSERT(ntgts == 2);
 866         ASSERT(x < y);
 867         ASSERT(x >= rm->rm_firstdatacol);
 868         ASSERT(y < rm->rm_cols);
 869
 870         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 871
 872         /*
 873          * Move the parity data aside -- we're going to compute parity as
 874          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 875          * reuse the parity generation mechanism without trashing the actual
 876          * parity so we make those columns appear to be full of zeros by
 877          * setting their lengths to zero.
 878          */
 879         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 880         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 881         xsize = rm->rm_col[x].rc_size;
 882         ysize = rm->rm_col[y].rc_size;
 883
 884         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 885             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 886         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 887             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 888         rm->rm_col[x].rc_size = 0;
 889         rm->rm_col[y].rc_size = 0;
 890
 891         vdev_raidz_generate_parity_pq(rm);
 892
 893         rm->rm_col[x].rc_size = xsize;
 894         rm->rm_col[y].rc_size = ysize;
 895
 896         p = pdata;
 897         q = qdata;
 898         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 899         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 900         xd = rm->rm_col[x].rc_data;
 901         yd = rm->rm_col[y].rc_data;
 902
 903         /*
 904          * We now have:
 905          *      Pxy = P + D_x + D_y
 906          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 907          *
 908          * We can then solve for D_x:
 909          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 910          * where
 911          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 912          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 913          *
 914          * With D_x in hand, we can easily solve for D_y:
 915          *      D_y = P + Pxy + D_x
 916          */
 917
 918         a = vdev_raidz_pow2[255 + x - y];
 919         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 920         tmp = 255 - vdev_raidz_log2[a ^ 1];
 921
 922         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 923         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 924
 925         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 926                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 927                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 928
 929                 if (i < ysize)
 930                         *yd = *p ^ *pxy ^ *xd;
 931         }
 932
 933         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 934             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 935         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 936             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 937
 938         /*
 939          * Restore the saved parity data.
 940          */
 941         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 942         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 943
 944         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 945 }
 946
 947 /* BEGIN CSTYLED */
 948 /*
 949  * In the general case of reconstruction, we must solve the system of linear
 950  * equations defined by the coeffecients used to generate parity as well as
 951  * the contents of the data and parity disks. This can be expressed with
 952  * vectors for the original data (D) and the actual data (d) and parity (p)
 953  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 954  *
 955  *            __   __                     __     __
 956  *            |     |         __     __   |  p_0  |
 957  *            |  V  |         |  D_0  |   | p_m-1 |
 958  *            |     |    x    |   :   | = |  d_0  |
 959  *            |  I  |         | D_n-1 |   |   :   |
 960  *            |     |         ~~     ~~   | d_n-1 |
 961  *            ~~   ~~                     ~~     ~~
 962  *
 963  * I is simply a square identity matrix of size n, and V is a vandermonde
 964  * matrix defined by the coeffecients we chose for the various parity columns
 965  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 966  * computation as well as linear separability.
 967  *
 968  *      __               __               __     __
 969  *      |   1   ..  1 1 1 |               |  p_0  |
 970  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 971  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 972  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 973  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 974  *      |   :       : : : |   |   :   |   |  d_2  |
 975  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 976  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 977  *      |   0   ..  0 0 1 |               | d_n-1 |
 978  *      ~~               ~~               ~~     ~~
 979  *
 980  * Note that I, V, d, and p are known. To compute D, we must invert the
 981  * matrix and use the known data and parity values to reconstruct the unknown
 982  * data values. We begin by removing the rows in V|I and d|p that correspond
 983  * to failed or missing columns; we then make V|I square (n x n) and d|p
 984  * sized n by removing rows corresponding to unused parity from the bottom up
 985  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 986  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 987  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 988  *           __                               __
 989  *           |  1   1   1   1   1   1   1   1  |
 990  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 991  *           |  19 205 116  29  64  16  4   1  |      / /
 992  *           |  1   0   0   0   0   0   0   0  |     / /
 993  *           |  0   1   0   0   0   0   0   0  | <--' /
 994  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 995  *           |  0   0   0   1   0   0   0   0  |
 996  *           |  0   0   0   0   1   0   0   0  |
 997  *           |  0   0   0   0   0   1   0   0  |
 998  *           |  0   0   0   0   0   0   1   0  |
 999  *           |  0   0   0   0   0   0   0   1  |
1000  *           ~~                               ~~
1001  *           __                               __
1002  *           |  1   1   1   1   1   1   1   1  |
1003  *           |  19 205 116  29  64  16  4   1  |
1004  *           |  1   0   0   0   0   0   0   0  |
1005  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1006  *           |  0   0   0   0   1   0   0   0  |
1007  *           |  0   0   0   0   0   1   0   0  |
1008  *           |  0   0   0   0   0   0   1   0  |
1009  *           |  0   0   0   0   0   0   0   1  |
1010  *           ~~                               ~~
1011  *
1012  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014  * matrix is not singular.
1015  * __                                                                 __
1016  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024  * ~~                                                                 ~~
1025  * __                                                                 __
1026  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1028  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1029  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034  * ~~                                                                 ~~
1035  * __                                                                 __
1036  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1039  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044  * ~~                                                                 ~~
1045  * __                                                                 __
1046  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1049  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054  * ~~                                                                 ~~
1055  * __                                                                 __
1056  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1058  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064  * ~~                                                                 ~~
1065  * __                                                                 __
1066  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1068  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1069  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1070  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1071  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1072  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1073  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1074  * ~~                                                                 ~~
1075  *                   __                               __
1076  *                   |  0   0   1   0   0   0   0   0  |
1077  *                   | 167 100  5   41 159 169 217 208 |
1078  *                   | 166 100  4   40 158 168 216 209 |
1079  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1080  *                   |  0   0   0   0   1   0   0   0  |
1081  *                   |  0   0   0   0   0   1   0   0  |
1082  *                   |  0   0   0   0   0   0   1   0  |
1083  *                   |  0   0   0   0   0   0   0   1  |
1084  *                   ~~                               ~~
1085  *
1086  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087  * of the missing data.
1088  *
1089  * As is apparent from the example above, the only non-trivial rows in the
1090  * inverse matrix correspond to the data disks that we're trying to
1091  * reconstruct. Indeed, those are the only rows we need as the others would
1092  * only be useful for reconstructing data known or assumed to be valid. For
1093  * that reason, we only build the coefficients in the rows that correspond to
1094  * targeted columns.
1095  */
1096 /* END CSTYLED */
1097
1098 static void
1099 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100     uint8_t **rows)
1101 {
1102         int i, j;
1103         int pow;
1104
1105         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106
1107         /*
1108          * Fill in the missing rows of interest.
1109          */
1110         for (i = 0; i < nmap; i++) {
1111                 ASSERT3S(0, <=, map[i]);
1112                 ASSERT3S(map[i], <=, 2);
1113
1114                 pow = map[i] * n;
1115                 if (pow > 255)
1116                         pow -= 255;
1117                 ASSERT(pow <= 255);
1118
1119                 for (j = 0; j < n; j++) {
1120                         pow -= map[i];
1121                         if (pow < 0)
1122                                 pow += 255;
1123                         rows[i][j] = vdev_raidz_pow2[pow];
1124                 }
1125         }
1126 }
1127
1128 static void
1129 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131 {
1132         int i, j, ii, jj;
1133         uint8_t log;
1134
1135         /*
1136          * Assert that the first nmissing entries from the array of used
1137          * columns correspond to parity columns and that subsequent entries
1138          * correspond to data columns.
1139          */
1140         for (i = 0; i < nmissing; i++) {
1141                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142         }
1143         for (; i < n; i++) {
1144                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145         }
1146
1147         /*
1148          * First initialize the storage where we'll compute the inverse rows.
1149          */
1150         for (i = 0; i < nmissing; i++) {
1151                 for (j = 0; j < n; j++) {
1152                         invrows[i][j] = (i == j) ? 1 : 0;
1153                 }
1154         }
1155
1156         /*
1157          * Subtract all trivial rows from the rows of consequence.
1158          */
1159         for (i = 0; i < nmissing; i++) {
1160                 for (j = nmissing; j < n; j++) {
1161                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162                         jj = used[j] - rm->rm_firstdatacol;
1163                         ASSERT3S(jj, <, n);
1164                         invrows[i][j] = rows[i][jj];
1165                         rows[i][jj] = 0;
1166                 }
1167         }
1168
1169         /*
1170          * For each of the rows of interest, we must normalize it and subtract
1171          * a multiple of it from the other rows.
1172          */
1173         for (i = 0; i < nmissing; i++) {
1174                 for (j = 0; j < missing[i]; j++) {
1175                         ASSERT0(rows[i][j]);
1176                 }
1177                 ASSERT3U(rows[i][missing[i]], !=, 0);
1178
1179                 /*
1180                  * Compute the inverse of the first element and multiply each
1181                  * element in the row by that value.
1182                  */
1183                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184
1185                 for (j = 0; j < n; j++) {
1186                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188                 }
1189
1190                 for (ii = 0; ii < nmissing; ii++) {
1191                         if (i == ii)
1192                                 continue;
1193
1194                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1195
1196                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1197
1198                         for (j = 0; j < n; j++) {
1199                                 rows[ii][j] ^=
1200                                     vdev_raidz_exp2(rows[i][j], log);
1201                                 invrows[ii][j] ^=
1202                                     vdev_raidz_exp2(invrows[i][j], log);
1203                         }
1204                 }
1205         }
1206
1207         /*
1208          * Verify that the data that is left in the rows are properly part of
1209          * an identity matrix.
1210          */
1211         for (i = 0; i < nmissing; i++) {
1212                 for (j = 0; j < n; j++) {
1213                         if (j == missing[i]) {
1214                                 ASSERT3U(rows[i][j], ==, 1);
1215                         } else {
1216                                 ASSERT0(rows[i][j]);
1217                         }
1218                 }
1219         }
1220 }
1221
1222 static void
1223 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224     int *missing, uint8_t **invrows, const uint8_t *used)
1225 {
1226         int i, j, x, cc, c;
1227         uint8_t *src;
1228         uint64_t ccount;
1229         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231         uint8_t log = 0;
1232         uint8_t val;
1233         int ll;
1234         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235         uint8_t *p, *pp;
1236         size_t psize;
1237
1238         psize = sizeof (invlog[0][0]) * n * nmissing;
1239         p = kmem_alloc(psize, KM_SLEEP);
1240
1241         for (pp = p, i = 0; i < nmissing; i++) {
1242                 invlog[i] = pp;
1243                 pp += n;
1244         }
1245
1246         for (i = 0; i < nmissing; i++) {
1247                 for (j = 0; j < n; j++) {
1248                         ASSERT3U(invrows[i][j], !=, 0);
1249                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250                 }
1251         }
1252
1253         for (i = 0; i < n; i++) {
1254                 c = used[i];
1255                 ASSERT3U(c, <, rm->rm_cols);
1256
1257                 src = rm->rm_col[c].rc_data;
1258                 ccount = rm->rm_col[c].rc_size;
1259                 for (j = 0; j < nmissing; j++) {
1260                         cc = missing[j] + rm->rm_firstdatacol;
1261                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262                         ASSERT3U(cc, <, rm->rm_cols);
1263                         ASSERT3U(cc, !=, c);
1264
1265                         dst[j] = rm->rm_col[cc].rc_data;
1266                         dcount[j] = rm->rm_col[cc].rc_size;
1267                 }
1268
1269                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270
1271                 for (x = 0; x < ccount; x++, src++) {
1272                         if (*src != 0)
1273                                 log = vdev_raidz_log2[*src];
1274
1275                         for (cc = 0; cc < nmissing; cc++) {
1276                                 if (x >= dcount[cc])
1277                                         continue;
1278
1279                                 if (*src == 0) {
1280                                         val = 0;
1281                                 } else {
1282                                         if ((ll = log + invlog[cc][i]) >= 255)
1283                                                 ll -= 255;
1284                                         val = vdev_raidz_pow2[ll];
1285                                 }
1286
1287                                 if (i == 0)
1288                                         dst[cc][x] = val;
1289                                 else
1290                                         dst[cc][x] ^= val;
1291                         }
1292                 }
1293         }
1294
1295         kmem_free(p, psize);
1296 }
1297
1298 static int
1299 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300 {
1301         int n, i, c, t, tt;
1302         int nmissing_rows;
1303         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304         int parity_map[VDEV_RAIDZ_MAXPARITY];
1305
1306         uint8_t *p, *pp;
1307         size_t psize;
1308
1309         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311         uint8_t *used;
1312
1313         int code = 0;
1314
1315
1316         n = rm->rm_cols - rm->rm_firstdatacol;
1317
1318         /*
1319          * Figure out which data columns are missing.
1320          */
1321         nmissing_rows = 0;
1322         for (t = 0; t < ntgts; t++) {
1323                 if (tgts[t] >= rm->rm_firstdatacol) {
1324                         missing_rows[nmissing_rows++] =
1325                             tgts[t] - rm->rm_firstdatacol;
1326                 }
1327         }
1328
1329         /*
1330          * Figure out which parity columns to use to help generate the missing
1331          * data columns.
1332          */
1333         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334                 ASSERT(tt < ntgts);
1335                 ASSERT(c < rm->rm_firstdatacol);
1336
1337                 /*
1338                  * Skip any targeted parity columns.
1339                  */
1340                 if (c == tgts[tt]) {
1341                         tt++;
1342                         continue;
1343                 }
1344
1345                 code |= 1 << c;
1346
1347                 parity_map[i] = c;
1348                 i++;
1349         }
1350
1351         ASSERT(code != 0);
1352         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353
1354         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355             nmissing_rows * n + sizeof (used[0]) * n;
1356         p = kmem_alloc(psize, KM_SLEEP);
1357
1358         for (pp = p, i = 0; i < nmissing_rows; i++) {
1359                 rows[i] = pp;
1360                 pp += n;
1361                 invrows[i] = pp;
1362                 pp += n;
1363         }
1364         used = pp;
1365
1366         for (i = 0; i < nmissing_rows; i++) {
1367                 used[i] = parity_map[i];
1368         }
1369
1370         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371                 if (tt < nmissing_rows &&
1372                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1373                         tt++;
1374                         continue;
1375                 }
1376
1377                 ASSERT3S(i, <, n);
1378                 used[i] = c;
1379                 i++;
1380         }
1381
1382         /*
1383          * Initialize the interesting rows of the matrix.
1384          */
1385         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386
1387         /*
1388          * Invert the matrix.
1389          */
1390         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391             invrows, used);
1392
1393         /*
1394          * Reconstruct the missing data using the generated matrix.
1395          */
1396         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397             invrows, used);
1398
1399         kmem_free(p, psize);
1400
1401         return (code);
1402 }
1403
1404 static int
1405 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406 {
1407         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408         int ntgts;
1409         int i, c;
1410         int code;
1411         int nbadparity, nbaddata;
1412         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413
1414         /*
1415          * The tgts list must already be sorted.
1416          */
1417         for (i = 1; i < nt; i++) {
1418                 ASSERT(t[i] > t[i - 1]);
1419         }
1420
1421         nbadparity = rm->rm_firstdatacol;
1422         nbaddata = rm->rm_cols - nbadparity;
1423         ntgts = 0;
1424         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425                 if (c < rm->rm_firstdatacol)
1426                         parity_valid[c] = B_FALSE;
1427
1428                 if (i < nt && c == t[i]) {
1429                         tgts[ntgts++] = c;
1430                         i++;
1431                 } else if (rm->rm_col[c].rc_error != 0) {
1432                         tgts[ntgts++] = c;
1433                 } else if (c >= rm->rm_firstdatacol) {
1434                         nbaddata--;
1435                 } else {
1436                         parity_valid[c] = B_TRUE;
1437                         nbadparity--;
1438                 }
1439         }
1440
1441         ASSERT(ntgts >= nt);
1442         ASSERT(nbaddata >= 0);
1443         ASSERT(nbaddata + nbadparity == ntgts);
1444
1445         dt = &tgts[nbadparity];
1446
1447         /*
1448          * See if we can use any of our optimized reconstruction routines.
1449          */
1450         if (!vdev_raidz_default_to_general) {
1451                 switch (nbaddata) {
1452                 case 1:
1453                         if (parity_valid[VDEV_RAIDZ_P])
1454                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455
1456                         ASSERT(rm->rm_firstdatacol > 1);
1457
1458                         if (parity_valid[VDEV_RAIDZ_Q])
1459                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460
1461                         ASSERT(rm->rm_firstdatacol > 2);
1462                         break;
1463
1464                 case 2:
1465                         ASSERT(rm->rm_firstdatacol > 1);
1466
1467                         if (parity_valid[VDEV_RAIDZ_P] &&
1468                             parity_valid[VDEV_RAIDZ_Q])
1469                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470
1471                         ASSERT(rm->rm_firstdatacol > 2);
1472
1473                         break;
1474                 }
1475         }
1476
1477         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479         ASSERT(code > 0);
1480         return (code);
1481 }
1482
1483 static int
1484 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485     uint64_t *logical_ashift, uint64_t *physical_ashift)
1486 {
1487         vdev_t *cvd;
1488         uint64_t nparity = vd->vdev_nparity;
1489         int c;
1490         int lasterror = 0;
1491         int numerrors = 0;
1492
1493         ASSERT(nparity > 0);
1494
1495         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496             vd->vdev_children < nparity + 1) {
1497                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498                 return (SET_ERROR(EINVAL));
1499         }
1500
1501         vdev_open_children(vd);
1502
1503         for (c = 0; c < vd->vdev_children; c++) {
1504                 cvd = vd->vdev_child[c];
1505
1506                 if (cvd->vdev_open_error != 0) {
1507                         lasterror = cvd->vdev_open_error;
1508                         numerrors++;
1509                         continue;
1510                 }
1511
1512                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515                 *physical_ashift = MAX(*physical_ashift,
1516                     cvd->vdev_physical_ashift);
1517         }
1518
1519         *asize *= vd->vdev_children;
1520         *max_asize *= vd->vdev_children;
1521
1522         if (numerrors > nparity) {
1523                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524                 return (lasterror);
1525         }
1526
1527         return (0);
1528 }
1529
1530 static void
1531 vdev_raidz_close(vdev_t *vd)
1532 {
1533         int c;
1534
1535         for (c = 0; c < vd->vdev_children; c++)
1536                 vdev_close(vd->vdev_child[c]);
1537 }
1538
1539 #ifdef illumos
1540 /*
1541  * Handle a read or write I/O to a RAID-Z dump device.
1542  *
1543  * The dump device is in a unique situation compared to other ZFS datasets:
1544  * writing to this device should be as simple and fast as possible.  In
1545  * addition, durability matters much less since the dump will be extracted
1546  * once the machine reboots.  For that reason, this function eschews parity for
1547  * performance and simplicity.  The dump device uses the checksum setting
1548  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549  * dataset.
1550  *
1551  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1552  * 128 KB will not fill an entire block; in addition, they may not be properly
1553  * aligned.  In that case, this function uses the preallocated 128 KB block and
1554  * omits reading or writing any "empty" portions of that block, as opposed to
1555  * allocating a fresh appropriately-sized block.
1556  *
1557  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558  *
1559  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560  *
1561  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562  * allocated which spans all five child vdevs.  8 KB of data would be written to
1563  * each of four vdevs, with the fifth containing the parity bits.
1564  *
1565  *       parity    data     data     data     data
1566  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1567  *         ^        ^        ^        ^        ^
1568  *         |        |        |        |        |
1569  *   8 KB parity    ------8 KB data blocks------
1570  *
1571  * However, when writing to the dump device, the behavior is different:
1572  *
1573  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574  *
1575  * Unlike the normal RAID-Z case in which the block is allocated based on the
1576  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1577  * I/O size is less than 128 KB, only the actual portions of data are written.
1578  * In this example the data is written to the third data vdev since that vdev
1579  * contains the offset [64 KB, 96 KB).
1580  *
1581  *       parity    data     data     data     data
1582  *     |        |        |        |   XX   |        |
1583  *                                    ^
1584  *                                    |
1585  *                             32 KB data block
1586  *
1587  * As a result, an individual I/O may not span all child vdevs; moreover, a
1588  * small I/O may only operate on a single child vdev.
1589  *
1590  * Note that since there are no parity bits calculated or written, this format
1591  * remains the same no matter how many parity bits are used in a normal RAID-Z
1592  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1593  * would look like:
1594  *
1595  *       parity   parity   parity    data     data     data     data
1596  *     |        |        |        |        |        |   XX   |        |
1597  *                                                      ^
1598  *                                                      |
1599  *                                               32 KB data block
1600  */
1601 int
1602 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604 {
1605         vdev_t *tvd = vd->vdev_top;
1606         vdev_t *cvd;
1607         raidz_map_t *rm;
1608         raidz_col_t *rc;
1609         int c, err = 0;
1610
1611         uint64_t start, end, colstart, colend;
1612         uint64_t coloffset, colsize, colskip;
1613
1614         int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616 #ifdef  _KERNEL
1617
1618         /*
1619          * Don't write past the end of the block
1620          */
1621         VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622
1623         start = offset;
1624         end = start + size;
1625
1626         /*
1627          * Allocate a RAID-Z map for this block.  Note that this block starts
1628          * from the "original" offset, this is, the offset of the extent which
1629          * contains the requisite offset of the data being read or written.
1630          *
1631          * Even if this I/O operation doesn't span the full block size, let's
1632          * treat the on-disk format as if the only blocks are the complete 128
1633          * KB size.
1634          */
1635         rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636             SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637             vd->vdev_nparity);
1638
1639         coloffset = origoffset;
1640
1641         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642             c++, coloffset += rc->rc_size) {
1643                 rc = &rm->rm_col[c];
1644                 cvd = vd->vdev_child[rc->rc_devidx];
1645
1646                 /*
1647                  * Find the start and end of this column in the RAID-Z map,
1648                  * keeping in mind that the stated size and offset of the
1649                  * operation may not fill the entire column for this vdev.
1650                  *
1651                  * If any portion of the data spans this column, issue the
1652                  * appropriate operation to the vdev.
1653                  */
1654                 if (coloffset + rc->rc_size <= start)
1655                         continue;
1656                 if (coloffset >= end)
1657                         continue;
1658
1659                 colstart = MAX(coloffset, start);
1660                 colend = MIN(end, coloffset + rc->rc_size);
1661                 colsize = colend - colstart;
1662                 colskip = colstart - coloffset;
1663
1664                 VERIFY3U(colsize, <=, rc->rc_size);
1665                 VERIFY3U(colskip, <=, rc->rc_size);
1666
1667                 /*
1668                  * Note that the child vdev will have a vdev label at the start
1669                  * of its range of offsets, hence the need for
1670                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1671                  * example of why this calculation is needed.
1672                  */
1673                 if ((err = vdev_disk_physio(cvd,
1674                     ((char *)rc->rc_data) + colskip, colsize,
1675                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676                     flags, isdump)) != 0)
1677                         break;
1678         }
1679
1680         vdev_raidz_map_free(rm);
1681 #endif  /* KERNEL */
1682
1683         return (err);
1684 }
1685 #endif
1686
1687 static uint64_t
1688 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689 {
1690         uint64_t asize;
1691         uint64_t ashift = vd->vdev_top->vdev_ashift;
1692         uint64_t cols = vd->vdev_children;
1693         uint64_t nparity = vd->vdev_nparity;
1694
1695         asize = ((psize - 1) >> ashift) + 1;
1696         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697         asize = roundup(asize, nparity + 1) << ashift;
1698
1699         return (asize);
1700 }
1701
1702 static void
1703 vdev_raidz_child_done(zio_t *zio)
1704 {
1705         raidz_col_t *rc = zio->io_private;
1706
1707         rc->rc_error = zio->io_error;
1708         rc->rc_tried = 1;
1709         rc->rc_skipped = 0;
1710 }
1711
1712 /*
1713  * Start an IO operation on a RAIDZ VDev
1714  *
1715  * Outline:
1716  * - For write operations:
1717  *   1. Generate the parity data
1718  *   2. Create child zio write operations to each column's vdev, for both
1719  *      data and parity.
1720  *   3. If the column skips any sectors for padding, create optional dummy
1721  *      write zio children for those areas to improve aggregation continuity.
1722  * - For read operations:
1723  *   1. Create child zio read operations to each data column's vdev to read
1724  *      the range of data required for zio.
1725  *   2. If this is a scrub or resilver operation, or if any of the data
1726  *      vdevs have had errors, then create zio read operations to the parity
1727  *      columns' VDevs as well.
1728  */
1729 static int
1730 vdev_raidz_io_start(zio_t *zio)
1731 {
1732         vdev_t *vd = zio->io_vd;
1733         vdev_t *tvd = vd->vdev_top;
1734         vdev_t *cvd;
1735         raidz_map_t *rm;
1736         raidz_col_t *rc;
1737         int c, i;
1738
1739         rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740             zio->io_type == ZIO_TYPE_FREE,
1741             tvd->vdev_ashift, vd->vdev_children,
1742             vd->vdev_nparity);
1743
1744         zio->io_vsd = rm;
1745         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1747         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749         if (zio->io_type == ZIO_TYPE_FREE) {
1750                 for (c = 0; c < rm->rm_cols; c++) {
1751                         rc = &rm->rm_col[c];
1752                         cvd = vd->vdev_child[rc->rc_devidx];
1753                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754                             rc->rc_offset, rc->rc_data, rc->rc_size,
1755                             zio->io_type, zio->io_priority, 0,
1756                             vdev_raidz_child_done, rc));
1757                 }
1758                 return (ZIO_PIPELINE_CONTINUE);
1759         }
1760
1761         if (zio->io_type == ZIO_TYPE_WRITE) {
1762                 vdev_raidz_generate_parity(rm);
1763
1764                 for (c = 0; c < rm->rm_cols; c++) {
1765                         rc = &rm->rm_col[c];
1766                         cvd = vd->vdev_child[rc->rc_devidx];
1767                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1768                             rc->rc_offset, rc->rc_data, rc->rc_size,
1769                             zio->io_type, zio->io_priority, 0,
1770                             vdev_raidz_child_done, rc));
1771                 }
1772
1773                 /*
1774                  * Generate optional I/Os for any skipped sectors to improve
1775                  * aggregation contiguity.
1776                  */
1777                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1778                         ASSERT(c <= rm->rm_scols);
1779                         if (c == rm->rm_scols)
1780                                 c = 0;
1781                         rc = &rm->rm_col[c];
1782                         cvd = vd->vdev_child[rc->rc_devidx];
1783                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1784                             rc->rc_offset + rc->rc_size, NULL,
1785                             1 << tvd->vdev_ashift,
1786                             zio->io_type, zio->io_priority,
1787                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1788                 }
1789
1790                 return (ZIO_PIPELINE_CONTINUE);
1791         }
1792
1793         ASSERT(zio->io_type == ZIO_TYPE_READ);
1794
1795         /*
1796          * Iterate over the columns in reverse order so that we hit the parity
1797          * last -- any errors along the way will force us to read the parity.
1798          */
1799         for (c = rm->rm_cols - 1; c >= 0; c--) {
1800                 rc = &rm->rm_col[c];
1801                 cvd = vd->vdev_child[rc->rc_devidx];
1802                 if (!vdev_readable(cvd)) {
1803                         if (c >= rm->rm_firstdatacol)
1804                                 rm->rm_missingdata++;
1805                         else
1806                                 rm->rm_missingparity++;
1807                         rc->rc_error = SET_ERROR(ENXIO);
1808                         rc->rc_tried = 1;       /* don't even try */
1809                         rc->rc_skipped = 1;
1810                         continue;
1811                 }
1812                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1813                         if (c >= rm->rm_firstdatacol)
1814                                 rm->rm_missingdata++;
1815                         else
1816                                 rm->rm_missingparity++;
1817                         rc->rc_error = SET_ERROR(ESTALE);
1818                         rc->rc_skipped = 1;
1819                         continue;
1820                 }
1821                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1822                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1823                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1824                             rc->rc_offset, rc->rc_data, rc->rc_size,
1825                             zio->io_type, zio->io_priority, 0,
1826                             vdev_raidz_child_done, rc));
1827                 }
1828         }
1829
1830         return (ZIO_PIPELINE_CONTINUE);
1831 }
1832
1833
1834 /*
1835  * Report a checksum error for a child of a RAID-Z device.
1836  */
1837 static void
1838 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1839 {
1840         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1841
1842         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1843                 zio_bad_cksum_t zbc;
1844                 raidz_map_t *rm = zio->io_vsd;
1845
1846                 mutex_enter(&vd->vdev_stat_lock);
1847                 vd->vdev_stat.vs_checksum_errors++;
1848                 mutex_exit(&vd->vdev_stat_lock);
1849
1850                 zbc.zbc_has_cksum = 0;
1851                 zbc.zbc_injected = rm->rm_ecksuminjected;
1852
1853                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1854                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1855                     &zbc);
1856         }
1857 }
1858
1859 /*
1860  * We keep track of whether or not there were any injected errors, so that
1861  * any ereports we generate can note it.
1862  */
1863 static int
1864 raidz_checksum_verify(zio_t *zio)
1865 {
1866         zio_bad_cksum_t zbc;
1867         raidz_map_t *rm = zio->io_vsd;
1868
1869         int ret = zio_checksum_error(zio, &zbc);
1870         if (ret != 0 && zbc.zbc_injected != 0)
1871                 rm->rm_ecksuminjected = 1;
1872
1873         return (ret);
1874 }
1875
1876 /*
1877  * Generate the parity from the data columns. If we tried and were able to
1878  * read the parity without error, verify that the generated parity matches the
1879  * data we read. If it doesn't, we fire off a checksum error. Return the
1880  * number such failures.
1881  */
1882 static int
1883 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1884 {
1885         void *orig[VDEV_RAIDZ_MAXPARITY];
1886         int c, ret = 0;
1887         raidz_col_t *rc;
1888
1889         blkptr_t *bp = zio->io_bp;
1890         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1891             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1892
1893         if (checksum == ZIO_CHECKSUM_NOPARITY)
1894                 return (ret);
1895
1896         for (c = 0; c < rm->rm_firstdatacol; c++) {
1897                 rc = &rm->rm_col[c];
1898                 if (!rc->rc_tried || rc->rc_error != 0)
1899                         continue;
1900                 orig[c] = zio_buf_alloc(rc->rc_size);
1901                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1902         }
1903
1904         vdev_raidz_generate_parity(rm);
1905
1906         for (c = 0; c < rm->rm_firstdatacol; c++) {
1907                 rc = &rm->rm_col[c];
1908                 if (!rc->rc_tried || rc->rc_error != 0)
1909                         continue;
1910                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1911                         raidz_checksum_error(zio, rc, orig[c]);
1912                         rc->rc_error = SET_ERROR(ECKSUM);
1913                         ret++;
1914                 }
1915                 zio_buf_free(orig[c], rc->rc_size);
1916         }
1917
1918         return (ret);
1919 }
1920
1921 /*
1922  * Keep statistics on all the ways that we used parity to correct data.
1923  */
1924 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1925
1926 static int
1927 vdev_raidz_worst_error(raidz_map_t *rm)
1928 {
1929         int error = 0;
1930
1931         for (int c = 0; c < rm->rm_cols; c++)
1932                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1933
1934         return (error);
1935 }
1936
1937 /*
1938  * Iterate over all combinations of bad data and attempt a reconstruction.
1939  * Note that the algorithm below is non-optimal because it doesn't take into
1940  * account how reconstruction is actually performed. For example, with
1941  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1942  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1943  * cases we'd only use parity information in column 0.
1944  */
1945 static int
1946 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1947 {
1948         raidz_map_t *rm = zio->io_vsd;
1949         raidz_col_t *rc;
1950         void *orig[VDEV_RAIDZ_MAXPARITY];
1951         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1952         int *tgts = &tstore[1];
1953         int current, next, i, c, n;
1954         int code, ret = 0;
1955
1956         ASSERT(total_errors < rm->rm_firstdatacol);
1957
1958         /*
1959          * This simplifies one edge condition.
1960          */
1961         tgts[-1] = -1;
1962
1963         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1964                 /*
1965                  * Initialize the targets array by finding the first n columns
1966                  * that contain no error.
1967                  *
1968                  * If there were no data errors, we need to ensure that we're
1969                  * always explicitly attempting to reconstruct at least one
1970                  * data column. To do this, we simply push the highest target
1971                  * up into the data columns.
1972                  */
1973                 for (c = 0, i = 0; i < n; i++) {
1974                         if (i == n - 1 && data_errors == 0 &&
1975                             c < rm->rm_firstdatacol) {
1976                                 c = rm->rm_firstdatacol;
1977                         }
1978
1979                         while (rm->rm_col[c].rc_error != 0) {
1980                                 c++;
1981                                 ASSERT3S(c, <, rm->rm_cols);
1982                         }
1983
1984                         tgts[i] = c++;
1985                 }
1986
1987                 /*
1988                  * Setting tgts[n] simplifies the other edge condition.
1989                  */
1990                 tgts[n] = rm->rm_cols;
1991
1992                 /*
1993                  * These buffers were allocated in previous iterations.
1994                  */
1995                 for (i = 0; i < n - 1; i++) {
1996                         ASSERT(orig[i] != NULL);
1997                 }
1998
1999                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2000
2001                 current = 0;
2002                 next = tgts[current];
2003
2004                 while (current != n) {
2005                         tgts[current] = next;
2006                         current = 0;
2007
2008                         /*
2009                          * Save off the original data that we're going to
2010                          * attempt to reconstruct.
2011                          */
2012                         for (i = 0; i < n; i++) {
2013                                 ASSERT(orig[i] != NULL);
2014                                 c = tgts[i];
2015                                 ASSERT3S(c, >=, 0);
2016                                 ASSERT3S(c, <, rm->rm_cols);
2017                                 rc = &rm->rm_col[c];
2018                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
2019                         }
2020
2021                         /*
2022                          * Attempt a reconstruction and exit the outer loop on
2023                          * success.
2024                          */
2025                         code = vdev_raidz_reconstruct(rm, tgts, n);
2026                         if (raidz_checksum_verify(zio) == 0) {
2027                                 atomic_inc_64(&raidz_corrected[code]);
2028
2029                                 for (i = 0; i < n; i++) {
2030                                         c = tgts[i];
2031                                         rc = &rm->rm_col[c];
2032                                         ASSERT(rc->rc_error == 0);
2033                                         if (rc->rc_tried)
2034                                                 raidz_checksum_error(zio, rc,
2035                                                     orig[i]);
2036                                         rc->rc_error = SET_ERROR(ECKSUM);
2037                                 }
2038
2039                                 ret = code;
2040                                 goto done;
2041                         }
2042
2043                         /*
2044                          * Restore the original data.
2045                          */
2046                         for (i = 0; i < n; i++) {
2047                                 c = tgts[i];
2048                                 rc = &rm->rm_col[c];
2049                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
2050                         }
2051
2052                         do {
2053                                 /*
2054                                  * Find the next valid column after the current
2055                                  * position..
2056                                  */
2057                                 for (next = tgts[current] + 1;
2058                                     next < rm->rm_cols &&
2059                                     rm->rm_col[next].rc_error != 0; next++)
2060                                         continue;
2061
2062                                 ASSERT(next <= tgts[current + 1]);
2063
2064                                 /*
2065                                  * If that spot is available, we're done here.
2066                                  */
2067                                 if (next != tgts[current + 1])
2068                                         break;
2069
2070                                 /*
2071                                  * Otherwise, find the next valid column after
2072                                  * the previous position.
2073                                  */
2074                                 for (c = tgts[current - 1] + 1;
2075                                     rm->rm_col[c].rc_error != 0; c++)
2076                                         continue;
2077
2078                                 tgts[current] = c;
2079                                 current++;
2080
2081                         } while (current != n);
2082                 }
2083         }
2084         n--;
2085 done:
2086         for (i = 0; i < n; i++) {
2087                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2088         }
2089
2090         return (ret);
2091 }
2092
2093 /*
2094  * Complete an IO operation on a RAIDZ VDev
2095  *
2096  * Outline:
2097  * - For write operations:
2098  *   1. Check for errors on the child IOs.
2099  *   2. Return, setting an error code if too few child VDevs were written
2100  *      to reconstruct the data later.  Note that partial writes are
2101  *      considered successful if they can be reconstructed at all.
2102  * - For read operations:
2103  *   1. Check for errors on the child IOs.
2104  *   2. If data errors occurred:
2105  *      a. Try to reassemble the data from the parity available.
2106  *      b. If we haven't yet read the parity drives, read them now.
2107  *      c. If all parity drives have been read but the data still doesn't
2108  *         reassemble with a correct checksum, then try combinatorial
2109  *         reconstruction.
2110  *      d. If that doesn't work, return an error.
2111  *   3. If there were unexpected errors or this is a resilver operation,
2112  *      rewrite the vdevs that had errors.
2113  */
2114 static void
2115 vdev_raidz_io_done(zio_t *zio)
2116 {
2117         vdev_t *vd = zio->io_vd;
2118         vdev_t *cvd;
2119         raidz_map_t *rm = zio->io_vsd;
2120         raidz_col_t *rc;
2121         int unexpected_errors = 0;
2122         int parity_errors = 0;
2123         int parity_untried = 0;
2124         int data_errors = 0;
2125         int total_errors = 0;
2126         int n, c;
2127         int tgts[VDEV_RAIDZ_MAXPARITY];
2128         int code;
2129
2130         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2131
2132         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2133         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2134
2135         for (c = 0; c < rm->rm_cols; c++) {
2136                 rc = &rm->rm_col[c];
2137
2138                 if (rc->rc_error) {
2139                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2140
2141                         if (c < rm->rm_firstdatacol)
2142                                 parity_errors++;
2143                         else
2144                                 data_errors++;
2145
2146                         if (!rc->rc_skipped)
2147                                 unexpected_errors++;
2148
2149                         total_errors++;
2150                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2151                         parity_untried++;
2152                 }
2153         }
2154
2155         if (zio->io_type == ZIO_TYPE_WRITE) {
2156                 /*
2157                  * XXX -- for now, treat partial writes as a success.
2158                  * (If we couldn't write enough columns to reconstruct
2159                  * the data, the I/O failed.  Otherwise, good enough.)
2160                  *
2161                  * Now that we support write reallocation, it would be better
2162                  * to treat partial failure as real failure unless there are
2163                  * no non-degraded top-level vdevs left, and not update DTLs
2164                  * if we intend to reallocate.
2165                  */
2166                 /* XXPOLICY */
2167                 if (total_errors > rm->rm_firstdatacol)
2168                         zio->io_error = vdev_raidz_worst_error(rm);
2169
2170                 return;
2171         } else if (zio->io_type == ZIO_TYPE_FREE) {
2172                 return;
2173         }
2174
2175         ASSERT(zio->io_type == ZIO_TYPE_READ);
2176         /*
2177          * There are three potential phases for a read:
2178          *      1. produce valid data from the columns read
2179          *      2. read all disks and try again
2180          *      3. perform combinatorial reconstruction
2181          *
2182          * Each phase is progressively both more expensive and less likely to
2183          * occur. If we encounter more errors than we can repair or all phases
2184          * fail, we have no choice but to return an error.
2185          */
2186
2187         /*
2188          * If the number of errors we saw was correctable -- less than or equal
2189          * to the number of parity disks read -- attempt to produce data that
2190          * has a valid checksum. Naturally, this case applies in the absence of
2191          * any errors.
2192          */
2193         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2194                 if (data_errors == 0) {
2195                         if (raidz_checksum_verify(zio) == 0) {
2196                                 /*
2197                                  * If we read parity information (unnecessarily
2198                                  * as it happens since no reconstruction was
2199                                  * needed) regenerate and verify the parity.
2200                                  * We also regenerate parity when resilvering
2201                                  * so we can write it out to the failed device
2202                                  * later.
2203                                  */
2204                                 if (parity_errors + parity_untried <
2205                                     rm->rm_firstdatacol ||
2206                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2207                                         n = raidz_parity_verify(zio, rm);
2208                                         unexpected_errors += n;
2209                                         ASSERT(parity_errors + n <=
2210                                             rm->rm_firstdatacol);
2211                                 }
2212                                 goto done;
2213                         }
2214                 } else {
2215                         /*
2216                          * We either attempt to read all the parity columns or
2217                          * none of them. If we didn't try to read parity, we
2218                          * wouldn't be here in the correctable case. There must
2219                          * also have been fewer parity errors than parity
2220                          * columns or, again, we wouldn't be in this code path.
2221                          */
2222                         ASSERT(parity_untried == 0);
2223                         ASSERT(parity_errors < rm->rm_firstdatacol);
2224
2225                         /*
2226                          * Identify the data columns that reported an error.
2227                          */
2228                         n = 0;
2229                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2230                                 rc = &rm->rm_col[c];
2231                                 if (rc->rc_error != 0) {
2232                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2233                                         tgts[n++] = c;
2234                                 }
2235                         }
2236
2237                         ASSERT(rm->rm_firstdatacol >= n);
2238
2239                         code = vdev_raidz_reconstruct(rm, tgts, n);
2240
2241                         if (raidz_checksum_verify(zio) == 0) {
2242                                 atomic_inc_64(&raidz_corrected[code]);
2243
2244                                 /*
2245                                  * If we read more parity disks than were used
2246                                  * for reconstruction, confirm that the other
2247                                  * parity disks produced correct data. This
2248                                  * routine is suboptimal in that it regenerates
2249                                  * the parity that we already used in addition
2250                                  * to the parity that we're attempting to
2251                                  * verify, but this should be a relatively
2252                                  * uncommon case, and can be optimized if it
2253                                  * becomes a problem. Note that we regenerate
2254                                  * parity when resilvering so we can write it
2255                                  * out to failed devices later.
2256                                  */
2257                                 if (parity_errors < rm->rm_firstdatacol - n ||
2258                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2259                                         n = raidz_parity_verify(zio, rm);
2260                                         unexpected_errors += n;
2261                                         ASSERT(parity_errors + n <=
2262                                             rm->rm_firstdatacol);
2263                                 }
2264
2265                                 goto done;
2266                         }
2267                 }
2268         }
2269
2270         /*
2271          * This isn't a typical situation -- either we got a read error or
2272          * a child silently returned bad data. Read every block so we can
2273          * try again with as much data and parity as we can track down. If
2274          * we've already been through once before, all children will be marked
2275          * as tried so we'll proceed to combinatorial reconstruction.
2276          */
2277         unexpected_errors = 1;
2278         rm->rm_missingdata = 0;
2279         rm->rm_missingparity = 0;
2280
2281         for (c = 0; c < rm->rm_cols; c++) {
2282                 if (rm->rm_col[c].rc_tried)
2283                         continue;
2284
2285                 zio_vdev_io_redone(zio);
2286                 do {
2287                         rc = &rm->rm_col[c];
2288                         if (rc->rc_tried)
2289                                 continue;
2290                         zio_nowait(zio_vdev_child_io(zio, NULL,
2291                             vd->vdev_child[rc->rc_devidx],
2292                             rc->rc_offset, rc->rc_data, rc->rc_size,
2293                             zio->io_type, zio->io_priority, 0,
2294                             vdev_raidz_child_done, rc));
2295                 } while (++c < rm->rm_cols);
2296
2297                 return;
2298         }
2299
2300         /*
2301          * At this point we've attempted to reconstruct the data given the
2302          * errors we detected, and we've attempted to read all columns. There
2303          * must, therefore, be one or more additional problems -- silent errors
2304          * resulting in invalid data rather than explicit I/O errors resulting
2305          * in absent data. We check if there is enough additional data to
2306          * possibly reconstruct the data and then perform combinatorial
2307          * reconstruction over all possible combinations. If that fails,
2308          * we're cooked.
2309          */
2310         if (total_errors > rm->rm_firstdatacol) {
2311                 zio->io_error = vdev_raidz_worst_error(rm);
2312
2313         } else if (total_errors < rm->rm_firstdatacol &&
2314             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2315                 /*
2316                  * If we didn't use all the available parity for the
2317                  * combinatorial reconstruction, verify that the remaining
2318                  * parity is correct.
2319                  */
2320                 if (code != (1 << rm->rm_firstdatacol) - 1)
2321                         (void) raidz_parity_verify(zio, rm);
2322         } else {
2323                 /*
2324                  * We're here because either:
2325                  *
2326                  *      total_errors == rm_first_datacol, or
2327                  *      vdev_raidz_combrec() failed
2328                  *
2329                  * In either case, there is enough bad data to prevent
2330                  * reconstruction.
2331                  *
2332                  * Start checksum ereports for all children which haven't
2333                  * failed, and the IO wasn't speculative.
2334                  */
2335                 zio->io_error = SET_ERROR(ECKSUM);
2336
2337                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2338                         for (c = 0; c < rm->rm_cols; c++) {
2339                                 rc = &rm->rm_col[c];
2340                                 if (rc->rc_error == 0) {
2341                                         zio_bad_cksum_t zbc;
2342                                         zbc.zbc_has_cksum = 0;
2343                                         zbc.zbc_injected =
2344                                             rm->rm_ecksuminjected;
2345
2346                                         zfs_ereport_start_checksum(
2347                                             zio->io_spa,
2348                                             vd->vdev_child[rc->rc_devidx],
2349                                             zio, rc->rc_offset, rc->rc_size,
2350                                             (void *)(uintptr_t)c, &zbc);
2351                                 }
2352                         }
2353                 }
2354         }
2355
2356 done:
2357         zio_checksum_verified(zio);
2358
2359         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2360             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2361                 /*
2362                  * Use the good data we have in hand to repair damaged children.
2363                  */
2364                 for (c = 0; c < rm->rm_cols; c++) {
2365                         rc = &rm->rm_col[c];
2366                         cvd = vd->vdev_child[rc->rc_devidx];
2367
2368                         if (rc->rc_error == 0)
2369                                 continue;
2370
2371                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2372                             rc->rc_offset, rc->rc_data, rc->rc_size,
2373                             ZIO_TYPE_WRITE, zio->io_priority,
2374                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2375                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2376                 }
2377         }
2378 }
2379
2380 static void
2381 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2382 {
2383         if (faulted > vd->vdev_nparity)
2384                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2385                     VDEV_AUX_NO_REPLICAS);
2386         else if (degraded + faulted != 0)
2387                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2388         else
2389                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2390 }
2391
2392 vdev_ops_t vdev_raidz_ops = {
2393         vdev_raidz_open,
2394         vdev_raidz_close,
2395         vdev_raidz_asize,
2396         vdev_raidz_io_start,
2397         vdev_raidz_io_done,
2398         vdev_raidz_state_change,
2399         NULL,
2400         NULL,
2401         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2402         B_FALSE                 /* not a leaf vdev */
2403 };