sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/zio_checksum.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34
  35 /*
  36  * Virtual device vector for RAID-Z.
  37  *
  38  * This vdev supports single, double, and triple parity. For single parity,
  39  * we use a simple XOR of all the data columns. For double or triple parity,
  40  * we use a special case of Reed-Solomon coding. This extends the
  41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44  * former is also based. The latter is designed to provide higher performance
  45  * for writes.
  46  *
  47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48  * amended six years later identifying a critical flaw that invalidates its
  49  * claims. Nevertheless, the technique can be adapted to work for up to
  50  * triple parity. For additional parity, the amendment "Note: Correction to
  51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52  * is viable, but the additional complexity means that write performance will
  53  * suffer.
  54  *
  55  * All of the methods above operate on a Galois field, defined over the
  56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57  * can be expressed with a single byte. Briefly, the operations on the
  58  * field are defined as follows:
  59  *
  60  *   o addition (+) is represented by a bitwise XOR
  61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62  *   o multiplication of A by 2 is defined by the following bitwise expression:
  63  *
  64  *      (A * 2)_7 = A_6
  65  *      (A * 2)_6 = A_5
  66  *      (A * 2)_5 = A_4
  67  *      (A * 2)_4 = A_3 + A_7
  68  *      (A * 2)_3 = A_2 + A_7
  69  *      (A * 2)_2 = A_1 + A_7
  70  *      (A * 2)_1 = A_0
  71  *      (A * 2)_0 = A_7
  72  *
  73  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  74  * As an aside, this multiplication is derived from the error correcting
  75  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  76  *
  77  * Observe that any number in the field (except for 0) can be expressed as a
  78  * power of 2 -- a generator for the field. We store a table of the powers of
  79  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  80  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  81  * than field addition). The inverse of a field element A (A^-1) is therefore
  82  * A ^ (255 - 1) = A^254.
  83  *
  84  * The up-to-three parity columns, P, Q, R over several data columns,
  85  * D_0, ... D_n-1, can be expressed by field operations:
  86  *
  87  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  88  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  89  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  90  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  91  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  92  *
  93  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  94  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  95  * independent coefficients. (There are no additional coefficients that have
  96  * this property which is why the uncorrected Plank method breaks down.)
  97  *
  98  * See the reconstruction code below for how P, Q and R can used individually
  99  * or in concert to recover missing data columns.
 100  */
 101
 102 typedef struct raidz_col {
 103         uint64_t rc_devidx;             /* child device index for I/O */
 104         uint64_t rc_offset;             /* device offset */
 105         uint64_t rc_size;               /* I/O size */
 106         void *rc_data;                  /* I/O data */
 107         void *rc_gdata;                 /* used to store the "good" version */
 108         int rc_error;                   /* I/O error for this device */
 109         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 110         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 111 } raidz_col_t;
 112
 113 typedef struct raidz_map {
 114         uint64_t rm_cols;               /* Regular column count */
 115         uint64_t rm_scols;              /* Count including skipped columns */
 116         uint64_t rm_bigcols;            /* Number of oversized columns */
 117         uint64_t rm_asize;              /* Actual total I/O size */
 118         uint64_t rm_missingdata;        /* Count of missing data devices */
 119         uint64_t rm_missingparity;      /* Count of missing parity devices */
 120         uint64_t rm_firstdatacol;       /* First data column/parity count */
 121         uint64_t rm_nskip;              /* Skipped sectors for padding */
 122         uint64_t rm_skipstart;          /* Column index of padding start */
 123         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 124         uintptr_t rm_reports;           /* # of referencing checksum reports */
 125         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 126         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 127         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 128 } raidz_map_t;
 129
 130 #define VDEV_RAIDZ_P            0
 131 #define VDEV_RAIDZ_Q            1
 132 #define VDEV_RAIDZ_R            2
 133
 134 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 135 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 136
 137 /*
 138  * We provide a mechanism to perform the field multiplication operation on a
 139  * 64-bit value all at once rather than a byte at a time. This works by
 140  * creating a mask from the top bit in each byte and using that to
 141  * conditionally apply the XOR of 0x1d.
 142  */
 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 144 { \
 145         (mask) = (x) & 0x8080808080808080ULL; \
 146         (mask) = ((mask) << 1) - ((mask) >> 7); \
 147         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 148             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 149 }
 150
 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 152 { \
 153         VDEV_RAIDZ_64MUL_2((x), mask); \
 154         VDEV_RAIDZ_64MUL_2((x), mask); \
 155 }
 156
 157 /*
 158  * Force reconstruction to use the general purpose method.
 159  */
 160 int vdev_raidz_default_to_general;
 161
 162 /* Powers of 2 in the Galois field defined above. */
 163 static const uint8_t vdev_raidz_pow2[256] = {
 164         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 165         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 166         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 167         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 168         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 169         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 170         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 171         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 172         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 173         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 174         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 175         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 176         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 177         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 178         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 179         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 180         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 181         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 182         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 183         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 184         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 185         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 186         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 187         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 188         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 189         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 190         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 191         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 192         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 193         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 194         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 195         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 196 };
 197 /* Logs of 2 in the Galois field defined above. */
 198 static const uint8_t vdev_raidz_log2[256] = {
 199         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 200         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 201         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 202         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 203         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 204         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 205         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 206         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 207         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 208         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 209         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 210         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 211         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 212         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 213         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 214         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 215         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 216         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 217         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 218         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 219         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 220         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 221         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 222         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 223         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 224         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 225         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 226         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 227         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 228         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 229         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 230         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 231 };
 232
 233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 234
 235 /*
 236  * Multiply a given number by 2 raised to the given power.
 237  */
 238 static uint8_t
 239 vdev_raidz_exp2(uint_t a, int exp)
 240 {
 241         if (a == 0)
 242                 return (0);
 243
 244         ASSERT(exp >= 0);
 245         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 246
 247         exp += vdev_raidz_log2[a];
 248         if (exp > 255)
 249                 exp -= 255;
 250
 251         return (vdev_raidz_pow2[exp]);
 252 }
 253
 254 static void
 255 vdev_raidz_map_free(raidz_map_t *rm)
 256 {
 257         int c;
 258         size_t size;
 259
 260         for (c = 0; c < rm->rm_firstdatacol; c++) {
 261                 if (rm->rm_col[c].rc_data != NULL)
 262                         zio_buf_free(rm->rm_col[c].rc_data,
 263                             rm->rm_col[c].rc_size);
 264
 265                 if (rm->rm_col[c].rc_gdata != NULL)
 266                         zio_buf_free(rm->rm_col[c].rc_gdata,
 267                             rm->rm_col[c].rc_size);
 268         }
 269
 270         size = 0;
 271         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 272                 size += rm->rm_col[c].rc_size;
 273
 274         if (rm->rm_datacopy != NULL)
 275                 zio_buf_free(rm->rm_datacopy, size);
 276
 277         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 278 }
 279
 280 static void
 281 vdev_raidz_map_free_vsd(zio_t *zio)
 282 {
 283         raidz_map_t *rm = zio->io_vsd;
 284
 285         ASSERT0(rm->rm_freed);
 286         rm->rm_freed = 1;
 287
 288         if (rm->rm_reports == 0)
 289                 vdev_raidz_map_free(rm);
 290 }
 291
 292 /*ARGSUSED*/
 293 static void
 294 vdev_raidz_cksum_free(void *arg, size_t ignored)
 295 {
 296         raidz_map_t *rm = arg;
 297
 298         ASSERT3U(rm->rm_reports, >, 0);
 299
 300         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 301                 vdev_raidz_map_free(rm);
 302 }
 303
 304 static void
 305 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 306 {
 307         raidz_map_t *rm = zcr->zcr_cbdata;
 308         size_t c = zcr->zcr_cbinfo;
 309         size_t x;
 310
 311         const char *good = NULL;
 312         const char *bad = rm->rm_col[c].rc_data;
 313
 314         if (good_data == NULL) {
 315                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 316                 return;
 317         }
 318
 319         if (c < rm->rm_firstdatacol) {
 320                 /*
 321                  * The first time through, calculate the parity blocks for
 322                  * the good data (this relies on the fact that the good
 323                  * data never changes for a given logical ZIO)
 324                  */
 325                 if (rm->rm_col[0].rc_gdata == NULL) {
 326                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 327                         char *buf;
 328
 329                         /*
 330                          * Set up the rm_col[]s to generate the parity for
 331                          * good_data, first saving the parity bufs and
 332                          * replacing them with buffers to hold the result.
 333                          */
 334                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 335                                 bad_parity[x] = rm->rm_col[x].rc_data;
 336                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 337                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 338                         }
 339
 340                         /* fill in the data columns from good_data */
 341                         buf = (char *)good_data;
 342                         for (; x < rm->rm_cols; x++) {
 343                                 rm->rm_col[x].rc_data = buf;
 344                                 buf += rm->rm_col[x].rc_size;
 345                         }
 346
 347                         /*
 348                          * Construct the parity from the good data.
 349                          */
 350                         vdev_raidz_generate_parity(rm);
 351
 352                         /* restore everything back to its original state */
 353                         for (x = 0; x < rm->rm_firstdatacol; x++)
 354                                 rm->rm_col[x].rc_data = bad_parity[x];
 355
 356                         buf = rm->rm_datacopy;
 357                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 358                                 rm->rm_col[x].rc_data = buf;
 359                                 buf += rm->rm_col[x].rc_size;
 360                         }
 361                 }
 362
 363                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 364                 good = rm->rm_col[c].rc_gdata;
 365         } else {
 366                 /* adjust good_data to point at the start of our column */
 367                 good = good_data;
 368
 369                 for (x = rm->rm_firstdatacol; x < c; x++)
 370                         good += rm->rm_col[x].rc_size;
 371         }
 372
 373         /* we drop the ereport if it ends up that the data was good */
 374         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 375 }
 376
 377 /*
 378  * Invoked indirectly by zfs_ereport_start_checksum(), called
 379  * below when our read operation fails completely.  The main point
 380  * is to keep a copy of everything we read from disk, so that at
 381  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 382  */
 383 static void
 384 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 385 {
 386         size_t c = (size_t)(uintptr_t)arg;
 387         caddr_t buf;
 388
 389         raidz_map_t *rm = zio->io_vsd;
 390         size_t size;
 391
 392         /* set up the report and bump the refcount  */
 393         zcr->zcr_cbdata = rm;
 394         zcr->zcr_cbinfo = c;
 395         zcr->zcr_finish = vdev_raidz_cksum_finish;
 396         zcr->zcr_free = vdev_raidz_cksum_free;
 397
 398         rm->rm_reports++;
 399         ASSERT3U(rm->rm_reports, >, 0);
 400
 401         if (rm->rm_datacopy != NULL)
 402                 return;
 403
 404         /*
 405          * It's the first time we're called for this raidz_map_t, so we need
 406          * to copy the data aside; there's no guarantee that our zio's buffer
 407          * won't be re-used for something else.
 408          *
 409          * Our parity data is already in separate buffers, so there's no need
 410          * to copy them.
 411          */
 412
 413         size = 0;
 414         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 415                 size += rm->rm_col[c].rc_size;
 416
 417         buf = rm->rm_datacopy = zio_buf_alloc(size);
 418
 419         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 420                 raidz_col_t *col = &rm->rm_col[c];
 421
 422                 bcopy(col->rc_data, buf, col->rc_size);
 423                 col->rc_data = buf;
 424
 425                 buf += col->rc_size;
 426         }
 427         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 428 }
 429
 430 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 431         vdev_raidz_map_free_vsd,
 432         vdev_raidz_cksum_report
 433 };
 434
 435 /*
 436  * Divides the IO evenly across all child vdevs; usually, dcols is
 437  * the number of children in the target vdev.
 438  */
 439 static raidz_map_t *
 440 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 441     uint64_t nparity)
 442 {
 443         raidz_map_t *rm;
 444         /* The starting RAIDZ (parent) vdev sector of the block. */
 445         uint64_t b = zio->io_offset >> unit_shift;
 446         /* The zio's size in units of the vdev's minimum sector size. */
 447         uint64_t s = zio->io_size >> unit_shift;
 448         /* The first column for this stripe. */
 449         uint64_t f = b % dcols;
 450         /* The starting byte offset on each child vdev. */
 451         uint64_t o = (b / dcols) << unit_shift;
 452         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 453
 454         /*
 455          * "Quotient": The number of data sectors for this stripe on all but
 456          * the "big column" child vdevs that also contain "remainder" data.
 457          */
 458         q = s / (dcols - nparity);
 459
 460         /*
 461          * "Remainder": The number of partial stripe data sectors in this I/O.
 462          * This will add a sector to some, but not all, child vdevs.
 463          */
 464         r = s - q * (dcols - nparity);
 465
 466         /* The number of "big columns" - those which contain remainder data. */
 467         bc = (r == 0 ? 0 : r + nparity);
 468
 469         /*
 470          * The total number of data and parity sectors associated with
 471          * this I/O.
 472          */
 473         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 474
 475         /* acols: The columns that will be accessed. */
 476         /* scols: The columns that will be accessed or skipped. */
 477         if (q == 0) {
 478                 /* Our I/O request doesn't span all child vdevs. */
 479                 acols = bc;
 480                 scols = MIN(dcols, roundup(bc, nparity + 1));
 481         } else {
 482                 acols = dcols;
 483                 scols = dcols;
 484         }
 485
 486         ASSERT3U(acols, <=, scols);
 487
 488         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 489
 490         rm->rm_cols = acols;
 491         rm->rm_scols = scols;
 492         rm->rm_bigcols = bc;
 493         rm->rm_skipstart = bc;
 494         rm->rm_missingdata = 0;
 495         rm->rm_missingparity = 0;
 496         rm->rm_firstdatacol = nparity;
 497         rm->rm_datacopy = NULL;
 498         rm->rm_reports = 0;
 499         rm->rm_freed = 0;
 500         rm->rm_ecksuminjected = 0;
 501
 502         asize = 0;
 503
 504         for (c = 0; c < scols; c++) {
 505                 col = f + c;
 506                 coff = o;
 507                 if (col >= dcols) {
 508                         col -= dcols;
 509                         coff += 1ULL << unit_shift;
 510                 }
 511                 rm->rm_col[c].rc_devidx = col;
 512                 rm->rm_col[c].rc_offset = coff;
 513                 rm->rm_col[c].rc_data = NULL;
 514                 rm->rm_col[c].rc_gdata = NULL;
 515                 rm->rm_col[c].rc_error = 0;
 516                 rm->rm_col[c].rc_tried = 0;
 517                 rm->rm_col[c].rc_skipped = 0;
 518
 519                 if (c >= acols)
 520                         rm->rm_col[c].rc_size = 0;
 521                 else if (c < bc)
 522                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 523                 else
 524                         rm->rm_col[c].rc_size = q << unit_shift;
 525
 526                 asize += rm->rm_col[c].rc_size;
 527         }
 528
 529         ASSERT3U(asize, ==, tot << unit_shift);
 530         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 531         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 532         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 533         ASSERT3U(rm->rm_nskip, <=, nparity);
 534
 535         if (zio->io_type != ZIO_TYPE_FREE) {
 536                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 537                         rm->rm_col[c].rc_data =
 538                             zio_buf_alloc(rm->rm_col[c].rc_size);
 539                 }
 540
 541                 rm->rm_col[c].rc_data = zio->io_data;
 542
 543                 for (c = c + 1; c < acols; c++) {
 544                         rm->rm_col[c].rc_data =
 545                             (char *)rm->rm_col[c - 1].rc_data +
 546                             rm->rm_col[c - 1].rc_size;
 547                 }
 548         }
 549
 550         /*
 551          * If all data stored spans all columns, there's a danger that parity
 552          * will always be on the same device and, since parity isn't read
 553          * during normal operation, that that device's I/O bandwidth won't be
 554          * used effectively. We therefore switch the parity every 1MB.
 555          *
 556          * ... at least that was, ostensibly, the theory. As a practical
 557          * matter unless we juggle the parity between all devices evenly, we
 558          * won't see any benefit. Further, occasional writes that aren't a
 559          * multiple of the LCM of the number of children and the minimum
 560          * stripe width are sufficient to avoid pessimal behavior.
 561          * Unfortunately, this decision created an implicit on-disk format
 562          * requirement that we need to support for all eternity, but only
 563          * for single-parity RAID-Z.
 564          *
 565          * If we intend to skip a sector in the zeroth column for padding
 566          * we must make sure to note this swap. We will never intend to
 567          * skip the first column since at least one data and one parity
 568          * column must appear in each row.
 569          */
 570         ASSERT(rm->rm_cols >= 2);
 571         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 572
 573         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 574                 devidx = rm->rm_col[0].rc_devidx;
 575                 o = rm->rm_col[0].rc_offset;
 576                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 577                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 578                 rm->rm_col[1].rc_devidx = devidx;
 579                 rm->rm_col[1].rc_offset = o;
 580
 581                 if (rm->rm_skipstart == 0)
 582                         rm->rm_skipstart = 1;
 583         }
 584
 585         zio->io_vsd = rm;
 586         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 587         return (rm);
 588 }
 589
 590 static void
 591 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 592 {
 593         uint64_t *p, *src, pcount, ccount, i;
 594         int c;
 595
 596         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 597
 598         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 599                 src = rm->rm_col[c].rc_data;
 600                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 601                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 602
 603                 if (c == rm->rm_firstdatacol) {
 604                         ASSERT(ccount == pcount);
 605                         for (i = 0; i < ccount; i++, src++, p++) {
 606                                 *p = *src;
 607                         }
 608                 } else {
 609                         ASSERT(ccount <= pcount);
 610                         for (i = 0; i < ccount; i++, src++, p++) {
 611                                 *p ^= *src;
 612                         }
 613                 }
 614         }
 615 }
 616
 617 static void
 618 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 619 {
 620         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 621         int c;
 622
 623         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 624         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 625             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 626
 627         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 628                 src = rm->rm_col[c].rc_data;
 629                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 630                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 631
 632                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 633
 634                 if (c == rm->rm_firstdatacol) {
 635                         ASSERT(ccnt == pcnt || ccnt == 0);
 636                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 637                                 *p = *src;
 638                                 *q = *src;
 639                         }
 640                         for (; i < pcnt; i++, src++, p++, q++) {
 641                                 *p = 0;
 642                                 *q = 0;
 643                         }
 644                 } else {
 645                         ASSERT(ccnt <= pcnt);
 646
 647                         /*
 648                          * Apply the algorithm described above by multiplying
 649                          * the previous result and adding in the new value.
 650                          */
 651                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 652                                 *p ^= *src;
 653
 654                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 655                                 *q ^= *src;
 656                         }
 657
 658                         /*
 659                          * Treat short columns as though they are full of 0s.
 660                          * Note that there's therefore nothing needed for P.
 661                          */
 662                         for (; i < pcnt; i++, q++) {
 663                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 664                         }
 665                 }
 666         }
 667 }
 668
 669 static void
 670 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 671 {
 672         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 673         int c;
 674
 675         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 676         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 677             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 678         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 679             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 680
 681         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 682                 src = rm->rm_col[c].rc_data;
 683                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 684                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 685                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 686
 687                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 688
 689                 if (c == rm->rm_firstdatacol) {
 690                         ASSERT(ccnt == pcnt || ccnt == 0);
 691                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 692                                 *p = *src;
 693                                 *q = *src;
 694                                 *r = *src;
 695                         }
 696                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 697                                 *p = 0;
 698                                 *q = 0;
 699                                 *r = 0;
 700                         }
 701                 } else {
 702                         ASSERT(ccnt <= pcnt);
 703
 704                         /*
 705                          * Apply the algorithm described above by multiplying
 706                          * the previous result and adding in the new value.
 707                          */
 708                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 709                                 *p ^= *src;
 710
 711                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 712                                 *q ^= *src;
 713
 714                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 715                                 *r ^= *src;
 716                         }
 717
 718                         /*
 719                          * Treat short columns as though they are full of 0s.
 720                          * Note that there's therefore nothing needed for P.
 721                          */
 722                         for (; i < pcnt; i++, q++, r++) {
 723                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 724                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 725                         }
 726                 }
 727         }
 728 }
 729
 730 /*
 731  * Generate RAID parity in the first virtual columns according to the number of
 732  * parity columns available.
 733  */
 734 static void
 735 vdev_raidz_generate_parity(raidz_map_t *rm)
 736 {
 737         switch (rm->rm_firstdatacol) {
 738         case 1:
 739                 vdev_raidz_generate_parity_p(rm);
 740                 break;
 741         case 2:
 742                 vdev_raidz_generate_parity_pq(rm);
 743                 break;
 744         case 3:
 745                 vdev_raidz_generate_parity_pqr(rm);
 746                 break;
 747         default:
 748                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 749         }
 750 }
 751
 752 static int
 753 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 754 {
 755         uint64_t *dst, *src, xcount, ccount, count, i;
 756         int x = tgts[0];
 757         int c;
 758
 759         ASSERT(ntgts == 1);
 760         ASSERT(x >= rm->rm_firstdatacol);
 761         ASSERT(x < rm->rm_cols);
 762
 763         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 764         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 765         ASSERT(xcount > 0);
 766
 767         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 768         dst = rm->rm_col[x].rc_data;
 769         for (i = 0; i < xcount; i++, dst++, src++) {
 770                 *dst = *src;
 771         }
 772
 773         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 774                 src = rm->rm_col[c].rc_data;
 775                 dst = rm->rm_col[x].rc_data;
 776
 777                 if (c == x)
 778                         continue;
 779
 780                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 781                 count = MIN(ccount, xcount);
 782
 783                 for (i = 0; i < count; i++, dst++, src++) {
 784                         *dst ^= *src;
 785                 }
 786         }
 787
 788         return (1 << VDEV_RAIDZ_P);
 789 }
 790
 791 static int
 792 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 793 {
 794         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 795         uint8_t *b;
 796         int x = tgts[0];
 797         int c, j, exp;
 798
 799         ASSERT(ntgts == 1);
 800
 801         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 802         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 803
 804         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 805                 src = rm->rm_col[c].rc_data;
 806                 dst = rm->rm_col[x].rc_data;
 807
 808                 if (c == x)
 809                         ccount = 0;
 810                 else
 811                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 812
 813                 count = MIN(ccount, xcount);
 814
 815                 if (c == rm->rm_firstdatacol) {
 816                         for (i = 0; i < count; i++, dst++, src++) {
 817                                 *dst = *src;
 818                         }
 819                         for (; i < xcount; i++, dst++) {
 820                                 *dst = 0;
 821                         }
 822
 823                 } else {
 824                         for (i = 0; i < count; i++, dst++, src++) {
 825                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 826                                 *dst ^= *src;
 827                         }
 828
 829                         for (; i < xcount; i++, dst++) {
 830                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 831                         }
 832                 }
 833         }
 834
 835         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 836         dst = rm->rm_col[x].rc_data;
 837         exp = 255 - (rm->rm_cols - 1 - x);
 838
 839         for (i = 0; i < xcount; i++, dst++, src++) {
 840                 *dst ^= *src;
 841                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 842                         *b = vdev_raidz_exp2(*b, exp);
 843                 }
 844         }
 845
 846         return (1 << VDEV_RAIDZ_Q);
 847 }
 848
 849 static int
 850 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 851 {
 852         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 853         void *pdata, *qdata;
 854         uint64_t xsize, ysize, i;
 855         int x = tgts[0];
 856         int y = tgts[1];
 857
 858         ASSERT(ntgts == 2);
 859         ASSERT(x < y);
 860         ASSERT(x >= rm->rm_firstdatacol);
 861         ASSERT(y < rm->rm_cols);
 862
 863         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 864
 865         /*
 866          * Move the parity data aside -- we're going to compute parity as
 867          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 868          * reuse the parity generation mechanism without trashing the actual
 869          * parity so we make those columns appear to be full of zeros by
 870          * setting their lengths to zero.
 871          */
 872         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 873         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 874         xsize = rm->rm_col[x].rc_size;
 875         ysize = rm->rm_col[y].rc_size;
 876
 877         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 878             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 879         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 880             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 881         rm->rm_col[x].rc_size = 0;
 882         rm->rm_col[y].rc_size = 0;
 883
 884         vdev_raidz_generate_parity_pq(rm);
 885
 886         rm->rm_col[x].rc_size = xsize;
 887         rm->rm_col[y].rc_size = ysize;
 888
 889         p = pdata;
 890         q = qdata;
 891         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 892         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 893         xd = rm->rm_col[x].rc_data;
 894         yd = rm->rm_col[y].rc_data;
 895
 896         /*
 897          * We now have:
 898          *      Pxy = P + D_x + D_y
 899          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 900          *
 901          * We can then solve for D_x:
 902          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 903          * where
 904          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 905          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 906          *
 907          * With D_x in hand, we can easily solve for D_y:
 908          *      D_y = P + Pxy + D_x
 909          */
 910
 911         a = vdev_raidz_pow2[255 + x - y];
 912         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 913         tmp = 255 - vdev_raidz_log2[a ^ 1];
 914
 915         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 916         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 917
 918         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 919                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 920                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 921
 922                 if (i < ysize)
 923                         *yd = *p ^ *pxy ^ *xd;
 924         }
 925
 926         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 927             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 928         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 929             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 930
 931         /*
 932          * Restore the saved parity data.
 933          */
 934         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 935         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 936
 937         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 938 }
 939
 940 /* BEGIN CSTYLED */
 941 /*
 942  * In the general case of reconstruction, we must solve the system of linear
 943  * equations defined by the coeffecients used to generate parity as well as
 944  * the contents of the data and parity disks. This can be expressed with
 945  * vectors for the original data (D) and the actual data (d) and parity (p)
 946  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 947  *
 948  *            __   __                     __     __
 949  *            |     |         __     __   |  p_0  |
 950  *            |  V  |         |  D_0  |   | p_m-1 |
 951  *            |     |    x    |   :   | = |  d_0  |
 952  *            |  I  |         | D_n-1 |   |   :   |
 953  *            |     |         ~~     ~~   | d_n-1 |
 954  *            ~~   ~~                     ~~     ~~
 955  *
 956  * I is simply a square identity matrix of size n, and V is a vandermonde
 957  * matrix defined by the coeffecients we chose for the various parity columns
 958  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 959  * computation as well as linear separability.
 960  *
 961  *      __               __               __     __
 962  *      |   1   ..  1 1 1 |               |  p_0  |
 963  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 964  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 965  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 966  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 967  *      |   :       : : : |   |   :   |   |  d_2  |
 968  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 969  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 970  *      |   0   ..  0 0 1 |               | d_n-1 |
 971  *      ~~               ~~               ~~     ~~
 972  *
 973  * Note that I, V, d, and p are known. To compute D, we must invert the
 974  * matrix and use the known data and parity values to reconstruct the unknown
 975  * data values. We begin by removing the rows in V|I and d|p that correspond
 976  * to failed or missing columns; we then make V|I square (n x n) and d|p
 977  * sized n by removing rows corresponding to unused parity from the bottom up
 978  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 979  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 980  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 981  *           __                               __
 982  *           |  1   1   1   1   1   1   1   1  |
 983  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 984  *           |  19 205 116  29  64  16  4   1  |      / /
 985  *           |  1   0   0   0   0   0   0   0  |     / /
 986  *           |  0   1   0   0   0   0   0   0  | <--' /
 987  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 988  *           |  0   0   0   1   0   0   0   0  |
 989  *           |  0   0   0   0   1   0   0   0  |
 990  *           |  0   0   0   0   0   1   0   0  |
 991  *           |  0   0   0   0   0   0   1   0  |
 992  *           |  0   0   0   0   0   0   0   1  |
 993  *           ~~                               ~~
 994  *           __                               __
 995  *           |  1   1   1   1   1   1   1   1  |
 996  *           | 128  64  32  16  8   4   2   1  |
 997  *           |  19 205 116  29  64  16  4   1  |
 998  *           |  1   0   0   0   0   0   0   0  |
 999  *           |  0   1   0   0   0   0   0   0  |
1000  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1001  *           |  0   0   0   1   0   0   0   0  |
1002  *           |  0   0   0   0   1   0   0   0  |
1003  *           |  0   0   0   0   0   1   0   0  |
1004  *           |  0   0   0   0   0   0   1   0  |
1005  *           |  0   0   0   0   0   0   0   1  |
1006  *           ~~                               ~~
1007  *
1008  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1009  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1010  * matrix is not singular.
1011  * __                                                                 __
1012  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1013  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1014  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1015  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1016  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1017  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1018  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1019  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1020  * ~~                                                                 ~~
1021  * __                                                                 __
1022  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1023  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1024  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1025  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1026  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1027  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1028  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1029  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1030  * ~~                                                                 ~~
1031  * __                                                                 __
1032  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1033  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1034  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1035  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1036  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1037  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1038  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1039  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1040  * ~~                                                                 ~~
1041  * __                                                                 __
1042  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1043  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1044  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1045  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1046  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1047  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1048  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1049  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1050  * ~~                                                                 ~~
1051  * __                                                                 __
1052  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1053  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1054  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1055  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1056  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1057  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1058  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1059  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1060  * ~~                                                                 ~~
1061  * __                                                                 __
1062  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1063  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1064  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1065  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1066  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1067  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1068  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1069  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1070  * ~~                                                                 ~~
1071  *                   __                               __
1072  *                   |  0   0   1   0   0   0   0   0  |
1073  *                   | 167 100  5   41 159 169 217 208 |
1074  *                   | 166 100  4   40 158 168 216 209 |
1075  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1076  *                   |  0   0   0   0   1   0   0   0  |
1077  *                   |  0   0   0   0   0   1   0   0  |
1078  *                   |  0   0   0   0   0   0   1   0  |
1079  *                   |  0   0   0   0   0   0   0   1  |
1080  *                   ~~                               ~~
1081  *
1082  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1083  * of the missing data.
1084  *
1085  * As is apparent from the example above, the only non-trivial rows in the
1086  * inverse matrix correspond to the data disks that we're trying to
1087  * reconstruct. Indeed, those are the only rows we need as the others would
1088  * only be useful for reconstructing data known or assumed to be valid. For
1089  * that reason, we only build the coefficients in the rows that correspond to
1090  * targeted columns.
1091  */
1092 /* END CSTYLED */
1093
1094 static void
1095 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1096     uint8_t **rows)
1097 {
1098         int i, j;
1099         int pow;
1100
1101         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1102
1103         /*
1104          * Fill in the missing rows of interest.
1105          */
1106         for (i = 0; i < nmap; i++) {
1107                 ASSERT3S(0, <=, map[i]);
1108                 ASSERT3S(map[i], <=, 2);
1109
1110                 pow = map[i] * n;
1111                 if (pow > 255)
1112                         pow -= 255;
1113                 ASSERT(pow <= 255);
1114
1115                 for (j = 0; j < n; j++) {
1116                         pow -= map[i];
1117                         if (pow < 0)
1118                                 pow += 255;
1119                         rows[i][j] = vdev_raidz_pow2[pow];
1120                 }
1121         }
1122 }
1123
1124 static void
1125 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1126     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1127 {
1128         int i, j, ii, jj;
1129         uint8_t log;
1130
1131         /*
1132          * Assert that the first nmissing entries from the array of used
1133          * columns correspond to parity columns and that subsequent entries
1134          * correspond to data columns.
1135          */
1136         for (i = 0; i < nmissing; i++) {
1137                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1138         }
1139         for (; i < n; i++) {
1140                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1141         }
1142
1143         /*
1144          * First initialize the storage where we'll compute the inverse rows.
1145          */
1146         for (i = 0; i < nmissing; i++) {
1147                 for (j = 0; j < n; j++) {
1148                         invrows[i][j] = (i == j) ? 1 : 0;
1149                 }
1150         }
1151
1152         /*
1153          * Subtract all trivial rows from the rows of consequence.
1154          */
1155         for (i = 0; i < nmissing; i++) {
1156                 for (j = nmissing; j < n; j++) {
1157                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1158                         jj = used[j] - rm->rm_firstdatacol;
1159                         ASSERT3S(jj, <, n);
1160                         invrows[i][j] = rows[i][jj];
1161                         rows[i][jj] = 0;
1162                 }
1163         }
1164
1165         /*
1166          * For each of the rows of interest, we must normalize it and subtract
1167          * a multiple of it from the other rows.
1168          */
1169         for (i = 0; i < nmissing; i++) {
1170                 for (j = 0; j < missing[i]; j++) {
1171                         ASSERT0(rows[i][j]);
1172                 }
1173                 ASSERT3U(rows[i][missing[i]], !=, 0);
1174
1175                 /*
1176                  * Compute the inverse of the first element and multiply each
1177                  * element in the row by that value.
1178                  */
1179                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1180
1181                 for (j = 0; j < n; j++) {
1182                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1183                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1184                 }
1185
1186                 for (ii = 0; ii < nmissing; ii++) {
1187                         if (i == ii)
1188                                 continue;
1189
1190                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1191
1192                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1193
1194                         for (j = 0; j < n; j++) {
1195                                 rows[ii][j] ^=
1196                                     vdev_raidz_exp2(rows[i][j], log);
1197                                 invrows[ii][j] ^=
1198                                     vdev_raidz_exp2(invrows[i][j], log);
1199                         }
1200                 }
1201         }
1202
1203         /*
1204          * Verify that the data that is left in the rows are properly part of
1205          * an identity matrix.
1206          */
1207         for (i = 0; i < nmissing; i++) {
1208                 for (j = 0; j < n; j++) {
1209                         if (j == missing[i]) {
1210                                 ASSERT3U(rows[i][j], ==, 1);
1211                         } else {
1212                                 ASSERT0(rows[i][j]);
1213                         }
1214                 }
1215         }
1216 }
1217
1218 static void
1219 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1220     int *missing, uint8_t **invrows, const uint8_t *used)
1221 {
1222         int i, j, x, cc, c;
1223         uint8_t *src;
1224         uint64_t ccount;
1225         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1226         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1227         uint8_t log = 0;
1228         uint8_t val;
1229         int ll;
1230         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1231         uint8_t *p, *pp;
1232         size_t psize;
1233
1234         psize = sizeof (invlog[0][0]) * n * nmissing;
1235         p = kmem_alloc(psize, KM_SLEEP);
1236
1237         for (pp = p, i = 0; i < nmissing; i++) {
1238                 invlog[i] = pp;
1239                 pp += n;
1240         }
1241
1242         for (i = 0; i < nmissing; i++) {
1243                 for (j = 0; j < n; j++) {
1244                         ASSERT3U(invrows[i][j], !=, 0);
1245                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1246                 }
1247         }
1248
1249         for (i = 0; i < n; i++) {
1250                 c = used[i];
1251                 ASSERT3U(c, <, rm->rm_cols);
1252
1253                 src = rm->rm_col[c].rc_data;
1254                 ccount = rm->rm_col[c].rc_size;
1255                 for (j = 0; j < nmissing; j++) {
1256                         cc = missing[j] + rm->rm_firstdatacol;
1257                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1258                         ASSERT3U(cc, <, rm->rm_cols);
1259                         ASSERT3U(cc, !=, c);
1260
1261                         dst[j] = rm->rm_col[cc].rc_data;
1262                         dcount[j] = rm->rm_col[cc].rc_size;
1263                 }
1264
1265                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1266
1267                 for (x = 0; x < ccount; x++, src++) {
1268                         if (*src != 0)
1269                                 log = vdev_raidz_log2[*src];
1270
1271                         for (cc = 0; cc < nmissing; cc++) {
1272                                 if (x >= dcount[cc])
1273                                         continue;
1274
1275                                 if (*src == 0) {
1276                                         val = 0;
1277                                 } else {
1278                                         if ((ll = log + invlog[cc][i]) >= 255)
1279                                                 ll -= 255;
1280                                         val = vdev_raidz_pow2[ll];
1281                                 }
1282
1283                                 if (i == 0)
1284                                         dst[cc][x] = val;
1285                                 else
1286                                         dst[cc][x] ^= val;
1287                         }
1288                 }
1289         }
1290
1291         kmem_free(p, psize);
1292 }
1293
1294 static int
1295 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1296 {
1297         int n, i, c, t, tt;
1298         int nmissing_rows;
1299         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1300         int parity_map[VDEV_RAIDZ_MAXPARITY];
1301
1302         uint8_t *p, *pp;
1303         size_t psize;
1304
1305         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1306         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1307         uint8_t *used;
1308
1309         int code = 0;
1310
1311
1312         n = rm->rm_cols - rm->rm_firstdatacol;
1313
1314         /*
1315          * Figure out which data columns are missing.
1316          */
1317         nmissing_rows = 0;
1318         for (t = 0; t < ntgts; t++) {
1319                 if (tgts[t] >= rm->rm_firstdatacol) {
1320                         missing_rows[nmissing_rows++] =
1321                             tgts[t] - rm->rm_firstdatacol;
1322                 }
1323         }
1324
1325         /*
1326          * Figure out which parity columns to use to help generate the missing
1327          * data columns.
1328          */
1329         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1330                 ASSERT(tt < ntgts);
1331                 ASSERT(c < rm->rm_firstdatacol);
1332
1333                 /*
1334                  * Skip any targeted parity columns.
1335                  */
1336                 if (c == tgts[tt]) {
1337                         tt++;
1338                         continue;
1339                 }
1340
1341                 code |= 1 << c;
1342
1343                 parity_map[i] = c;
1344                 i++;
1345         }
1346
1347         ASSERT(code != 0);
1348         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1349
1350         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1351             nmissing_rows * n + sizeof (used[0]) * n;
1352         p = kmem_alloc(psize, KM_SLEEP);
1353
1354         for (pp = p, i = 0; i < nmissing_rows; i++) {
1355                 rows[i] = pp;
1356                 pp += n;
1357                 invrows[i] = pp;
1358                 pp += n;
1359         }
1360         used = pp;
1361
1362         for (i = 0; i < nmissing_rows; i++) {
1363                 used[i] = parity_map[i];
1364         }
1365
1366         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1367                 if (tt < nmissing_rows &&
1368                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1369                         tt++;
1370                         continue;
1371                 }
1372
1373                 ASSERT3S(i, <, n);
1374                 used[i] = c;
1375                 i++;
1376         }
1377
1378         /*
1379          * Initialize the interesting rows of the matrix.
1380          */
1381         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1382
1383         /*
1384          * Invert the matrix.
1385          */
1386         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1387             invrows, used);
1388
1389         /*
1390          * Reconstruct the missing data using the generated matrix.
1391          */
1392         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1393             invrows, used);
1394
1395         kmem_free(p, psize);
1396
1397         return (code);
1398 }
1399
1400 static int
1401 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1402 {
1403         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1404         int ntgts;
1405         int i, c;
1406         int code;
1407         int nbadparity, nbaddata;
1408         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1409
1410         /*
1411          * The tgts list must already be sorted.
1412          */
1413         for (i = 1; i < nt; i++) {
1414                 ASSERT(t[i] > t[i - 1]);
1415         }
1416
1417         nbadparity = rm->rm_firstdatacol;
1418         nbaddata = rm->rm_cols - nbadparity;
1419         ntgts = 0;
1420         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1421                 if (c < rm->rm_firstdatacol)
1422                         parity_valid[c] = B_FALSE;
1423
1424                 if (i < nt && c == t[i]) {
1425                         tgts[ntgts++] = c;
1426                         i++;
1427                 } else if (rm->rm_col[c].rc_error != 0) {
1428                         tgts[ntgts++] = c;
1429                 } else if (c >= rm->rm_firstdatacol) {
1430                         nbaddata--;
1431                 } else {
1432                         parity_valid[c] = B_TRUE;
1433                         nbadparity--;
1434                 }
1435         }
1436
1437         ASSERT(ntgts >= nt);
1438         ASSERT(nbaddata >= 0);
1439         ASSERT(nbaddata + nbadparity == ntgts);
1440
1441         dt = &tgts[nbadparity];
1442
1443         /*
1444          * See if we can use any of our optimized reconstruction routines.
1445          */
1446         if (!vdev_raidz_default_to_general) {
1447                 switch (nbaddata) {
1448                 case 1:
1449                         if (parity_valid[VDEV_RAIDZ_P])
1450                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1451
1452                         ASSERT(rm->rm_firstdatacol > 1);
1453
1454                         if (parity_valid[VDEV_RAIDZ_Q])
1455                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1456
1457                         ASSERT(rm->rm_firstdatacol > 2);
1458                         break;
1459
1460                 case 2:
1461                         ASSERT(rm->rm_firstdatacol > 1);
1462
1463                         if (parity_valid[VDEV_RAIDZ_P] &&
1464                             parity_valid[VDEV_RAIDZ_Q])
1465                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1466
1467                         ASSERT(rm->rm_firstdatacol > 2);
1468
1469                         break;
1470                 }
1471         }
1472
1473         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1474         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1475         ASSERT(code > 0);
1476         return (code);
1477 }
1478
1479 static int
1480 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1481     uint64_t *logical_ashift, uint64_t *physical_ashift)
1482 {
1483         vdev_t *cvd;
1484         uint64_t nparity = vd->vdev_nparity;
1485         int c;
1486         int lasterror = 0;
1487         int numerrors = 0;
1488
1489         ASSERT(nparity > 0);
1490
1491         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1492             vd->vdev_children < nparity + 1) {
1493                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1494                 return (SET_ERROR(EINVAL));
1495         }
1496
1497         vdev_open_children(vd);
1498
1499         for (c = 0; c < vd->vdev_children; c++) {
1500                 cvd = vd->vdev_child[c];
1501
1502                 if (cvd->vdev_open_error != 0) {
1503                         lasterror = cvd->vdev_open_error;
1504                         numerrors++;
1505                         continue;
1506                 }
1507
1508                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1509                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1510                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1511                 *physical_ashift = MAX(*physical_ashift,
1512                     cvd->vdev_physical_ashift);
1513         }
1514
1515         *asize *= vd->vdev_children;
1516         *max_asize *= vd->vdev_children;
1517
1518         if (numerrors > nparity) {
1519                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1520                 return (lasterror);
1521         }
1522
1523         return (0);
1524 }
1525
1526 static void
1527 vdev_raidz_close(vdev_t *vd)
1528 {
1529         int c;
1530
1531         for (c = 0; c < vd->vdev_children; c++)
1532                 vdev_close(vd->vdev_child[c]);
1533 }
1534
1535 static uint64_t
1536 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1537 {
1538         uint64_t asize;
1539         uint64_t ashift = vd->vdev_top->vdev_ashift;
1540         uint64_t cols = vd->vdev_children;
1541         uint64_t nparity = vd->vdev_nparity;
1542
1543         asize = ((psize - 1) >> ashift) + 1;
1544         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1545         asize = roundup(asize, nparity + 1) << ashift;
1546
1547         return (asize);
1548 }
1549
1550 static void
1551 vdev_raidz_child_done(zio_t *zio)
1552 {
1553         raidz_col_t *rc = zio->io_private;
1554
1555         rc->rc_error = zio->io_error;
1556         rc->rc_tried = 1;
1557         rc->rc_skipped = 0;
1558 }
1559
1560 /*
1561  * Start an IO operation on a RAIDZ VDev
1562  *
1563  * Outline:
1564  * - For write operations:
1565  *   1. Generate the parity data
1566  *   2. Create child zio write operations to each column's vdev, for both
1567  *      data and parity.
1568  *   3. If the column skips any sectors for padding, create optional dummy
1569  *      write zio children for those areas to improve aggregation continuity.
1570  * - For read operations:
1571  *   1. Create child zio read operations to each data column's vdev to read
1572  *      the range of data required for zio.
1573  *   2. If this is a scrub or resilver operation, or if any of the data
1574  *      vdevs have had errors, then create zio read operations to the parity
1575  *      columns' VDevs as well.
1576  */
1577 static int
1578 vdev_raidz_io_start(zio_t *zio)
1579 {
1580         vdev_t *vd = zio->io_vd;
1581         vdev_t *tvd = vd->vdev_top;
1582         vdev_t *cvd;
1583         raidz_map_t *rm;
1584         raidz_col_t *rc;
1585         int c, i;
1586
1587         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1588             vd->vdev_nparity);
1589
1590         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1591
1592         if (zio->io_type == ZIO_TYPE_FREE) {
1593                 for (c = 0; c < rm->rm_cols; c++) {
1594                         rc = &rm->rm_col[c];
1595                         cvd = vd->vdev_child[rc->rc_devidx];
1596                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597                             rc->rc_offset, rc->rc_data, rc->rc_size,
1598                             zio->io_type, zio->io_priority, 0,
1599                             vdev_raidz_child_done, rc));
1600                 }
1601                 return (ZIO_PIPELINE_CONTINUE);
1602         }
1603
1604         if (zio->io_type == ZIO_TYPE_WRITE) {
1605                 vdev_raidz_generate_parity(rm);
1606
1607                 for (c = 0; c < rm->rm_cols; c++) {
1608                         rc = &rm->rm_col[c];
1609                         cvd = vd->vdev_child[rc->rc_devidx];
1610                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1611                             rc->rc_offset, rc->rc_data, rc->rc_size,
1612                             zio->io_type, zio->io_priority, 0,
1613                             vdev_raidz_child_done, rc));
1614                 }
1615
1616                 /*
1617                  * Generate optional I/Os for any skipped sectors to improve
1618                  * aggregation contiguity.
1619                  */
1620                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1621                         ASSERT(c <= rm->rm_scols);
1622                         if (c == rm->rm_scols)
1623                                 c = 0;
1624                         rc = &rm->rm_col[c];
1625                         cvd = vd->vdev_child[rc->rc_devidx];
1626                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1627                             rc->rc_offset + rc->rc_size, NULL,
1628                             1 << tvd->vdev_ashift,
1629                             zio->io_type, zio->io_priority,
1630                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1631                 }
1632
1633                 return (ZIO_PIPELINE_CONTINUE);
1634         }
1635
1636         ASSERT(zio->io_type == ZIO_TYPE_READ);
1637
1638         /*
1639          * Iterate over the columns in reverse order so that we hit the parity
1640          * last -- any errors along the way will force us to read the parity.
1641          */
1642         for (c = rm->rm_cols - 1; c >= 0; c--) {
1643                 rc = &rm->rm_col[c];
1644                 cvd = vd->vdev_child[rc->rc_devidx];
1645                 if (!vdev_readable(cvd)) {
1646                         if (c >= rm->rm_firstdatacol)
1647                                 rm->rm_missingdata++;
1648                         else
1649                                 rm->rm_missingparity++;
1650                         rc->rc_error = SET_ERROR(ENXIO);
1651                         rc->rc_tried = 1;       /* don't even try */
1652                         rc->rc_skipped = 1;
1653                         continue;
1654                 }
1655                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1656                         if (c >= rm->rm_firstdatacol)
1657                                 rm->rm_missingdata++;
1658                         else
1659                                 rm->rm_missingparity++;
1660                         rc->rc_error = SET_ERROR(ESTALE);
1661                         rc->rc_skipped = 1;
1662                         continue;
1663                 }
1664                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1665                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1666                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1667                             rc->rc_offset, rc->rc_data, rc->rc_size,
1668                             zio->io_type, zio->io_priority, 0,
1669                             vdev_raidz_child_done, rc));
1670                 }
1671         }
1672
1673         return (ZIO_PIPELINE_CONTINUE);
1674 }
1675
1676
1677 /*
1678  * Report a checksum error for a child of a RAID-Z device.
1679  */
1680 static void
1681 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1682 {
1683         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1684
1685         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1686                 zio_bad_cksum_t zbc;
1687                 raidz_map_t *rm = zio->io_vsd;
1688
1689                 mutex_enter(&vd->vdev_stat_lock);
1690                 vd->vdev_stat.vs_checksum_errors++;
1691                 mutex_exit(&vd->vdev_stat_lock);
1692
1693                 zbc.zbc_has_cksum = 0;
1694                 zbc.zbc_injected = rm->rm_ecksuminjected;
1695
1696                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1697                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1698                     &zbc);
1699         }
1700 }
1701
1702 /*
1703  * We keep track of whether or not there were any injected errors, so that
1704  * any ereports we generate can note it.
1705  */
1706 static int
1707 raidz_checksum_verify(zio_t *zio)
1708 {
1709         zio_bad_cksum_t zbc;
1710         raidz_map_t *rm = zio->io_vsd;
1711
1712         int ret = zio_checksum_error(zio, &zbc);
1713         if (ret != 0 && zbc.zbc_injected != 0)
1714                 rm->rm_ecksuminjected = 1;
1715
1716         return (ret);
1717 }
1718
1719 /*
1720  * Generate the parity from the data columns. If we tried and were able to
1721  * read the parity without error, verify that the generated parity matches the
1722  * data we read. If it doesn't, we fire off a checksum error. Return the
1723  * number such failures.
1724  */
1725 static int
1726 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1727 {
1728         void *orig[VDEV_RAIDZ_MAXPARITY];
1729         int c, ret = 0;
1730         raidz_col_t *rc;
1731
1732         for (c = 0; c < rm->rm_firstdatacol; c++) {
1733                 rc = &rm->rm_col[c];
1734                 if (!rc->rc_tried || rc->rc_error != 0)
1735                         continue;
1736                 orig[c] = zio_buf_alloc(rc->rc_size);
1737                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1738         }
1739
1740         vdev_raidz_generate_parity(rm);
1741
1742         for (c = 0; c < rm->rm_firstdatacol; c++) {
1743                 rc = &rm->rm_col[c];
1744                 if (!rc->rc_tried || rc->rc_error != 0)
1745                         continue;
1746                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1747                         raidz_checksum_error(zio, rc, orig[c]);
1748                         rc->rc_error = SET_ERROR(ECKSUM);
1749                         ret++;
1750                 }
1751                 zio_buf_free(orig[c], rc->rc_size);
1752         }
1753
1754         return (ret);
1755 }
1756
1757 /*
1758  * Keep statistics on all the ways that we used parity to correct data.
1759  */
1760 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1761
1762 static int
1763 vdev_raidz_worst_error(raidz_map_t *rm)
1764 {
1765         int error = 0;
1766
1767         for (int c = 0; c < rm->rm_cols; c++)
1768                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1769
1770         return (error);
1771 }
1772
1773 /*
1774  * Iterate over all combinations of bad data and attempt a reconstruction.
1775  * Note that the algorithm below is non-optimal because it doesn't take into
1776  * account how reconstruction is actually performed. For example, with
1777  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1778  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1779  * cases we'd only use parity information in column 0.
1780  */
1781 static int
1782 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1783 {
1784         raidz_map_t *rm = zio->io_vsd;
1785         raidz_col_t *rc;
1786         void *orig[VDEV_RAIDZ_MAXPARITY];
1787         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1788         int *tgts = &tstore[1];
1789         int current, next, i, c, n;
1790         int code, ret = 0;
1791
1792         ASSERT(total_errors < rm->rm_firstdatacol);
1793
1794         /*
1795          * This simplifies one edge condition.
1796          */
1797         tgts[-1] = -1;
1798
1799         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1800                 /*
1801                  * Initialize the targets array by finding the first n columns
1802                  * that contain no error.
1803                  *
1804                  * If there were no data errors, we need to ensure that we're
1805                  * always explicitly attempting to reconstruct at least one
1806                  * data column. To do this, we simply push the highest target
1807                  * up into the data columns.
1808                  */
1809                 for (c = 0, i = 0; i < n; i++) {
1810                         if (i == n - 1 && data_errors == 0 &&
1811                             c < rm->rm_firstdatacol) {
1812                                 c = rm->rm_firstdatacol;
1813                         }
1814
1815                         while (rm->rm_col[c].rc_error != 0) {
1816                                 c++;
1817                                 ASSERT3S(c, <, rm->rm_cols);
1818                         }
1819
1820                         tgts[i] = c++;
1821                 }
1822
1823                 /*
1824                  * Setting tgts[n] simplifies the other edge condition.
1825                  */
1826                 tgts[n] = rm->rm_cols;
1827
1828                 /*
1829                  * These buffers were allocated in previous iterations.
1830                  */
1831                 for (i = 0; i < n - 1; i++) {
1832                         ASSERT(orig[i] != NULL);
1833                 }
1834
1835                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1836
1837                 current = 0;
1838                 next = tgts[current];
1839
1840                 while (current != n) {
1841                         tgts[current] = next;
1842                         current = 0;
1843
1844                         /*
1845                          * Save off the original data that we're going to
1846                          * attempt to reconstruct.
1847                          */
1848                         for (i = 0; i < n; i++) {
1849                                 ASSERT(orig[i] != NULL);
1850                                 c = tgts[i];
1851                                 ASSERT3S(c, >=, 0);
1852                                 ASSERT3S(c, <, rm->rm_cols);
1853                                 rc = &rm->rm_col[c];
1854                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1855                         }
1856
1857                         /*
1858                          * Attempt a reconstruction and exit the outer loop on
1859                          * success.
1860                          */
1861                         code = vdev_raidz_reconstruct(rm, tgts, n);
1862                         if (raidz_checksum_verify(zio) == 0) {
1863                                 atomic_inc_64(&raidz_corrected[code]);
1864
1865                                 for (i = 0; i < n; i++) {
1866                                         c = tgts[i];
1867                                         rc = &rm->rm_col[c];
1868                                         ASSERT(rc->rc_error == 0);
1869                                         if (rc->rc_tried)
1870                                                 raidz_checksum_error(zio, rc,
1871                                                     orig[i]);
1872                                         rc->rc_error = SET_ERROR(ECKSUM);
1873                                 }
1874
1875                                 ret = code;
1876                                 goto done;
1877                         }
1878
1879                         /*
1880                          * Restore the original data.
1881                          */
1882                         for (i = 0; i < n; i++) {
1883                                 c = tgts[i];
1884                                 rc = &rm->rm_col[c];
1885                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1886                         }
1887
1888                         do {
1889                                 /*
1890                                  * Find the next valid column after the current
1891                                  * position..
1892                                  */
1893                                 for (next = tgts[current] + 1;
1894                                     next < rm->rm_cols &&
1895                                     rm->rm_col[next].rc_error != 0; next++)
1896                                         continue;
1897
1898                                 ASSERT(next <= tgts[current + 1]);
1899
1900                                 /*
1901                                  * If that spot is available, we're done here.
1902                                  */
1903                                 if (next != tgts[current + 1])
1904                                         break;
1905
1906                                 /*
1907                                  * Otherwise, find the next valid column after
1908                                  * the previous position.
1909                                  */
1910                                 for (c = tgts[current - 1] + 1;
1911                                     rm->rm_col[c].rc_error != 0; c++)
1912                                         continue;
1913
1914                                 tgts[current] = c;
1915                                 current++;
1916
1917                         } while (current != n);
1918                 }
1919         }
1920         n--;
1921 done:
1922         for (i = 0; i < n; i++) {
1923                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1924         }
1925
1926         return (ret);
1927 }
1928
1929 /*
1930  * Complete an IO operation on a RAIDZ VDev
1931  *
1932  * Outline:
1933  * - For write operations:
1934  *   1. Check for errors on the child IOs.
1935  *   2. Return, setting an error code if too few child VDevs were written
1936  *      to reconstruct the data later.  Note that partial writes are
1937  *      considered successful if they can be reconstructed at all.
1938  * - For read operations:
1939  *   1. Check for errors on the child IOs.
1940  *   2. If data errors occurred:
1941  *      a. Try to reassemble the data from the parity available.
1942  *      b. If we haven't yet read the parity drives, read them now.
1943  *      c. If all parity drives have been read but the data still doesn't
1944  *         reassemble with a correct checksum, then try combinatorial
1945  *         reconstruction.
1946  *      d. If that doesn't work, return an error.
1947  *   3. If there were unexpected errors or this is a resilver operation,
1948  *      rewrite the vdevs that had errors.
1949  */
1950 static void
1951 vdev_raidz_io_done(zio_t *zio)
1952 {
1953         vdev_t *vd = zio->io_vd;
1954         vdev_t *cvd;
1955         raidz_map_t *rm = zio->io_vsd;
1956         raidz_col_t *rc;
1957         int unexpected_errors = 0;
1958         int parity_errors = 0;
1959         int parity_untried = 0;
1960         int data_errors = 0;
1961         int total_errors = 0;
1962         int n, c;
1963         int tgts[VDEV_RAIDZ_MAXPARITY];
1964         int code;
1965
1966         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1967
1968         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1969         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1970
1971         for (c = 0; c < rm->rm_cols; c++) {
1972                 rc = &rm->rm_col[c];
1973
1974                 if (rc->rc_error) {
1975                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1976
1977                         if (c < rm->rm_firstdatacol)
1978                                 parity_errors++;
1979                         else
1980                                 data_errors++;
1981
1982                         if (!rc->rc_skipped)
1983                                 unexpected_errors++;
1984
1985                         total_errors++;
1986                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1987                         parity_untried++;
1988                 }
1989         }
1990
1991         if (zio->io_type == ZIO_TYPE_WRITE) {
1992                 /*
1993                  * XXX -- for now, treat partial writes as a success.
1994                  * (If we couldn't write enough columns to reconstruct
1995                  * the data, the I/O failed.  Otherwise, good enough.)
1996                  *
1997                  * Now that we support write reallocation, it would be better
1998                  * to treat partial failure as real failure unless there are
1999                  * no non-degraded top-level vdevs left, and not update DTLs
2000                  * if we intend to reallocate.
2001                  */
2002                 /* XXPOLICY */
2003                 if (total_errors > rm->rm_firstdatacol)
2004                         zio->io_error = vdev_raidz_worst_error(rm);
2005
2006                 return;
2007         } else if (zio->io_type == ZIO_TYPE_FREE) {
2008                 return;
2009         }
2010
2011         ASSERT(zio->io_type == ZIO_TYPE_READ);
2012         /*
2013          * There are three potential phases for a read:
2014          *      1. produce valid data from the columns read
2015          *      2. read all disks and try again
2016          *      3. perform combinatorial reconstruction
2017          *
2018          * Each phase is progressively both more expensive and less likely to
2019          * occur. If we encounter more errors than we can repair or all phases
2020          * fail, we have no choice but to return an error.
2021          */
2022
2023         /*
2024          * If the number of errors we saw was correctable -- less than or equal
2025          * to the number of parity disks read -- attempt to produce data that
2026          * has a valid checksum. Naturally, this case applies in the absence of
2027          * any errors.
2028          */
2029         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2030                 if (data_errors == 0) {
2031                         if (raidz_checksum_verify(zio) == 0) {
2032                                 /*
2033                                  * If we read parity information (unnecessarily
2034                                  * as it happens since no reconstruction was
2035                                  * needed) regenerate and verify the parity.
2036                                  * We also regenerate parity when resilvering
2037                                  * so we can write it out to the failed device
2038                                  * later.
2039                                  */
2040                                 if (parity_errors + parity_untried <
2041                                     rm->rm_firstdatacol ||
2042                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2043                                         n = raidz_parity_verify(zio, rm);
2044                                         unexpected_errors += n;
2045                                         ASSERT(parity_errors + n <=
2046                                             rm->rm_firstdatacol);
2047                                 }
2048                                 goto done;
2049                         }
2050                 } else {
2051                         /*
2052                          * We either attempt to read all the parity columns or
2053                          * none of them. If we didn't try to read parity, we
2054                          * wouldn't be here in the correctable case. There must
2055                          * also have been fewer parity errors than parity
2056                          * columns or, again, we wouldn't be in this code path.
2057                          */
2058                         ASSERT(parity_untried == 0);
2059                         ASSERT(parity_errors < rm->rm_firstdatacol);
2060
2061                         /*
2062                          * Identify the data columns that reported an error.
2063                          */
2064                         n = 0;
2065                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2066                                 rc = &rm->rm_col[c];
2067                                 if (rc->rc_error != 0) {
2068                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2069                                         tgts[n++] = c;
2070                                 }
2071                         }
2072
2073                         ASSERT(rm->rm_firstdatacol >= n);
2074
2075                         code = vdev_raidz_reconstruct(rm, tgts, n);
2076
2077                         if (raidz_checksum_verify(zio) == 0) {
2078                                 atomic_inc_64(&raidz_corrected[code]);
2079
2080                                 /*
2081                                  * If we read more parity disks than were used
2082                                  * for reconstruction, confirm that the other
2083                                  * parity disks produced correct data. This
2084                                  * routine is suboptimal in that it regenerates
2085                                  * the parity that we already used in addition
2086                                  * to the parity that we're attempting to
2087                                  * verify, but this should be a relatively
2088                                  * uncommon case, and can be optimized if it
2089                                  * becomes a problem. Note that we regenerate
2090                                  * parity when resilvering so we can write it
2091                                  * out to failed devices later.
2092                                  */
2093                                 if (parity_errors < rm->rm_firstdatacol - n ||
2094                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2095                                         n = raidz_parity_verify(zio, rm);
2096                                         unexpected_errors += n;
2097                                         ASSERT(parity_errors + n <=
2098                                             rm->rm_firstdatacol);
2099                                 }
2100
2101                                 goto done;
2102                         }
2103                 }
2104         }
2105
2106         /*
2107          * This isn't a typical situation -- either we got a read error or
2108          * a child silently returned bad data. Read every block so we can
2109          * try again with as much data and parity as we can track down. If
2110          * we've already been through once before, all children will be marked
2111          * as tried so we'll proceed to combinatorial reconstruction.
2112          */
2113         unexpected_errors = 1;
2114         rm->rm_missingdata = 0;
2115         rm->rm_missingparity = 0;
2116
2117         for (c = 0; c < rm->rm_cols; c++) {
2118                 if (rm->rm_col[c].rc_tried)
2119                         continue;
2120
2121                 zio_vdev_io_redone(zio);
2122                 do {
2123                         rc = &rm->rm_col[c];
2124                         if (rc->rc_tried)
2125                                 continue;
2126                         zio_nowait(zio_vdev_child_io(zio, NULL,
2127                             vd->vdev_child[rc->rc_devidx],
2128                             rc->rc_offset, rc->rc_data, rc->rc_size,
2129                             zio->io_type, zio->io_priority, 0,
2130                             vdev_raidz_child_done, rc));
2131                 } while (++c < rm->rm_cols);
2132
2133                 return;
2134         }
2135
2136         /*
2137          * At this point we've attempted to reconstruct the data given the
2138          * errors we detected, and we've attempted to read all columns. There
2139          * must, therefore, be one or more additional problems -- silent errors
2140          * resulting in invalid data rather than explicit I/O errors resulting
2141          * in absent data. We check if there is enough additional data to
2142          * possibly reconstruct the data and then perform combinatorial
2143          * reconstruction over all possible combinations. If that fails,
2144          * we're cooked.
2145          */
2146         if (total_errors > rm->rm_firstdatacol) {
2147                 zio->io_error = vdev_raidz_worst_error(rm);
2148
2149         } else if (total_errors < rm->rm_firstdatacol &&
2150             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2151                 /*
2152                  * If we didn't use all the available parity for the
2153                  * combinatorial reconstruction, verify that the remaining
2154                  * parity is correct.
2155                  */
2156                 if (code != (1 << rm->rm_firstdatacol) - 1)
2157                         (void) raidz_parity_verify(zio, rm);
2158         } else {
2159                 /*
2160                  * We're here because either:
2161                  *
2162                  *      total_errors == rm_first_datacol, or
2163                  *      vdev_raidz_combrec() failed
2164                  *
2165                  * In either case, there is enough bad data to prevent
2166                  * reconstruction.
2167                  *
2168                  * Start checksum ereports for all children which haven't
2169                  * failed, and the IO wasn't speculative.
2170                  */
2171                 zio->io_error = SET_ERROR(ECKSUM);
2172
2173                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2174                         for (c = 0; c < rm->rm_cols; c++) {
2175                                 rc = &rm->rm_col[c];
2176                                 if (rc->rc_error == 0) {
2177                                         zio_bad_cksum_t zbc;
2178                                         zbc.zbc_has_cksum = 0;
2179                                         zbc.zbc_injected =
2180                                             rm->rm_ecksuminjected;
2181
2182                                         zfs_ereport_start_checksum(
2183                                             zio->io_spa,
2184                                             vd->vdev_child[rc->rc_devidx],
2185                                             zio, rc->rc_offset, rc->rc_size,
2186                                             (void *)(uintptr_t)c, &zbc);
2187                                 }
2188                         }
2189                 }
2190         }
2191
2192 done:
2193         zio_checksum_verified(zio);
2194
2195         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2196             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2197                 /*
2198                  * Use the good data we have in hand to repair damaged children.
2199                  */
2200                 for (c = 0; c < rm->rm_cols; c++) {
2201                         rc = &rm->rm_col[c];
2202                         cvd = vd->vdev_child[rc->rc_devidx];
2203
2204                         if (rc->rc_error == 0)
2205                                 continue;
2206
2207                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2208                             rc->rc_offset, rc->rc_data, rc->rc_size,
2209                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2210                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2211                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2212                 }
2213         }
2214 }
2215
2216 static void
2217 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2218 {
2219         if (faulted > vd->vdev_nparity)
2220                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2221                     VDEV_AUX_NO_REPLICAS);
2222         else if (degraded + faulted != 0)
2223                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2224         else
2225                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2226 }
2227
2228 vdev_ops_t vdev_raidz_ops = {
2229         vdev_raidz_open,
2230         vdev_raidz_close,
2231         vdev_raidz_asize,
2232         vdev_raidz_io_start,
2233         vdev_raidz_io_done,
2234         vdev_raidz_state_change,
2235         NULL,
2236         NULL,
2237         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2238         B_FALSE                 /* not a leaf vdev */
2239 };