sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/vdev_impl.h>
  32 #ifdef illumos
  33 #include <sys/vdev_disk.h>
  34 #endif
  35 #include <sys/vdev_file.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/fm/fs/zfs.h>
  41 #include <sys/bio.h>
  42
  43 /*
  44  * Virtual device vector for RAID-Z.
  45  *
  46  * This vdev supports single, double, and triple parity. For single parity,
  47  * we use a simple XOR of all the data columns. For double or triple parity,
  48  * we use a special case of Reed-Solomon coding. This extends the
  49  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  50  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  51  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  52  * former is also based. The latter is designed to provide higher performance
  53  * for writes.
  54  *
  55  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  56  * amended six years later identifying a critical flaw that invalidates its
  57  * claims. Nevertheless, the technique can be adapted to work for up to
  58  * triple parity. For additional parity, the amendment "Note: Correction to
  59  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  60  * is viable, but the additional complexity means that write performance will
  61  * suffer.
  62  *
  63  * All of the methods above operate on a Galois field, defined over the
  64  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  65  * can be expressed with a single byte. Briefly, the operations on the
  66  * field are defined as follows:
  67  *
  68  *   o addition (+) is represented by a bitwise XOR
  69  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  70  *   o multiplication of A by 2 is defined by the following bitwise expression:
  71  *
  72  *      (A * 2)_7 = A_6
  73  *      (A * 2)_6 = A_5
  74  *      (A * 2)_5 = A_4
  75  *      (A * 2)_4 = A_3 + A_7
  76  *      (A * 2)_3 = A_2 + A_7
  77  *      (A * 2)_2 = A_1 + A_7
  78  *      (A * 2)_1 = A_0
  79  *      (A * 2)_0 = A_7
  80  *
  81  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  82  * As an aside, this multiplication is derived from the error correcting
  83  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  84  *
  85  * Observe that any number in the field (except for 0) can be expressed as a
  86  * power of 2 -- a generator for the field. We store a table of the powers of
  87  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  88  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  89  * than field addition). The inverse of a field element A (A^-1) is therefore
  90  * A ^ (255 - 1) = A^254.
  91  *
  92  * The up-to-three parity columns, P, Q, R over several data columns,
  93  * D_0, ... D_n-1, can be expressed by field operations:
  94  *
  95  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  96  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  97  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  98  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  99  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 100  *
 101  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 102  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 103  * independent coefficients. (There are no additional coefficients that have
 104  * this property which is why the uncorrected Plank method breaks down.)
 105  *
 106  * See the reconstruction code below for how P, Q and R can used individually
 107  * or in concert to recover missing data columns.
 108  */
 109
 110 typedef struct raidz_col {
 111         uint64_t rc_devidx;             /* child device index for I/O */
 112         uint64_t rc_offset;             /* device offset */
 113         uint64_t rc_size;               /* I/O size */
 114         void *rc_data;                  /* I/O data */
 115         void *rc_gdata;                 /* used to store the "good" version */
 116         int rc_error;                   /* I/O error for this device */
 117         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 118         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 119 } raidz_col_t;
 120
 121 typedef struct raidz_map {
 122         uint64_t rm_cols;               /* Regular column count */
 123         uint64_t rm_scols;              /* Count including skipped columns */
 124         uint64_t rm_bigcols;            /* Number of oversized columns */
 125         uint64_t rm_asize;              /* Actual total I/O size */
 126         uint64_t rm_missingdata;        /* Count of missing data devices */
 127         uint64_t rm_missingparity;      /* Count of missing parity devices */
 128         uint64_t rm_firstdatacol;       /* First data column/parity count */
 129         uint64_t rm_nskip;              /* Skipped sectors for padding */
 130         uint64_t rm_skipstart;          /* Column index of padding start */
 131         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 132         uintptr_t rm_reports;           /* # of referencing checksum reports */
 133         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 134         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 135         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 136 } raidz_map_t;
 137
 138 #define VDEV_RAIDZ_P            0
 139 #define VDEV_RAIDZ_Q            1
 140 #define VDEV_RAIDZ_R            2
 141
 142 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 143 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 144
 145 /*
 146  * We provide a mechanism to perform the field multiplication operation on a
 147  * 64-bit value all at once rather than a byte at a time. This works by
 148  * creating a mask from the top bit in each byte and using that to
 149  * conditionally apply the XOR of 0x1d.
 150  */
 151 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 152 { \
 153         (mask) = (x) & 0x8080808080808080ULL; \
 154         (mask) = ((mask) << 1) - ((mask) >> 7); \
 155         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 156             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 157 }
 158
 159 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 160 { \
 161         VDEV_RAIDZ_64MUL_2((x), mask); \
 162         VDEV_RAIDZ_64MUL_2((x), mask); \
 163 }
 164
 165 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 166
 167 /*
 168  * Force reconstruction to use the general purpose method.
 169  */
 170 int vdev_raidz_default_to_general;
 171
 172 /* Powers of 2 in the Galois field defined above. */
 173 static const uint8_t vdev_raidz_pow2[256] = {
 174         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 175         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 176         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 177         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 178         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 179         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 180         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 181         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 182         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 183         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 184         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 185         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 186         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 187         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 188         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 189         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 190         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 191         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 192         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 193         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 194         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 195         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 196         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 197         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 198         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 199         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 200         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 201         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 202         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 203         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 204         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 205         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 206 };
 207 /* Logs of 2 in the Galois field defined above. */
 208 static const uint8_t vdev_raidz_log2[256] = {
 209         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 210         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 211         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 212         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 213         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 214         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 215         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 216         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 217         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 218         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 219         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 220         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 221         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 222         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 223         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 224         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 225         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 226         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 227         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 228         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 229         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 230         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 231         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 232         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 233         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 234         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 235         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 236         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 237         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 238         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 239         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 240         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 241 };
 242
 243 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 244
 245 /*
 246  * Multiply a given number by 2 raised to the given power.
 247  */
 248 static uint8_t
 249 vdev_raidz_exp2(uint_t a, int exp)
 250 {
 251         if (a == 0)
 252                 return (0);
 253
 254         ASSERT(exp >= 0);
 255         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 256
 257         exp += vdev_raidz_log2[a];
 258         if (exp > 255)
 259                 exp -= 255;
 260
 261         return (vdev_raidz_pow2[exp]);
 262 }
 263
 264 static void
 265 vdev_raidz_map_free(raidz_map_t *rm)
 266 {
 267         int c;
 268         size_t size;
 269
 270         for (c = 0; c < rm->rm_firstdatacol; c++) {
 271                 if (rm->rm_col[c].rc_data != NULL)
 272                         zio_buf_free(rm->rm_col[c].rc_data,
 273                             rm->rm_col[c].rc_size);
 274
 275                 if (rm->rm_col[c].rc_gdata != NULL)
 276                         zio_buf_free(rm->rm_col[c].rc_gdata,
 277                             rm->rm_col[c].rc_size);
 278         }
 279
 280         size = 0;
 281         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 282                 size += rm->rm_col[c].rc_size;
 283
 284         if (rm->rm_datacopy != NULL)
 285                 zio_buf_free(rm->rm_datacopy, size);
 286
 287         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 288 }
 289
 290 static void
 291 vdev_raidz_map_free_vsd(zio_t *zio)
 292 {
 293         raidz_map_t *rm = zio->io_vsd;
 294
 295         ASSERT0(rm->rm_freed);
 296         rm->rm_freed = 1;
 297
 298         if (rm->rm_reports == 0)
 299                 vdev_raidz_map_free(rm);
 300 }
 301
 302 /*ARGSUSED*/
 303 static void
 304 vdev_raidz_cksum_free(void *arg, size_t ignored)
 305 {
 306         raidz_map_t *rm = arg;
 307
 308         ASSERT3U(rm->rm_reports, >, 0);
 309
 310         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 311                 vdev_raidz_map_free(rm);
 312 }
 313
 314 static void
 315 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 316 {
 317         raidz_map_t *rm = zcr->zcr_cbdata;
 318         size_t c = zcr->zcr_cbinfo;
 319         size_t x;
 320
 321         const char *good = NULL;
 322         const char *bad = rm->rm_col[c].rc_data;
 323
 324         if (good_data == NULL) {
 325                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 326                 return;
 327         }
 328
 329         if (c < rm->rm_firstdatacol) {
 330                 /*
 331                  * The first time through, calculate the parity blocks for
 332                  * the good data (this relies on the fact that the good
 333                  * data never changes for a given logical ZIO)
 334                  */
 335                 if (rm->rm_col[0].rc_gdata == NULL) {
 336                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 337                         char *buf;
 338
 339                         /*
 340                          * Set up the rm_col[]s to generate the parity for
 341                          * good_data, first saving the parity bufs and
 342                          * replacing them with buffers to hold the result.
 343                          */
 344                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 345                                 bad_parity[x] = rm->rm_col[x].rc_data;
 346                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 347                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 348                         }
 349
 350                         /* fill in the data columns from good_data */
 351                         buf = (char *)good_data;
 352                         for (; x < rm->rm_cols; x++) {
 353                                 rm->rm_col[x].rc_data = buf;
 354                                 buf += rm->rm_col[x].rc_size;
 355                         }
 356
 357                         /*
 358                          * Construct the parity from the good data.
 359                          */
 360                         vdev_raidz_generate_parity(rm);
 361
 362                         /* restore everything back to its original state */
 363                         for (x = 0; x < rm->rm_firstdatacol; x++)
 364                                 rm->rm_col[x].rc_data = bad_parity[x];
 365
 366                         buf = rm->rm_datacopy;
 367                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 368                                 rm->rm_col[x].rc_data = buf;
 369                                 buf += rm->rm_col[x].rc_size;
 370                         }
 371                 }
 372
 373                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 374                 good = rm->rm_col[c].rc_gdata;
 375         } else {
 376                 /* adjust good_data to point at the start of our column */
 377                 good = good_data;
 378
 379                 for (x = rm->rm_firstdatacol; x < c; x++)
 380                         good += rm->rm_col[x].rc_size;
 381         }
 382
 383         /* we drop the ereport if it ends up that the data was good */
 384         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 385 }
 386
 387 /*
 388  * Invoked indirectly by zfs_ereport_start_checksum(), called
 389  * below when our read operation fails completely.  The main point
 390  * is to keep a copy of everything we read from disk, so that at
 391  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 392  */
 393 static void
 394 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 395 {
 396         size_t c = (size_t)(uintptr_t)arg;
 397         caddr_t buf;
 398
 399         raidz_map_t *rm = zio->io_vsd;
 400         size_t size;
 401
 402         /* set up the report and bump the refcount  */
 403         zcr->zcr_cbdata = rm;
 404         zcr->zcr_cbinfo = c;
 405         zcr->zcr_finish = vdev_raidz_cksum_finish;
 406         zcr->zcr_free = vdev_raidz_cksum_free;
 407
 408         rm->rm_reports++;
 409         ASSERT3U(rm->rm_reports, >, 0);
 410
 411         if (rm->rm_datacopy != NULL)
 412                 return;
 413
 414         /*
 415          * It's the first time we're called for this raidz_map_t, so we need
 416          * to copy the data aside; there's no guarantee that our zio's buffer
 417          * won't be re-used for something else.
 418          *
 419          * Our parity data is already in separate buffers, so there's no need
 420          * to copy them.
 421          */
 422
 423         size = 0;
 424         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 425                 size += rm->rm_col[c].rc_size;
 426
 427         buf = rm->rm_datacopy = zio_buf_alloc(size);
 428
 429         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 430                 raidz_col_t *col = &rm->rm_col[c];
 431
 432                 bcopy(col->rc_data, buf, col->rc_size);
 433                 col->rc_data = buf;
 434
 435                 buf += col->rc_size;
 436         }
 437         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 438 }
 439
 440 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 441         vdev_raidz_map_free_vsd,
 442         vdev_raidz_cksum_report
 443 };
 444
 445 /*
 446  * Divides the IO evenly across all child vdevs; usually, dcols is
 447  * the number of children in the target vdev.
 448  */
 449 static raidz_map_t *
 450 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
 451     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 452 {
 453         raidz_map_t *rm;
 454         /* The starting RAIDZ (parent) vdev sector of the block. */
 455         uint64_t b = offset >> unit_shift;
 456         /* The zio's size in units of the vdev's minimum sector size. */
 457         uint64_t s = size >> unit_shift;
 458         /* The first column for this stripe. */
 459         uint64_t f = b % dcols;
 460         /* The starting byte offset on each child vdev. */
 461         uint64_t o = (b / dcols) << unit_shift;
 462         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 463
 464         /*
 465          * "Quotient": The number of data sectors for this stripe on all but
 466          * the "big column" child vdevs that also contain "remainder" data.
 467          */
 468         q = s / (dcols - nparity);
 469
 470         /*
 471          * "Remainder": The number of partial stripe data sectors in this I/O.
 472          * This will add a sector to some, but not all, child vdevs.
 473          */
 474         r = s - q * (dcols - nparity);
 475
 476         /* The number of "big columns" - those which contain remainder data. */
 477         bc = (r == 0 ? 0 : r + nparity);
 478
 479         /*
 480          * The total number of data and parity sectors associated with
 481          * this I/O.
 482          */
 483         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 484
 485         /* acols: The columns that will be accessed. */
 486         /* scols: The columns that will be accessed or skipped. */
 487         if (q == 0) {
 488                 /* Our I/O request doesn't span all child vdevs. */
 489                 acols = bc;
 490                 scols = MIN(dcols, roundup(bc, nparity + 1));
 491         } else {
 492                 acols = dcols;
 493                 scols = dcols;
 494         }
 495
 496         ASSERT3U(acols, <=, scols);
 497
 498         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 499
 500         rm->rm_cols = acols;
 501         rm->rm_scols = scols;
 502         rm->rm_bigcols = bc;
 503         rm->rm_skipstart = bc;
 504         rm->rm_missingdata = 0;
 505         rm->rm_missingparity = 0;
 506         rm->rm_firstdatacol = nparity;
 507         rm->rm_datacopy = NULL;
 508         rm->rm_reports = 0;
 509         rm->rm_freed = 0;
 510         rm->rm_ecksuminjected = 0;
 511
 512         asize = 0;
 513
 514         for (c = 0; c < scols; c++) {
 515                 col = f + c;
 516                 coff = o;
 517                 if (col >= dcols) {
 518                         col -= dcols;
 519                         coff += 1ULL << unit_shift;
 520                 }
 521                 rm->rm_col[c].rc_devidx = col;
 522                 rm->rm_col[c].rc_offset = coff;
 523                 rm->rm_col[c].rc_data = NULL;
 524                 rm->rm_col[c].rc_gdata = NULL;
 525                 rm->rm_col[c].rc_error = 0;
 526                 rm->rm_col[c].rc_tried = 0;
 527                 rm->rm_col[c].rc_skipped = 0;
 528
 529                 if (c >= acols)
 530                         rm->rm_col[c].rc_size = 0;
 531                 else if (c < bc)
 532                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 533                 else
 534                         rm->rm_col[c].rc_size = q << unit_shift;
 535
 536                 asize += rm->rm_col[c].rc_size;
 537         }
 538
 539         ASSERT3U(asize, ==, tot << unit_shift);
 540         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 541         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 542         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 543         ASSERT3U(rm->rm_nskip, <=, nparity);
 544
 545         if (!dofree) {
 546                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 547                         rm->rm_col[c].rc_data =
 548                             zio_buf_alloc(rm->rm_col[c].rc_size);
 549                 }
 550
 551                 rm->rm_col[c].rc_data = data;
 552
 553                 for (c = c + 1; c < acols; c++) {
 554                         rm->rm_col[c].rc_data =
 555                             (char *)rm->rm_col[c - 1].rc_data +
 556                             rm->rm_col[c - 1].rc_size;
 557                 }
 558         }
 559
 560         /*
 561          * If all data stored spans all columns, there's a danger that parity
 562          * will always be on the same device and, since parity isn't read
 563          * during normal operation, that that device's I/O bandwidth won't be
 564          * used effectively. We therefore switch the parity every 1MB.
 565          *
 566          * ... at least that was, ostensibly, the theory. As a practical
 567          * matter unless we juggle the parity between all devices evenly, we
 568          * won't see any benefit. Further, occasional writes that aren't a
 569          * multiple of the LCM of the number of children and the minimum
 570          * stripe width are sufficient to avoid pessimal behavior.
 571          * Unfortunately, this decision created an implicit on-disk format
 572          * requirement that we need to support for all eternity, but only
 573          * for single-parity RAID-Z.
 574          *
 575          * If we intend to skip a sector in the zeroth column for padding
 576          * we must make sure to note this swap. We will never intend to
 577          * skip the first column since at least one data and one parity
 578          * column must appear in each row.
 579          */
 580         ASSERT(rm->rm_cols >= 2);
 581         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 582
 583         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 584                 devidx = rm->rm_col[0].rc_devidx;
 585                 o = rm->rm_col[0].rc_offset;
 586                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 587                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 588                 rm->rm_col[1].rc_devidx = devidx;
 589                 rm->rm_col[1].rc_offset = o;
 590
 591                 if (rm->rm_skipstart == 0)
 592                         rm->rm_skipstart = 1;
 593         }
 594
 595         return (rm);
 596 }
 597
 598 static void
 599 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 600 {
 601         uint64_t *p, *src, pcount, ccount, i;
 602         int c;
 603
 604         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 605
 606         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 607                 src = rm->rm_col[c].rc_data;
 608                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 609                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 610
 611                 if (c == rm->rm_firstdatacol) {
 612                         ASSERT(ccount == pcount);
 613                         for (i = 0; i < ccount; i++, src++, p++) {
 614                                 *p = *src;
 615                         }
 616                 } else {
 617                         ASSERT(ccount <= pcount);
 618                         for (i = 0; i < ccount; i++, src++, p++) {
 619                                 *p ^= *src;
 620                         }
 621                 }
 622         }
 623 }
 624
 625 static void
 626 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 627 {
 628         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 629         int c;
 630
 631         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 632         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 633             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 634
 635         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 636                 src = rm->rm_col[c].rc_data;
 637                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 638                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 639
 640                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 641
 642                 if (c == rm->rm_firstdatacol) {
 643                         ASSERT(ccnt == pcnt || ccnt == 0);
 644                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 645                                 *p = *src;
 646                                 *q = *src;
 647                         }
 648                         for (; i < pcnt; i++, src++, p++, q++) {
 649                                 *p = 0;
 650                                 *q = 0;
 651                         }
 652                 } else {
 653                         ASSERT(ccnt <= pcnt);
 654
 655                         /*
 656                          * Apply the algorithm described above by multiplying
 657                          * the previous result and adding in the new value.
 658                          */
 659                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 660                                 *p ^= *src;
 661
 662                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 663                                 *q ^= *src;
 664                         }
 665
 666                         /*
 667                          * Treat short columns as though they are full of 0s.
 668                          * Note that there's therefore nothing needed for P.
 669                          */
 670                         for (; i < pcnt; i++, q++) {
 671                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 672                         }
 673                 }
 674         }
 675 }
 676
 677 static void
 678 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 679 {
 680         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 681         int c;
 682
 683         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 684         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 685             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 686         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 687             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 688
 689         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 690                 src = rm->rm_col[c].rc_data;
 691                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 692                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 693                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 694
 695                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 696
 697                 if (c == rm->rm_firstdatacol) {
 698                         ASSERT(ccnt == pcnt || ccnt == 0);
 699                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 700                                 *p = *src;
 701                                 *q = *src;
 702                                 *r = *src;
 703                         }
 704                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 705                                 *p = 0;
 706                                 *q = 0;
 707                                 *r = 0;
 708                         }
 709                 } else {
 710                         ASSERT(ccnt <= pcnt);
 711
 712                         /*
 713                          * Apply the algorithm described above by multiplying
 714                          * the previous result and adding in the new value.
 715                          */
 716                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 717                                 *p ^= *src;
 718
 719                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 720                                 *q ^= *src;
 721
 722                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 723                                 *r ^= *src;
 724                         }
 725
 726                         /*
 727                          * Treat short columns as though they are full of 0s.
 728                          * Note that there's therefore nothing needed for P.
 729                          */
 730                         for (; i < pcnt; i++, q++, r++) {
 731                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 732                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 733                         }
 734                 }
 735         }
 736 }
 737
 738 /*
 739  * Generate RAID parity in the first virtual columns according to the number of
 740  * parity columns available.
 741  */
 742 static void
 743 vdev_raidz_generate_parity(raidz_map_t *rm)
 744 {
 745         switch (rm->rm_firstdatacol) {
 746         case 1:
 747                 vdev_raidz_generate_parity_p(rm);
 748                 break;
 749         case 2:
 750                 vdev_raidz_generate_parity_pq(rm);
 751                 break;
 752         case 3:
 753                 vdev_raidz_generate_parity_pqr(rm);
 754                 break;
 755         default:
 756                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 757         }
 758 }
 759
 760 static int
 761 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 762 {
 763         uint64_t *dst, *src, xcount, ccount, count, i;
 764         int x = tgts[0];
 765         int c;
 766
 767         ASSERT(ntgts == 1);
 768         ASSERT(x >= rm->rm_firstdatacol);
 769         ASSERT(x < rm->rm_cols);
 770
 771         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 772         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 773         ASSERT(xcount > 0);
 774
 775         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 776         dst = rm->rm_col[x].rc_data;
 777         for (i = 0; i < xcount; i++, dst++, src++) {
 778                 *dst = *src;
 779         }
 780
 781         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 782                 src = rm->rm_col[c].rc_data;
 783                 dst = rm->rm_col[x].rc_data;
 784
 785                 if (c == x)
 786                         continue;
 787
 788                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 789                 count = MIN(ccount, xcount);
 790
 791                 for (i = 0; i < count; i++, dst++, src++) {
 792                         *dst ^= *src;
 793                 }
 794         }
 795
 796         return (1 << VDEV_RAIDZ_P);
 797 }
 798
 799 static int
 800 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 801 {
 802         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 803         uint8_t *b;
 804         int x = tgts[0];
 805         int c, j, exp;
 806
 807         ASSERT(ntgts == 1);
 808
 809         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 810         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 811
 812         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 813                 src = rm->rm_col[c].rc_data;
 814                 dst = rm->rm_col[x].rc_data;
 815
 816                 if (c == x)
 817                         ccount = 0;
 818                 else
 819                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 820
 821                 count = MIN(ccount, xcount);
 822
 823                 if (c == rm->rm_firstdatacol) {
 824                         for (i = 0; i < count; i++, dst++, src++) {
 825                                 *dst = *src;
 826                         }
 827                         for (; i < xcount; i++, dst++) {
 828                                 *dst = 0;
 829                         }
 830
 831                 } else {
 832                         for (i = 0; i < count; i++, dst++, src++) {
 833                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 834                                 *dst ^= *src;
 835                         }
 836
 837                         for (; i < xcount; i++, dst++) {
 838                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 839                         }
 840                 }
 841         }
 842
 843         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 844         dst = rm->rm_col[x].rc_data;
 845         exp = 255 - (rm->rm_cols - 1 - x);
 846
 847         for (i = 0; i < xcount; i++, dst++, src++) {
 848                 *dst ^= *src;
 849                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 850                         *b = vdev_raidz_exp2(*b, exp);
 851                 }
 852         }
 853
 854         return (1 << VDEV_RAIDZ_Q);
 855 }
 856
 857 static int
 858 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 859 {
 860         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 861         void *pdata, *qdata;
 862         uint64_t xsize, ysize, i;
 863         int x = tgts[0];
 864         int y = tgts[1];
 865
 866         ASSERT(ntgts == 2);
 867         ASSERT(x < y);
 868         ASSERT(x >= rm->rm_firstdatacol);
 869         ASSERT(y < rm->rm_cols);
 870
 871         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 872
 873         /*
 874          * Move the parity data aside -- we're going to compute parity as
 875          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 876          * reuse the parity generation mechanism without trashing the actual
 877          * parity so we make those columns appear to be full of zeros by
 878          * setting their lengths to zero.
 879          */
 880         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 881         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 882         xsize = rm->rm_col[x].rc_size;
 883         ysize = rm->rm_col[y].rc_size;
 884
 885         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 886             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 887         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 888             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 889         rm->rm_col[x].rc_size = 0;
 890         rm->rm_col[y].rc_size = 0;
 891
 892         vdev_raidz_generate_parity_pq(rm);
 893
 894         rm->rm_col[x].rc_size = xsize;
 895         rm->rm_col[y].rc_size = ysize;
 896
 897         p = pdata;
 898         q = qdata;
 899         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 900         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 901         xd = rm->rm_col[x].rc_data;
 902         yd = rm->rm_col[y].rc_data;
 903
 904         /*
 905          * We now have:
 906          *      Pxy = P + D_x + D_y
 907          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 908          *
 909          * We can then solve for D_x:
 910          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 911          * where
 912          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 913          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 914          *
 915          * With D_x in hand, we can easily solve for D_y:
 916          *      D_y = P + Pxy + D_x
 917          */
 918
 919         a = vdev_raidz_pow2[255 + x - y];
 920         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 921         tmp = 255 - vdev_raidz_log2[a ^ 1];
 922
 923         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 924         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 925
 926         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 927                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 928                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 929
 930                 if (i < ysize)
 931                         *yd = *p ^ *pxy ^ *xd;
 932         }
 933
 934         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 935             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 936         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 937             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 938
 939         /*
 940          * Restore the saved parity data.
 941          */
 942         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 943         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 944
 945         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 946 }
 947
 948 /* BEGIN CSTYLED */
 949 /*
 950  * In the general case of reconstruction, we must solve the system of linear
 951  * equations defined by the coeffecients used to generate parity as well as
 952  * the contents of the data and parity disks. This can be expressed with
 953  * vectors for the original data (D) and the actual data (d) and parity (p)
 954  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 955  *
 956  *            __   __                     __     __
 957  *            |     |         __     __   |  p_0  |
 958  *            |  V  |         |  D_0  |   | p_m-1 |
 959  *            |     |    x    |   :   | = |  d_0  |
 960  *            |  I  |         | D_n-1 |   |   :   |
 961  *            |     |         ~~     ~~   | d_n-1 |
 962  *            ~~   ~~                     ~~     ~~
 963  *
 964  * I is simply a square identity matrix of size n, and V is a vandermonde
 965  * matrix defined by the coeffecients we chose for the various parity columns
 966  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 967  * computation as well as linear separability.
 968  *
 969  *      __               __               __     __
 970  *      |   1   ..  1 1 1 |               |  p_0  |
 971  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 972  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 973  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 974  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 975  *      |   :       : : : |   |   :   |   |  d_2  |
 976  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 977  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 978  *      |   0   ..  0 0 1 |               | d_n-1 |
 979  *      ~~               ~~               ~~     ~~
 980  *
 981  * Note that I, V, d, and p are known. To compute D, we must invert the
 982  * matrix and use the known data and parity values to reconstruct the unknown
 983  * data values. We begin by removing the rows in V|I and d|p that correspond
 984  * to failed or missing columns; we then make V|I square (n x n) and d|p
 985  * sized n by removing rows corresponding to unused parity from the bottom up
 986  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 987  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 988  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 989  *           __                               __
 990  *           |  1   1   1   1   1   1   1   1  |
 991  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 992  *           |  19 205 116  29  64  16  4   1  |      / /
 993  *           |  1   0   0   0   0   0   0   0  |     / /
 994  *           |  0   1   0   0   0   0   0   0  | <--' /
 995  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 996  *           |  0   0   0   1   0   0   0   0  |
 997  *           |  0   0   0   0   1   0   0   0  |
 998  *           |  0   0   0   0   0   1   0   0  |
 999  *           |  0   0   0   0   0   0   1   0  |
1000  *           |  0   0   0   0   0   0   0   1  |
1001  *           ~~                               ~~
1002  *           __                               __
1003  *           |  1   1   1   1   1   1   1   1  |
1004  *           |  19 205 116  29  64  16  4   1  |
1005  *           |  1   0   0   0   0   0   0   0  |
1006  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1007  *           |  0   0   0   0   1   0   0   0  |
1008  *           |  0   0   0   0   0   1   0   0  |
1009  *           |  0   0   0   0   0   0   1   0  |
1010  *           |  0   0   0   0   0   0   0   1  |
1011  *           ~~                               ~~
1012  *
1013  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1014  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1015  * matrix is not singular.
1016  * __                                                                 __
1017  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1020  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1021  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1022  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1023  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1024  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1025  * ~~                                                                 ~~
1026  * __                                                                 __
1027  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1028  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1029  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1030  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1031  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1032  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1033  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1034  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1035  * ~~                                                                 ~~
1036  * __                                                                 __
1037  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1038  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1039  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1040  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1041  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1042  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1043  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1044  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1045  * ~~                                                                 ~~
1046  * __                                                                 __
1047  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1048  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1049  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1050  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1051  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1052  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1053  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1054  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1055  * ~~                                                                 ~~
1056  * __                                                                 __
1057  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1058  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1059  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1060  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1061  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1062  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1063  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1064  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1065  * ~~                                                                 ~~
1066  * __                                                                 __
1067  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1068  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1069  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1070  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1071  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1072  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1073  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1074  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1075  * ~~                                                                 ~~
1076  *                   __                               __
1077  *                   |  0   0   1   0   0   0   0   0  |
1078  *                   | 167 100  5   41 159 169 217 208 |
1079  *                   | 166 100  4   40 158 168 216 209 |
1080  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1081  *                   |  0   0   0   0   1   0   0   0  |
1082  *                   |  0   0   0   0   0   1   0   0  |
1083  *                   |  0   0   0   0   0   0   1   0  |
1084  *                   |  0   0   0   0   0   0   0   1  |
1085  *                   ~~                               ~~
1086  *
1087  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1088  * of the missing data.
1089  *
1090  * As is apparent from the example above, the only non-trivial rows in the
1091  * inverse matrix correspond to the data disks that we're trying to
1092  * reconstruct. Indeed, those are the only rows we need as the others would
1093  * only be useful for reconstructing data known or assumed to be valid. For
1094  * that reason, we only build the coefficients in the rows that correspond to
1095  * targeted columns.
1096  */
1097 /* END CSTYLED */
1098
1099 static void
1100 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1101     uint8_t **rows)
1102 {
1103         int i, j;
1104         int pow;
1105
1106         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1107
1108         /*
1109          * Fill in the missing rows of interest.
1110          */
1111         for (i = 0; i < nmap; i++) {
1112                 ASSERT3S(0, <=, map[i]);
1113                 ASSERT3S(map[i], <=, 2);
1114
1115                 pow = map[i] * n;
1116                 if (pow > 255)
1117                         pow -= 255;
1118                 ASSERT(pow <= 255);
1119
1120                 for (j = 0; j < n; j++) {
1121                         pow -= map[i];
1122                         if (pow < 0)
1123                                 pow += 255;
1124                         rows[i][j] = vdev_raidz_pow2[pow];
1125                 }
1126         }
1127 }
1128
1129 static void
1130 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1131     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1132 {
1133         int i, j, ii, jj;
1134         uint8_t log;
1135
1136         /*
1137          * Assert that the first nmissing entries from the array of used
1138          * columns correspond to parity columns and that subsequent entries
1139          * correspond to data columns.
1140          */
1141         for (i = 0; i < nmissing; i++) {
1142                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1143         }
1144         for (; i < n; i++) {
1145                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1146         }
1147
1148         /*
1149          * First initialize the storage where we'll compute the inverse rows.
1150          */
1151         for (i = 0; i < nmissing; i++) {
1152                 for (j = 0; j < n; j++) {
1153                         invrows[i][j] = (i == j) ? 1 : 0;
1154                 }
1155         }
1156
1157         /*
1158          * Subtract all trivial rows from the rows of consequence.
1159          */
1160         for (i = 0; i < nmissing; i++) {
1161                 for (j = nmissing; j < n; j++) {
1162                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1163                         jj = used[j] - rm->rm_firstdatacol;
1164                         ASSERT3S(jj, <, n);
1165                         invrows[i][j] = rows[i][jj];
1166                         rows[i][jj] = 0;
1167                 }
1168         }
1169
1170         /*
1171          * For each of the rows of interest, we must normalize it and subtract
1172          * a multiple of it from the other rows.
1173          */
1174         for (i = 0; i < nmissing; i++) {
1175                 for (j = 0; j < missing[i]; j++) {
1176                         ASSERT0(rows[i][j]);
1177                 }
1178                 ASSERT3U(rows[i][missing[i]], !=, 0);
1179
1180                 /*
1181                  * Compute the inverse of the first element and multiply each
1182                  * element in the row by that value.
1183                  */
1184                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1185
1186                 for (j = 0; j < n; j++) {
1187                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1188                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1189                 }
1190
1191                 for (ii = 0; ii < nmissing; ii++) {
1192                         if (i == ii)
1193                                 continue;
1194
1195                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1196
1197                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1198
1199                         for (j = 0; j < n; j++) {
1200                                 rows[ii][j] ^=
1201                                     vdev_raidz_exp2(rows[i][j], log);
1202                                 invrows[ii][j] ^=
1203                                     vdev_raidz_exp2(invrows[i][j], log);
1204                         }
1205                 }
1206         }
1207
1208         /*
1209          * Verify that the data that is left in the rows are properly part of
1210          * an identity matrix.
1211          */
1212         for (i = 0; i < nmissing; i++) {
1213                 for (j = 0; j < n; j++) {
1214                         if (j == missing[i]) {
1215                                 ASSERT3U(rows[i][j], ==, 1);
1216                         } else {
1217                                 ASSERT0(rows[i][j]);
1218                         }
1219                 }
1220         }
1221 }
1222
1223 static void
1224 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1225     int *missing, uint8_t **invrows, const uint8_t *used)
1226 {
1227         int i, j, x, cc, c;
1228         uint8_t *src;
1229         uint64_t ccount;
1230         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1231         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1232         uint8_t log = 0;
1233         uint8_t val;
1234         int ll;
1235         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1236         uint8_t *p, *pp;
1237         size_t psize;
1238
1239         psize = sizeof (invlog[0][0]) * n * nmissing;
1240         p = kmem_alloc(psize, KM_SLEEP);
1241
1242         for (pp = p, i = 0; i < nmissing; i++) {
1243                 invlog[i] = pp;
1244                 pp += n;
1245         }
1246
1247         for (i = 0; i < nmissing; i++) {
1248                 for (j = 0; j < n; j++) {
1249                         ASSERT3U(invrows[i][j], !=, 0);
1250                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1251                 }
1252         }
1253
1254         for (i = 0; i < n; i++) {
1255                 c = used[i];
1256                 ASSERT3U(c, <, rm->rm_cols);
1257
1258                 src = rm->rm_col[c].rc_data;
1259                 ccount = rm->rm_col[c].rc_size;
1260                 for (j = 0; j < nmissing; j++) {
1261                         cc = missing[j] + rm->rm_firstdatacol;
1262                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1263                         ASSERT3U(cc, <, rm->rm_cols);
1264                         ASSERT3U(cc, !=, c);
1265
1266                         dst[j] = rm->rm_col[cc].rc_data;
1267                         dcount[j] = rm->rm_col[cc].rc_size;
1268                 }
1269
1270                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1271
1272                 for (x = 0; x < ccount; x++, src++) {
1273                         if (*src != 0)
1274                                 log = vdev_raidz_log2[*src];
1275
1276                         for (cc = 0; cc < nmissing; cc++) {
1277                                 if (x >= dcount[cc])
1278                                         continue;
1279
1280                                 if (*src == 0) {
1281                                         val = 0;
1282                                 } else {
1283                                         if ((ll = log + invlog[cc][i]) >= 255)
1284                                                 ll -= 255;
1285                                         val = vdev_raidz_pow2[ll];
1286                                 }
1287
1288                                 if (i == 0)
1289                                         dst[cc][x] = val;
1290                                 else
1291                                         dst[cc][x] ^= val;
1292                         }
1293                 }
1294         }
1295
1296         kmem_free(p, psize);
1297 }
1298
1299 static int
1300 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1301 {
1302         int n, i, c, t, tt;
1303         int nmissing_rows;
1304         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1305         int parity_map[VDEV_RAIDZ_MAXPARITY];
1306
1307         uint8_t *p, *pp;
1308         size_t psize;
1309
1310         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1311         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1312         uint8_t *used;
1313
1314         int code = 0;
1315
1316
1317         n = rm->rm_cols - rm->rm_firstdatacol;
1318
1319         /*
1320          * Figure out which data columns are missing.
1321          */
1322         nmissing_rows = 0;
1323         for (t = 0; t < ntgts; t++) {
1324                 if (tgts[t] >= rm->rm_firstdatacol) {
1325                         missing_rows[nmissing_rows++] =
1326                             tgts[t] - rm->rm_firstdatacol;
1327                 }
1328         }
1329
1330         /*
1331          * Figure out which parity columns to use to help generate the missing
1332          * data columns.
1333          */
1334         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1335                 ASSERT(tt < ntgts);
1336                 ASSERT(c < rm->rm_firstdatacol);
1337
1338                 /*
1339                  * Skip any targeted parity columns.
1340                  */
1341                 if (c == tgts[tt]) {
1342                         tt++;
1343                         continue;
1344                 }
1345
1346                 code |= 1 << c;
1347
1348                 parity_map[i] = c;
1349                 i++;
1350         }
1351
1352         ASSERT(code != 0);
1353         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1354
1355         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1356             nmissing_rows * n + sizeof (used[0]) * n;
1357         p = kmem_alloc(psize, KM_SLEEP);
1358
1359         for (pp = p, i = 0; i < nmissing_rows; i++) {
1360                 rows[i] = pp;
1361                 pp += n;
1362                 invrows[i] = pp;
1363                 pp += n;
1364         }
1365         used = pp;
1366
1367         for (i = 0; i < nmissing_rows; i++) {
1368                 used[i] = parity_map[i];
1369         }
1370
1371         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1372                 if (tt < nmissing_rows &&
1373                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1374                         tt++;
1375                         continue;
1376                 }
1377
1378                 ASSERT3S(i, <, n);
1379                 used[i] = c;
1380                 i++;
1381         }
1382
1383         /*
1384          * Initialize the interesting rows of the matrix.
1385          */
1386         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1387
1388         /*
1389          * Invert the matrix.
1390          */
1391         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1392             invrows, used);
1393
1394         /*
1395          * Reconstruct the missing data using the generated matrix.
1396          */
1397         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1398             invrows, used);
1399
1400         kmem_free(p, psize);
1401
1402         return (code);
1403 }
1404
1405 static int
1406 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1407 {
1408         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1409         int ntgts;
1410         int i, c;
1411         int code;
1412         int nbadparity, nbaddata;
1413         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1414
1415         /*
1416          * The tgts list must already be sorted.
1417          */
1418         for (i = 1; i < nt; i++) {
1419                 ASSERT(t[i] > t[i - 1]);
1420         }
1421
1422         nbadparity = rm->rm_firstdatacol;
1423         nbaddata = rm->rm_cols - nbadparity;
1424         ntgts = 0;
1425         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1426                 if (c < rm->rm_firstdatacol)
1427                         parity_valid[c] = B_FALSE;
1428
1429                 if (i < nt && c == t[i]) {
1430                         tgts[ntgts++] = c;
1431                         i++;
1432                 } else if (rm->rm_col[c].rc_error != 0) {
1433                         tgts[ntgts++] = c;
1434                 } else if (c >= rm->rm_firstdatacol) {
1435                         nbaddata--;
1436                 } else {
1437                         parity_valid[c] = B_TRUE;
1438                         nbadparity--;
1439                 }
1440         }
1441
1442         ASSERT(ntgts >= nt);
1443         ASSERT(nbaddata >= 0);
1444         ASSERT(nbaddata + nbadparity == ntgts);
1445
1446         dt = &tgts[nbadparity];
1447
1448         /*
1449          * See if we can use any of our optimized reconstruction routines.
1450          */
1451         if (!vdev_raidz_default_to_general) {
1452                 switch (nbaddata) {
1453                 case 1:
1454                         if (parity_valid[VDEV_RAIDZ_P])
1455                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1456
1457                         ASSERT(rm->rm_firstdatacol > 1);
1458
1459                         if (parity_valid[VDEV_RAIDZ_Q])
1460                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1461
1462                         ASSERT(rm->rm_firstdatacol > 2);
1463                         break;
1464
1465                 case 2:
1466                         ASSERT(rm->rm_firstdatacol > 1);
1467
1468                         if (parity_valid[VDEV_RAIDZ_P] &&
1469                             parity_valid[VDEV_RAIDZ_Q])
1470                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1471
1472                         ASSERT(rm->rm_firstdatacol > 2);
1473
1474                         break;
1475                 }
1476         }
1477
1478         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1479         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1480         ASSERT(code > 0);
1481         return (code);
1482 }
1483
1484 static int
1485 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1486     uint64_t *logical_ashift, uint64_t *physical_ashift)
1487 {
1488         vdev_t *cvd;
1489         uint64_t nparity = vd->vdev_nparity;
1490         int c;
1491         int lasterror = 0;
1492         int numerrors = 0;
1493
1494         ASSERT(nparity > 0);
1495
1496         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1497             vd->vdev_children < nparity + 1) {
1498                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1499                 return (SET_ERROR(EINVAL));
1500         }
1501
1502         vdev_open_children(vd);
1503
1504         for (c = 0; c < vd->vdev_children; c++) {
1505                 cvd = vd->vdev_child[c];
1506
1507                 if (cvd->vdev_open_error != 0) {
1508                         lasterror = cvd->vdev_open_error;
1509                         numerrors++;
1510                         continue;
1511                 }
1512
1513                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1514                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1515                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1516                 *physical_ashift = MAX(*physical_ashift,
1517                     cvd->vdev_physical_ashift);
1518         }
1519
1520         *asize *= vd->vdev_children;
1521         *max_asize *= vd->vdev_children;
1522
1523         if (numerrors > nparity) {
1524                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1525                 return (lasterror);
1526         }
1527
1528         return (0);
1529 }
1530
1531 static void
1532 vdev_raidz_close(vdev_t *vd)
1533 {
1534         int c;
1535
1536         for (c = 0; c < vd->vdev_children; c++)
1537                 vdev_close(vd->vdev_child[c]);
1538 }
1539
1540 #ifdef illumos
1541 /*
1542  * Handle a read or write I/O to a RAID-Z dump device.
1543  *
1544  * The dump device is in a unique situation compared to other ZFS datasets:
1545  * writing to this device should be as simple and fast as possible.  In
1546  * addition, durability matters much less since the dump will be extracted
1547  * once the machine reboots.  For that reason, this function eschews parity for
1548  * performance and simplicity.  The dump device uses the checksum setting
1549  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1550  * dataset.
1551  *
1552  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1553  * 128 KB will not fill an entire block; in addition, they may not be properly
1554  * aligned.  In that case, this function uses the preallocated 128 KB block and
1555  * omits reading or writing any "empty" portions of that block, as opposed to
1556  * allocating a fresh appropriately-sized block.
1557  *
1558  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1559  *
1560  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1561  *
1562  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1563  * allocated which spans all five child vdevs.  8 KB of data would be written to
1564  * each of four vdevs, with the fifth containing the parity bits.
1565  *
1566  *       parity    data     data     data     data
1567  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1568  *         ^        ^        ^        ^        ^
1569  *         |        |        |        |        |
1570  *   8 KB parity    ------8 KB data blocks------
1571  *
1572  * However, when writing to the dump device, the behavior is different:
1573  *
1574  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1575  *
1576  * Unlike the normal RAID-Z case in which the block is allocated based on the
1577  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1578  * I/O size is less than 128 KB, only the actual portions of data are written.
1579  * In this example the data is written to the third data vdev since that vdev
1580  * contains the offset [64 KB, 96 KB).
1581  *
1582  *       parity    data     data     data     data
1583  *     |        |        |        |   XX   |        |
1584  *                                    ^
1585  *                                    |
1586  *                             32 KB data block
1587  *
1588  * As a result, an individual I/O may not span all child vdevs; moreover, a
1589  * small I/O may only operate on a single child vdev.
1590  *
1591  * Note that since there are no parity bits calculated or written, this format
1592  * remains the same no matter how many parity bits are used in a normal RAID-Z
1593  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1594  * would look like:
1595  *
1596  *       parity   parity   parity    data     data     data     data
1597  *     |        |        |        |        |        |   XX   |        |
1598  *                                                      ^
1599  *                                                      |
1600  *                                               32 KB data block
1601  */
1602 int
1603 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1604     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1605 {
1606         vdev_t *tvd = vd->vdev_top;
1607         vdev_t *cvd;
1608         raidz_map_t *rm;
1609         raidz_col_t *rc;
1610         int c, err = 0;
1611
1612         uint64_t start, end, colstart, colend;
1613         uint64_t coloffset, colsize, colskip;
1614
1615         int flags = doread ? BIO_READ : BIO_WRITE;
1616
1617 #ifdef  _KERNEL
1618
1619         /*
1620          * Don't write past the end of the block
1621          */
1622         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1623
1624         start = offset;
1625         end = start + size;
1626
1627         /*
1628          * Allocate a RAID-Z map for this block.  Note that this block starts
1629          * from the "original" offset, this is, the offset of the extent which
1630          * contains the requisite offset of the data being read or written.
1631          *
1632          * Even if this I/O operation doesn't span the full block size, let's
1633          * treat the on-disk format as if the only blocks are the complete 128
1634          * KB size.
1635          */
1636         rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1637             SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1638             vd->vdev_children, vd->vdev_nparity);
1639
1640         coloffset = origoffset;
1641
1642         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1643             c++, coloffset += rc->rc_size) {
1644                 rc = &rm->rm_col[c];
1645                 cvd = vd->vdev_child[rc->rc_devidx];
1646
1647                 /*
1648                  * Find the start and end of this column in the RAID-Z map,
1649                  * keeping in mind that the stated size and offset of the
1650                  * operation may not fill the entire column for this vdev.
1651                  *
1652                  * If any portion of the data spans this column, issue the
1653                  * appropriate operation to the vdev.
1654                  */
1655                 if (coloffset + rc->rc_size <= start)
1656                         continue;
1657                 if (coloffset >= end)
1658                         continue;
1659
1660                 colstart = MAX(coloffset, start);
1661                 colend = MIN(end, coloffset + rc->rc_size);
1662                 colsize = colend - colstart;
1663                 colskip = colstart - coloffset;
1664
1665                 VERIFY3U(colsize, <=, rc->rc_size);
1666                 VERIFY3U(colskip, <=, rc->rc_size);
1667
1668                 /*
1669                  * Note that the child vdev will have a vdev label at the start
1670                  * of its range of offsets, hence the need for
1671                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1672                  * example of why this calculation is needed.
1673                  */
1674                 if ((err = vdev_disk_physio(cvd,
1675                     ((char *)rc->rc_data) + colskip, colsize,
1676                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1677                     flags, isdump)) != 0)
1678                         break;
1679         }
1680
1681         vdev_raidz_map_free(rm);
1682 #endif  /* KERNEL */
1683
1684         return (err);
1685 }
1686 #endif
1687
1688 static uint64_t
1689 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1690 {
1691         uint64_t asize;
1692         uint64_t ashift = vd->vdev_top->vdev_ashift;
1693         uint64_t cols = vd->vdev_children;
1694         uint64_t nparity = vd->vdev_nparity;
1695
1696         asize = ((psize - 1) >> ashift) + 1;
1697         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1698         asize = roundup(asize, nparity + 1) << ashift;
1699
1700         return (asize);
1701 }
1702
1703 static void
1704 vdev_raidz_child_done(zio_t *zio)
1705 {
1706         raidz_col_t *rc = zio->io_private;
1707
1708         rc->rc_error = zio->io_error;
1709         rc->rc_tried = 1;
1710         rc->rc_skipped = 0;
1711 }
1712
1713 /*
1714  * Start an IO operation on a RAIDZ VDev
1715  *
1716  * Outline:
1717  * - For write operations:
1718  *   1. Generate the parity data
1719  *   2. Create child zio write operations to each column's vdev, for both
1720  *      data and parity.
1721  *   3. If the column skips any sectors for padding, create optional dummy
1722  *      write zio children for those areas to improve aggregation continuity.
1723  * - For read operations:
1724  *   1. Create child zio read operations to each data column's vdev to read
1725  *      the range of data required for zio.
1726  *   2. If this is a scrub or resilver operation, or if any of the data
1727  *      vdevs have had errors, then create zio read operations to the parity
1728  *      columns' VDevs as well.
1729  */
1730 static void
1731 vdev_raidz_io_start(zio_t *zio)
1732 {
1733         vdev_t *vd = zio->io_vd;
1734         vdev_t *tvd = vd->vdev_top;
1735         vdev_t *cvd;
1736         raidz_map_t *rm;
1737         raidz_col_t *rc;
1738         int c, i;
1739
1740         rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1741             zio->io_type == ZIO_TYPE_FREE,
1742             tvd->vdev_ashift, vd->vdev_children,
1743             vd->vdev_nparity);
1744
1745         zio->io_vsd = rm;
1746         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1747
1748         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1749
1750         if (zio->io_type == ZIO_TYPE_FREE) {
1751                 for (c = 0; c < rm->rm_cols; c++) {
1752                         rc = &rm->rm_col[c];
1753                         cvd = vd->vdev_child[rc->rc_devidx];
1754                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1755                             rc->rc_offset, rc->rc_data, rc->rc_size,
1756                             zio->io_type, zio->io_priority, 0,
1757                             vdev_raidz_child_done, rc));
1758                 }
1759
1760                 zio_execute(zio);
1761                 return;
1762         }
1763
1764         if (zio->io_type == ZIO_TYPE_WRITE) {
1765                 vdev_raidz_generate_parity(rm);
1766
1767                 for (c = 0; c < rm->rm_cols; c++) {
1768                         rc = &rm->rm_col[c];
1769                         cvd = vd->vdev_child[rc->rc_devidx];
1770                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1771                             rc->rc_offset, rc->rc_data, rc->rc_size,
1772                             zio->io_type, zio->io_priority, 0,
1773                             vdev_raidz_child_done, rc));
1774                 }
1775
1776                 /*
1777                  * Generate optional I/Os for any skipped sectors to improve
1778                  * aggregation contiguity.
1779                  */
1780                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1781                         ASSERT(c <= rm->rm_scols);
1782                         if (c == rm->rm_scols)
1783                                 c = 0;
1784                         rc = &rm->rm_col[c];
1785                         cvd = vd->vdev_child[rc->rc_devidx];
1786                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1787                             rc->rc_offset + rc->rc_size, NULL,
1788                             1 << tvd->vdev_ashift,
1789                             zio->io_type, zio->io_priority,
1790                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1791                 }
1792
1793                 zio_execute(zio);
1794                 return;
1795         }
1796
1797         ASSERT(zio->io_type == ZIO_TYPE_READ);
1798
1799         /*
1800          * Iterate over the columns in reverse order so that we hit the parity
1801          * last -- any errors along the way will force us to read the parity.
1802          */
1803         for (c = rm->rm_cols - 1; c >= 0; c--) {
1804                 rc = &rm->rm_col[c];
1805                 cvd = vd->vdev_child[rc->rc_devidx];
1806                 if (!vdev_readable(cvd)) {
1807                         if (c >= rm->rm_firstdatacol)
1808                                 rm->rm_missingdata++;
1809                         else
1810                                 rm->rm_missingparity++;
1811                         rc->rc_error = SET_ERROR(ENXIO);
1812                         rc->rc_tried = 1;       /* don't even try */
1813                         rc->rc_skipped = 1;
1814                         continue;
1815                 }
1816                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1817                         if (c >= rm->rm_firstdatacol)
1818                                 rm->rm_missingdata++;
1819                         else
1820                                 rm->rm_missingparity++;
1821                         rc->rc_error = SET_ERROR(ESTALE);
1822                         rc->rc_skipped = 1;
1823                         continue;
1824                 }
1825                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1826                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1827                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1828                             rc->rc_offset, rc->rc_data, rc->rc_size,
1829                             zio->io_type, zio->io_priority, 0,
1830                             vdev_raidz_child_done, rc));
1831                 }
1832         }
1833
1834         zio_execute(zio);
1835 }
1836
1837
1838 /*
1839  * Report a checksum error for a child of a RAID-Z device.
1840  */
1841 static void
1842 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1843 {
1844         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1845
1846         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847                 zio_bad_cksum_t zbc;
1848                 raidz_map_t *rm = zio->io_vsd;
1849
1850                 mutex_enter(&vd->vdev_stat_lock);
1851                 vd->vdev_stat.vs_checksum_errors++;
1852                 mutex_exit(&vd->vdev_stat_lock);
1853
1854                 zbc.zbc_has_cksum = 0;
1855                 zbc.zbc_injected = rm->rm_ecksuminjected;
1856
1857                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1859                     &zbc);
1860         }
1861 }
1862
1863 /*
1864  * We keep track of whether or not there were any injected errors, so that
1865  * any ereports we generate can note it.
1866  */
1867 static int
1868 raidz_checksum_verify(zio_t *zio)
1869 {
1870         zio_bad_cksum_t zbc;
1871         raidz_map_t *rm = zio->io_vsd;
1872
1873         int ret = zio_checksum_error(zio, &zbc);
1874         if (ret != 0 && zbc.zbc_injected != 0)
1875                 rm->rm_ecksuminjected = 1;
1876
1877         return (ret);
1878 }
1879
1880 /*
1881  * Generate the parity from the data columns. If we tried and were able to
1882  * read the parity without error, verify that the generated parity matches the
1883  * data we read. If it doesn't, we fire off a checksum error. Return the
1884  * number such failures.
1885  */
1886 static int
1887 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1888 {
1889         void *orig[VDEV_RAIDZ_MAXPARITY];
1890         int c, ret = 0;
1891         raidz_col_t *rc;
1892
1893         blkptr_t *bp = zio->io_bp;
1894         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1896
1897         if (checksum == ZIO_CHECKSUM_NOPARITY)
1898                 return (ret);
1899
1900         for (c = 0; c < rm->rm_firstdatacol; c++) {
1901                 rc = &rm->rm_col[c];
1902                 if (!rc->rc_tried || rc->rc_error != 0)
1903                         continue;
1904                 orig[c] = zio_buf_alloc(rc->rc_size);
1905                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1906         }
1907
1908         vdev_raidz_generate_parity(rm);
1909
1910         for (c = 0; c < rm->rm_firstdatacol; c++) {
1911                 rc = &rm->rm_col[c];
1912                 if (!rc->rc_tried || rc->rc_error != 0)
1913                         continue;
1914                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915                         raidz_checksum_error(zio, rc, orig[c]);
1916                         rc->rc_error = SET_ERROR(ECKSUM);
1917                         ret++;
1918                 }
1919                 zio_buf_free(orig[c], rc->rc_size);
1920         }
1921
1922         return (ret);
1923 }
1924
1925 /*
1926  * Keep statistics on all the ways that we used parity to correct data.
1927  */
1928 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1929
1930 static int
1931 vdev_raidz_worst_error(raidz_map_t *rm)
1932 {
1933         int error = 0;
1934
1935         for (int c = 0; c < rm->rm_cols; c++)
1936                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1937
1938         return (error);
1939 }
1940
1941 /*
1942  * Iterate over all combinations of bad data and attempt a reconstruction.
1943  * Note that the algorithm below is non-optimal because it doesn't take into
1944  * account how reconstruction is actually performed. For example, with
1945  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947  * cases we'd only use parity information in column 0.
1948  */
1949 static int
1950 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1951 {
1952         raidz_map_t *rm = zio->io_vsd;
1953         raidz_col_t *rc;
1954         void *orig[VDEV_RAIDZ_MAXPARITY];
1955         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956         int *tgts = &tstore[1];
1957         int current, next, i, c, n;
1958         int code, ret = 0;
1959
1960         ASSERT(total_errors < rm->rm_firstdatacol);
1961
1962         /*
1963          * This simplifies one edge condition.
1964          */
1965         tgts[-1] = -1;
1966
1967         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1968                 /*
1969                  * Initialize the targets array by finding the first n columns
1970                  * that contain no error.
1971                  *
1972                  * If there were no data errors, we need to ensure that we're
1973                  * always explicitly attempting to reconstruct at least one
1974                  * data column. To do this, we simply push the highest target
1975                  * up into the data columns.
1976                  */
1977                 for (c = 0, i = 0; i < n; i++) {
1978                         if (i == n - 1 && data_errors == 0 &&
1979                             c < rm->rm_firstdatacol) {
1980                                 c = rm->rm_firstdatacol;
1981                         }
1982
1983                         while (rm->rm_col[c].rc_error != 0) {
1984                                 c++;
1985                                 ASSERT3S(c, <, rm->rm_cols);
1986                         }
1987
1988                         tgts[i] = c++;
1989                 }
1990
1991                 /*
1992                  * Setting tgts[n] simplifies the other edge condition.
1993                  */
1994                 tgts[n] = rm->rm_cols;
1995
1996                 /*
1997                  * These buffers were allocated in previous iterations.
1998                  */
1999                 for (i = 0; i < n - 1; i++) {
2000                         ASSERT(orig[i] != NULL);
2001                 }
2002
2003                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2004
2005                 current = 0;
2006                 next = tgts[current];
2007
2008                 while (current != n) {
2009                         tgts[current] = next;
2010                         current = 0;
2011
2012                         /*
2013                          * Save off the original data that we're going to
2014                          * attempt to reconstruct.
2015                          */
2016                         for (i = 0; i < n; i++) {
2017                                 ASSERT(orig[i] != NULL);
2018                                 c = tgts[i];
2019                                 ASSERT3S(c, >=, 0);
2020                                 ASSERT3S(c, <, rm->rm_cols);
2021                                 rc = &rm->rm_col[c];
2022                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
2023                         }
2024
2025                         /*
2026                          * Attempt a reconstruction and exit the outer loop on
2027                          * success.
2028                          */
2029                         code = vdev_raidz_reconstruct(rm, tgts, n);
2030                         if (raidz_checksum_verify(zio) == 0) {
2031                                 atomic_inc_64(&raidz_corrected[code]);
2032
2033                                 for (i = 0; i < n; i++) {
2034                                         c = tgts[i];
2035                                         rc = &rm->rm_col[c];
2036                                         ASSERT(rc->rc_error == 0);
2037                                         if (rc->rc_tried)
2038                                                 raidz_checksum_error(zio, rc,
2039                                                     orig[i]);
2040                                         rc->rc_error = SET_ERROR(ECKSUM);
2041                                 }
2042
2043                                 ret = code;
2044                                 goto done;
2045                         }
2046
2047                         /*
2048                          * Restore the original data.
2049                          */
2050                         for (i = 0; i < n; i++) {
2051                                 c = tgts[i];
2052                                 rc = &rm->rm_col[c];
2053                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
2054                         }
2055
2056                         do {
2057                                 /*
2058                                  * Find the next valid column after the current
2059                                  * position..
2060                                  */
2061                                 for (next = tgts[current] + 1;
2062                                     next < rm->rm_cols &&
2063                                     rm->rm_col[next].rc_error != 0; next++)
2064                                         continue;
2065
2066                                 ASSERT(next <= tgts[current + 1]);
2067
2068                                 /*
2069                                  * If that spot is available, we're done here.
2070                                  */
2071                                 if (next != tgts[current + 1])
2072                                         break;
2073
2074                                 /*
2075                                  * Otherwise, find the next valid column after
2076                                  * the previous position.
2077                                  */
2078                                 for (c = tgts[current - 1] + 1;
2079                                     rm->rm_col[c].rc_error != 0; c++)
2080                                         continue;
2081
2082                                 tgts[current] = c;
2083                                 current++;
2084
2085                         } while (current != n);
2086                 }
2087         }
2088         n--;
2089 done:
2090         for (i = 0; i < n; i++) {
2091                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2092         }
2093
2094         return (ret);
2095 }
2096
2097 /*
2098  * Complete an IO operation on a RAIDZ VDev
2099  *
2100  * Outline:
2101  * - For write operations:
2102  *   1. Check for errors on the child IOs.
2103  *   2. Return, setting an error code if too few child VDevs were written
2104  *      to reconstruct the data later.  Note that partial writes are
2105  *      considered successful if they can be reconstructed at all.
2106  * - For read operations:
2107  *   1. Check for errors on the child IOs.
2108  *   2. If data errors occurred:
2109  *      a. Try to reassemble the data from the parity available.
2110  *      b. If we haven't yet read the parity drives, read them now.
2111  *      c. If all parity drives have been read but the data still doesn't
2112  *         reassemble with a correct checksum, then try combinatorial
2113  *         reconstruction.
2114  *      d. If that doesn't work, return an error.
2115  *   3. If there were unexpected errors or this is a resilver operation,
2116  *      rewrite the vdevs that had errors.
2117  */
2118 static void
2119 vdev_raidz_io_done(zio_t *zio)
2120 {
2121         vdev_t *vd = zio->io_vd;
2122         vdev_t *cvd;
2123         raidz_map_t *rm = zio->io_vsd;
2124         raidz_col_t *rc;
2125         int unexpected_errors = 0;
2126         int parity_errors = 0;
2127         int parity_untried = 0;
2128         int data_errors = 0;
2129         int total_errors = 0;
2130         int n, c;
2131         int tgts[VDEV_RAIDZ_MAXPARITY];
2132         int code;
2133
2134         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2135
2136         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2138
2139         for (c = 0; c < rm->rm_cols; c++) {
2140                 rc = &rm->rm_col[c];
2141
2142                 if (rc->rc_error) {
2143                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2144
2145                         if (c < rm->rm_firstdatacol)
2146                                 parity_errors++;
2147                         else
2148                                 data_errors++;
2149
2150                         if (!rc->rc_skipped)
2151                                 unexpected_errors++;
2152
2153                         total_errors++;
2154                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155                         parity_untried++;
2156                 }
2157         }
2158
2159         if (zio->io_type == ZIO_TYPE_WRITE) {
2160                 /*
2161                  * XXX -- for now, treat partial writes as a success.
2162                  * (If we couldn't write enough columns to reconstruct
2163                  * the data, the I/O failed.  Otherwise, good enough.)
2164                  *
2165                  * Now that we support write reallocation, it would be better
2166                  * to treat partial failure as real failure unless there are
2167                  * no non-degraded top-level vdevs left, and not update DTLs
2168                  * if we intend to reallocate.
2169                  */
2170                 /* XXPOLICY */
2171                 if (total_errors > rm->rm_firstdatacol)
2172                         zio->io_error = vdev_raidz_worst_error(rm);
2173
2174                 return;
2175         } else if (zio->io_type == ZIO_TYPE_FREE) {
2176                 return;
2177         }
2178
2179         ASSERT(zio->io_type == ZIO_TYPE_READ);
2180         /*
2181          * There are three potential phases for a read:
2182          *      1. produce valid data from the columns read
2183          *      2. read all disks and try again
2184          *      3. perform combinatorial reconstruction
2185          *
2186          * Each phase is progressively both more expensive and less likely to
2187          * occur. If we encounter more errors than we can repair or all phases
2188          * fail, we have no choice but to return an error.
2189          */
2190
2191         /*
2192          * If the number of errors we saw was correctable -- less than or equal
2193          * to the number of parity disks read -- attempt to produce data that
2194          * has a valid checksum. Naturally, this case applies in the absence of
2195          * any errors.
2196          */
2197         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198                 if (data_errors == 0) {
2199                         if (raidz_checksum_verify(zio) == 0) {
2200                                 /*
2201                                  * If we read parity information (unnecessarily
2202                                  * as it happens since no reconstruction was
2203                                  * needed) regenerate and verify the parity.
2204                                  * We also regenerate parity when resilvering
2205                                  * so we can write it out to the failed device
2206                                  * later.
2207                                  */
2208                                 if (parity_errors + parity_untried <
2209                                     rm->rm_firstdatacol ||
2210                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211                                         n = raidz_parity_verify(zio, rm);
2212                                         unexpected_errors += n;
2213                                         ASSERT(parity_errors + n <=
2214                                             rm->rm_firstdatacol);
2215                                 }
2216                                 goto done;
2217                         }
2218                 } else {
2219                         /*
2220                          * We either attempt to read all the parity columns or
2221                          * none of them. If we didn't try to read parity, we
2222                          * wouldn't be here in the correctable case. There must
2223                          * also have been fewer parity errors than parity
2224                          * columns or, again, we wouldn't be in this code path.
2225                          */
2226                         ASSERT(parity_untried == 0);
2227                         ASSERT(parity_errors < rm->rm_firstdatacol);
2228
2229                         /*
2230                          * Identify the data columns that reported an error.
2231                          */
2232                         n = 0;
2233                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234                                 rc = &rm->rm_col[c];
2235                                 if (rc->rc_error != 0) {
2236                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237                                         tgts[n++] = c;
2238                                 }
2239                         }
2240
2241                         ASSERT(rm->rm_firstdatacol >= n);
2242
2243                         code = vdev_raidz_reconstruct(rm, tgts, n);
2244
2245                         if (raidz_checksum_verify(zio) == 0) {
2246                                 atomic_inc_64(&raidz_corrected[code]);
2247
2248                                 /*
2249                                  * If we read more parity disks than were used
2250                                  * for reconstruction, confirm that the other
2251                                  * parity disks produced correct data. This
2252                                  * routine is suboptimal in that it regenerates
2253                                  * the parity that we already used in addition
2254                                  * to the parity that we're attempting to
2255                                  * verify, but this should be a relatively
2256                                  * uncommon case, and can be optimized if it
2257                                  * becomes a problem. Note that we regenerate
2258                                  * parity when resilvering so we can write it
2259                                  * out to failed devices later.
2260                                  */
2261                                 if (parity_errors < rm->rm_firstdatacol - n ||
2262                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263                                         n = raidz_parity_verify(zio, rm);
2264                                         unexpected_errors += n;
2265                                         ASSERT(parity_errors + n <=
2266                                             rm->rm_firstdatacol);
2267                                 }
2268
2269                                 goto done;
2270                         }
2271                 }
2272         }
2273
2274         /*
2275          * This isn't a typical situation -- either we got a read error or
2276          * a child silently returned bad data. Read every block so we can
2277          * try again with as much data and parity as we can track down. If
2278          * we've already been through once before, all children will be marked
2279          * as tried so we'll proceed to combinatorial reconstruction.
2280          */
2281         unexpected_errors = 1;
2282         rm->rm_missingdata = 0;
2283         rm->rm_missingparity = 0;
2284
2285         for (c = 0; c < rm->rm_cols; c++) {
2286                 if (rm->rm_col[c].rc_tried)
2287                         continue;
2288
2289                 zio_vdev_io_redone(zio);
2290                 do {
2291                         rc = &rm->rm_col[c];
2292                         if (rc->rc_tried)
2293                                 continue;
2294                         zio_nowait(zio_vdev_child_io(zio, NULL,
2295                             vd->vdev_child[rc->rc_devidx],
2296                             rc->rc_offset, rc->rc_data, rc->rc_size,
2297                             zio->io_type, zio->io_priority, 0,
2298                             vdev_raidz_child_done, rc));
2299                 } while (++c < rm->rm_cols);
2300
2301                 return;
2302         }
2303
2304         /*
2305          * At this point we've attempted to reconstruct the data given the
2306          * errors we detected, and we've attempted to read all columns. There
2307          * must, therefore, be one or more additional problems -- silent errors
2308          * resulting in invalid data rather than explicit I/O errors resulting
2309          * in absent data. We check if there is enough additional data to
2310          * possibly reconstruct the data and then perform combinatorial
2311          * reconstruction over all possible combinations. If that fails,
2312          * we're cooked.
2313          */
2314         if (total_errors > rm->rm_firstdatacol) {
2315                 zio->io_error = vdev_raidz_worst_error(rm);
2316
2317         } else if (total_errors < rm->rm_firstdatacol &&
2318             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2319                 /*
2320                  * If we didn't use all the available parity for the
2321                  * combinatorial reconstruction, verify that the remaining
2322                  * parity is correct.
2323                  */
2324                 if (code != (1 << rm->rm_firstdatacol) - 1)
2325                         (void) raidz_parity_verify(zio, rm);
2326         } else {
2327                 /*
2328                  * We're here because either:
2329                  *
2330                  *      total_errors == rm_first_datacol, or
2331                  *      vdev_raidz_combrec() failed
2332                  *
2333                  * In either case, there is enough bad data to prevent
2334                  * reconstruction.
2335                  *
2336                  * Start checksum ereports for all children which haven't
2337                  * failed, and the IO wasn't speculative.
2338                  */
2339                 zio->io_error = SET_ERROR(ECKSUM);
2340
2341                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342                         for (c = 0; c < rm->rm_cols; c++) {
2343                                 rc = &rm->rm_col[c];
2344                                 if (rc->rc_error == 0) {
2345                                         zio_bad_cksum_t zbc;
2346                                         zbc.zbc_has_cksum = 0;
2347                                         zbc.zbc_injected =
2348                                             rm->rm_ecksuminjected;
2349
2350                                         zfs_ereport_start_checksum(
2351                                             zio->io_spa,
2352                                             vd->vdev_child[rc->rc_devidx],
2353                                             zio, rc->rc_offset, rc->rc_size,
2354                                             (void *)(uintptr_t)c, &zbc);
2355                                 }
2356                         }
2357                 }
2358         }
2359
2360 done:
2361         zio_checksum_verified(zio);
2362
2363         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2365                 /*
2366                  * Use the good data we have in hand to repair damaged children.
2367                  */
2368                 for (c = 0; c < rm->rm_cols; c++) {
2369                         rc = &rm->rm_col[c];
2370                         cvd = vd->vdev_child[rc->rc_devidx];
2371
2372                         if (rc->rc_error == 0)
2373                                 continue;
2374
2375                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376                             rc->rc_offset, rc->rc_data, rc->rc_size,
2377                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2380                 }
2381         }
2382 }
2383
2384 static void
2385 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2386 {
2387         if (faulted > vd->vdev_nparity)
2388                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389                     VDEV_AUX_NO_REPLICAS);
2390         else if (degraded + faulted != 0)
2391                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2392         else
2393                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2394 }
2395
2396 vdev_ops_t vdev_raidz_ops = {
2397         vdev_raidz_open,
2398         vdev_raidz_close,
2399         vdev_raidz_asize,
2400         vdev_raidz_io_start,
2401         vdev_raidz_io_done,
2402         vdev_raidz_state_change,
2403         NULL,
2404         NULL,
2405         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2406         B_FALSE                 /* not a leaf vdev */
2407 };