sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/vdev_impl.h>
  32 #ifdef illumos
  33 #include <sys/vdev_disk.h>
  34 #endif
  35 #include <sys/vdev_file.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/abd.h>
  40 #include <sys/fs/zfs.h>
  41 #include <sys/fm/fs/zfs.h>
  42 #include <sys/bio.h>
  43
  44 /*
  45  * Virtual device vector for RAID-Z.
  46  *
  47  * This vdev supports single, double, and triple parity. For single parity,
  48  * we use a simple XOR of all the data columns. For double or triple parity,
  49  * we use a special case of Reed-Solomon coding. This extends the
  50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  53  * former is also based. The latter is designed to provide higher performance
  54  * for writes.
  55  *
  56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  57  * amended six years later identifying a critical flaw that invalidates its
  58  * claims. Nevertheless, the technique can be adapted to work for up to
  59  * triple parity. For additional parity, the amendment "Note: Correction to
  60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  61  * is viable, but the additional complexity means that write performance will
  62  * suffer.
  63  *
  64  * All of the methods above operate on a Galois field, defined over the
  65  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  66  * can be expressed with a single byte. Briefly, the operations on the
  67  * field are defined as follows:
  68  *
  69  *   o addition (+) is represented by a bitwise XOR
  70  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  71  *   o multiplication of A by 2 is defined by the following bitwise expression:
  72  *
  73  *      (A * 2)_7 = A_6
  74  *      (A * 2)_6 = A_5
  75  *      (A * 2)_5 = A_4
  76  *      (A * 2)_4 = A_3 + A_7
  77  *      (A * 2)_3 = A_2 + A_7
  78  *      (A * 2)_2 = A_1 + A_7
  79  *      (A * 2)_1 = A_0
  80  *      (A * 2)_0 = A_7
  81  *
  82  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  83  * As an aside, this multiplication is derived from the error correcting
  84  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  85  *
  86  * Observe that any number in the field (except for 0) can be expressed as a
  87  * power of 2 -- a generator for the field. We store a table of the powers of
  88  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  89  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  90  * than field addition). The inverse of a field element A (A^-1) is therefore
  91  * A ^ (255 - 1) = A^254.
  92  *
  93  * The up-to-three parity columns, P, Q, R over several data columns,
  94  * D_0, ... D_n-1, can be expressed by field operations:
  95  *
  96  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  97  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  98  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  99  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 100  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 101  *
 102  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 103  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 104  * independent coefficients. (There are no additional coefficients that have
 105  * this property which is why the uncorrected Plank method breaks down.)
 106  *
 107  * See the reconstruction code below for how P, Q and R can used individually
 108  * or in concert to recover missing data columns.
 109  */
 110
 111 typedef struct raidz_col {
 112         uint64_t rc_devidx;             /* child device index for I/O */
 113         uint64_t rc_offset;             /* device offset */
 114         uint64_t rc_size;               /* I/O size */
 115         abd_t *rc_abd;                  /* I/O data */
 116         void *rc_gdata;                 /* used to store the "good" version */
 117         int rc_error;                   /* I/O error for this device */
 118         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 119         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 120 } raidz_col_t;
 121
 122 typedef struct raidz_map {
 123         uint64_t rm_cols;               /* Regular column count */
 124         uint64_t rm_scols;              /* Count including skipped columns */
 125         uint64_t rm_bigcols;            /* Number of oversized columns */
 126         uint64_t rm_asize;              /* Actual total I/O size */
 127         uint64_t rm_missingdata;        /* Count of missing data devices */
 128         uint64_t rm_missingparity;      /* Count of missing parity devices */
 129         uint64_t rm_firstdatacol;       /* First data column/parity count */
 130         uint64_t rm_nskip;              /* Skipped sectors for padding */
 131         uint64_t rm_skipstart;          /* Column index of padding start */
 132         abd_t *rm_abd_copy;             /* rm_asize-buffer of copied data */
 133         uintptr_t rm_reports;           /* # of referencing checksum reports */
 134         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 135         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 136         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 137 } raidz_map_t;
 138
 139 #define VDEV_RAIDZ_P            0
 140 #define VDEV_RAIDZ_Q            1
 141 #define VDEV_RAIDZ_R            2
 142
 143 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 144 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 145
 146 /*
 147  * We provide a mechanism to perform the field multiplication operation on a
 148  * 64-bit value all at once rather than a byte at a time. This works by
 149  * creating a mask from the top bit in each byte and using that to
 150  * conditionally apply the XOR of 0x1d.
 151  */
 152 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 153 { \
 154         (mask) = (x) & 0x8080808080808080ULL; \
 155         (mask) = ((mask) << 1) - ((mask) >> 7); \
 156         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 157             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 158 }
 159
 160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 161 { \
 162         VDEV_RAIDZ_64MUL_2((x), mask); \
 163         VDEV_RAIDZ_64MUL_2((x), mask); \
 164 }
 165
 166 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 167
 168 /*
 169  * Force reconstruction to use the general purpose method.
 170  */
 171 int vdev_raidz_default_to_general;
 172
 173 /* Powers of 2 in the Galois field defined above. */
 174 static const uint8_t vdev_raidz_pow2[256] = {
 175         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 176         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 177         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 178         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 179         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 180         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 181         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 182         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 183         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 184         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 185         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 186         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 187         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 188         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 189         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 190         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 191         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 192         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 193         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 194         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 195         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 196         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 197         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 198         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 199         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 200         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 201         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 202         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 203         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 204         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 205         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 206         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 207 };
 208 /* Logs of 2 in the Galois field defined above. */
 209 static const uint8_t vdev_raidz_log2[256] = {
 210         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 211         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 212         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 213         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 214         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 215         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 216         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 217         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 218         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 219         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 220         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 221         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 222         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 223         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 224         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 225         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 226         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 227         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 228         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 229         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 230         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 231         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 232         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 233         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 234         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 235         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 236         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 237         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 238         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 239         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 240         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 241         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 242 };
 243
 244 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 245
 246 /*
 247  * Multiply a given number by 2 raised to the given power.
 248  */
 249 static uint8_t
 250 vdev_raidz_exp2(uint_t a, int exp)
 251 {
 252         if (a == 0)
 253                 return (0);
 254
 255         ASSERT(exp >= 0);
 256         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 257
 258         exp += vdev_raidz_log2[a];
 259         if (exp > 255)
 260                 exp -= 255;
 261
 262         return (vdev_raidz_pow2[exp]);
 263 }
 264
 265 static void
 266 vdev_raidz_map_free(raidz_map_t *rm)
 267 {
 268         int c;
 269         size_t size;
 270
 271         for (c = 0; c < rm->rm_firstdatacol; c++) {
 272                 if (rm->rm_col[c].rc_abd != NULL)
 273                         abd_free(rm->rm_col[c].rc_abd);
 274
 275                 if (rm->rm_col[c].rc_gdata != NULL)
 276                         zio_buf_free(rm->rm_col[c].rc_gdata,
 277                             rm->rm_col[c].rc_size);
 278         }
 279
 280         size = 0;
 281         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 282                 if (rm->rm_col[c].rc_abd != NULL)
 283                         abd_put(rm->rm_col[c].rc_abd);
 284                 size += rm->rm_col[c].rc_size;
 285         }
 286
 287         if (rm->rm_abd_copy != NULL)
 288                 abd_free(rm->rm_abd_copy);
 289
 290         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 291 }
 292
 293 static void
 294 vdev_raidz_map_free_vsd(zio_t *zio)
 295 {
 296         raidz_map_t *rm = zio->io_vsd;
 297
 298         ASSERT0(rm->rm_freed);
 299         rm->rm_freed = 1;
 300
 301         if (rm->rm_reports == 0)
 302                 vdev_raidz_map_free(rm);
 303 }
 304
 305 /*ARGSUSED*/
 306 static void
 307 vdev_raidz_cksum_free(void *arg, size_t ignored)
 308 {
 309         raidz_map_t *rm = arg;
 310
 311         ASSERT3U(rm->rm_reports, >, 0);
 312
 313         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 314                 vdev_raidz_map_free(rm);
 315 }
 316
 317 static void
 318 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 319 {
 320         raidz_map_t *rm = zcr->zcr_cbdata;
 321         size_t c = zcr->zcr_cbinfo;
 322         size_t x;
 323
 324         const char *good = NULL;
 325         char *bad;
 326
 327         if (good_data == NULL) {
 328                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 329                 return;
 330         }
 331
 332         if (c < rm->rm_firstdatacol) {
 333                 /*
 334                  * The first time through, calculate the parity blocks for
 335                  * the good data (this relies on the fact that the good
 336                  * data never changes for a given logical ZIO)
 337                  */
 338                 if (rm->rm_col[0].rc_gdata == NULL) {
 339                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 340                         char *buf;
 341                         int offset;
 342
 343                         /*
 344                          * Set up the rm_col[]s to generate the parity for
 345                          * good_data, first saving the parity bufs and
 346                          * replacing them with buffers to hold the result.
 347                          */
 348                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 349                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 350                                 rm->rm_col[x].rc_gdata =
 351                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 352                                 rm->rm_col[x].rc_abd =
 353                                     abd_get_from_buf(rm->rm_col[x].rc_gdata,
 354                                     rm->rm_col[x].rc_size);
 355                         }
 356
 357                         /* fill in the data columns from good_data */
 358                         buf = (char *)good_data;
 359                         for (; x < rm->rm_cols; x++) {
 360                                 abd_put(rm->rm_col[x].rc_abd);
 361                                 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
 362                                     rm->rm_col[x].rc_size);
 363                                 buf += rm->rm_col[x].rc_size;
 364                         }
 365
 366                         /*
 367                          * Construct the parity from the good data.
 368                          */
 369                         vdev_raidz_generate_parity(rm);
 370
 371                         /* restore everything back to its original state */
 372                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 373                                 abd_put(rm->rm_col[x].rc_abd);
 374                                 rm->rm_col[x].rc_abd = bad_parity[x];
 375                         }
 376
 377                         offset = 0;
 378                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 379                                 abd_put(rm->rm_col[x].rc_abd);
 380                                 rm->rm_col[x].rc_abd = abd_get_offset(
 381                                     rm->rm_abd_copy, offset);
 382                                 offset += rm->rm_col[x].rc_size;
 383                         }
 384                 }
 385
 386                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 387                 good = rm->rm_col[c].rc_gdata;
 388         } else {
 389                 /* adjust good_data to point at the start of our column */
 390                 good = good_data;
 391
 392                 for (x = rm->rm_firstdatacol; x < c; x++)
 393                         good += rm->rm_col[x].rc_size;
 394         }
 395
 396         bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
 397         /* we drop the ereport if it ends up that the data was good */
 398         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 399         abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
 400 }
 401
 402 /*
 403  * Invoked indirectly by zfs_ereport_start_checksum(), called
 404  * below when our read operation fails completely.  The main point
 405  * is to keep a copy of everything we read from disk, so that at
 406  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 407  */
 408 static void
 409 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 410 {
 411         size_t c = (size_t)(uintptr_t)arg;
 412         size_t offset;
 413
 414         raidz_map_t *rm = zio->io_vsd;
 415         size_t size;
 416
 417         /* set up the report and bump the refcount  */
 418         zcr->zcr_cbdata = rm;
 419         zcr->zcr_cbinfo = c;
 420         zcr->zcr_finish = vdev_raidz_cksum_finish;
 421         zcr->zcr_free = vdev_raidz_cksum_free;
 422
 423         rm->rm_reports++;
 424         ASSERT3U(rm->rm_reports, >, 0);
 425
 426         if (rm->rm_abd_copy != NULL)
 427                 return;
 428
 429         /*
 430          * It's the first time we're called for this raidz_map_t, so we need
 431          * to copy the data aside; there's no guarantee that our zio's buffer
 432          * won't be re-used for something else.
 433          *
 434          * Our parity data is already in separate buffers, so there's no need
 435          * to copy them.
 436          */
 437
 438         size = 0;
 439         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 440                 size += rm->rm_col[c].rc_size;
 441
 442         rm->rm_abd_copy =
 443             abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
 444
 445         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 446                 raidz_col_t *col = &rm->rm_col[c];
 447                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 448
 449                 abd_copy(tmp, col->rc_abd, col->rc_size);
 450                 abd_put(col->rc_abd);
 451                 col->rc_abd = tmp;
 452
 453                 offset += col->rc_size;
 454         }
 455         ASSERT3U(offset, ==, size);
 456 }
 457
 458 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 459         vdev_raidz_map_free_vsd,
 460         vdev_raidz_cksum_report
 461 };
 462
 463 /*
 464  * Divides the IO evenly across all child vdevs; usually, dcols is
 465  * the number of children in the target vdev.
 466  */
 467 static raidz_map_t *
 468 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
 469     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 470 {
 471         raidz_map_t *rm;
 472         /* The starting RAIDZ (parent) vdev sector of the block. */
 473         uint64_t b = offset >> unit_shift;
 474         /* The zio's size in units of the vdev's minimum sector size. */
 475         uint64_t s = size >> unit_shift;
 476         /* The first column for this stripe. */
 477         uint64_t f = b % dcols;
 478         /* The starting byte offset on each child vdev. */
 479         uint64_t o = (b / dcols) << unit_shift;
 480         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 481         uint64_t off = 0;
 482
 483         /*
 484          * "Quotient": The number of data sectors for this stripe on all but
 485          * the "big column" child vdevs that also contain "remainder" data.
 486          */
 487         q = s / (dcols - nparity);
 488
 489         /*
 490          * "Remainder": The number of partial stripe data sectors in this I/O.
 491          * This will add a sector to some, but not all, child vdevs.
 492          */
 493         r = s - q * (dcols - nparity);
 494
 495         /* The number of "big columns" - those which contain remainder data. */
 496         bc = (r == 0 ? 0 : r + nparity);
 497
 498         /*
 499          * The total number of data and parity sectors associated with
 500          * this I/O.
 501          */
 502         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 503
 504         /* acols: The columns that will be accessed. */
 505         /* scols: The columns that will be accessed or skipped. */
 506         if (q == 0) {
 507                 /* Our I/O request doesn't span all child vdevs. */
 508                 acols = bc;
 509                 scols = MIN(dcols, roundup(bc, nparity + 1));
 510         } else {
 511                 acols = dcols;
 512                 scols = dcols;
 513         }
 514
 515         ASSERT3U(acols, <=, scols);
 516
 517         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 518
 519         rm->rm_cols = acols;
 520         rm->rm_scols = scols;
 521         rm->rm_bigcols = bc;
 522         rm->rm_skipstart = bc;
 523         rm->rm_missingdata = 0;
 524         rm->rm_missingparity = 0;
 525         rm->rm_firstdatacol = nparity;
 526         rm->rm_abd_copy = NULL;
 527         rm->rm_reports = 0;
 528         rm->rm_freed = 0;
 529         rm->rm_ecksuminjected = 0;
 530
 531         asize = 0;
 532
 533         for (c = 0; c < scols; c++) {
 534                 col = f + c;
 535                 coff = o;
 536                 if (col >= dcols) {
 537                         col -= dcols;
 538                         coff += 1ULL << unit_shift;
 539                 }
 540                 rm->rm_col[c].rc_devidx = col;
 541                 rm->rm_col[c].rc_offset = coff;
 542                 rm->rm_col[c].rc_abd = NULL;
 543                 rm->rm_col[c].rc_gdata = NULL;
 544                 rm->rm_col[c].rc_error = 0;
 545                 rm->rm_col[c].rc_tried = 0;
 546                 rm->rm_col[c].rc_skipped = 0;
 547
 548                 if (c >= acols)
 549                         rm->rm_col[c].rc_size = 0;
 550                 else if (c < bc)
 551                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 552                 else
 553                         rm->rm_col[c].rc_size = q << unit_shift;
 554
 555                 asize += rm->rm_col[c].rc_size;
 556         }
 557
 558         ASSERT3U(asize, ==, tot << unit_shift);
 559         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 560         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 561         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 562         ASSERT3U(rm->rm_nskip, <=, nparity);
 563
 564         if (!dofree) {
 565                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 566                         rm->rm_col[c].rc_abd =
 567                             abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 568                 }
 569
 570                 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 571                 off = rm->rm_col[c].rc_size;
 572
 573                 for (c = c + 1; c < acols; c++) {
 574                         rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 575                         off += rm->rm_col[c].rc_size;
 576                 }
 577         }
 578
 579         /*
 580          * If all data stored spans all columns, there's a danger that parity
 581          * will always be on the same device and, since parity isn't read
 582          * during normal operation, that that device's I/O bandwidth won't be
 583          * used effectively. We therefore switch the parity every 1MB.
 584          *
 585          * ... at least that was, ostensibly, the theory. As a practical
 586          * matter unless we juggle the parity between all devices evenly, we
 587          * won't see any benefit. Further, occasional writes that aren't a
 588          * multiple of the LCM of the number of children and the minimum
 589          * stripe width are sufficient to avoid pessimal behavior.
 590          * Unfortunately, this decision created an implicit on-disk format
 591          * requirement that we need to support for all eternity, but only
 592          * for single-parity RAID-Z.
 593          *
 594          * If we intend to skip a sector in the zeroth column for padding
 595          * we must make sure to note this swap. We will never intend to
 596          * skip the first column since at least one data and one parity
 597          * column must appear in each row.
 598          */
 599         ASSERT(rm->rm_cols >= 2);
 600         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 601
 602         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 603                 devidx = rm->rm_col[0].rc_devidx;
 604                 o = rm->rm_col[0].rc_offset;
 605                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 606                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 607                 rm->rm_col[1].rc_devidx = devidx;
 608                 rm->rm_col[1].rc_offset = o;
 609
 610                 if (rm->rm_skipstart == 0)
 611                         rm->rm_skipstart = 1;
 612         }
 613
 614         return (rm);
 615 }
 616
 617 struct pqr_struct {
 618         uint64_t *p;
 619         uint64_t *q;
 620         uint64_t *r;
 621 };
 622
 623 static int
 624 vdev_raidz_p_func(void *buf, size_t size, void *private)
 625 {
 626         struct pqr_struct *pqr = private;
 627         const uint64_t *src = buf;
 628         int i, cnt = size / sizeof (src[0]);
 629
 630         ASSERT(pqr->p && !pqr->q && !pqr->r);
 631
 632         for (i = 0; i < cnt; i++, src++, pqr->p++)
 633                 *pqr->p ^= *src;
 634
 635         return (0);
 636 }
 637
 638 static int
 639 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 640 {
 641         struct pqr_struct *pqr = private;
 642         const uint64_t *src = buf;
 643         uint64_t mask;
 644         int i, cnt = size / sizeof (src[0]);
 645
 646         ASSERT(pqr->p && pqr->q && !pqr->r);
 647
 648         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 649                 *pqr->p ^= *src;
 650                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 651                 *pqr->q ^= *src;
 652         }
 653
 654         return (0);
 655 }
 656
 657 static int
 658 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 659 {
 660         struct pqr_struct *pqr = private;
 661         const uint64_t *src = buf;
 662         uint64_t mask;
 663         int i, cnt = size / sizeof (src[0]);
 664
 665         ASSERT(pqr->p && pqr->q && pqr->r);
 666
 667         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 668                 *pqr->p ^= *src;
 669                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 670                 *pqr->q ^= *src;
 671                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 672                 *pqr->r ^= *src;
 673         }
 674
 675         return (0);
 676 }
 677
 678 static void
 679 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 680 {
 681         uint64_t *p;
 682         int c;
 683         abd_t *src;
 684
 685         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 686                 src = rm->rm_col[c].rc_abd;
 687                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 688
 689                 if (c == rm->rm_firstdatacol) {
 690                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 691                 } else {
 692                         struct pqr_struct pqr = { p, NULL, NULL };
 693                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 694                             vdev_raidz_p_func, &pqr);
 695                 }
 696         }
 697 }
 698
 699 static void
 700 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 701 {
 702         uint64_t *p, *q, pcnt, ccnt, mask, i;
 703         int c;
 704         abd_t *src;
 705
 706         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 707         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 708             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 709
 710         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 711                 src = rm->rm_col[c].rc_abd;
 712                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 713                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 714
 715                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 716
 717                 if (c == rm->rm_firstdatacol) {
 718                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 719                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 720                 } else {
 721                         struct pqr_struct pqr = { p, q, NULL };
 722                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 723                             vdev_raidz_pq_func, &pqr);
 724                 }
 725
 726                 if (c == rm->rm_firstdatacol) {
 727                         for (i = ccnt; i < pcnt; i++) {
 728                                 p[i] = 0;
 729                                 q[i] = 0;
 730                         }
 731                 } else {
 732                         /*
 733                          * Treat short columns as though they are full of 0s.
 734                          * Note that there's therefore nothing needed for P.
 735                          */
 736                         for (i = ccnt; i < pcnt; i++) {
 737                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 738                         }
 739                 }
 740         }
 741 }
 742
 743 static void
 744 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 745 {
 746         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 747         int c;
 748         abd_t *src;
 749
 750         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 751         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 752             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 753         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 754             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 755
 756         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 757                 src = rm->rm_col[c].rc_abd;
 758                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 759                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 760                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 761
 762                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 763
 764                 if (c == rm->rm_firstdatacol) {
 765                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 766                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 767                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 768                 } else {
 769                         struct pqr_struct pqr = { p, q, r };
 770                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 771                             vdev_raidz_pqr_func, &pqr);
 772                 }
 773
 774                 if (c == rm->rm_firstdatacol) {
 775                         for (i = ccnt; i < pcnt; i++) {
 776                                 p[i] = 0;
 777                                 q[i] = 0;
 778                                 r[i] = 0;
 779                         }
 780                 } else {
 781                         /*
 782                          * Treat short columns as though they are full of 0s.
 783                          * Note that there's therefore nothing needed for P.
 784                          */
 785                         for (i = ccnt; i < pcnt; i++) {
 786                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 787                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 788                         }
 789                 }
 790         }
 791 }
 792
 793 /*
 794  * Generate RAID parity in the first virtual columns according to the number of
 795  * parity columns available.
 796  */
 797 static void
 798 vdev_raidz_generate_parity(raidz_map_t *rm)
 799 {
 800         switch (rm->rm_firstdatacol) {
 801         case 1:
 802                 vdev_raidz_generate_parity_p(rm);
 803                 break;
 804         case 2:
 805                 vdev_raidz_generate_parity_pq(rm);
 806                 break;
 807         case 3:
 808                 vdev_raidz_generate_parity_pqr(rm);
 809                 break;
 810         default:
 811                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 812         }
 813 }
 814
 815 /* ARGSUSED */
 816 static int
 817 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 818 {
 819         uint64_t *dst = dbuf;
 820         uint64_t *src = sbuf;
 821         int cnt = size / sizeof (src[0]);
 822
 823         for (int i = 0; i < cnt; i++) {
 824                 dst[i] ^= src[i];
 825         }
 826
 827         return (0);
 828 }
 829
 830 /* ARGSUSED */
 831 static int
 832 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 833     void *private)
 834 {
 835         uint64_t *dst = dbuf;
 836         uint64_t *src = sbuf;
 837         uint64_t mask;
 838         int cnt = size / sizeof (dst[0]);
 839
 840         for (int i = 0; i < cnt; i++, dst++, src++) {
 841                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 842                 *dst ^= *src;
 843         }
 844
 845         return (0);
 846 }
 847
 848 /* ARGSUSED */
 849 static int
 850 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 851 {
 852         uint64_t *dst = buf;
 853         uint64_t mask;
 854         int cnt = size / sizeof (dst[0]);
 855
 856         for (int i = 0; i < cnt; i++, dst++) {
 857                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 858                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 859         }
 860
 861         return (0);
 862 }
 863
 864 struct reconst_q_struct {
 865         uint64_t *q;
 866         int exp;
 867 };
 868
 869 static int
 870 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 871 {
 872         struct reconst_q_struct *rq = private;
 873         uint64_t *dst = buf;
 874         int cnt = size / sizeof (dst[0]);
 875
 876         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 877                 *dst ^= *rq->q;
 878
 879                 int j;
 880                 uint8_t *b;
 881                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 882                         *b = vdev_raidz_exp2(*b, rq->exp);
 883                 }
 884         }
 885
 886         return (0);
 887 }
 888
 889 struct reconst_pq_struct {
 890         uint8_t *p;
 891         uint8_t *q;
 892         uint8_t *pxy;
 893         uint8_t *qxy;
 894         int aexp;
 895         int bexp;
 896 };
 897
 898 static int
 899 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 900 {
 901         struct reconst_pq_struct *rpq = private;
 902         uint8_t *xd = xbuf;
 903         uint8_t *yd = ybuf;
 904
 905         for (int i = 0; i < size;
 906             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 907                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 908                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 909                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 910         }
 911
 912         return (0);
 913 }
 914
 915 static int
 916 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 917 {
 918         struct reconst_pq_struct *rpq = private;
 919         uint8_t *xd = xbuf;
 920
 921         for (int i = 0; i < size;
 922             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 923                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 924                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 925                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 926         }
 927
 928         return (0);
 929 }
 930
 931 static int
 932 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 933 {
 934         int x = tgts[0];
 935         int c;
 936         abd_t *dst, *src;
 937
 938         ASSERT(ntgts == 1);
 939         ASSERT(x >= rm->rm_firstdatacol);
 940         ASSERT(x < rm->rm_cols);
 941
 942         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
 943         ASSERT(rm->rm_col[x].rc_size > 0);
 944
 945         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 946         dst = rm->rm_col[x].rc_abd;
 947
 948         abd_copy(dst, src, rm->rm_col[x].rc_size);
 949
 950         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 951                 uint64_t size = MIN(rm->rm_col[x].rc_size,
 952                     rm->rm_col[c].rc_size);
 953
 954                 src = rm->rm_col[c].rc_abd;
 955                 dst = rm->rm_col[x].rc_abd;
 956
 957                 if (c == x)
 958                         continue;
 959
 960                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 961                     vdev_raidz_reconst_p_func, NULL);
 962         }
 963
 964         return (1 << VDEV_RAIDZ_P);
 965 }
 966
 967 static int
 968 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 969 {
 970         int x = tgts[0];
 971         int c, exp;
 972         abd_t *dst, *src;
 973
 974         ASSERT(ntgts == 1);
 975
 976         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 977
 978         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 979                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
 980                     rm->rm_col[c].rc_size);
 981
 982                 src = rm->rm_col[c].rc_abd;
 983                 dst = rm->rm_col[x].rc_abd;
 984
 985                 if (c == rm->rm_firstdatacol) {
 986                         abd_copy(dst, src, size);
 987                         if (rm->rm_col[x].rc_size > size)
 988                                 abd_zero_off(dst, size,
 989                                     rm->rm_col[x].rc_size - size);
 990                 } else {
 991                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
 992                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 993                             vdev_raidz_reconst_q_pre_func, NULL);
 994                         (void) abd_iterate_func(dst,
 995                             size, rm->rm_col[x].rc_size - size,
 996                             vdev_raidz_reconst_q_pre_tail_func, NULL);
 997                 }
 998         }
 999
1000         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1001         dst = rm->rm_col[x].rc_abd;
1002         exp = 255 - (rm->rm_cols - 1 - x);
1003
1004         struct reconst_q_struct rq = { abd_to_buf(src), exp };
1005         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1006             vdev_raidz_reconst_q_post_func, &rq);
1007
1008         return (1 << VDEV_RAIDZ_Q);
1009 }
1010
1011 static int
1012 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1013 {
1014         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1015         abd_t *pdata, *qdata;
1016         uint64_t xsize, ysize;
1017         int x = tgts[0];
1018         int y = tgts[1];
1019         abd_t *xd, *yd;
1020
1021         ASSERT(ntgts == 2);
1022         ASSERT(x < y);
1023         ASSERT(x >= rm->rm_firstdatacol);
1024         ASSERT(y < rm->rm_cols);
1025
1026         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1027
1028         /*
1029          * Move the parity data aside -- we're going to compute parity as
1030          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1031          * reuse the parity generation mechanism without trashing the actual
1032          * parity so we make those columns appear to be full of zeros by
1033          * setting their lengths to zero.
1034          */
1035         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1036         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1037         xsize = rm->rm_col[x].rc_size;
1038         ysize = rm->rm_col[y].rc_size;
1039
1040         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1041             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1042         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1043             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1044         rm->rm_col[x].rc_size = 0;
1045         rm->rm_col[y].rc_size = 0;
1046
1047         vdev_raidz_generate_parity_pq(rm);
1048
1049         rm->rm_col[x].rc_size = xsize;
1050         rm->rm_col[y].rc_size = ysize;
1051
1052         p = abd_to_buf(pdata);
1053         q = abd_to_buf(qdata);
1054         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1055         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1056         xd = rm->rm_col[x].rc_abd;
1057         yd = rm->rm_col[y].rc_abd;
1058
1059         /*
1060          * We now have:
1061          *      Pxy = P + D_x + D_y
1062          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1063          *
1064          * We can then solve for D_x:
1065          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1066          * where
1067          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1068          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1069          *
1070          * With D_x in hand, we can easily solve for D_y:
1071          *      D_y = P + Pxy + D_x
1072          */
1073
1074         a = vdev_raidz_pow2[255 + x - y];
1075         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1076         tmp = 255 - vdev_raidz_log2[a ^ 1];
1077
1078         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1079         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1080
1081         ASSERT3U(xsize, >=, ysize);
1082         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1083         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1084             vdev_raidz_reconst_pq_func, &rpq);
1085         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1086             vdev_raidz_reconst_pq_tail_func, &rpq);
1087
1088         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1089         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1090
1091         /*
1092          * Restore the saved parity data.
1093          */
1094         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1095         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1096
1097         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1098 }
1099
1100 /* BEGIN CSTYLED */
1101 /*
1102  * In the general case of reconstruction, we must solve the system of linear
1103  * equations defined by the coeffecients used to generate parity as well as
1104  * the contents of the data and parity disks. This can be expressed with
1105  * vectors for the original data (D) and the actual data (d) and parity (p)
1106  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1107  *
1108  *            __   __                     __     __
1109  *            |     |         __     __   |  p_0  |
1110  *            |  V  |         |  D_0  |   | p_m-1 |
1111  *            |     |    x    |   :   | = |  d_0  |
1112  *            |  I  |         | D_n-1 |   |   :   |
1113  *            |     |         ~~     ~~   | d_n-1 |
1114  *            ~~   ~~                     ~~     ~~
1115  *
1116  * I is simply a square identity matrix of size n, and V is a vandermonde
1117  * matrix defined by the coeffecients we chose for the various parity columns
1118  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1119  * computation as well as linear separability.
1120  *
1121  *      __               __               __     __
1122  *      |   1   ..  1 1 1 |               |  p_0  |
1123  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1124  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1125  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1126  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1127  *      |   :       : : : |   |   :   |   |  d_2  |
1128  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1129  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1130  *      |   0   ..  0 0 1 |               | d_n-1 |
1131  *      ~~               ~~               ~~     ~~
1132  *
1133  * Note that I, V, d, and p are known. To compute D, we must invert the
1134  * matrix and use the known data and parity values to reconstruct the unknown
1135  * data values. We begin by removing the rows in V|I and d|p that correspond
1136  * to failed or missing columns; we then make V|I square (n x n) and d|p
1137  * sized n by removing rows corresponding to unused parity from the bottom up
1138  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1139  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1140  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1141  *           __                               __
1142  *           |  1   1   1   1   1   1   1   1  |
1143  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1144  *           |  19 205 116  29  64  16  4   1  |      / /
1145  *           |  1   0   0   0   0   0   0   0  |     / /
1146  *           |  0   1   0   0   0   0   0   0  | <--' /
1147  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1148  *           |  0   0   0   1   0   0   0   0  |
1149  *           |  0   0   0   0   1   0   0   0  |
1150  *           |  0   0   0   0   0   1   0   0  |
1151  *           |  0   0   0   0   0   0   1   0  |
1152  *           |  0   0   0   0   0   0   0   1  |
1153  *           ~~                               ~~
1154  *           __                               __
1155  *           |  1   1   1   1   1   1   1   1  |
1156  *           |  19 205 116  29  64  16  4   1  |
1157  *           |  1   0   0   0   0   0   0   0  |
1158  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1159  *           |  0   0   0   0   1   0   0   0  |
1160  *           |  0   0   0   0   0   1   0   0  |
1161  *           |  0   0   0   0   0   0   1   0  |
1162  *           |  0   0   0   0   0   0   0   1  |
1163  *           ~~                               ~~
1164  *
1165  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1166  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1167  * matrix is not singular.
1168  * __                                                                 __
1169  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1170  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1171  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1172  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1173  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1174  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1175  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1176  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1177  * ~~                                                                 ~~
1178  * __                                                                 __
1179  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1180  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1181  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1182  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1183  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1184  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1185  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1186  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1187  * ~~                                                                 ~~
1188  * __                                                                 __
1189  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1190  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1191  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1192  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1193  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1194  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1195  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1196  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1197  * ~~                                                                 ~~
1198  * __                                                                 __
1199  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1200  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1201  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1202  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1203  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1204  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1205  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1206  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1207  * ~~                                                                 ~~
1208  * __                                                                 __
1209  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1210  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1211  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1212  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1213  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1214  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1215  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1216  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1217  * ~~                                                                 ~~
1218  * __                                                                 __
1219  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1220  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1221  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1222  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1223  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1224  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1225  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1226  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1227  * ~~                                                                 ~~
1228  *                   __                               __
1229  *                   |  0   0   1   0   0   0   0   0  |
1230  *                   | 167 100  5   41 159 169 217 208 |
1231  *                   | 166 100  4   40 158 168 216 209 |
1232  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1233  *                   |  0   0   0   0   1   0   0   0  |
1234  *                   |  0   0   0   0   0   1   0   0  |
1235  *                   |  0   0   0   0   0   0   1   0  |
1236  *                   |  0   0   0   0   0   0   0   1  |
1237  *                   ~~                               ~~
1238  *
1239  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1240  * of the missing data.
1241  *
1242  * As is apparent from the example above, the only non-trivial rows in the
1243  * inverse matrix correspond to the data disks that we're trying to
1244  * reconstruct. Indeed, those are the only rows we need as the others would
1245  * only be useful for reconstructing data known or assumed to be valid. For
1246  * that reason, we only build the coefficients in the rows that correspond to
1247  * targeted columns.
1248  */
1249 /* END CSTYLED */
1250
1251 static void
1252 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1253     uint8_t **rows)
1254 {
1255         int i, j;
1256         int pow;
1257
1258         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1259
1260         /*
1261          * Fill in the missing rows of interest.
1262          */
1263         for (i = 0; i < nmap; i++) {
1264                 ASSERT3S(0, <=, map[i]);
1265                 ASSERT3S(map[i], <=, 2);
1266
1267                 pow = map[i] * n;
1268                 if (pow > 255)
1269                         pow -= 255;
1270                 ASSERT(pow <= 255);
1271
1272                 for (j = 0; j < n; j++) {
1273                         pow -= map[i];
1274                         if (pow < 0)
1275                                 pow += 255;
1276                         rows[i][j] = vdev_raidz_pow2[pow];
1277                 }
1278         }
1279 }
1280
1281 static void
1282 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1283     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1284 {
1285         int i, j, ii, jj;
1286         uint8_t log;
1287
1288         /*
1289          * Assert that the first nmissing entries from the array of used
1290          * columns correspond to parity columns and that subsequent entries
1291          * correspond to data columns.
1292          */
1293         for (i = 0; i < nmissing; i++) {
1294                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1295         }
1296         for (; i < n; i++) {
1297                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1298         }
1299
1300         /*
1301          * First initialize the storage where we'll compute the inverse rows.
1302          */
1303         for (i = 0; i < nmissing; i++) {
1304                 for (j = 0; j < n; j++) {
1305                         invrows[i][j] = (i == j) ? 1 : 0;
1306                 }
1307         }
1308
1309         /*
1310          * Subtract all trivial rows from the rows of consequence.
1311          */
1312         for (i = 0; i < nmissing; i++) {
1313                 for (j = nmissing; j < n; j++) {
1314                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1315                         jj = used[j] - rm->rm_firstdatacol;
1316                         ASSERT3S(jj, <, n);
1317                         invrows[i][j] = rows[i][jj];
1318                         rows[i][jj] = 0;
1319                 }
1320         }
1321
1322         /*
1323          * For each of the rows of interest, we must normalize it and subtract
1324          * a multiple of it from the other rows.
1325          */
1326         for (i = 0; i < nmissing; i++) {
1327                 for (j = 0; j < missing[i]; j++) {
1328                         ASSERT0(rows[i][j]);
1329                 }
1330                 ASSERT3U(rows[i][missing[i]], !=, 0);
1331
1332                 /*
1333                  * Compute the inverse of the first element and multiply each
1334                  * element in the row by that value.
1335                  */
1336                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1337
1338                 for (j = 0; j < n; j++) {
1339                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1340                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1341                 }
1342
1343                 for (ii = 0; ii < nmissing; ii++) {
1344                         if (i == ii)
1345                                 continue;
1346
1347                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1348
1349                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1350
1351                         for (j = 0; j < n; j++) {
1352                                 rows[ii][j] ^=
1353                                     vdev_raidz_exp2(rows[i][j], log);
1354                                 invrows[ii][j] ^=
1355                                     vdev_raidz_exp2(invrows[i][j], log);
1356                         }
1357                 }
1358         }
1359
1360         /*
1361          * Verify that the data that is left in the rows are properly part of
1362          * an identity matrix.
1363          */
1364         for (i = 0; i < nmissing; i++) {
1365                 for (j = 0; j < n; j++) {
1366                         if (j == missing[i]) {
1367                                 ASSERT3U(rows[i][j], ==, 1);
1368                         } else {
1369                                 ASSERT0(rows[i][j]);
1370                         }
1371                 }
1372         }
1373 }
1374
1375 static void
1376 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1377     int *missing, uint8_t **invrows, const uint8_t *used)
1378 {
1379         int i, j, x, cc, c;
1380         uint8_t *src;
1381         uint64_t ccount;
1382         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1383         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1384         uint8_t log = 0;
1385         uint8_t val;
1386         int ll;
1387         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1388         uint8_t *p, *pp;
1389         size_t psize;
1390
1391         psize = sizeof (invlog[0][0]) * n * nmissing;
1392         p = kmem_alloc(psize, KM_SLEEP);
1393
1394         for (pp = p, i = 0; i < nmissing; i++) {
1395                 invlog[i] = pp;
1396                 pp += n;
1397         }
1398
1399         for (i = 0; i < nmissing; i++) {
1400                 for (j = 0; j < n; j++) {
1401                         ASSERT3U(invrows[i][j], !=, 0);
1402                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1403                 }
1404         }
1405
1406         for (i = 0; i < n; i++) {
1407                 c = used[i];
1408                 ASSERT3U(c, <, rm->rm_cols);
1409
1410                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1411                 ccount = rm->rm_col[c].rc_size;
1412                 for (j = 0; j < nmissing; j++) {
1413                         cc = missing[j] + rm->rm_firstdatacol;
1414                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1415                         ASSERT3U(cc, <, rm->rm_cols);
1416                         ASSERT3U(cc, !=, c);
1417
1418                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1419                         dcount[j] = rm->rm_col[cc].rc_size;
1420                 }
1421
1422                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1423
1424                 for (x = 0; x < ccount; x++, src++) {
1425                         if (*src != 0)
1426                                 log = vdev_raidz_log2[*src];
1427
1428                         for (cc = 0; cc < nmissing; cc++) {
1429                                 if (x >= dcount[cc])
1430                                         continue;
1431
1432                                 if (*src == 0) {
1433                                         val = 0;
1434                                 } else {
1435                                         if ((ll = log + invlog[cc][i]) >= 255)
1436                                                 ll -= 255;
1437                                         val = vdev_raidz_pow2[ll];
1438                                 }
1439
1440                                 if (i == 0)
1441                                         dst[cc][x] = val;
1442                                 else
1443                                         dst[cc][x] ^= val;
1444                         }
1445                 }
1446         }
1447
1448         kmem_free(p, psize);
1449 }
1450
1451 static int
1452 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1453 {
1454         int n, i, c, t, tt;
1455         int nmissing_rows;
1456         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1457         int parity_map[VDEV_RAIDZ_MAXPARITY];
1458
1459         uint8_t *p, *pp;
1460         size_t psize;
1461
1462         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1463         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1464         uint8_t *used;
1465
1466         abd_t **bufs = NULL;
1467
1468         int code = 0;
1469
1470         /*
1471          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1472          * temporary linear ABDs.
1473          */
1474         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1475                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1476
1477                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1478                         raidz_col_t *col = &rm->rm_col[c];
1479
1480                         bufs[c] = col->rc_abd;
1481                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1482                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1483                 }
1484         }
1485
1486         n = rm->rm_cols - rm->rm_firstdatacol;
1487
1488         /*
1489          * Figure out which data columns are missing.
1490          */
1491         nmissing_rows = 0;
1492         for (t = 0; t < ntgts; t++) {
1493                 if (tgts[t] >= rm->rm_firstdatacol) {
1494                         missing_rows[nmissing_rows++] =
1495                             tgts[t] - rm->rm_firstdatacol;
1496                 }
1497         }
1498
1499         /*
1500          * Figure out which parity columns to use to help generate the missing
1501          * data columns.
1502          */
1503         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1504                 ASSERT(tt < ntgts);
1505                 ASSERT(c < rm->rm_firstdatacol);
1506
1507                 /*
1508                  * Skip any targeted parity columns.
1509                  */
1510                 if (c == tgts[tt]) {
1511                         tt++;
1512                         continue;
1513                 }
1514
1515                 code |= 1 << c;
1516
1517                 parity_map[i] = c;
1518                 i++;
1519         }
1520
1521         ASSERT(code != 0);
1522         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1523
1524         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1525             nmissing_rows * n + sizeof (used[0]) * n;
1526         p = kmem_alloc(psize, KM_SLEEP);
1527
1528         for (pp = p, i = 0; i < nmissing_rows; i++) {
1529                 rows[i] = pp;
1530                 pp += n;
1531                 invrows[i] = pp;
1532                 pp += n;
1533         }
1534         used = pp;
1535
1536         for (i = 0; i < nmissing_rows; i++) {
1537                 used[i] = parity_map[i];
1538         }
1539
1540         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1541                 if (tt < nmissing_rows &&
1542                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1543                         tt++;
1544                         continue;
1545                 }
1546
1547                 ASSERT3S(i, <, n);
1548                 used[i] = c;
1549                 i++;
1550         }
1551
1552         /*
1553          * Initialize the interesting rows of the matrix.
1554          */
1555         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1556
1557         /*
1558          * Invert the matrix.
1559          */
1560         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1561             invrows, used);
1562
1563         /*
1564          * Reconstruct the missing data using the generated matrix.
1565          */
1566         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1567             invrows, used);
1568
1569         kmem_free(p, psize);
1570
1571         /*
1572          * copy back from temporary linear abds and free them
1573          */
1574         if (bufs) {
1575                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1576                         raidz_col_t *col = &rm->rm_col[c];
1577
1578                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1579                         abd_free(col->rc_abd);
1580                         col->rc_abd = bufs[c];
1581                 }
1582                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1583         }
1584
1585         return (code);
1586 }
1587
1588 static int
1589 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1590 {
1591         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1592         int ntgts;
1593         int i, c;
1594         int code;
1595         int nbadparity, nbaddata;
1596         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1597
1598         /*
1599          * The tgts list must already be sorted.
1600          */
1601         for (i = 1; i < nt; i++) {
1602                 ASSERT(t[i] > t[i - 1]);
1603         }
1604
1605         nbadparity = rm->rm_firstdatacol;
1606         nbaddata = rm->rm_cols - nbadparity;
1607         ntgts = 0;
1608         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1609                 if (c < rm->rm_firstdatacol)
1610                         parity_valid[c] = B_FALSE;
1611
1612                 if (i < nt && c == t[i]) {
1613                         tgts[ntgts++] = c;
1614                         i++;
1615                 } else if (rm->rm_col[c].rc_error != 0) {
1616                         tgts[ntgts++] = c;
1617                 } else if (c >= rm->rm_firstdatacol) {
1618                         nbaddata--;
1619                 } else {
1620                         parity_valid[c] = B_TRUE;
1621                         nbadparity--;
1622                 }
1623         }
1624
1625         ASSERT(ntgts >= nt);
1626         ASSERT(nbaddata >= 0);
1627         ASSERT(nbaddata + nbadparity == ntgts);
1628
1629         dt = &tgts[nbadparity];
1630
1631         /*
1632          * See if we can use any of our optimized reconstruction routines.
1633          */
1634         if (!vdev_raidz_default_to_general) {
1635                 switch (nbaddata) {
1636                 case 1:
1637                         if (parity_valid[VDEV_RAIDZ_P])
1638                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1639
1640                         ASSERT(rm->rm_firstdatacol > 1);
1641
1642                         if (parity_valid[VDEV_RAIDZ_Q])
1643                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1644
1645                         ASSERT(rm->rm_firstdatacol > 2);
1646                         break;
1647
1648                 case 2:
1649                         ASSERT(rm->rm_firstdatacol > 1);
1650
1651                         if (parity_valid[VDEV_RAIDZ_P] &&
1652                             parity_valid[VDEV_RAIDZ_Q])
1653                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1654
1655                         ASSERT(rm->rm_firstdatacol > 2);
1656
1657                         break;
1658                 }
1659         }
1660
1661         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1662         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1663         ASSERT(code > 0);
1664         return (code);
1665 }
1666
1667 static int
1668 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1669     uint64_t *logical_ashift, uint64_t *physical_ashift)
1670 {
1671         vdev_t *cvd;
1672         uint64_t nparity = vd->vdev_nparity;
1673         int c;
1674         int lasterror = 0;
1675         int numerrors = 0;
1676
1677         ASSERT(nparity > 0);
1678
1679         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1680             vd->vdev_children < nparity + 1) {
1681                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1682                 return (SET_ERROR(EINVAL));
1683         }
1684
1685         vdev_open_children(vd);
1686
1687         for (c = 0; c < vd->vdev_children; c++) {
1688                 cvd = vd->vdev_child[c];
1689
1690                 if (cvd->vdev_open_error != 0) {
1691                         lasterror = cvd->vdev_open_error;
1692                         numerrors++;
1693                         continue;
1694                 }
1695
1696                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1697                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1698                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1699                 *physical_ashift = MAX(*physical_ashift,
1700                     cvd->vdev_physical_ashift);
1701         }
1702
1703         *asize *= vd->vdev_children;
1704         *max_asize *= vd->vdev_children;
1705
1706         if (numerrors > nparity) {
1707                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1708                 return (lasterror);
1709         }
1710
1711         return (0);
1712 }
1713
1714 static void
1715 vdev_raidz_close(vdev_t *vd)
1716 {
1717         int c;
1718
1719         for (c = 0; c < vd->vdev_children; c++)
1720                 vdev_close(vd->vdev_child[c]);
1721 }
1722
1723 #ifdef illumos
1724 /*
1725  * Handle a read or write I/O to a RAID-Z dump device.
1726  *
1727  * The dump device is in a unique situation compared to other ZFS datasets:
1728  * writing to this device should be as simple and fast as possible.  In
1729  * addition, durability matters much less since the dump will be extracted
1730  * once the machine reboots.  For that reason, this function eschews parity for
1731  * performance and simplicity.  The dump device uses the checksum setting
1732  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1733  * dataset.
1734  *
1735  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1736  * 128 KB will not fill an entire block; in addition, they may not be properly
1737  * aligned.  In that case, this function uses the preallocated 128 KB block and
1738  * omits reading or writing any "empty" portions of that block, as opposed to
1739  * allocating a fresh appropriately-sized block.
1740  *
1741  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1742  *
1743  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1744  *
1745  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1746  * allocated which spans all five child vdevs.  8 KB of data would be written to
1747  * each of four vdevs, with the fifth containing the parity bits.
1748  *
1749  *       parity    data     data     data     data
1750  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1751  *         ^        ^        ^        ^        ^
1752  *         |        |        |        |        |
1753  *   8 KB parity    ------8 KB data blocks------
1754  *
1755  * However, when writing to the dump device, the behavior is different:
1756  *
1757  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1758  *
1759  * Unlike the normal RAID-Z case in which the block is allocated based on the
1760  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1761  * I/O size is less than 128 KB, only the actual portions of data are written.
1762  * In this example the data is written to the third data vdev since that vdev
1763  * contains the offset [64 KB, 96 KB).
1764  *
1765  *       parity    data     data     data     data
1766  *     |        |        |        |   XX   |        |
1767  *                                    ^
1768  *                                    |
1769  *                             32 KB data block
1770  *
1771  * As a result, an individual I/O may not span all child vdevs; moreover, a
1772  * small I/O may only operate on a single child vdev.
1773  *
1774  * Note that since there are no parity bits calculated or written, this format
1775  * remains the same no matter how many parity bits are used in a normal RAID-Z
1776  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1777  * would look like:
1778  *
1779  *       parity   parity   parity    data     data     data     data
1780  *     |        |        |        |        |        |   XX   |        |
1781  *                                                      ^
1782  *                                                      |
1783  *                                               32 KB data block
1784  */
1785 int
1786 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1787     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1788 {
1789         vdev_t *tvd = vd->vdev_top;
1790         vdev_t *cvd;
1791         raidz_map_t *rm;
1792         raidz_col_t *rc;
1793         int c, err = 0;
1794
1795         uint64_t start, end, colstart, colend;
1796         uint64_t coloffset, colsize, colskip;
1797
1798         int flags = doread ? BIO_READ : BIO_WRITE;
1799
1800 #ifdef  _KERNEL
1801
1802         /*
1803          * Don't write past the end of the block
1804          */
1805         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1806
1807         start = offset;
1808         end = start + size;
1809
1810         /*
1811          * Allocate a RAID-Z map for this block.  Note that this block starts
1812          * from the "original" offset, this is, the offset of the extent which
1813          * contains the requisite offset of the data being read or written.
1814          *
1815          * Even if this I/O operation doesn't span the full block size, let's
1816          * treat the on-disk format as if the only blocks are the complete 128
1817          * KB size.
1818          */
1819         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1820             SPA_OLD_MAXBLOCKSIZE);
1821         rm = vdev_raidz_map_alloc(abd,
1822             SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1823             vd->vdev_children, vd->vdev_nparity);
1824
1825         coloffset = origoffset;
1826
1827         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1828             c++, coloffset += rc->rc_size) {
1829                 rc = &rm->rm_col[c];
1830                 cvd = vd->vdev_child[rc->rc_devidx];
1831
1832                 /*
1833                  * Find the start and end of this column in the RAID-Z map,
1834                  * keeping in mind that the stated size and offset of the
1835                  * operation may not fill the entire column for this vdev.
1836                  *
1837                  * If any portion of the data spans this column, issue the
1838                  * appropriate operation to the vdev.
1839                  */
1840                 if (coloffset + rc->rc_size <= start)
1841                         continue;
1842                 if (coloffset >= end)
1843                         continue;
1844
1845                 colstart = MAX(coloffset, start);
1846                 colend = MIN(end, coloffset + rc->rc_size);
1847                 colsize = colend - colstart;
1848                 colskip = colstart - coloffset;
1849
1850                 VERIFY3U(colsize, <=, rc->rc_size);
1851                 VERIFY3U(colskip, <=, rc->rc_size);
1852
1853                 /*
1854                  * Note that the child vdev will have a vdev label at the start
1855                  * of its range of offsets, hence the need for
1856                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1857                  * example of why this calculation is needed.
1858                  */
1859                 if ((err = vdev_disk_physio(cvd,
1860                     ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1861                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1862                     flags, isdump)) != 0)
1863                         break;
1864         }
1865
1866         vdev_raidz_map_free(rm);
1867         abd_put(abd);
1868 #endif  /* KERNEL */
1869
1870         return (err);
1871 }
1872 #endif
1873
1874 static uint64_t
1875 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1876 {
1877         uint64_t asize;
1878         uint64_t ashift = vd->vdev_top->vdev_ashift;
1879         uint64_t cols = vd->vdev_children;
1880         uint64_t nparity = vd->vdev_nparity;
1881
1882         asize = ((psize - 1) >> ashift) + 1;
1883         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1884         asize = roundup(asize, nparity + 1) << ashift;
1885
1886         return (asize);
1887 }
1888
1889 static void
1890 vdev_raidz_child_done(zio_t *zio)
1891 {
1892         raidz_col_t *rc = zio->io_private;
1893
1894         rc->rc_error = zio->io_error;
1895         rc->rc_tried = 1;
1896         rc->rc_skipped = 0;
1897 }
1898
1899 /*
1900  * Start an IO operation on a RAIDZ VDev
1901  *
1902  * Outline:
1903  * - For write operations:
1904  *   1. Generate the parity data
1905  *   2. Create child zio write operations to each column's vdev, for both
1906  *      data and parity.
1907  *   3. If the column skips any sectors for padding, create optional dummy
1908  *      write zio children for those areas to improve aggregation continuity.
1909  * - For read operations:
1910  *   1. Create child zio read operations to each data column's vdev to read
1911  *      the range of data required for zio.
1912  *   2. If this is a scrub or resilver operation, or if any of the data
1913  *      vdevs have had errors, then create zio read operations to the parity
1914  *      columns' VDevs as well.
1915  */
1916 static void
1917 vdev_raidz_io_start(zio_t *zio)
1918 {
1919         vdev_t *vd = zio->io_vd;
1920         vdev_t *tvd = vd->vdev_top;
1921         vdev_t *cvd;
1922         raidz_map_t *rm;
1923         raidz_col_t *rc;
1924         int c, i;
1925
1926         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1927             zio->io_type == ZIO_TYPE_FREE,
1928             tvd->vdev_ashift, vd->vdev_children,
1929             vd->vdev_nparity);
1930
1931         zio->io_vsd = rm;
1932         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1933
1934         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1935
1936         if (zio->io_type == ZIO_TYPE_FREE) {
1937                 for (c = 0; c < rm->rm_cols; c++) {
1938                         rc = &rm->rm_col[c];
1939                         cvd = vd->vdev_child[rc->rc_devidx];
1940                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1941                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1942                             zio->io_type, zio->io_priority, 0,
1943                             vdev_raidz_child_done, rc));
1944                 }
1945
1946                 zio_execute(zio);
1947                 return;
1948         }
1949
1950         if (zio->io_type == ZIO_TYPE_WRITE) {
1951                 vdev_raidz_generate_parity(rm);
1952
1953                 for (c = 0; c < rm->rm_cols; c++) {
1954                         rc = &rm->rm_col[c];
1955                         cvd = vd->vdev_child[rc->rc_devidx];
1956                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1957                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1958                             zio->io_type, zio->io_priority, 0,
1959                             vdev_raidz_child_done, rc));
1960                 }
1961
1962                 /*
1963                  * Generate optional I/Os for any skipped sectors to improve
1964                  * aggregation contiguity.
1965                  */
1966                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1967                         ASSERT(c <= rm->rm_scols);
1968                         if (c == rm->rm_scols)
1969                                 c = 0;
1970                         rc = &rm->rm_col[c];
1971                         cvd = vd->vdev_child[rc->rc_devidx];
1972                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1973                             rc->rc_offset + rc->rc_size, NULL,
1974                             1 << tvd->vdev_ashift,
1975                             zio->io_type, zio->io_priority,
1976                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1977                 }
1978
1979                 zio_execute(zio);
1980                 return;
1981         }
1982
1983         ASSERT(zio->io_type == ZIO_TYPE_READ);
1984
1985         /*
1986          * Iterate over the columns in reverse order so that we hit the parity
1987          * last -- any errors along the way will force us to read the parity.
1988          */
1989         for (c = rm->rm_cols - 1; c >= 0; c--) {
1990                 rc = &rm->rm_col[c];
1991                 cvd = vd->vdev_child[rc->rc_devidx];
1992                 if (!vdev_readable(cvd)) {
1993                         if (c >= rm->rm_firstdatacol)
1994                                 rm->rm_missingdata++;
1995                         else
1996                                 rm->rm_missingparity++;
1997                         rc->rc_error = SET_ERROR(ENXIO);
1998                         rc->rc_tried = 1;       /* don't even try */
1999                         rc->rc_skipped = 1;
2000                         continue;
2001                 }
2002                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2003                         if (c >= rm->rm_firstdatacol)
2004                                 rm->rm_missingdata++;
2005                         else
2006                                 rm->rm_missingparity++;
2007                         rc->rc_error = SET_ERROR(ESTALE);
2008                         rc->rc_skipped = 1;
2009                         continue;
2010                 }
2011                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2012                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2013                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2014                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2015                             zio->io_type, zio->io_priority, 0,
2016                             vdev_raidz_child_done, rc));
2017                 }
2018         }
2019
2020         zio_execute(zio);
2021 }
2022
2023
2024 /*
2025  * Report a checksum error for a child of a RAID-Z device.
2026  */
2027 static void
2028 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2029 {
2030         void *buf;
2031         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2032
2033         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2034                 zio_bad_cksum_t zbc;
2035                 raidz_map_t *rm = zio->io_vsd;
2036
2037                 mutex_enter(&vd->vdev_stat_lock);
2038                 vd->vdev_stat.vs_checksum_errors++;
2039                 mutex_exit(&vd->vdev_stat_lock);
2040
2041                 zbc.zbc_has_cksum = 0;
2042                 zbc.zbc_injected = rm->rm_ecksuminjected;
2043
2044                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2045                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2046                     rc->rc_offset, rc->rc_size, buf, bad_data,
2047                     &zbc);
2048                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2049         }
2050 }
2051
2052 /*
2053  * We keep track of whether or not there were any injected errors, so that
2054  * any ereports we generate can note it.
2055  */
2056 static int
2057 raidz_checksum_verify(zio_t *zio)
2058 {
2059         zio_bad_cksum_t zbc;
2060         raidz_map_t *rm = zio->io_vsd;
2061
2062         int ret = zio_checksum_error(zio, &zbc);
2063         if (ret != 0 && zbc.zbc_injected != 0)
2064                 rm->rm_ecksuminjected = 1;
2065
2066         return (ret);
2067 }
2068
2069 /*
2070  * Generate the parity from the data columns. If we tried and were able to
2071  * read the parity without error, verify that the generated parity matches the
2072  * data we read. If it doesn't, we fire off a checksum error. Return the
2073  * number such failures.
2074  */
2075 static int
2076 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2077 {
2078         void *orig[VDEV_RAIDZ_MAXPARITY];
2079         int c, ret = 0;
2080         raidz_col_t *rc;
2081
2082         blkptr_t *bp = zio->io_bp;
2083         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2084             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2085
2086         if (checksum == ZIO_CHECKSUM_NOPARITY)
2087                 return (ret);
2088
2089         for (c = 0; c < rm->rm_firstdatacol; c++) {
2090                 rc = &rm->rm_col[c];
2091                 if (!rc->rc_tried || rc->rc_error != 0)
2092                         continue;
2093                 orig[c] = zio_buf_alloc(rc->rc_size);
2094                 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2095         }
2096
2097         vdev_raidz_generate_parity(rm);
2098
2099         for (c = 0; c < rm->rm_firstdatacol; c++) {
2100                 rc = &rm->rm_col[c];
2101                 if (!rc->rc_tried || rc->rc_error != 0)
2102                         continue;
2103                 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2104                         raidz_checksum_error(zio, rc, orig[c]);
2105                         rc->rc_error = SET_ERROR(ECKSUM);
2106                         ret++;
2107                 }
2108                 zio_buf_free(orig[c], rc->rc_size);
2109         }
2110
2111         return (ret);
2112 }
2113
2114 /*
2115  * Keep statistics on all the ways that we used parity to correct data.
2116  */
2117 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2118
2119 static int
2120 vdev_raidz_worst_error(raidz_map_t *rm)
2121 {
2122         int error = 0;
2123
2124         for (int c = 0; c < rm->rm_cols; c++)
2125                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2126
2127         return (error);
2128 }
2129
2130 /*
2131  * Iterate over all combinations of bad data and attempt a reconstruction.
2132  * Note that the algorithm below is non-optimal because it doesn't take into
2133  * account how reconstruction is actually performed. For example, with
2134  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2135  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2136  * cases we'd only use parity information in column 0.
2137  */
2138 static int
2139 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2140 {
2141         raidz_map_t *rm = zio->io_vsd;
2142         raidz_col_t *rc;
2143         void *orig[VDEV_RAIDZ_MAXPARITY];
2144         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2145         int *tgts = &tstore[1];
2146         int current, next, i, c, n;
2147         int code, ret = 0;
2148
2149         ASSERT(total_errors < rm->rm_firstdatacol);
2150
2151         /*
2152          * This simplifies one edge condition.
2153          */
2154         tgts[-1] = -1;
2155
2156         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2157                 /*
2158                  * Initialize the targets array by finding the first n columns
2159                  * that contain no error.
2160                  *
2161                  * If there were no data errors, we need to ensure that we're
2162                  * always explicitly attempting to reconstruct at least one
2163                  * data column. To do this, we simply push the highest target
2164                  * up into the data columns.
2165                  */
2166                 for (c = 0, i = 0; i < n; i++) {
2167                         if (i == n - 1 && data_errors == 0 &&
2168                             c < rm->rm_firstdatacol) {
2169                                 c = rm->rm_firstdatacol;
2170                         }
2171
2172                         while (rm->rm_col[c].rc_error != 0) {
2173                                 c++;
2174                                 ASSERT3S(c, <, rm->rm_cols);
2175                         }
2176
2177                         tgts[i] = c++;
2178                 }
2179
2180                 /*
2181                  * Setting tgts[n] simplifies the other edge condition.
2182                  */
2183                 tgts[n] = rm->rm_cols;
2184
2185                 /*
2186                  * These buffers were allocated in previous iterations.
2187                  */
2188                 for (i = 0; i < n - 1; i++) {
2189                         ASSERT(orig[i] != NULL);
2190                 }
2191
2192                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2193
2194                 current = 0;
2195                 next = tgts[current];
2196
2197                 while (current != n) {
2198                         tgts[current] = next;
2199                         current = 0;
2200
2201                         /*
2202                          * Save off the original data that we're going to
2203                          * attempt to reconstruct.
2204                          */
2205                         for (i = 0; i < n; i++) {
2206                                 ASSERT(orig[i] != NULL);
2207                                 c = tgts[i];
2208                                 ASSERT3S(c, >=, 0);
2209                                 ASSERT3S(c, <, rm->rm_cols);
2210                                 rc = &rm->rm_col[c];
2211                                 abd_copy_to_buf(orig[i], rc->rc_abd,
2212                                     rc->rc_size);
2213                         }
2214
2215                         /*
2216                          * Attempt a reconstruction and exit the outer loop on
2217                          * success.
2218                          */
2219                         code = vdev_raidz_reconstruct(rm, tgts, n);
2220                         if (raidz_checksum_verify(zio) == 0) {
2221                                 atomic_inc_64(&raidz_corrected[code]);
2222
2223                                 for (i = 0; i < n; i++) {
2224                                         c = tgts[i];
2225                                         rc = &rm->rm_col[c];
2226                                         ASSERT(rc->rc_error == 0);
2227                                         if (rc->rc_tried)
2228                                                 raidz_checksum_error(zio, rc,
2229                                                     orig[i]);
2230                                         rc->rc_error = SET_ERROR(ECKSUM);
2231                                 }
2232
2233                                 ret = code;
2234                                 goto done;
2235                         }
2236
2237                         /*
2238                          * Restore the original data.
2239                          */
2240                         for (i = 0; i < n; i++) {
2241                                 c = tgts[i];
2242                                 rc = &rm->rm_col[c];
2243                                 abd_copy_from_buf(rc->rc_abd, orig[i],
2244                                     rc->rc_size);
2245                         }
2246
2247                         do {
2248                                 /*
2249                                  * Find the next valid column after the current
2250                                  * position..
2251                                  */
2252                                 for (next = tgts[current] + 1;
2253                                     next < rm->rm_cols &&
2254                                     rm->rm_col[next].rc_error != 0; next++)
2255                                         continue;
2256
2257                                 ASSERT(next <= tgts[current + 1]);
2258
2259                                 /*
2260                                  * If that spot is available, we're done here.
2261                                  */
2262                                 if (next != tgts[current + 1])
2263                                         break;
2264
2265                                 /*
2266                                  * Otherwise, find the next valid column after
2267                                  * the previous position.
2268                                  */
2269                                 for (c = tgts[current - 1] + 1;
2270                                     rm->rm_col[c].rc_error != 0; c++)
2271                                         continue;
2272
2273                                 tgts[current] = c;
2274                                 current++;
2275
2276                         } while (current != n);
2277                 }
2278         }
2279         n--;
2280 done:
2281         for (i = 0; i < n; i++) {
2282                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2283         }
2284
2285         return (ret);
2286 }
2287
2288 /*
2289  * Complete an IO operation on a RAIDZ VDev
2290  *
2291  * Outline:
2292  * - For write operations:
2293  *   1. Check for errors on the child IOs.
2294  *   2. Return, setting an error code if too few child VDevs were written
2295  *      to reconstruct the data later.  Note that partial writes are
2296  *      considered successful if they can be reconstructed at all.
2297  * - For read operations:
2298  *   1. Check for errors on the child IOs.
2299  *   2. If data errors occurred:
2300  *      a. Try to reassemble the data from the parity available.
2301  *      b. If we haven't yet read the parity drives, read them now.
2302  *      c. If all parity drives have been read but the data still doesn't
2303  *         reassemble with a correct checksum, then try combinatorial
2304  *         reconstruction.
2305  *      d. If that doesn't work, return an error.
2306  *   3. If there were unexpected errors or this is a resilver operation,
2307  *      rewrite the vdevs that had errors.
2308  */
2309 static void
2310 vdev_raidz_io_done(zio_t *zio)
2311 {
2312         vdev_t *vd = zio->io_vd;
2313         vdev_t *cvd;
2314         raidz_map_t *rm = zio->io_vsd;
2315         raidz_col_t *rc;
2316         int unexpected_errors = 0;
2317         int parity_errors = 0;
2318         int parity_untried = 0;
2319         int data_errors = 0;
2320         int total_errors = 0;
2321         int n, c;
2322         int tgts[VDEV_RAIDZ_MAXPARITY];
2323         int code;
2324
2325         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2326
2327         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2328         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2329
2330         for (c = 0; c < rm->rm_cols; c++) {
2331                 rc = &rm->rm_col[c];
2332
2333                 if (rc->rc_error) {
2334                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2335
2336                         if (c < rm->rm_firstdatacol)
2337                                 parity_errors++;
2338                         else
2339                                 data_errors++;
2340
2341                         if (!rc->rc_skipped)
2342                                 unexpected_errors++;
2343
2344                         total_errors++;
2345                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2346                         parity_untried++;
2347                 }
2348         }
2349
2350         if (zio->io_type == ZIO_TYPE_WRITE) {
2351                 /*
2352                  * XXX -- for now, treat partial writes as a success.
2353                  * (If we couldn't write enough columns to reconstruct
2354                  * the data, the I/O failed.  Otherwise, good enough.)
2355                  *
2356                  * Now that we support write reallocation, it would be better
2357                  * to treat partial failure as real failure unless there are
2358                  * no non-degraded top-level vdevs left, and not update DTLs
2359                  * if we intend to reallocate.
2360                  */
2361                 /* XXPOLICY */
2362                 if (total_errors > rm->rm_firstdatacol)
2363                         zio->io_error = vdev_raidz_worst_error(rm);
2364
2365                 return;
2366         } else if (zio->io_type == ZIO_TYPE_FREE) {
2367                 return;
2368         }
2369
2370         ASSERT(zio->io_type == ZIO_TYPE_READ);
2371         /*
2372          * There are three potential phases for a read:
2373          *      1. produce valid data from the columns read
2374          *      2. read all disks and try again
2375          *      3. perform combinatorial reconstruction
2376          *
2377          * Each phase is progressively both more expensive and less likely to
2378          * occur. If we encounter more errors than we can repair or all phases
2379          * fail, we have no choice but to return an error.
2380          */
2381
2382         /*
2383          * If the number of errors we saw was correctable -- less than or equal
2384          * to the number of parity disks read -- attempt to produce data that
2385          * has a valid checksum. Naturally, this case applies in the absence of
2386          * any errors.
2387          */
2388         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2389                 if (data_errors == 0) {
2390                         if (raidz_checksum_verify(zio) == 0) {
2391                                 /*
2392                                  * If we read parity information (unnecessarily
2393                                  * as it happens since no reconstruction was
2394                                  * needed) regenerate and verify the parity.
2395                                  * We also regenerate parity when resilvering
2396                                  * so we can write it out to the failed device
2397                                  * later.
2398                                  */
2399                                 if (parity_errors + parity_untried <
2400                                     rm->rm_firstdatacol ||
2401                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2402                                         n = raidz_parity_verify(zio, rm);
2403                                         unexpected_errors += n;
2404                                         ASSERT(parity_errors + n <=
2405                                             rm->rm_firstdatacol);
2406                                 }
2407                                 goto done;
2408                         }
2409                 } else {
2410                         /*
2411                          * We either attempt to read all the parity columns or
2412                          * none of them. If we didn't try to read parity, we
2413                          * wouldn't be here in the correctable case. There must
2414                          * also have been fewer parity errors than parity
2415                          * columns or, again, we wouldn't be in this code path.
2416                          */
2417                         ASSERT(parity_untried == 0);
2418                         ASSERT(parity_errors < rm->rm_firstdatacol);
2419
2420                         /*
2421                          * Identify the data columns that reported an error.
2422                          */
2423                         n = 0;
2424                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2425                                 rc = &rm->rm_col[c];
2426                                 if (rc->rc_error != 0) {
2427                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2428                                         tgts[n++] = c;
2429                                 }
2430                         }
2431
2432                         ASSERT(rm->rm_firstdatacol >= n);
2433
2434                         code = vdev_raidz_reconstruct(rm, tgts, n);
2435
2436                         if (raidz_checksum_verify(zio) == 0) {
2437                                 atomic_inc_64(&raidz_corrected[code]);
2438
2439                                 /*
2440                                  * If we read more parity disks than were used
2441                                  * for reconstruction, confirm that the other
2442                                  * parity disks produced correct data. This
2443                                  * routine is suboptimal in that it regenerates
2444                                  * the parity that we already used in addition
2445                                  * to the parity that we're attempting to
2446                                  * verify, but this should be a relatively
2447                                  * uncommon case, and can be optimized if it
2448                                  * becomes a problem. Note that we regenerate
2449                                  * parity when resilvering so we can write it
2450                                  * out to failed devices later.
2451                                  */
2452                                 if (parity_errors < rm->rm_firstdatacol - n ||
2453                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2454                                         n = raidz_parity_verify(zio, rm);
2455                                         unexpected_errors += n;
2456                                         ASSERT(parity_errors + n <=
2457                                             rm->rm_firstdatacol);
2458                                 }
2459
2460                                 goto done;
2461                         }
2462                 }
2463         }
2464
2465         /*
2466          * This isn't a typical situation -- either we got a read error or
2467          * a child silently returned bad data. Read every block so we can
2468          * try again with as much data and parity as we can track down. If
2469          * we've already been through once before, all children will be marked
2470          * as tried so we'll proceed to combinatorial reconstruction.
2471          */
2472         unexpected_errors = 1;
2473         rm->rm_missingdata = 0;
2474         rm->rm_missingparity = 0;
2475
2476         for (c = 0; c < rm->rm_cols; c++) {
2477                 if (rm->rm_col[c].rc_tried)
2478                         continue;
2479
2480                 zio_vdev_io_redone(zio);
2481                 do {
2482                         rc = &rm->rm_col[c];
2483                         if (rc->rc_tried)
2484                                 continue;
2485                         zio_nowait(zio_vdev_child_io(zio, NULL,
2486                             vd->vdev_child[rc->rc_devidx],
2487                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2488                             zio->io_type, zio->io_priority, 0,
2489                             vdev_raidz_child_done, rc));
2490                 } while (++c < rm->rm_cols);
2491
2492                 return;
2493         }
2494
2495         /*
2496          * At this point we've attempted to reconstruct the data given the
2497          * errors we detected, and we've attempted to read all columns. There
2498          * must, therefore, be one or more additional problems -- silent errors
2499          * resulting in invalid data rather than explicit I/O errors resulting
2500          * in absent data. We check if there is enough additional data to
2501          * possibly reconstruct the data and then perform combinatorial
2502          * reconstruction over all possible combinations. If that fails,
2503          * we're cooked.
2504          */
2505         if (total_errors > rm->rm_firstdatacol) {
2506                 zio->io_error = vdev_raidz_worst_error(rm);
2507
2508         } else if (total_errors < rm->rm_firstdatacol &&
2509             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2510                 /*
2511                  * If we didn't use all the available parity for the
2512                  * combinatorial reconstruction, verify that the remaining
2513                  * parity is correct.
2514                  */
2515                 if (code != (1 << rm->rm_firstdatacol) - 1)
2516                         (void) raidz_parity_verify(zio, rm);
2517         } else {
2518                 /*
2519                  * We're here because either:
2520                  *
2521                  *      total_errors == rm_first_datacol, or
2522                  *      vdev_raidz_combrec() failed
2523                  *
2524                  * In either case, there is enough bad data to prevent
2525                  * reconstruction.
2526                  *
2527                  * Start checksum ereports for all children which haven't
2528                  * failed, and the IO wasn't speculative.
2529                  */
2530                 zio->io_error = SET_ERROR(ECKSUM);
2531
2532                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2533                         for (c = 0; c < rm->rm_cols; c++) {
2534                                 rc = &rm->rm_col[c];
2535                                 if (rc->rc_error == 0) {
2536                                         zio_bad_cksum_t zbc;
2537                                         zbc.zbc_has_cksum = 0;
2538                                         zbc.zbc_injected =
2539                                             rm->rm_ecksuminjected;
2540
2541                                         zfs_ereport_start_checksum(
2542                                             zio->io_spa,
2543                                             vd->vdev_child[rc->rc_devidx],
2544                                             zio, rc->rc_offset, rc->rc_size,
2545                                             (void *)(uintptr_t)c, &zbc);
2546                                 }
2547                         }
2548                 }
2549         }
2550
2551 done:
2552         zio_checksum_verified(zio);
2553
2554         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2555             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2556                 /*
2557                  * Use the good data we have in hand to repair damaged children.
2558                  */
2559                 for (c = 0; c < rm->rm_cols; c++) {
2560                         rc = &rm->rm_col[c];
2561                         cvd = vd->vdev_child[rc->rc_devidx];
2562
2563                         if (rc->rc_error == 0)
2564                                 continue;
2565
2566                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2567                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2568                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2569                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2570                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2571                 }
2572         }
2573 }
2574
2575 static void
2576 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2577 {
2578         if (faulted > vd->vdev_nparity)
2579                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2580                     VDEV_AUX_NO_REPLICAS);
2581         else if (degraded + faulted != 0)
2582                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2583         else
2584                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2585 }
2586
2587 vdev_ops_t vdev_raidz_ops = {
2588         vdev_raidz_open,
2589         vdev_raidz_close,
2590         vdev_raidz_asize,
2591         vdev_raidz_io_start,
2592         vdev_raidz_io_done,
2593         vdev_raidz_state_change,
2594         NULL,
2595         NULL,
2596         NULL,
2597         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2598         B_FALSE                 /* not a leaf vdev */
2599 };