sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/vdev_impl.h>
  32 #ifdef illumos
  33 #include <sys/vdev_disk.h>
  34 #endif
  35 #include <sys/vdev_file.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/abd.h>
  40 #include <sys/fs/zfs.h>
  41 #include <sys/fm/fs/zfs.h>
  42 #include <sys/bio.h>
  43
  44 #ifdef ZFS_DEBUG
  45 #include <sys/vdev_initialize.h>        /* vdev_xlate testing */
  46 #endif
  47
  48 /*
  49  * Virtual device vector for RAID-Z.
  50  *
  51  * This vdev supports single, double, and triple parity. For single parity,
  52  * we use a simple XOR of all the data columns. For double or triple parity,
  53  * we use a special case of Reed-Solomon coding. This extends the
  54  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  55  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  56  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  57  * former is also based. The latter is designed to provide higher performance
  58  * for writes.
  59  *
  60  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  61  * amended six years later identifying a critical flaw that invalidates its
  62  * claims. Nevertheless, the technique can be adapted to work for up to
  63  * triple parity. For additional parity, the amendment "Note: Correction to
  64  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  65  * is viable, but the additional complexity means that write performance will
  66  * suffer.
  67  *
  68  * All of the methods above operate on a Galois field, defined over the
  69  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  70  * can be expressed with a single byte. Briefly, the operations on the
  71  * field are defined as follows:
  72  *
  73  *   o addition (+) is represented by a bitwise XOR
  74  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  75  *   o multiplication of A by 2 is defined by the following bitwise expression:
  76  *
  77  *      (A * 2)_7 = A_6
  78  *      (A * 2)_6 = A_5
  79  *      (A * 2)_5 = A_4
  80  *      (A * 2)_4 = A_3 + A_7
  81  *      (A * 2)_3 = A_2 + A_7
  82  *      (A * 2)_2 = A_1 + A_7
  83  *      (A * 2)_1 = A_0
  84  *      (A * 2)_0 = A_7
  85  *
  86  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  87  * As an aside, this multiplication is derived from the error correcting
  88  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  89  *
  90  * Observe that any number in the field (except for 0) can be expressed as a
  91  * power of 2 -- a generator for the field. We store a table of the powers of
  92  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  93  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  94  * than field addition). The inverse of a field element A (A^-1) is therefore
  95  * A ^ (255 - 1) = A^254.
  96  *
  97  * The up-to-three parity columns, P, Q, R over several data columns,
  98  * D_0, ... D_n-1, can be expressed by field operations:
  99  *
 100  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
 101  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
 102  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
 103  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 104  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 105  *
 106  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 107  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 108  * independent coefficients. (There are no additional coefficients that have
 109  * this property which is why the uncorrected Plank method breaks down.)
 110  *
 111  * See the reconstruction code below for how P, Q and R can used individually
 112  * or in concert to recover missing data columns.
 113  */
 114
 115 typedef struct raidz_col {
 116         uint64_t rc_devidx;             /* child device index for I/O */
 117         uint64_t rc_offset;             /* device offset */
 118         uint64_t rc_size;               /* I/O size */
 119         abd_t *rc_abd;                  /* I/O data */
 120         void *rc_gdata;                 /* used to store the "good" version */
 121         int rc_error;                   /* I/O error for this device */
 122         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 123         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 124 } raidz_col_t;
 125
 126 typedef struct raidz_map {
 127         uint64_t rm_cols;               /* Regular column count */
 128         uint64_t rm_scols;              /* Count including skipped columns */
 129         uint64_t rm_bigcols;            /* Number of oversized columns */
 130         uint64_t rm_asize;              /* Actual total I/O size */
 131         uint64_t rm_missingdata;        /* Count of missing data devices */
 132         uint64_t rm_missingparity;      /* Count of missing parity devices */
 133         uint64_t rm_firstdatacol;       /* First data column/parity count */
 134         uint64_t rm_nskip;              /* Skipped sectors for padding */
 135         uint64_t rm_skipstart;          /* Column index of padding start */
 136         abd_t *rm_abd_copy;             /* rm_asize-buffer of copied data */
 137         uintptr_t rm_reports;           /* # of referencing checksum reports */
 138         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 139         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 140         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 141 } raidz_map_t;
 142
 143 #define VDEV_RAIDZ_P            0
 144 #define VDEV_RAIDZ_Q            1
 145 #define VDEV_RAIDZ_R            2
 146
 147 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 148 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 149
 150 /*
 151  * We provide a mechanism to perform the field multiplication operation on a
 152  * 64-bit value all at once rather than a byte at a time. This works by
 153  * creating a mask from the top bit in each byte and using that to
 154  * conditionally apply the XOR of 0x1d.
 155  */
 156 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 157 { \
 158         (mask) = (x) & 0x8080808080808080ULL; \
 159         (mask) = ((mask) << 1) - ((mask) >> 7); \
 160         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 161             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 162 }
 163
 164 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 165 { \
 166         VDEV_RAIDZ_64MUL_2((x), mask); \
 167         VDEV_RAIDZ_64MUL_2((x), mask); \
 168 }
 169
 170 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 171
 172 /*
 173  * Force reconstruction to use the general purpose method.
 174  */
 175 int vdev_raidz_default_to_general;
 176
 177 /* Powers of 2 in the Galois field defined above. */
 178 static const uint8_t vdev_raidz_pow2[256] = {
 179         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 180         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 181         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 182         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 183         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 184         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 185         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 186         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 187         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 188         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 189         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 190         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 191         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 192         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 193         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 194         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 195         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 196         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 197         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 198         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 199         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 200         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 201         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 202         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 203         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 204         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 205         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 206         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 207         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 208         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 209         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 210         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 211 };
 212 /* Logs of 2 in the Galois field defined above. */
 213 static const uint8_t vdev_raidz_log2[256] = {
 214         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 215         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 216         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 217         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 218         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 219         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 220         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 221         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 222         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 223         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 224         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 225         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 226         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 227         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 228         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 229         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 230         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 231         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 232         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 233         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 234         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 235         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 236         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 237         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 238         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 239         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 240         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 241         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 242         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 243         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 244         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 245         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 246 };
 247
 248 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 249
 250 /*
 251  * Multiply a given number by 2 raised to the given power.
 252  */
 253 static uint8_t
 254 vdev_raidz_exp2(uint_t a, int exp)
 255 {
 256         if (a == 0)
 257                 return (0);
 258
 259         ASSERT(exp >= 0);
 260         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 261
 262         exp += vdev_raidz_log2[a];
 263         if (exp > 255)
 264                 exp -= 255;
 265
 266         return (vdev_raidz_pow2[exp]);
 267 }
 268
 269 static void
 270 vdev_raidz_map_free(raidz_map_t *rm)
 271 {
 272         int c;
 273         size_t size;
 274
 275         for (c = 0; c < rm->rm_firstdatacol; c++) {
 276                 if (rm->rm_col[c].rc_abd != NULL)
 277                         abd_free(rm->rm_col[c].rc_abd);
 278
 279                 if (rm->rm_col[c].rc_gdata != NULL)
 280                         zio_buf_free(rm->rm_col[c].rc_gdata,
 281                             rm->rm_col[c].rc_size);
 282         }
 283
 284         size = 0;
 285         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 286                 if (rm->rm_col[c].rc_abd != NULL)
 287                         abd_put(rm->rm_col[c].rc_abd);
 288                 size += rm->rm_col[c].rc_size;
 289         }
 290
 291         if (rm->rm_abd_copy != NULL)
 292                 abd_free(rm->rm_abd_copy);
 293
 294         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 295 }
 296
 297 static void
 298 vdev_raidz_map_free_vsd(zio_t *zio)
 299 {
 300         raidz_map_t *rm = zio->io_vsd;
 301
 302         ASSERT0(rm->rm_freed);
 303         rm->rm_freed = 1;
 304
 305         if (rm->rm_reports == 0)
 306                 vdev_raidz_map_free(rm);
 307 }
 308
 309 /*ARGSUSED*/
 310 static void
 311 vdev_raidz_cksum_free(void *arg, size_t ignored)
 312 {
 313         raidz_map_t *rm = arg;
 314
 315         ASSERT3U(rm->rm_reports, >, 0);
 316
 317         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 318                 vdev_raidz_map_free(rm);
 319 }
 320
 321 static void
 322 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 323 {
 324         raidz_map_t *rm = zcr->zcr_cbdata;
 325         size_t c = zcr->zcr_cbinfo;
 326         size_t x;
 327
 328         const char *good = NULL;
 329         char *bad;
 330
 331         if (good_data == NULL) {
 332                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 333                 return;
 334         }
 335
 336         if (c < rm->rm_firstdatacol) {
 337                 /*
 338                  * The first time through, calculate the parity blocks for
 339                  * the good data (this relies on the fact that the good
 340                  * data never changes for a given logical ZIO)
 341                  */
 342                 if (rm->rm_col[0].rc_gdata == NULL) {
 343                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 344                         char *buf;
 345                         int offset;
 346
 347                         /*
 348                          * Set up the rm_col[]s to generate the parity for
 349                          * good_data, first saving the parity bufs and
 350                          * replacing them with buffers to hold the result.
 351                          */
 352                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 353                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 354                                 rm->rm_col[x].rc_gdata =
 355                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 356                                 rm->rm_col[x].rc_abd =
 357                                     abd_get_from_buf(rm->rm_col[x].rc_gdata,
 358                                     rm->rm_col[x].rc_size);
 359                         }
 360
 361                         /* fill in the data columns from good_data */
 362                         buf = (char *)good_data;
 363                         for (; x < rm->rm_cols; x++) {
 364                                 abd_put(rm->rm_col[x].rc_abd);
 365                                 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
 366                                     rm->rm_col[x].rc_size);
 367                                 buf += rm->rm_col[x].rc_size;
 368                         }
 369
 370                         /*
 371                          * Construct the parity from the good data.
 372                          */
 373                         vdev_raidz_generate_parity(rm);
 374
 375                         /* restore everything back to its original state */
 376                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 377                                 abd_put(rm->rm_col[x].rc_abd);
 378                                 rm->rm_col[x].rc_abd = bad_parity[x];
 379                         }
 380
 381                         offset = 0;
 382                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 383                                 abd_put(rm->rm_col[x].rc_abd);
 384                                 rm->rm_col[x].rc_abd = abd_get_offset(
 385                                     rm->rm_abd_copy, offset);
 386                                 offset += rm->rm_col[x].rc_size;
 387                         }
 388                 }
 389
 390                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 391                 good = rm->rm_col[c].rc_gdata;
 392         } else {
 393                 /* adjust good_data to point at the start of our column */
 394                 good = good_data;
 395
 396                 for (x = rm->rm_firstdatacol; x < c; x++)
 397                         good += rm->rm_col[x].rc_size;
 398         }
 399
 400         bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
 401         /* we drop the ereport if it ends up that the data was good */
 402         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 403         abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
 404 }
 405
 406 /*
 407  * Invoked indirectly by zfs_ereport_start_checksum(), called
 408  * below when our read operation fails completely.  The main point
 409  * is to keep a copy of everything we read from disk, so that at
 410  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 411  */
 412 static void
 413 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 414 {
 415         size_t c = (size_t)(uintptr_t)arg;
 416         size_t offset;
 417
 418         raidz_map_t *rm = zio->io_vsd;
 419         size_t size;
 420
 421         /* set up the report and bump the refcount  */
 422         zcr->zcr_cbdata = rm;
 423         zcr->zcr_cbinfo = c;
 424         zcr->zcr_finish = vdev_raidz_cksum_finish;
 425         zcr->zcr_free = vdev_raidz_cksum_free;
 426
 427         rm->rm_reports++;
 428         ASSERT3U(rm->rm_reports, >, 0);
 429
 430         if (rm->rm_abd_copy != NULL)
 431                 return;
 432
 433         /*
 434          * It's the first time we're called for this raidz_map_t, so we need
 435          * to copy the data aside; there's no guarantee that our zio's buffer
 436          * won't be re-used for something else.
 437          *
 438          * Our parity data is already in separate buffers, so there's no need
 439          * to copy them.
 440          */
 441
 442         size = 0;
 443         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 444                 size += rm->rm_col[c].rc_size;
 445
 446         rm->rm_abd_copy =
 447             abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
 448
 449         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 450                 raidz_col_t *col = &rm->rm_col[c];
 451                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 452
 453                 abd_copy(tmp, col->rc_abd, col->rc_size);
 454                 abd_put(col->rc_abd);
 455                 col->rc_abd = tmp;
 456
 457                 offset += col->rc_size;
 458         }
 459         ASSERT3U(offset, ==, size);
 460 }
 461
 462 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 463         vdev_raidz_map_free_vsd,
 464         vdev_raidz_cksum_report
 465 };
 466
 467 /*
 468  * Divides the IO evenly across all child vdevs; usually, dcols is
 469  * the number of children in the target vdev.
 470  */
 471 static raidz_map_t *
 472 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
 473     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 474 {
 475         raidz_map_t *rm;
 476         /* The starting RAIDZ (parent) vdev sector of the block. */
 477         uint64_t b = offset >> unit_shift;
 478         /* The zio's size in units of the vdev's minimum sector size. */
 479         uint64_t s = size >> unit_shift;
 480         /* The first column for this stripe. */
 481         uint64_t f = b % dcols;
 482         /* The starting byte offset on each child vdev. */
 483         uint64_t o = (b / dcols) << unit_shift;
 484         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 485         uint64_t off = 0;
 486
 487         /*
 488          * "Quotient": The number of data sectors for this stripe on all but
 489          * the "big column" child vdevs that also contain "remainder" data.
 490          */
 491         q = s / (dcols - nparity);
 492
 493         /*
 494          * "Remainder": The number of partial stripe data sectors in this I/O.
 495          * This will add a sector to some, but not all, child vdevs.
 496          */
 497         r = s - q * (dcols - nparity);
 498
 499         /* The number of "big columns" - those which contain remainder data. */
 500         bc = (r == 0 ? 0 : r + nparity);
 501
 502         /*
 503          * The total number of data and parity sectors associated with
 504          * this I/O.
 505          */
 506         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 507
 508         /* acols: The columns that will be accessed. */
 509         /* scols: The columns that will be accessed or skipped. */
 510         if (q == 0) {
 511                 /* Our I/O request doesn't span all child vdevs. */
 512                 acols = bc;
 513                 scols = MIN(dcols, roundup(bc, nparity + 1));
 514         } else {
 515                 acols = dcols;
 516                 scols = dcols;
 517         }
 518
 519         ASSERT3U(acols, <=, scols);
 520
 521         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 522
 523         rm->rm_cols = acols;
 524         rm->rm_scols = scols;
 525         rm->rm_bigcols = bc;
 526         rm->rm_skipstart = bc;
 527         rm->rm_missingdata = 0;
 528         rm->rm_missingparity = 0;
 529         rm->rm_firstdatacol = nparity;
 530         rm->rm_abd_copy = NULL;
 531         rm->rm_reports = 0;
 532         rm->rm_freed = 0;
 533         rm->rm_ecksuminjected = 0;
 534
 535         asize = 0;
 536
 537         for (c = 0; c < scols; c++) {
 538                 col = f + c;
 539                 coff = o;
 540                 if (col >= dcols) {
 541                         col -= dcols;
 542                         coff += 1ULL << unit_shift;
 543                 }
 544                 rm->rm_col[c].rc_devidx = col;
 545                 rm->rm_col[c].rc_offset = coff;
 546                 rm->rm_col[c].rc_abd = NULL;
 547                 rm->rm_col[c].rc_gdata = NULL;
 548                 rm->rm_col[c].rc_error = 0;
 549                 rm->rm_col[c].rc_tried = 0;
 550                 rm->rm_col[c].rc_skipped = 0;
 551
 552                 if (c >= acols)
 553                         rm->rm_col[c].rc_size = 0;
 554                 else if (c < bc)
 555                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 556                 else
 557                         rm->rm_col[c].rc_size = q << unit_shift;
 558
 559                 asize += rm->rm_col[c].rc_size;
 560         }
 561
 562         ASSERT3U(asize, ==, tot << unit_shift);
 563         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 564         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 565         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 566         ASSERT3U(rm->rm_nskip, <=, nparity);
 567
 568         if (!dofree) {
 569                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 570                         rm->rm_col[c].rc_abd =
 571                             abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 572                 }
 573
 574                 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 575                 off = rm->rm_col[c].rc_size;
 576
 577                 for (c = c + 1; c < acols; c++) {
 578                         rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 579                         off += rm->rm_col[c].rc_size;
 580                 }
 581         }
 582
 583         /*
 584          * If all data stored spans all columns, there's a danger that parity
 585          * will always be on the same device and, since parity isn't read
 586          * during normal operation, that that device's I/O bandwidth won't be
 587          * used effectively. We therefore switch the parity every 1MB.
 588          *
 589          * ... at least that was, ostensibly, the theory. As a practical
 590          * matter unless we juggle the parity between all devices evenly, we
 591          * won't see any benefit. Further, occasional writes that aren't a
 592          * multiple of the LCM of the number of children and the minimum
 593          * stripe width are sufficient to avoid pessimal behavior.
 594          * Unfortunately, this decision created an implicit on-disk format
 595          * requirement that we need to support for all eternity, but only
 596          * for single-parity RAID-Z.
 597          *
 598          * If we intend to skip a sector in the zeroth column for padding
 599          * we must make sure to note this swap. We will never intend to
 600          * skip the first column since at least one data and one parity
 601          * column must appear in each row.
 602          */
 603         ASSERT(rm->rm_cols >= 2);
 604         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 605
 606         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 607                 devidx = rm->rm_col[0].rc_devidx;
 608                 o = rm->rm_col[0].rc_offset;
 609                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 610                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 611                 rm->rm_col[1].rc_devidx = devidx;
 612                 rm->rm_col[1].rc_offset = o;
 613
 614                 if (rm->rm_skipstart == 0)
 615                         rm->rm_skipstart = 1;
 616         }
 617
 618         return (rm);
 619 }
 620
 621 struct pqr_struct {
 622         uint64_t *p;
 623         uint64_t *q;
 624         uint64_t *r;
 625 };
 626
 627 static int
 628 vdev_raidz_p_func(void *buf, size_t size, void *private)
 629 {
 630         struct pqr_struct *pqr = private;
 631         const uint64_t *src = buf;
 632         int i, cnt = size / sizeof (src[0]);
 633
 634         ASSERT(pqr->p && !pqr->q && !pqr->r);
 635
 636         for (i = 0; i < cnt; i++, src++, pqr->p++)
 637                 *pqr->p ^= *src;
 638
 639         return (0);
 640 }
 641
 642 static int
 643 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 644 {
 645         struct pqr_struct *pqr = private;
 646         const uint64_t *src = buf;
 647         uint64_t mask;
 648         int i, cnt = size / sizeof (src[0]);
 649
 650         ASSERT(pqr->p && pqr->q && !pqr->r);
 651
 652         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 653                 *pqr->p ^= *src;
 654                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 655                 *pqr->q ^= *src;
 656         }
 657
 658         return (0);
 659 }
 660
 661 static int
 662 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 663 {
 664         struct pqr_struct *pqr = private;
 665         const uint64_t *src = buf;
 666         uint64_t mask;
 667         int i, cnt = size / sizeof (src[0]);
 668
 669         ASSERT(pqr->p && pqr->q && pqr->r);
 670
 671         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 672                 *pqr->p ^= *src;
 673                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 674                 *pqr->q ^= *src;
 675                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 676                 *pqr->r ^= *src;
 677         }
 678
 679         return (0);
 680 }
 681
 682 static void
 683 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 684 {
 685         uint64_t *p;
 686         int c;
 687         abd_t *src;
 688
 689         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 690                 src = rm->rm_col[c].rc_abd;
 691                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 692
 693                 if (c == rm->rm_firstdatacol) {
 694                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 695                 } else {
 696                         struct pqr_struct pqr = { p, NULL, NULL };
 697                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 698                             vdev_raidz_p_func, &pqr);
 699                 }
 700         }
 701 }
 702
 703 static void
 704 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 705 {
 706         uint64_t *p, *q, pcnt, ccnt, mask, i;
 707         int c;
 708         abd_t *src;
 709
 710         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 711         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 712             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 713
 714         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 715                 src = rm->rm_col[c].rc_abd;
 716                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 717                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 718
 719                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 720
 721                 if (c == rm->rm_firstdatacol) {
 722                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 723                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 724                 } else {
 725                         struct pqr_struct pqr = { p, q, NULL };
 726                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 727                             vdev_raidz_pq_func, &pqr);
 728                 }
 729
 730                 if (c == rm->rm_firstdatacol) {
 731                         for (i = ccnt; i < pcnt; i++) {
 732                                 p[i] = 0;
 733                                 q[i] = 0;
 734                         }
 735                 } else {
 736                         /*
 737                          * Treat short columns as though they are full of 0s.
 738                          * Note that there's therefore nothing needed for P.
 739                          */
 740                         for (i = ccnt; i < pcnt; i++) {
 741                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 742                         }
 743                 }
 744         }
 745 }
 746
 747 static void
 748 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 749 {
 750         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 751         int c;
 752         abd_t *src;
 753
 754         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 755         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 756             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 757         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 758             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 759
 760         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 761                 src = rm->rm_col[c].rc_abd;
 762                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 763                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 764                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 765
 766                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 767
 768                 if (c == rm->rm_firstdatacol) {
 769                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 770                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 771                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 772                 } else {
 773                         struct pqr_struct pqr = { p, q, r };
 774                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 775                             vdev_raidz_pqr_func, &pqr);
 776                 }
 777
 778                 if (c == rm->rm_firstdatacol) {
 779                         for (i = ccnt; i < pcnt; i++) {
 780                                 p[i] = 0;
 781                                 q[i] = 0;
 782                                 r[i] = 0;
 783                         }
 784                 } else {
 785                         /*
 786                          * Treat short columns as though they are full of 0s.
 787                          * Note that there's therefore nothing needed for P.
 788                          */
 789                         for (i = ccnt; i < pcnt; i++) {
 790                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 791                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 792                         }
 793                 }
 794         }
 795 }
 796
 797 /*
 798  * Generate RAID parity in the first virtual columns according to the number of
 799  * parity columns available.
 800  */
 801 static void
 802 vdev_raidz_generate_parity(raidz_map_t *rm)
 803 {
 804         switch (rm->rm_firstdatacol) {
 805         case 1:
 806                 vdev_raidz_generate_parity_p(rm);
 807                 break;
 808         case 2:
 809                 vdev_raidz_generate_parity_pq(rm);
 810                 break;
 811         case 3:
 812                 vdev_raidz_generate_parity_pqr(rm);
 813                 break;
 814         default:
 815                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 816         }
 817 }
 818
 819 /* ARGSUSED */
 820 static int
 821 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 822 {
 823         uint64_t *dst = dbuf;
 824         uint64_t *src = sbuf;
 825         int cnt = size / sizeof (src[0]);
 826
 827         for (int i = 0; i < cnt; i++) {
 828                 dst[i] ^= src[i];
 829         }
 830
 831         return (0);
 832 }
 833
 834 /* ARGSUSED */
 835 static int
 836 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 837     void *private)
 838 {
 839         uint64_t *dst = dbuf;
 840         uint64_t *src = sbuf;
 841         uint64_t mask;
 842         int cnt = size / sizeof (dst[0]);
 843
 844         for (int i = 0; i < cnt; i++, dst++, src++) {
 845                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 846                 *dst ^= *src;
 847         }
 848
 849         return (0);
 850 }
 851
 852 /* ARGSUSED */
 853 static int
 854 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 855 {
 856         uint64_t *dst = buf;
 857         uint64_t mask;
 858         int cnt = size / sizeof (dst[0]);
 859
 860         for (int i = 0; i < cnt; i++, dst++) {
 861                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 862                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 863         }
 864
 865         return (0);
 866 }
 867
 868 struct reconst_q_struct {
 869         uint64_t *q;
 870         int exp;
 871 };
 872
 873 static int
 874 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 875 {
 876         struct reconst_q_struct *rq = private;
 877         uint64_t *dst = buf;
 878         int cnt = size / sizeof (dst[0]);
 879
 880         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 881                 *dst ^= *rq->q;
 882
 883                 int j;
 884                 uint8_t *b;
 885                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 886                         *b = vdev_raidz_exp2(*b, rq->exp);
 887                 }
 888         }
 889
 890         return (0);
 891 }
 892
 893 struct reconst_pq_struct {
 894         uint8_t *p;
 895         uint8_t *q;
 896         uint8_t *pxy;
 897         uint8_t *qxy;
 898         int aexp;
 899         int bexp;
 900 };
 901
 902 static int
 903 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 904 {
 905         struct reconst_pq_struct *rpq = private;
 906         uint8_t *xd = xbuf;
 907         uint8_t *yd = ybuf;
 908
 909         for (int i = 0; i < size;
 910             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 911                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 912                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 913                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 914         }
 915
 916         return (0);
 917 }
 918
 919 static int
 920 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 921 {
 922         struct reconst_pq_struct *rpq = private;
 923         uint8_t *xd = xbuf;
 924
 925         for (int i = 0; i < size;
 926             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 927                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 928                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 929                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 930         }
 931
 932         return (0);
 933 }
 934
 935 static int
 936 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 937 {
 938         int x = tgts[0];
 939         int c;
 940         abd_t *dst, *src;
 941
 942         ASSERT(ntgts == 1);
 943         ASSERT(x >= rm->rm_firstdatacol);
 944         ASSERT(x < rm->rm_cols);
 945
 946         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
 947         ASSERT(rm->rm_col[x].rc_size > 0);
 948
 949         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 950         dst = rm->rm_col[x].rc_abd;
 951
 952         abd_copy(dst, src, rm->rm_col[x].rc_size);
 953
 954         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 955                 uint64_t size = MIN(rm->rm_col[x].rc_size,
 956                     rm->rm_col[c].rc_size);
 957
 958                 src = rm->rm_col[c].rc_abd;
 959                 dst = rm->rm_col[x].rc_abd;
 960
 961                 if (c == x)
 962                         continue;
 963
 964                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 965                     vdev_raidz_reconst_p_func, NULL);
 966         }
 967
 968         return (1 << VDEV_RAIDZ_P);
 969 }
 970
 971 static int
 972 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 973 {
 974         int x = tgts[0];
 975         int c, exp;
 976         abd_t *dst, *src;
 977
 978         ASSERT(ntgts == 1);
 979
 980         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 981
 982         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 983                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
 984                     rm->rm_col[c].rc_size);
 985
 986                 src = rm->rm_col[c].rc_abd;
 987                 dst = rm->rm_col[x].rc_abd;
 988
 989                 if (c == rm->rm_firstdatacol) {
 990                         abd_copy(dst, src, size);
 991                         if (rm->rm_col[x].rc_size > size)
 992                                 abd_zero_off(dst, size,
 993                                     rm->rm_col[x].rc_size - size);
 994                 } else {
 995                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
 996                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 997                             vdev_raidz_reconst_q_pre_func, NULL);
 998                         (void) abd_iterate_func(dst,
 999                             size, rm->rm_col[x].rc_size - size,
1000                             vdev_raidz_reconst_q_pre_tail_func, NULL);
1001                 }
1002         }
1003
1004         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1005         dst = rm->rm_col[x].rc_abd;
1006         exp = 255 - (rm->rm_cols - 1 - x);
1007
1008         struct reconst_q_struct rq = { abd_to_buf(src), exp };
1009         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1010             vdev_raidz_reconst_q_post_func, &rq);
1011
1012         return (1 << VDEV_RAIDZ_Q);
1013 }
1014
1015 static int
1016 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1017 {
1018         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1019         abd_t *pdata, *qdata;
1020         uint64_t xsize, ysize;
1021         int x = tgts[0];
1022         int y = tgts[1];
1023         abd_t *xd, *yd;
1024
1025         ASSERT(ntgts == 2);
1026         ASSERT(x < y);
1027         ASSERT(x >= rm->rm_firstdatacol);
1028         ASSERT(y < rm->rm_cols);
1029
1030         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1031
1032         /*
1033          * Move the parity data aside -- we're going to compute parity as
1034          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1035          * reuse the parity generation mechanism without trashing the actual
1036          * parity so we make those columns appear to be full of zeros by
1037          * setting their lengths to zero.
1038          */
1039         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1040         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1041         xsize = rm->rm_col[x].rc_size;
1042         ysize = rm->rm_col[y].rc_size;
1043
1044         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1045             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1046         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1047             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1048         rm->rm_col[x].rc_size = 0;
1049         rm->rm_col[y].rc_size = 0;
1050
1051         vdev_raidz_generate_parity_pq(rm);
1052
1053         rm->rm_col[x].rc_size = xsize;
1054         rm->rm_col[y].rc_size = ysize;
1055
1056         p = abd_to_buf(pdata);
1057         q = abd_to_buf(qdata);
1058         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1059         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1060         xd = rm->rm_col[x].rc_abd;
1061         yd = rm->rm_col[y].rc_abd;
1062
1063         /*
1064          * We now have:
1065          *      Pxy = P + D_x + D_y
1066          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1067          *
1068          * We can then solve for D_x:
1069          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1070          * where
1071          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1072          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1073          *
1074          * With D_x in hand, we can easily solve for D_y:
1075          *      D_y = P + Pxy + D_x
1076          */
1077
1078         a = vdev_raidz_pow2[255 + x - y];
1079         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1080         tmp = 255 - vdev_raidz_log2[a ^ 1];
1081
1082         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1083         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1084
1085         ASSERT3U(xsize, >=, ysize);
1086         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1087         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1088             vdev_raidz_reconst_pq_func, &rpq);
1089         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1090             vdev_raidz_reconst_pq_tail_func, &rpq);
1091
1092         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1093         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1094
1095         /*
1096          * Restore the saved parity data.
1097          */
1098         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1099         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1100
1101         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1102 }
1103
1104 /* BEGIN CSTYLED */
1105 /*
1106  * In the general case of reconstruction, we must solve the system of linear
1107  * equations defined by the coeffecients used to generate parity as well as
1108  * the contents of the data and parity disks. This can be expressed with
1109  * vectors for the original data (D) and the actual data (d) and parity (p)
1110  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1111  *
1112  *            __   __                     __     __
1113  *            |     |         __     __   |  p_0  |
1114  *            |  V  |         |  D_0  |   | p_m-1 |
1115  *            |     |    x    |   :   | = |  d_0  |
1116  *            |  I  |         | D_n-1 |   |   :   |
1117  *            |     |         ~~     ~~   | d_n-1 |
1118  *            ~~   ~~                     ~~     ~~
1119  *
1120  * I is simply a square identity matrix of size n, and V is a vandermonde
1121  * matrix defined by the coeffecients we chose for the various parity columns
1122  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1123  * computation as well as linear separability.
1124  *
1125  *      __               __               __     __
1126  *      |   1   ..  1 1 1 |               |  p_0  |
1127  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1128  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1129  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1130  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1131  *      |   :       : : : |   |   :   |   |  d_2  |
1132  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1133  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1134  *      |   0   ..  0 0 1 |               | d_n-1 |
1135  *      ~~               ~~               ~~     ~~
1136  *
1137  * Note that I, V, d, and p are known. To compute D, we must invert the
1138  * matrix and use the known data and parity values to reconstruct the unknown
1139  * data values. We begin by removing the rows in V|I and d|p that correspond
1140  * to failed or missing columns; we then make V|I square (n x n) and d|p
1141  * sized n by removing rows corresponding to unused parity from the bottom up
1142  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1143  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1144  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1145  *           __                               __
1146  *           |  1   1   1   1   1   1   1   1  |
1147  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1148  *           |  19 205 116  29  64  16  4   1  |      / /
1149  *           |  1   0   0   0   0   0   0   0  |     / /
1150  *           |  0   1   0   0   0   0   0   0  | <--' /
1151  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1152  *           |  0   0   0   1   0   0   0   0  |
1153  *           |  0   0   0   0   1   0   0   0  |
1154  *           |  0   0   0   0   0   1   0   0  |
1155  *           |  0   0   0   0   0   0   1   0  |
1156  *           |  0   0   0   0   0   0   0   1  |
1157  *           ~~                               ~~
1158  *           __                               __
1159  *           |  1   1   1   1   1   1   1   1  |
1160  *           |  19 205 116  29  64  16  4   1  |
1161  *           |  1   0   0   0   0   0   0   0  |
1162  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1163  *           |  0   0   0   0   1   0   0   0  |
1164  *           |  0   0   0   0   0   1   0   0  |
1165  *           |  0   0   0   0   0   0   1   0  |
1166  *           |  0   0   0   0   0   0   0   1  |
1167  *           ~~                               ~~
1168  *
1169  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1170  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1171  * matrix is not singular.
1172  * __                                                                 __
1173  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1174  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1175  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1176  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1177  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1178  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1179  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1180  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1181  * ~~                                                                 ~~
1182  * __                                                                 __
1183  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1184  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1185  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1186  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1187  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1188  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1189  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1190  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1191  * ~~                                                                 ~~
1192  * __                                                                 __
1193  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1194  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1195  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1196  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1197  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1198  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1199  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1200  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1201  * ~~                                                                 ~~
1202  * __                                                                 __
1203  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1204  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1205  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1206  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1207  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1208  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1209  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1210  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1211  * ~~                                                                 ~~
1212  * __                                                                 __
1213  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1214  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1215  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1216  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1217  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1218  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1219  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1220  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1221  * ~~                                                                 ~~
1222  * __                                                                 __
1223  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1224  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1225  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1226  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1227  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1228  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1229  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1230  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1231  * ~~                                                                 ~~
1232  *                   __                               __
1233  *                   |  0   0   1   0   0   0   0   0  |
1234  *                   | 167 100  5   41 159 169 217 208 |
1235  *                   | 166 100  4   40 158 168 216 209 |
1236  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1237  *                   |  0   0   0   0   1   0   0   0  |
1238  *                   |  0   0   0   0   0   1   0   0  |
1239  *                   |  0   0   0   0   0   0   1   0  |
1240  *                   |  0   0   0   0   0   0   0   1  |
1241  *                   ~~                               ~~
1242  *
1243  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1244  * of the missing data.
1245  *
1246  * As is apparent from the example above, the only non-trivial rows in the
1247  * inverse matrix correspond to the data disks that we're trying to
1248  * reconstruct. Indeed, those are the only rows we need as the others would
1249  * only be useful for reconstructing data known or assumed to be valid. For
1250  * that reason, we only build the coefficients in the rows that correspond to
1251  * targeted columns.
1252  */
1253 /* END CSTYLED */
1254
1255 static void
1256 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1257     uint8_t **rows)
1258 {
1259         int i, j;
1260         int pow;
1261
1262         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1263
1264         /*
1265          * Fill in the missing rows of interest.
1266          */
1267         for (i = 0; i < nmap; i++) {
1268                 ASSERT3S(0, <=, map[i]);
1269                 ASSERT3S(map[i], <=, 2);
1270
1271                 pow = map[i] * n;
1272                 if (pow > 255)
1273                         pow -= 255;
1274                 ASSERT(pow <= 255);
1275
1276                 for (j = 0; j < n; j++) {
1277                         pow -= map[i];
1278                         if (pow < 0)
1279                                 pow += 255;
1280                         rows[i][j] = vdev_raidz_pow2[pow];
1281                 }
1282         }
1283 }
1284
1285 static void
1286 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1287     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1288 {
1289         int i, j, ii, jj;
1290         uint8_t log;
1291
1292         /*
1293          * Assert that the first nmissing entries from the array of used
1294          * columns correspond to parity columns and that subsequent entries
1295          * correspond to data columns.
1296          */
1297         for (i = 0; i < nmissing; i++) {
1298                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1299         }
1300         for (; i < n; i++) {
1301                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1302         }
1303
1304         /*
1305          * First initialize the storage where we'll compute the inverse rows.
1306          */
1307         for (i = 0; i < nmissing; i++) {
1308                 for (j = 0; j < n; j++) {
1309                         invrows[i][j] = (i == j) ? 1 : 0;
1310                 }
1311         }
1312
1313         /*
1314          * Subtract all trivial rows from the rows of consequence.
1315          */
1316         for (i = 0; i < nmissing; i++) {
1317                 for (j = nmissing; j < n; j++) {
1318                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1319                         jj = used[j] - rm->rm_firstdatacol;
1320                         ASSERT3S(jj, <, n);
1321                         invrows[i][j] = rows[i][jj];
1322                         rows[i][jj] = 0;
1323                 }
1324         }
1325
1326         /*
1327          * For each of the rows of interest, we must normalize it and subtract
1328          * a multiple of it from the other rows.
1329          */
1330         for (i = 0; i < nmissing; i++) {
1331                 for (j = 0; j < missing[i]; j++) {
1332                         ASSERT0(rows[i][j]);
1333                 }
1334                 ASSERT3U(rows[i][missing[i]], !=, 0);
1335
1336                 /*
1337                  * Compute the inverse of the first element and multiply each
1338                  * element in the row by that value.
1339                  */
1340                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1341
1342                 for (j = 0; j < n; j++) {
1343                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1344                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1345                 }
1346
1347                 for (ii = 0; ii < nmissing; ii++) {
1348                         if (i == ii)
1349                                 continue;
1350
1351                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1352
1353                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1354
1355                         for (j = 0; j < n; j++) {
1356                                 rows[ii][j] ^=
1357                                     vdev_raidz_exp2(rows[i][j], log);
1358                                 invrows[ii][j] ^=
1359                                     vdev_raidz_exp2(invrows[i][j], log);
1360                         }
1361                 }
1362         }
1363
1364         /*
1365          * Verify that the data that is left in the rows are properly part of
1366          * an identity matrix.
1367          */
1368         for (i = 0; i < nmissing; i++) {
1369                 for (j = 0; j < n; j++) {
1370                         if (j == missing[i]) {
1371                                 ASSERT3U(rows[i][j], ==, 1);
1372                         } else {
1373                                 ASSERT0(rows[i][j]);
1374                         }
1375                 }
1376         }
1377 }
1378
1379 static void
1380 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1381     int *missing, uint8_t **invrows, const uint8_t *used)
1382 {
1383         int i, j, x, cc, c;
1384         uint8_t *src;
1385         uint64_t ccount;
1386         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1387         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1388         uint8_t log = 0;
1389         uint8_t val;
1390         int ll;
1391         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1392         uint8_t *p, *pp;
1393         size_t psize;
1394
1395         psize = sizeof (invlog[0][0]) * n * nmissing;
1396         p = kmem_alloc(psize, KM_SLEEP);
1397
1398         for (pp = p, i = 0; i < nmissing; i++) {
1399                 invlog[i] = pp;
1400                 pp += n;
1401         }
1402
1403         for (i = 0; i < nmissing; i++) {
1404                 for (j = 0; j < n; j++) {
1405                         ASSERT3U(invrows[i][j], !=, 0);
1406                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1407                 }
1408         }
1409
1410         for (i = 0; i < n; i++) {
1411                 c = used[i];
1412                 ASSERT3U(c, <, rm->rm_cols);
1413
1414                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1415                 ccount = rm->rm_col[c].rc_size;
1416                 for (j = 0; j < nmissing; j++) {
1417                         cc = missing[j] + rm->rm_firstdatacol;
1418                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1419                         ASSERT3U(cc, <, rm->rm_cols);
1420                         ASSERT3U(cc, !=, c);
1421
1422                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1423                         dcount[j] = rm->rm_col[cc].rc_size;
1424                 }
1425
1426                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1427
1428                 for (x = 0; x < ccount; x++, src++) {
1429                         if (*src != 0)
1430                                 log = vdev_raidz_log2[*src];
1431
1432                         for (cc = 0; cc < nmissing; cc++) {
1433                                 if (x >= dcount[cc])
1434                                         continue;
1435
1436                                 if (*src == 0) {
1437                                         val = 0;
1438                                 } else {
1439                                         if ((ll = log + invlog[cc][i]) >= 255)
1440                                                 ll -= 255;
1441                                         val = vdev_raidz_pow2[ll];
1442                                 }
1443
1444                                 if (i == 0)
1445                                         dst[cc][x] = val;
1446                                 else
1447                                         dst[cc][x] ^= val;
1448                         }
1449                 }
1450         }
1451
1452         kmem_free(p, psize);
1453 }
1454
1455 static int
1456 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1457 {
1458         int n, i, c, t, tt;
1459         int nmissing_rows;
1460         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1461         int parity_map[VDEV_RAIDZ_MAXPARITY];
1462
1463         uint8_t *p, *pp;
1464         size_t psize;
1465
1466         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1467         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1468         uint8_t *used;
1469
1470         abd_t **bufs = NULL;
1471
1472         int code = 0;
1473
1474         /*
1475          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1476          * temporary linear ABDs.
1477          */
1478         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1479                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1480
1481                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1482                         raidz_col_t *col = &rm->rm_col[c];
1483
1484                         bufs[c] = col->rc_abd;
1485                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1486                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1487                 }
1488         }
1489
1490         n = rm->rm_cols - rm->rm_firstdatacol;
1491
1492         /*
1493          * Figure out which data columns are missing.
1494          */
1495         nmissing_rows = 0;
1496         for (t = 0; t < ntgts; t++) {
1497                 if (tgts[t] >= rm->rm_firstdatacol) {
1498                         missing_rows[nmissing_rows++] =
1499                             tgts[t] - rm->rm_firstdatacol;
1500                 }
1501         }
1502
1503         /*
1504          * Figure out which parity columns to use to help generate the missing
1505          * data columns.
1506          */
1507         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1508                 ASSERT(tt < ntgts);
1509                 ASSERT(c < rm->rm_firstdatacol);
1510
1511                 /*
1512                  * Skip any targeted parity columns.
1513                  */
1514                 if (c == tgts[tt]) {
1515                         tt++;
1516                         continue;
1517                 }
1518
1519                 code |= 1 << c;
1520
1521                 parity_map[i] = c;
1522                 i++;
1523         }
1524
1525         ASSERT(code != 0);
1526         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1527
1528         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1529             nmissing_rows * n + sizeof (used[0]) * n;
1530         p = kmem_alloc(psize, KM_SLEEP);
1531
1532         for (pp = p, i = 0; i < nmissing_rows; i++) {
1533                 rows[i] = pp;
1534                 pp += n;
1535                 invrows[i] = pp;
1536                 pp += n;
1537         }
1538         used = pp;
1539
1540         for (i = 0; i < nmissing_rows; i++) {
1541                 used[i] = parity_map[i];
1542         }
1543
1544         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1545                 if (tt < nmissing_rows &&
1546                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1547                         tt++;
1548                         continue;
1549                 }
1550
1551                 ASSERT3S(i, <, n);
1552                 used[i] = c;
1553                 i++;
1554         }
1555
1556         /*
1557          * Initialize the interesting rows of the matrix.
1558          */
1559         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1560
1561         /*
1562          * Invert the matrix.
1563          */
1564         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1565             invrows, used);
1566
1567         /*
1568          * Reconstruct the missing data using the generated matrix.
1569          */
1570         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1571             invrows, used);
1572
1573         kmem_free(p, psize);
1574
1575         /*
1576          * copy back from temporary linear abds and free them
1577          */
1578         if (bufs) {
1579                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1580                         raidz_col_t *col = &rm->rm_col[c];
1581
1582                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1583                         abd_free(col->rc_abd);
1584                         col->rc_abd = bufs[c];
1585                 }
1586                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1587         }
1588
1589         return (code);
1590 }
1591
1592 static int
1593 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1594 {
1595         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1596         int ntgts;
1597         int i, c;
1598         int code;
1599         int nbadparity, nbaddata;
1600         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1601
1602         /*
1603          * The tgts list must already be sorted.
1604          */
1605         for (i = 1; i < nt; i++) {
1606                 ASSERT(t[i] > t[i - 1]);
1607         }
1608
1609         nbadparity = rm->rm_firstdatacol;
1610         nbaddata = rm->rm_cols - nbadparity;
1611         ntgts = 0;
1612         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1613                 if (c < rm->rm_firstdatacol)
1614                         parity_valid[c] = B_FALSE;
1615
1616                 if (i < nt && c == t[i]) {
1617                         tgts[ntgts++] = c;
1618                         i++;
1619                 } else if (rm->rm_col[c].rc_error != 0) {
1620                         tgts[ntgts++] = c;
1621                 } else if (c >= rm->rm_firstdatacol) {
1622                         nbaddata--;
1623                 } else {
1624                         parity_valid[c] = B_TRUE;
1625                         nbadparity--;
1626                 }
1627         }
1628
1629         ASSERT(ntgts >= nt);
1630         ASSERT(nbaddata >= 0);
1631         ASSERT(nbaddata + nbadparity == ntgts);
1632
1633         dt = &tgts[nbadparity];
1634
1635         /*
1636          * See if we can use any of our optimized reconstruction routines.
1637          */
1638         if (!vdev_raidz_default_to_general) {
1639                 switch (nbaddata) {
1640                 case 1:
1641                         if (parity_valid[VDEV_RAIDZ_P])
1642                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1643
1644                         ASSERT(rm->rm_firstdatacol > 1);
1645
1646                         if (parity_valid[VDEV_RAIDZ_Q])
1647                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1648
1649                         ASSERT(rm->rm_firstdatacol > 2);
1650                         break;
1651
1652                 case 2:
1653                         ASSERT(rm->rm_firstdatacol > 1);
1654
1655                         if (parity_valid[VDEV_RAIDZ_P] &&
1656                             parity_valid[VDEV_RAIDZ_Q])
1657                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1658
1659                         ASSERT(rm->rm_firstdatacol > 2);
1660
1661                         break;
1662                 }
1663         }
1664
1665         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1666         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1667         ASSERT(code > 0);
1668         return (code);
1669 }
1670
1671 static int
1672 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1673     uint64_t *logical_ashift, uint64_t *physical_ashift)
1674 {
1675         vdev_t *cvd;
1676         uint64_t nparity = vd->vdev_nparity;
1677         int c;
1678         int lasterror = 0;
1679         int numerrors = 0;
1680
1681         ASSERT(nparity > 0);
1682
1683         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1684             vd->vdev_children < nparity + 1) {
1685                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1686                 return (SET_ERROR(EINVAL));
1687         }
1688
1689         vdev_open_children(vd);
1690
1691         for (c = 0; c < vd->vdev_children; c++) {
1692                 cvd = vd->vdev_child[c];
1693
1694                 if (cvd->vdev_open_error != 0) {
1695                         lasterror = cvd->vdev_open_error;
1696                         numerrors++;
1697                         continue;
1698                 }
1699
1700                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1701                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1702                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1703                 *physical_ashift = MAX(*physical_ashift,
1704                     cvd->vdev_physical_ashift);
1705         }
1706
1707         *asize *= vd->vdev_children;
1708         *max_asize *= vd->vdev_children;
1709
1710         if (numerrors > nparity) {
1711                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1712                 return (lasterror);
1713         }
1714
1715         return (0);
1716 }
1717
1718 static void
1719 vdev_raidz_close(vdev_t *vd)
1720 {
1721         int c;
1722
1723         for (c = 0; c < vd->vdev_children; c++)
1724                 vdev_close(vd->vdev_child[c]);
1725 }
1726
1727 #ifdef illumos
1728 /*
1729  * Handle a read or write I/O to a RAID-Z dump device.
1730  *
1731  * The dump device is in a unique situation compared to other ZFS datasets:
1732  * writing to this device should be as simple and fast as possible.  In
1733  * addition, durability matters much less since the dump will be extracted
1734  * once the machine reboots.  For that reason, this function eschews parity for
1735  * performance and simplicity.  The dump device uses the checksum setting
1736  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1737  * dataset.
1738  *
1739  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1740  * 128 KB will not fill an entire block; in addition, they may not be properly
1741  * aligned.  In that case, this function uses the preallocated 128 KB block and
1742  * omits reading or writing any "empty" portions of that block, as opposed to
1743  * allocating a fresh appropriately-sized block.
1744  *
1745  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1746  *
1747  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1748  *
1749  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1750  * allocated which spans all five child vdevs.  8 KB of data would be written to
1751  * each of four vdevs, with the fifth containing the parity bits.
1752  *
1753  *       parity    data     data     data     data
1754  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1755  *         ^        ^        ^        ^        ^
1756  *         |        |        |        |        |
1757  *   8 KB parity    ------8 KB data blocks------
1758  *
1759  * However, when writing to the dump device, the behavior is different:
1760  *
1761  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1762  *
1763  * Unlike the normal RAID-Z case in which the block is allocated based on the
1764  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1765  * I/O size is less than 128 KB, only the actual portions of data are written.
1766  * In this example the data is written to the third data vdev since that vdev
1767  * contains the offset [64 KB, 96 KB).
1768  *
1769  *       parity    data     data     data     data
1770  *     |        |        |        |   XX   |        |
1771  *                                    ^
1772  *                                    |
1773  *                             32 KB data block
1774  *
1775  * As a result, an individual I/O may not span all child vdevs; moreover, a
1776  * small I/O may only operate on a single child vdev.
1777  *
1778  * Note that since there are no parity bits calculated or written, this format
1779  * remains the same no matter how many parity bits are used in a normal RAID-Z
1780  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1781  * would look like:
1782  *
1783  *       parity   parity   parity    data     data     data     data
1784  *     |        |        |        |        |        |   XX   |        |
1785  *                                                      ^
1786  *                                                      |
1787  *                                               32 KB data block
1788  */
1789 int
1790 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1791     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1792 {
1793         vdev_t *tvd = vd->vdev_top;
1794         vdev_t *cvd;
1795         raidz_map_t *rm;
1796         raidz_col_t *rc;
1797         int c, err = 0;
1798
1799         uint64_t start, end, colstart, colend;
1800         uint64_t coloffset, colsize, colskip;
1801
1802         int flags = doread ? BIO_READ : BIO_WRITE;
1803
1804 #ifdef  _KERNEL
1805
1806         /*
1807          * Don't write past the end of the block
1808          */
1809         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1810
1811         start = offset;
1812         end = start + size;
1813
1814         /*
1815          * Allocate a RAID-Z map for this block.  Note that this block starts
1816          * from the "original" offset, this is, the offset of the extent which
1817          * contains the requisite offset of the data being read or written.
1818          *
1819          * Even if this I/O operation doesn't span the full block size, let's
1820          * treat the on-disk format as if the only blocks are the complete 128
1821          * KB size.
1822          */
1823         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1824             SPA_OLD_MAXBLOCKSIZE);
1825         rm = vdev_raidz_map_alloc(abd,
1826             SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1827             vd->vdev_children, vd->vdev_nparity);
1828
1829         coloffset = origoffset;
1830
1831         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1832             c++, coloffset += rc->rc_size) {
1833                 rc = &rm->rm_col[c];
1834                 cvd = vd->vdev_child[rc->rc_devidx];
1835
1836                 /*
1837                  * Find the start and end of this column in the RAID-Z map,
1838                  * keeping in mind that the stated size and offset of the
1839                  * operation may not fill the entire column for this vdev.
1840                  *
1841                  * If any portion of the data spans this column, issue the
1842                  * appropriate operation to the vdev.
1843                  */
1844                 if (coloffset + rc->rc_size <= start)
1845                         continue;
1846                 if (coloffset >= end)
1847                         continue;
1848
1849                 colstart = MAX(coloffset, start);
1850                 colend = MIN(end, coloffset + rc->rc_size);
1851                 colsize = colend - colstart;
1852                 colskip = colstart - coloffset;
1853
1854                 VERIFY3U(colsize, <=, rc->rc_size);
1855                 VERIFY3U(colskip, <=, rc->rc_size);
1856
1857                 /*
1858                  * Note that the child vdev will have a vdev label at the start
1859                  * of its range of offsets, hence the need for
1860                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1861                  * example of why this calculation is needed.
1862                  */
1863                 if ((err = vdev_disk_physio(cvd,
1864                     ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1865                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1866                     flags, isdump)) != 0)
1867                         break;
1868         }
1869
1870         vdev_raidz_map_free(rm);
1871         abd_put(abd);
1872 #endif  /* KERNEL */
1873
1874         return (err);
1875 }
1876 #endif
1877
1878 static uint64_t
1879 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1880 {
1881         uint64_t asize;
1882         uint64_t ashift = vd->vdev_top->vdev_ashift;
1883         uint64_t cols = vd->vdev_children;
1884         uint64_t nparity = vd->vdev_nparity;
1885
1886         asize = ((psize - 1) >> ashift) + 1;
1887         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1888         asize = roundup(asize, nparity + 1) << ashift;
1889
1890         return (asize);
1891 }
1892
1893 static void
1894 vdev_raidz_child_done(zio_t *zio)
1895 {
1896         raidz_col_t *rc = zio->io_private;
1897
1898         rc->rc_error = zio->io_error;
1899         rc->rc_tried = 1;
1900         rc->rc_skipped = 0;
1901 }
1902
1903 static void
1904 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1905 {
1906 #ifdef ZFS_DEBUG
1907         vdev_t *vd = zio->io_vd;
1908         vdev_t *tvd = vd->vdev_top;
1909
1910         range_seg_t logical_rs, physical_rs;
1911         logical_rs.rs_start = zio->io_offset;
1912         logical_rs.rs_end = logical_rs.rs_start +
1913             vdev_raidz_asize(zio->io_vd, zio->io_size);
1914
1915         raidz_col_t *rc = &rm->rm_col[col];
1916         vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1917
1918         vdev_xlate(cvd, &logical_rs, &physical_rs);
1919         ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1920         ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1921         /*
1922          * It would be nice to assert that rs_end is equal
1923          * to rc_offset + rc_size but there might be an
1924          * optional I/O at the end that is not accounted in
1925          * rc_size.
1926          */
1927         if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1928                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1929                     rc->rc_size + (1 << tvd->vdev_ashift));
1930         } else {
1931                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1932         }
1933 #endif
1934 }
1935
1936 /*
1937  * Start an IO operation on a RAIDZ VDev
1938  *
1939  * Outline:
1940  * - For write operations:
1941  *   1. Generate the parity data
1942  *   2. Create child zio write operations to each column's vdev, for both
1943  *      data and parity.
1944  *   3. If the column skips any sectors for padding, create optional dummy
1945  *      write zio children for those areas to improve aggregation continuity.
1946  * - For read operations:
1947  *   1. Create child zio read operations to each data column's vdev to read
1948  *      the range of data required for zio.
1949  *   2. If this is a scrub or resilver operation, or if any of the data
1950  *      vdevs have had errors, then create zio read operations to the parity
1951  *      columns' VDevs as well.
1952  */
1953 static void
1954 vdev_raidz_io_start(zio_t *zio)
1955 {
1956         vdev_t *vd = zio->io_vd;
1957         vdev_t *tvd = vd->vdev_top;
1958         vdev_t *cvd;
1959         raidz_map_t *rm;
1960         raidz_col_t *rc;
1961         int c, i;
1962
1963         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1964             zio->io_type == ZIO_TYPE_FREE,
1965             tvd->vdev_ashift, vd->vdev_children,
1966             vd->vdev_nparity);
1967
1968         zio->io_vsd = rm;
1969         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1970
1971         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1972
1973         if (zio->io_type == ZIO_TYPE_FREE) {
1974                 for (c = 0; c < rm->rm_cols; c++) {
1975                         rc = &rm->rm_col[c];
1976                         cvd = vd->vdev_child[rc->rc_devidx];
1977                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1978                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1979                             zio->io_type, zio->io_priority, 0,
1980                             vdev_raidz_child_done, rc));
1981                 }
1982
1983                 zio_execute(zio);
1984                 return;
1985         }
1986
1987         if (zio->io_type == ZIO_TYPE_WRITE) {
1988                 vdev_raidz_generate_parity(rm);
1989
1990                 for (c = 0; c < rm->rm_cols; c++) {
1991                         rc = &rm->rm_col[c];
1992                         cvd = vd->vdev_child[rc->rc_devidx];
1993
1994                         /*
1995                          * Verify physical to logical translation.
1996                          */
1997                         vdev_raidz_io_verify(zio, rm, c);
1998
1999                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2000                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2001                             zio->io_type, zio->io_priority, 0,
2002                             vdev_raidz_child_done, rc));
2003                 }
2004
2005                 /*
2006                  * Generate optional I/Os for any skipped sectors to improve
2007                  * aggregation contiguity.
2008                  */
2009                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2010                         ASSERT(c <= rm->rm_scols);
2011                         if (c == rm->rm_scols)
2012                                 c = 0;
2013                         rc = &rm->rm_col[c];
2014                         cvd = vd->vdev_child[rc->rc_devidx];
2015                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2016                             rc->rc_offset + rc->rc_size, NULL,
2017                             1 << tvd->vdev_ashift,
2018                             zio->io_type, zio->io_priority,
2019                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2020                 }
2021
2022                 zio_execute(zio);
2023                 return;
2024         }
2025
2026         ASSERT(zio->io_type == ZIO_TYPE_READ);
2027
2028         /*
2029          * Iterate over the columns in reverse order so that we hit the parity
2030          * last -- any errors along the way will force us to read the parity.
2031          */
2032         for (c = rm->rm_cols - 1; c >= 0; c--) {
2033                 rc = &rm->rm_col[c];
2034                 cvd = vd->vdev_child[rc->rc_devidx];
2035                 if (!vdev_readable(cvd)) {
2036                         if (c >= rm->rm_firstdatacol)
2037                                 rm->rm_missingdata++;
2038                         else
2039                                 rm->rm_missingparity++;
2040                         rc->rc_error = SET_ERROR(ENXIO);
2041                         rc->rc_tried = 1;       /* don't even try */
2042                         rc->rc_skipped = 1;
2043                         continue;
2044                 }
2045                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2046                         if (c >= rm->rm_firstdatacol)
2047                                 rm->rm_missingdata++;
2048                         else
2049                                 rm->rm_missingparity++;
2050                         rc->rc_error = SET_ERROR(ESTALE);
2051                         rc->rc_skipped = 1;
2052                         continue;
2053                 }
2054                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2055                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2056                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2057                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2058                             zio->io_type, zio->io_priority, 0,
2059                             vdev_raidz_child_done, rc));
2060                 }
2061         }
2062
2063         zio_execute(zio);
2064 }
2065
2066
2067 /*
2068  * Report a checksum error for a child of a RAID-Z device.
2069  */
2070 static void
2071 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2072 {
2073         void *buf;
2074         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2075
2076         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2077                 zio_bad_cksum_t zbc;
2078                 raidz_map_t *rm = zio->io_vsd;
2079
2080                 mutex_enter(&vd->vdev_stat_lock);
2081                 vd->vdev_stat.vs_checksum_errors++;
2082                 mutex_exit(&vd->vdev_stat_lock);
2083
2084                 zbc.zbc_has_cksum = 0;
2085                 zbc.zbc_injected = rm->rm_ecksuminjected;
2086
2087                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2088                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2089                     rc->rc_offset, rc->rc_size, buf, bad_data,
2090                     &zbc);
2091                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2092         }
2093 }
2094
2095 /*
2096  * We keep track of whether or not there were any injected errors, so that
2097  * any ereports we generate can note it.
2098  */
2099 static int
2100 raidz_checksum_verify(zio_t *zio)
2101 {
2102         zio_bad_cksum_t zbc;
2103         raidz_map_t *rm = zio->io_vsd;
2104
2105         int ret = zio_checksum_error(zio, &zbc);
2106         if (ret != 0 && zbc.zbc_injected != 0)
2107                 rm->rm_ecksuminjected = 1;
2108
2109         return (ret);
2110 }
2111
2112 /*
2113  * Generate the parity from the data columns. If we tried and were able to
2114  * read the parity without error, verify that the generated parity matches the
2115  * data we read. If it doesn't, we fire off a checksum error. Return the
2116  * number such failures.
2117  */
2118 static int
2119 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2120 {
2121         void *orig[VDEV_RAIDZ_MAXPARITY];
2122         int c, ret = 0;
2123         raidz_col_t *rc;
2124
2125         blkptr_t *bp = zio->io_bp;
2126         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2127             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2128
2129         if (checksum == ZIO_CHECKSUM_NOPARITY)
2130                 return (ret);
2131
2132         for (c = 0; c < rm->rm_firstdatacol; c++) {
2133                 rc = &rm->rm_col[c];
2134                 if (!rc->rc_tried || rc->rc_error != 0)
2135                         continue;
2136                 orig[c] = zio_buf_alloc(rc->rc_size);
2137                 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2138         }
2139
2140         vdev_raidz_generate_parity(rm);
2141
2142         for (c = 0; c < rm->rm_firstdatacol; c++) {
2143                 rc = &rm->rm_col[c];
2144                 if (!rc->rc_tried || rc->rc_error != 0)
2145                         continue;
2146                 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2147                         raidz_checksum_error(zio, rc, orig[c]);
2148                         rc->rc_error = SET_ERROR(ECKSUM);
2149                         ret++;
2150                 }
2151                 zio_buf_free(orig[c], rc->rc_size);
2152         }
2153
2154         return (ret);
2155 }
2156
2157 /*
2158  * Keep statistics on all the ways that we used parity to correct data.
2159  */
2160 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2161
2162 static int
2163 vdev_raidz_worst_error(raidz_map_t *rm)
2164 {
2165         int error = 0;
2166
2167         for (int c = 0; c < rm->rm_cols; c++)
2168                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2169
2170         return (error);
2171 }
2172
2173 /*
2174  * Iterate over all combinations of bad data and attempt a reconstruction.
2175  * Note that the algorithm below is non-optimal because it doesn't take into
2176  * account how reconstruction is actually performed. For example, with
2177  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2178  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2179  * cases we'd only use parity information in column 0.
2180  */
2181 static int
2182 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2183 {
2184         raidz_map_t *rm = zio->io_vsd;
2185         raidz_col_t *rc;
2186         void *orig[VDEV_RAIDZ_MAXPARITY];
2187         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2188         int *tgts = &tstore[1];
2189         int current, next, i, c, n;
2190         int code, ret = 0;
2191
2192         ASSERT(total_errors < rm->rm_firstdatacol);
2193
2194         /*
2195          * This simplifies one edge condition.
2196          */
2197         tgts[-1] = -1;
2198
2199         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2200                 /*
2201                  * Initialize the targets array by finding the first n columns
2202                  * that contain no error.
2203                  *
2204                  * If there were no data errors, we need to ensure that we're
2205                  * always explicitly attempting to reconstruct at least one
2206                  * data column. To do this, we simply push the highest target
2207                  * up into the data columns.
2208                  */
2209                 for (c = 0, i = 0; i < n; i++) {
2210                         if (i == n - 1 && data_errors == 0 &&
2211                             c < rm->rm_firstdatacol) {
2212                                 c = rm->rm_firstdatacol;
2213                         }
2214
2215                         while (rm->rm_col[c].rc_error != 0) {
2216                                 c++;
2217                                 ASSERT3S(c, <, rm->rm_cols);
2218                         }
2219
2220                         tgts[i] = c++;
2221                 }
2222
2223                 /*
2224                  * Setting tgts[n] simplifies the other edge condition.
2225                  */
2226                 tgts[n] = rm->rm_cols;
2227
2228                 /*
2229                  * These buffers were allocated in previous iterations.
2230                  */
2231                 for (i = 0; i < n - 1; i++) {
2232                         ASSERT(orig[i] != NULL);
2233                 }
2234
2235                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2236
2237                 current = 0;
2238                 next = tgts[current];
2239
2240                 while (current != n) {
2241                         tgts[current] = next;
2242                         current = 0;
2243
2244                         /*
2245                          * Save off the original data that we're going to
2246                          * attempt to reconstruct.
2247                          */
2248                         for (i = 0; i < n; i++) {
2249                                 ASSERT(orig[i] != NULL);
2250                                 c = tgts[i];
2251                                 ASSERT3S(c, >=, 0);
2252                                 ASSERT3S(c, <, rm->rm_cols);
2253                                 rc = &rm->rm_col[c];
2254                                 abd_copy_to_buf(orig[i], rc->rc_abd,
2255                                     rc->rc_size);
2256                         }
2257
2258                         /*
2259                          * Attempt a reconstruction and exit the outer loop on
2260                          * success.
2261                          */
2262                         code = vdev_raidz_reconstruct(rm, tgts, n);
2263                         if (raidz_checksum_verify(zio) == 0) {
2264                                 atomic_inc_64(&raidz_corrected[code]);
2265
2266                                 for (i = 0; i < n; i++) {
2267                                         c = tgts[i];
2268                                         rc = &rm->rm_col[c];
2269                                         ASSERT(rc->rc_error == 0);
2270                                         if (rc->rc_tried)
2271                                                 raidz_checksum_error(zio, rc,
2272                                                     orig[i]);
2273                                         rc->rc_error = SET_ERROR(ECKSUM);
2274                                 }
2275
2276                                 ret = code;
2277                                 goto done;
2278                         }
2279
2280                         /*
2281                          * Restore the original data.
2282                          */
2283                         for (i = 0; i < n; i++) {
2284                                 c = tgts[i];
2285                                 rc = &rm->rm_col[c];
2286                                 abd_copy_from_buf(rc->rc_abd, orig[i],
2287                                     rc->rc_size);
2288                         }
2289
2290                         do {
2291                                 /*
2292                                  * Find the next valid column after the current
2293                                  * position..
2294                                  */
2295                                 for (next = tgts[current] + 1;
2296                                     next < rm->rm_cols &&
2297                                     rm->rm_col[next].rc_error != 0; next++)
2298                                         continue;
2299
2300                                 ASSERT(next <= tgts[current + 1]);
2301
2302                                 /*
2303                                  * If that spot is available, we're done here.
2304                                  */
2305                                 if (next != tgts[current + 1])
2306                                         break;
2307
2308                                 /*
2309                                  * Otherwise, find the next valid column after
2310                                  * the previous position.
2311                                  */
2312                                 for (c = tgts[current - 1] + 1;
2313                                     rm->rm_col[c].rc_error != 0; c++)
2314                                         continue;
2315
2316                                 tgts[current] = c;
2317                                 current++;
2318
2319                         } while (current != n);
2320                 }
2321         }
2322         n--;
2323 done:
2324         for (i = 0; i < n; i++) {
2325                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2326         }
2327
2328         return (ret);
2329 }
2330
2331 /*
2332  * Complete an IO operation on a RAIDZ VDev
2333  *
2334  * Outline:
2335  * - For write operations:
2336  *   1. Check for errors on the child IOs.
2337  *   2. Return, setting an error code if too few child VDevs were written
2338  *      to reconstruct the data later.  Note that partial writes are
2339  *      considered successful if they can be reconstructed at all.
2340  * - For read operations:
2341  *   1. Check for errors on the child IOs.
2342  *   2. If data errors occurred:
2343  *      a. Try to reassemble the data from the parity available.
2344  *      b. If we haven't yet read the parity drives, read them now.
2345  *      c. If all parity drives have been read but the data still doesn't
2346  *         reassemble with a correct checksum, then try combinatorial
2347  *         reconstruction.
2348  *      d. If that doesn't work, return an error.
2349  *   3. If there were unexpected errors or this is a resilver operation,
2350  *      rewrite the vdevs that had errors.
2351  */
2352 static void
2353 vdev_raidz_io_done(zio_t *zio)
2354 {
2355         vdev_t *vd = zio->io_vd;
2356         vdev_t *cvd;
2357         raidz_map_t *rm = zio->io_vsd;
2358         raidz_col_t *rc;
2359         int unexpected_errors = 0;
2360         int parity_errors = 0;
2361         int parity_untried = 0;
2362         int data_errors = 0;
2363         int total_errors = 0;
2364         int n, c;
2365         int tgts[VDEV_RAIDZ_MAXPARITY];
2366         int code;
2367
2368         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2369
2370         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2371         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2372
2373         for (c = 0; c < rm->rm_cols; c++) {
2374                 rc = &rm->rm_col[c];
2375
2376                 if (rc->rc_error) {
2377                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2378
2379                         if (c < rm->rm_firstdatacol)
2380                                 parity_errors++;
2381                         else
2382                                 data_errors++;
2383
2384                         if (!rc->rc_skipped)
2385                                 unexpected_errors++;
2386
2387                         total_errors++;
2388                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2389                         parity_untried++;
2390                 }
2391         }
2392
2393         if (zio->io_type == ZIO_TYPE_WRITE) {
2394                 /*
2395                  * XXX -- for now, treat partial writes as a success.
2396                  * (If we couldn't write enough columns to reconstruct
2397                  * the data, the I/O failed.  Otherwise, good enough.)
2398                  *
2399                  * Now that we support write reallocation, it would be better
2400                  * to treat partial failure as real failure unless there are
2401                  * no non-degraded top-level vdevs left, and not update DTLs
2402                  * if we intend to reallocate.
2403                  */
2404                 /* XXPOLICY */
2405                 if (total_errors > rm->rm_firstdatacol)
2406                         zio->io_error = vdev_raidz_worst_error(rm);
2407
2408                 return;
2409         } else if (zio->io_type == ZIO_TYPE_FREE) {
2410                 return;
2411         }
2412
2413         ASSERT(zio->io_type == ZIO_TYPE_READ);
2414         /*
2415          * There are three potential phases for a read:
2416          *      1. produce valid data from the columns read
2417          *      2. read all disks and try again
2418          *      3. perform combinatorial reconstruction
2419          *
2420          * Each phase is progressively both more expensive and less likely to
2421          * occur. If we encounter more errors than we can repair or all phases
2422          * fail, we have no choice but to return an error.
2423          */
2424
2425         /*
2426          * If the number of errors we saw was correctable -- less than or equal
2427          * to the number of parity disks read -- attempt to produce data that
2428          * has a valid checksum. Naturally, this case applies in the absence of
2429          * any errors.
2430          */
2431         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2432                 if (data_errors == 0) {
2433                         if (raidz_checksum_verify(zio) == 0) {
2434                                 /*
2435                                  * If we read parity information (unnecessarily
2436                                  * as it happens since no reconstruction was
2437                                  * needed) regenerate and verify the parity.
2438                                  * We also regenerate parity when resilvering
2439                                  * so we can write it out to the failed device
2440                                  * later.
2441                                  */
2442                                 if (parity_errors + parity_untried <
2443                                     rm->rm_firstdatacol ||
2444                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2445                                         n = raidz_parity_verify(zio, rm);
2446                                         unexpected_errors += n;
2447                                         ASSERT(parity_errors + n <=
2448                                             rm->rm_firstdatacol);
2449                                 }
2450                                 goto done;
2451                         }
2452                 } else {
2453                         /*
2454                          * We either attempt to read all the parity columns or
2455                          * none of them. If we didn't try to read parity, we
2456                          * wouldn't be here in the correctable case. There must
2457                          * also have been fewer parity errors than parity
2458                          * columns or, again, we wouldn't be in this code path.
2459                          */
2460                         ASSERT(parity_untried == 0);
2461                         ASSERT(parity_errors < rm->rm_firstdatacol);
2462
2463                         /*
2464                          * Identify the data columns that reported an error.
2465                          */
2466                         n = 0;
2467                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2468                                 rc = &rm->rm_col[c];
2469                                 if (rc->rc_error != 0) {
2470                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2471                                         tgts[n++] = c;
2472                                 }
2473                         }
2474
2475                         ASSERT(rm->rm_firstdatacol >= n);
2476
2477                         code = vdev_raidz_reconstruct(rm, tgts, n);
2478
2479                         if (raidz_checksum_verify(zio) == 0) {
2480                                 atomic_inc_64(&raidz_corrected[code]);
2481
2482                                 /*
2483                                  * If we read more parity disks than were used
2484                                  * for reconstruction, confirm that the other
2485                                  * parity disks produced correct data. This
2486                                  * routine is suboptimal in that it regenerates
2487                                  * the parity that we already used in addition
2488                                  * to the parity that we're attempting to
2489                                  * verify, but this should be a relatively
2490                                  * uncommon case, and can be optimized if it
2491                                  * becomes a problem. Note that we regenerate
2492                                  * parity when resilvering so we can write it
2493                                  * out to failed devices later.
2494                                  */
2495                                 if (parity_errors < rm->rm_firstdatacol - n ||
2496                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2497                                         n = raidz_parity_verify(zio, rm);
2498                                         unexpected_errors += n;
2499                                         ASSERT(parity_errors + n <=
2500                                             rm->rm_firstdatacol);
2501                                 }
2502
2503                                 goto done;
2504                         }
2505                 }
2506         }
2507
2508         /*
2509          * This isn't a typical situation -- either we got a read error or
2510          * a child silently returned bad data. Read every block so we can
2511          * try again with as much data and parity as we can track down. If
2512          * we've already been through once before, all children will be marked
2513          * as tried so we'll proceed to combinatorial reconstruction.
2514          */
2515         unexpected_errors = 1;
2516         rm->rm_missingdata = 0;
2517         rm->rm_missingparity = 0;
2518
2519         for (c = 0; c < rm->rm_cols; c++) {
2520                 if (rm->rm_col[c].rc_tried)
2521                         continue;
2522
2523                 zio_vdev_io_redone(zio);
2524                 do {
2525                         rc = &rm->rm_col[c];
2526                         if (rc->rc_tried)
2527                                 continue;
2528                         zio_nowait(zio_vdev_child_io(zio, NULL,
2529                             vd->vdev_child[rc->rc_devidx],
2530                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2531                             zio->io_type, zio->io_priority, 0,
2532                             vdev_raidz_child_done, rc));
2533                 } while (++c < rm->rm_cols);
2534
2535                 return;
2536         }
2537
2538         /*
2539          * At this point we've attempted to reconstruct the data given the
2540          * errors we detected, and we've attempted to read all columns. There
2541          * must, therefore, be one or more additional problems -- silent errors
2542          * resulting in invalid data rather than explicit I/O errors resulting
2543          * in absent data. We check if there is enough additional data to
2544          * possibly reconstruct the data and then perform combinatorial
2545          * reconstruction over all possible combinations. If that fails,
2546          * we're cooked.
2547          */
2548         if (total_errors > rm->rm_firstdatacol) {
2549                 zio->io_error = vdev_raidz_worst_error(rm);
2550
2551         } else if (total_errors < rm->rm_firstdatacol &&
2552             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2553                 /*
2554                  * If we didn't use all the available parity for the
2555                  * combinatorial reconstruction, verify that the remaining
2556                  * parity is correct.
2557                  */
2558                 if (code != (1 << rm->rm_firstdatacol) - 1)
2559                         (void) raidz_parity_verify(zio, rm);
2560         } else {
2561                 /*
2562                  * We're here because either:
2563                  *
2564                  *      total_errors == rm_first_datacol, or
2565                  *      vdev_raidz_combrec() failed
2566                  *
2567                  * In either case, there is enough bad data to prevent
2568                  * reconstruction.
2569                  *
2570                  * Start checksum ereports for all children which haven't
2571                  * failed, and the IO wasn't speculative.
2572                  */
2573                 zio->io_error = SET_ERROR(ECKSUM);
2574
2575                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2576                         for (c = 0; c < rm->rm_cols; c++) {
2577                                 rc = &rm->rm_col[c];
2578                                 if (rc->rc_error == 0) {
2579                                         zio_bad_cksum_t zbc;
2580                                         zbc.zbc_has_cksum = 0;
2581                                         zbc.zbc_injected =
2582                                             rm->rm_ecksuminjected;
2583
2584                                         zfs_ereport_start_checksum(
2585                                             zio->io_spa,
2586                                             vd->vdev_child[rc->rc_devidx],
2587                                             zio, rc->rc_offset, rc->rc_size,
2588                                             (void *)(uintptr_t)c, &zbc);
2589                                 }
2590                         }
2591                 }
2592         }
2593
2594 done:
2595         zio_checksum_verified(zio);
2596
2597         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2598             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2599                 /*
2600                  * Use the good data we have in hand to repair damaged children.
2601                  */
2602                 for (c = 0; c < rm->rm_cols; c++) {
2603                         rc = &rm->rm_col[c];
2604                         cvd = vd->vdev_child[rc->rc_devidx];
2605
2606                         if (rc->rc_error == 0)
2607                                 continue;
2608
2609                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2610                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2611                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2612                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2613                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2614                 }
2615         }
2616 }
2617
2618 static void
2619 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2620 {
2621         if (faulted > vd->vdev_nparity)
2622                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2623                     VDEV_AUX_NO_REPLICAS);
2624         else if (degraded + faulted != 0)
2625                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2626         else
2627                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2628 }
2629
2630 /*
2631  * Determine if any portion of the provided block resides on a child vdev
2632  * with a dirty DTL and therefore needs to be resilvered.  The function
2633  * assumes that at least one DTL is dirty which imples that full stripe
2634  * width blocks must be resilvered.
2635  */
2636 static boolean_t
2637 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2638 {
2639         uint64_t dcols = vd->vdev_children;
2640         uint64_t nparity = vd->vdev_nparity;
2641         uint64_t ashift = vd->vdev_top->vdev_ashift;
2642         /* The starting RAIDZ (parent) vdev sector of the block. */
2643         uint64_t b = offset >> ashift;
2644         /* The zio's size in units of the vdev's minimum sector size. */
2645         uint64_t s = ((psize - 1) >> ashift) + 1;
2646         /* The first column for this stripe. */
2647         uint64_t f = b % dcols;
2648
2649         if (s + nparity >= dcols)
2650                 return (B_TRUE);
2651
2652         for (uint64_t c = 0; c < s + nparity; c++) {
2653                 uint64_t devidx = (f + c) % dcols;
2654                 vdev_t *cvd = vd->vdev_child[devidx];
2655
2656                 /*
2657                  * dsl_scan_need_resilver() already checked vd with
2658                  * vdev_dtl_contains(). So here just check cvd with
2659                  * vdev_dtl_empty(), cheaper and a good approximation.
2660                  */
2661                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2662                         return (B_TRUE);
2663         }
2664
2665         return (B_FALSE);
2666 }
2667
2668 static void
2669 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2670 {
2671         vdev_t *raidvd = cvd->vdev_parent;
2672         ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2673
2674         uint64_t width = raidvd->vdev_children;
2675         uint64_t tgt_col = cvd->vdev_id;
2676         uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2677
2678         /* make sure the offsets are block-aligned */
2679         ASSERT0(in->rs_start % (1 << ashift));
2680         ASSERT0(in->rs_end % (1 << ashift));
2681         uint64_t b_start = in->rs_start >> ashift;
2682         uint64_t b_end = in->rs_end >> ashift;
2683
2684         uint64_t start_row = 0;
2685         if (b_start > tgt_col) /* avoid underflow */
2686                 start_row = ((b_start - tgt_col - 1) / width) + 1;
2687
2688         uint64_t end_row = 0;
2689         if (b_end > tgt_col)
2690                 end_row = ((b_end - tgt_col - 1) / width) + 1;
2691
2692         res->rs_start = start_row << ashift;
2693         res->rs_end = end_row << ashift;
2694
2695         ASSERT3U(res->rs_start, <=, in->rs_start);
2696         ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2697 }
2698
2699 vdev_ops_t vdev_raidz_ops = {
2700         vdev_raidz_open,
2701         vdev_raidz_close,
2702         vdev_raidz_asize,
2703         vdev_raidz_io_start,
2704         vdev_raidz_io_done,
2705         vdev_raidz_state_change,
2706         vdev_raidz_need_resilver,
2707         NULL,
2708         NULL,
2709         NULL,
2710         vdev_raidz_xlate,
2711         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2712         B_FALSE                 /* not a leaf vdev */
2713 };