sys/cddl/boot/zfs/zfssubr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cdefs.h>
  27 __FBSDID("$FreeBSD$");
  28
  29 static uint64_t zfs_crc64_table[256];
  30
  31 #define ECKSUM  666
  32
  33 #define ASSERT3S(x, y, z)       ((void)0)
  34 #define ASSERT3U(x, y, z)       ((void)0)
  35 #define ASSERT3P(x, y, z)       ((void)0)
  36 #define ASSERT0(x)              ((void)0)
  37 #define ASSERT(x)               ((void)0)
  38
  39 #define panic(...)      do {                                            \
  40         printf(__VA_ARGS__);                                            \
  41         for (;;) ;                                                      \
  42 } while (0)
  43
  44 #define kmem_alloc(size, flag)  zfs_alloc((size))
  45 #define kmem_free(ptr, size)    zfs_free((ptr), (size))
  46
  47 static void
  48 zfs_init_crc(void)
  49 {
  50         int i, j;
  51         uint64_t *ct;
  52
  53         /*
  54          * Calculate the crc64 table (used for the zap hash
  55          * function).
  56          */
  57         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  58                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  59                 for (i = 0; i < 256; i++)
  60                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  61                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  62         }
  63 }
  64
  65 static void
  66 zio_checksum_off(const void *buf, uint64_t size,
  67     const void *ctx_template, zio_cksum_t *zcp)
  68 {
  69         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  70 }
  71
  72 /*
  73  * Signature for checksum functions.
  74  */
  75 typedef void zio_checksum_t(const void *data, uint64_t size,
  76     const void *ctx_template, zio_cksum_t *zcp);
  77 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  78 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  79
  80 typedef enum zio_checksum_flags {
  81         /* Strong enough for metadata? */
  82         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  83         /* ZIO embedded checksum */
  84         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  85         /* Strong enough for dedup (without verification)? */
  86         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  87         /* Uses salt value */
  88         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  89         /* Strong enough for nopwrite? */
  90         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  91 } zio_checksum_flags_t;
  92
  93 /*
  94  * Information about each checksum function.
  95  */
  96 typedef struct zio_checksum_info {
  97         /* checksum function for each byteorder */
  98         zio_checksum_t                  *ci_func[2];
  99         zio_checksum_tmpl_init_t        *ci_tmpl_init;
 100         zio_checksum_tmpl_free_t        *ci_tmpl_free;
 101         zio_checksum_flags_t            ci_flags;
 102         const char                      *ci_name;       /* descriptive name */
 103 } zio_checksum_info_t;
 104
 105 #include "blkptr.c"
 106
 107 #include "fletcher.c"
 108 #include "sha256.c"
 109 #include "skein_zfs.c"
 110
 111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 112         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 113         {{NULL, NULL}, NULL, NULL, 0, "on"},
 114         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 115         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 116             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 117         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 118             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 119         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 120             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 121         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 122             0, "fletcher2"},
 123         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 124             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 125         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 126             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 127             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 128         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 129             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 130         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 131             0, "noparity"},
 132         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 133             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 134             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 135         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
 136             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 137             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 138             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 139         /* no edonr for now */
 140         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 141             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
 142 };
 143
 144 /*
 145  * Common signature for all zio compress/decompress functions.
 146  */
 147 typedef size_t zio_compress_func_t(void *src, void *dst,
 148     size_t s_len, size_t d_len, int);
 149 typedef int zio_decompress_func_t(void *src, void *dst,
 150     size_t s_len, size_t d_len, int);
 151
 152 /*
 153  * Information about each compression function.
 154  */
 155 typedef struct zio_compress_info {
 156         zio_compress_func_t     *ci_compress;   /* compression function */
 157         zio_decompress_func_t   *ci_decompress; /* decompression function */
 158         int                     ci_level;       /* level parameter */
 159         const char              *ci_name;       /* algorithm name */
 160 } zio_compress_info_t;
 161
 162 #include "lzjb.c"
 163 #include "zle.c"
 164 #include "lz4.c"
 165
 166 /*
 167  * Compression vectors.
 168  */
 169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 170         {NULL,                  NULL,                   0,      "inherit"},
 171         {NULL,                  NULL,                   0,      "on"},
 172         {NULL,                  NULL,                   0,      "uncompressed"},
 173         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 174         {NULL,                  NULL,                   0,      "empty"},
 175         {NULL,                  NULL,                   1,      "gzip-1"},
 176         {NULL,                  NULL,                   2,      "gzip-2"},
 177         {NULL,                  NULL,                   3,      "gzip-3"},
 178         {NULL,                  NULL,                   4,      "gzip-4"},
 179         {NULL,                  NULL,                   5,      "gzip-5"},
 180         {NULL,                  NULL,                   6,      "gzip-6"},
 181         {NULL,                  NULL,                   7,      "gzip-7"},
 182         {NULL,                  NULL,                   8,      "gzip-8"},
 183         {NULL,                  NULL,                   9,      "gzip-9"},
 184         {NULL,                  zle_decompress,         64,     "zle"},
 185         {NULL,                  lz4_decompress,         0,      "lz4"},
 186 };
 187
 188 static void
 189 byteswap_uint64_array(void *vbuf, size_t size)
 190 {
 191         uint64_t *buf = vbuf;
 192         size_t count = size >> 3;
 193         int i;
 194
 195         ASSERT((size & 7) == 0);
 196
 197         for (i = 0; i < count; i++)
 198                 buf[i] = BSWAP_64(buf[i]);
 199 }
 200
 201 /*
 202  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 203  * a tuple which is guaranteed to be unique for the life of the pool.
 204  */
 205 static void
 206 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 207 {
 208         const dva_t *dva = BP_IDENTITY(bp);
 209         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 210
 211         ASSERT(BP_IS_GANG(bp));
 212
 213         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 214 }
 215
 216 /*
 217  * Set the external verifier for a label block based on its offset.
 218  * The vdev is implicit, and the txg is unknowable at pool open time --
 219  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 220  */
 221 static void
 222 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 223 {
 224         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 225 }
 226
 227 /*
 228  * Calls the template init function of a checksum which supports context
 229  * templates and installs the template into the spa_t.
 230  */
 231 static void
 232 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 233 {
 234         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 235
 236         if (ci->ci_tmpl_init == NULL)
 237                 return;
 238
 239         if (spa->spa_cksum_tmpls[checksum] != NULL)
 240                 return;
 241
 242         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 243                 spa->spa_cksum_tmpls[checksum] =
 244                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 245         }
 246 }
 247
 248 /*
 249  * Called by a spa_t that's about to be deallocated. This steps through
 250  * all of the checksum context templates and deallocates any that were
 251  * initialized using the algorithm-specific template init function.
 252  */
 253 void
 254 zio_checksum_templates_free(spa_t *spa)
 255 {
 256         for (enum zio_checksum checksum = 0;
 257             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 258                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 259                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 260
 261                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 262                         spa->spa_cksum_tmpls[checksum] = NULL;
 263                 }
 264         }
 265 }
 266
 267 static int
 268 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
 269 {
 270         uint64_t size;
 271         unsigned int checksum;
 272         zio_checksum_info_t *ci;
 273         void *ctx = NULL;
 274         zio_cksum_t actual_cksum, expected_cksum, verifier;
 275         int byteswap;
 276
 277         checksum = BP_GET_CHECKSUM(bp);
 278         size = BP_GET_PSIZE(bp);
 279
 280         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 281                 return (EINVAL);
 282         ci = &zio_checksum_table[checksum];
 283         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 284                 return (EINVAL);
 285
 286         if (spa != NULL) {
 287                 zio_checksum_template_init(checksum, (spa_t *) spa);
 288                 ctx = spa->spa_cksum_tmpls[checksum];
 289         }
 290
 291         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 292                 zio_eck_t *eck;
 293
 294                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 295                     checksum == ZIO_CHECKSUM_LABEL);
 296
 297                 eck = (zio_eck_t *)((char *)data + size) - 1;
 298
 299                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 300                         zio_checksum_gang_verifier(&verifier, bp);
 301                 else if (checksum == ZIO_CHECKSUM_LABEL)
 302                         zio_checksum_label_verifier(&verifier,
 303                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 304                 else
 305                         verifier = bp->blk_cksum;
 306
 307                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 308
 309                 if (byteswap)
 310                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 311
 312                 expected_cksum = eck->zec_cksum;
 313                 eck->zec_cksum = verifier;
 314                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 315                 eck->zec_cksum = expected_cksum;
 316
 317                 if (byteswap)
 318                         byteswap_uint64_array(&expected_cksum,
 319                             sizeof (zio_cksum_t));
 320         } else {
 321                 expected_cksum = bp->blk_cksum;
 322                 ci->ci_func[0](data, size, ctx, &actual_cksum);
 323         }
 324
 325         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 326                 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
 327                 return (EIO);
 328         }
 329
 330         return (0);
 331 }
 332
 333 static int
 334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 335         void *dest, uint64_t destsize)
 336 {
 337         zio_compress_info_t *ci;
 338
 339         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 340                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 341                 return (EIO);
 342         }
 343
 344         ci = &zio_compress_table[cpfunc];
 345         if (!ci->ci_decompress) {
 346                 printf("ZFS: unsupported compression algorithm %s\n",
 347                     ci->ci_name);
 348                 return (EIO);
 349         }
 350
 351         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 352 }
 353
 354 static uint64_t
 355 zap_hash(uint64_t salt, const char *name)
 356 {
 357         const uint8_t *cp;
 358         uint8_t c;
 359         uint64_t crc = salt;
 360
 361         ASSERT(crc != 0);
 362         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 363         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 364                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 365
 366         /*
 367          * Only use 28 bits, since we need 4 bits in the cookie for the
 368          * collision differentiator.  We MUST use the high bits, since
 369          * those are the onces that we first pay attention to when
 370          * chosing the bucket.
 371          */
 372         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 373
 374         return (crc);
 375 }
 376
 377 static void *zfs_alloc(size_t size);
 378 static void zfs_free(void *ptr, size_t size);
 379
 380 typedef struct raidz_col {
 381         uint64_t rc_devidx;             /* child device index for I/O */
 382         uint64_t rc_offset;             /* device offset */
 383         uint64_t rc_size;               /* I/O size */
 384         void *rc_data;                  /* I/O data */
 385         int rc_error;                   /* I/O error for this device */
 386         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 387         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 388 } raidz_col_t;
 389
 390 typedef struct raidz_map {
 391         uint64_t rm_cols;               /* Regular column count */
 392         uint64_t rm_scols;              /* Count including skipped columns */
 393         uint64_t rm_bigcols;            /* Number of oversized columns */
 394         uint64_t rm_asize;              /* Actual total I/O size */
 395         uint64_t rm_missingdata;        /* Count of missing data devices */
 396         uint64_t rm_missingparity;      /* Count of missing parity devices */
 397         uint64_t rm_firstdatacol;       /* First data column/parity count */
 398         uint64_t rm_nskip;              /* Skipped sectors for padding */
 399         uint64_t rm_skipstart;          /* Column index of padding start */
 400         uintptr_t rm_reports;           /* # of referencing checksum reports */
 401         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 402         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 403         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 404 } raidz_map_t;
 405
 406 #define VDEV_RAIDZ_P            0
 407 #define VDEV_RAIDZ_Q            1
 408 #define VDEV_RAIDZ_R            2
 409
 410 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 411 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 412
 413 /*
 414  * We provide a mechanism to perform the field multiplication operation on a
 415  * 64-bit value all at once rather than a byte at a time. This works by
 416  * creating a mask from the top bit in each byte and using that to
 417  * conditionally apply the XOR of 0x1d.
 418  */
 419 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 420 { \
 421         (mask) = (x) & 0x8080808080808080ULL; \
 422         (mask) = ((mask) << 1) - ((mask) >> 7); \
 423         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 424             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 425 }
 426
 427 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 428 { \
 429         VDEV_RAIDZ_64MUL_2((x), mask); \
 430         VDEV_RAIDZ_64MUL_2((x), mask); \
 431 }
 432
 433 /*
 434  * These two tables represent powers and logs of 2 in the Galois field defined
 435  * above. These values were computed by repeatedly multiplying by 2 as above.
 436  */
 437 static const uint8_t vdev_raidz_pow2[256] = {
 438         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 439         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 440         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 441         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 442         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 443         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 444         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 445         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 446         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 447         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 448         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 449         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 450         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 451         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 452         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 453         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 454         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 455         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 456         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 457         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 458         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 459         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 460         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 461         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 462         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 463         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 464         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 465         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 466         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 467         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 468         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 469         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 470 };
 471 static const uint8_t vdev_raidz_log2[256] = {
 472         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 473         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 474         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 475         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 476         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 477         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 478         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 479         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 480         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 481         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 482         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 483         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 484         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 485         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 486         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 487         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 488         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 489         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 490         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 491         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 492         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 493         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 494         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 495         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 496         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 497         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 498         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 499         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 500         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 501         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 502         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 503         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 504 };
 505
 506 /*
 507  * Multiply a given number by 2 raised to the given power.
 508  */
 509 static uint8_t
 510 vdev_raidz_exp2(uint8_t a, int exp)
 511 {
 512         if (a == 0)
 513                 return (0);
 514
 515         ASSERT(exp >= 0);
 516         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 517
 518         exp += vdev_raidz_log2[a];
 519         if (exp > 255)
 520                 exp -= 255;
 521
 522         return (vdev_raidz_pow2[exp]);
 523 }
 524
 525 static void
 526 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 527 {
 528         uint64_t *p, *src, pcount, ccount, i;
 529         int c;
 530
 531         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 532
 533         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 534                 src = rm->rm_col[c].rc_data;
 535                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 536                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 537
 538                 if (c == rm->rm_firstdatacol) {
 539                         ASSERT(ccount == pcount);
 540                         for (i = 0; i < ccount; i++, src++, p++) {
 541                                 *p = *src;
 542                         }
 543                 } else {
 544                         ASSERT(ccount <= pcount);
 545                         for (i = 0; i < ccount; i++, src++, p++) {
 546                                 *p ^= *src;
 547                         }
 548                 }
 549         }
 550 }
 551
 552 static void
 553 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 554 {
 555         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 556         int c;
 557
 558         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 559         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 560             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 561
 562         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 563                 src = rm->rm_col[c].rc_data;
 564                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 565                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 566
 567                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 568
 569                 if (c == rm->rm_firstdatacol) {
 570                         ASSERT(ccnt == pcnt || ccnt == 0);
 571                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 572                                 *p = *src;
 573                                 *q = *src;
 574                         }
 575                         for (; i < pcnt; i++, src++, p++, q++) {
 576                                 *p = 0;
 577                                 *q = 0;
 578                         }
 579                 } else {
 580                         ASSERT(ccnt <= pcnt);
 581
 582                         /*
 583                          * Apply the algorithm described above by multiplying
 584                          * the previous result and adding in the new value.
 585                          */
 586                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 587                                 *p ^= *src;
 588
 589                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 590                                 *q ^= *src;
 591                         }
 592
 593                         /*
 594                          * Treat short columns as though they are full of 0s.
 595                          * Note that there's therefore nothing needed for P.
 596                          */
 597                         for (; i < pcnt; i++, q++) {
 598                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 599                         }
 600                 }
 601         }
 602 }
 603
 604 static void
 605 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 606 {
 607         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 608         int c;
 609
 610         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 611         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 612             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 613         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 614             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 615
 616         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 617                 src = rm->rm_col[c].rc_data;
 618                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 619                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 620                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 621
 622                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 623
 624                 if (c == rm->rm_firstdatacol) {
 625                         ASSERT(ccnt == pcnt || ccnt == 0);
 626                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 627                                 *p = *src;
 628                                 *q = *src;
 629                                 *r = *src;
 630                         }
 631                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 632                                 *p = 0;
 633                                 *q = 0;
 634                                 *r = 0;
 635                         }
 636                 } else {
 637                         ASSERT(ccnt <= pcnt);
 638
 639                         /*
 640                          * Apply the algorithm described above by multiplying
 641                          * the previous result and adding in the new value.
 642                          */
 643                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 644                                 *p ^= *src;
 645
 646                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 647                                 *q ^= *src;
 648
 649                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 650                                 *r ^= *src;
 651                         }
 652
 653                         /*
 654                          * Treat short columns as though they are full of 0s.
 655                          * Note that there's therefore nothing needed for P.
 656                          */
 657                         for (; i < pcnt; i++, q++, r++) {
 658                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 659                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 660                         }
 661                 }
 662         }
 663 }
 664
 665 /*
 666  * Generate RAID parity in the first virtual columns according to the number of
 667  * parity columns available.
 668  */
 669 static void
 670 vdev_raidz_generate_parity(raidz_map_t *rm)
 671 {
 672         switch (rm->rm_firstdatacol) {
 673         case 1:
 674                 vdev_raidz_generate_parity_p(rm);
 675                 break;
 676         case 2:
 677                 vdev_raidz_generate_parity_pq(rm);
 678                 break;
 679         case 3:
 680                 vdev_raidz_generate_parity_pqr(rm);
 681                 break;
 682         default:
 683                 panic("invalid RAID-Z configuration");
 684         }
 685 }
 686
 687 /* BEGIN CSTYLED */
 688 /*
 689  * In the general case of reconstruction, we must solve the system of linear
 690  * equations defined by the coeffecients used to generate parity as well as
 691  * the contents of the data and parity disks. This can be expressed with
 692  * vectors for the original data (D) and the actual data (d) and parity (p)
 693  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 694  *
 695  *            __   __                     __     __
 696  *            |     |         __     __   |  p_0  |
 697  *            |  V  |         |  D_0  |   | p_m-1 |
 698  *            |     |    x    |   :   | = |  d_0  |
 699  *            |  I  |         | D_n-1 |   |   :   |
 700  *            |     |         ~~     ~~   | d_n-1 |
 701  *            ~~   ~~                     ~~     ~~
 702  *
 703  * I is simply a square identity matrix of size n, and V is a vandermonde
 704  * matrix defined by the coeffecients we chose for the various parity columns
 705  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 706  * computation as well as linear separability.
 707  *
 708  *      __               __               __     __
 709  *      |   1   ..  1 1 1 |               |  p_0  |
 710  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 711  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 712  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 713  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 714  *      |   :       : : : |   |   :   |   |  d_2  |
 715  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 716  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 717  *      |   0   ..  0 0 1 |               | d_n-1 |
 718  *      ~~               ~~               ~~     ~~
 719  *
 720  * Note that I, V, d, and p are known. To compute D, we must invert the
 721  * matrix and use the known data and parity values to reconstruct the unknown
 722  * data values. We begin by removing the rows in V|I and d|p that correspond
 723  * to failed or missing columns; we then make V|I square (n x n) and d|p
 724  * sized n by removing rows corresponding to unused parity from the bottom up
 725  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 726  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 727  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 728  *           __                               __
 729  *           |  1   1   1   1   1   1   1   1  |
 730  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 731  *           |  19 205 116  29  64  16  4   1  |      / /
 732  *           |  1   0   0   0   0   0   0   0  |     / /
 733  *           |  0   1   0   0   0   0   0   0  | <--' /
 734  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 735  *           |  0   0   0   1   0   0   0   0  |
 736  *           |  0   0   0   0   1   0   0   0  |
 737  *           |  0   0   0   0   0   1   0   0  |
 738  *           |  0   0   0   0   0   0   1   0  |
 739  *           |  0   0   0   0   0   0   0   1  |
 740  *           ~~                               ~~
 741  *           __                               __
 742  *           |  1   1   1   1   1   1   1   1  |
 743  *           | 128  64  32  16  8   4   2   1  |
 744  *           |  19 205 116  29  64  16  4   1  |
 745  *           |  1   0   0   0   0   0   0   0  |
 746  *           |  0   1   0   0   0   0   0   0  |
 747  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 748  *           |  0   0   0   1   0   0   0   0  |
 749  *           |  0   0   0   0   1   0   0   0  |
 750  *           |  0   0   0   0   0   1   0   0  |
 751  *           |  0   0   0   0   0   0   1   0  |
 752  *           |  0   0   0   0   0   0   0   1  |
 753  *           ~~                               ~~
 754  *
 755  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 756  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 757  * matrix is not singular.
 758  * __                                                                 __
 759  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 760  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 761  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 762  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 763  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 764  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 765  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 766  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 767  * ~~                                                                 ~~
 768  * __                                                                 __
 769  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 770  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 771  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 772  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 773  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 774  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 775  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 776  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 777  * ~~                                                                 ~~
 778  * __                                                                 __
 779  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 780  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 781  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 782  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 783  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 784  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 785  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 786  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 787  * ~~                                                                 ~~
 788  * __                                                                 __
 789  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 790  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 791  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 792  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 793  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 794  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 795  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 796  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 797  * ~~                                                                 ~~
 798  * __                                                                 __
 799  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 800  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 801  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 802  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 803  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 804  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 805  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 806  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 807  * ~~                                                                 ~~
 808  * __                                                                 __
 809  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 810  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 811  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 812  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 813  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 814  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 815  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 816  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 817  * ~~                                                                 ~~
 818  *                   __                               __
 819  *                   |  0   0   1   0   0   0   0   0  |
 820  *                   | 167 100  5   41 159 169 217 208 |
 821  *                   | 166 100  4   40 158 168 216 209 |
 822  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 823  *                   |  0   0   0   0   1   0   0   0  |
 824  *                   |  0   0   0   0   0   1   0   0  |
 825  *                   |  0   0   0   0   0   0   1   0  |
 826  *                   |  0   0   0   0   0   0   0   1  |
 827  *                   ~~                               ~~
 828  *
 829  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 830  * of the missing data.
 831  *
 832  * As is apparent from the example above, the only non-trivial rows in the
 833  * inverse matrix correspond to the data disks that we're trying to
 834  * reconstruct. Indeed, those are the only rows we need as the others would
 835  * only be useful for reconstructing data known or assumed to be valid. For
 836  * that reason, we only build the coefficients in the rows that correspond to
 837  * targeted columns.
 838  */
 839 /* END CSTYLED */
 840
 841 static void
 842 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 843     uint8_t **rows)
 844 {
 845         int i, j;
 846         int pow;
 847
 848         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 849
 850         /*
 851          * Fill in the missing rows of interest.
 852          */
 853         for (i = 0; i < nmap; i++) {
 854                 ASSERT3S(0, <=, map[i]);
 855                 ASSERT3S(map[i], <=, 2);
 856
 857                 pow = map[i] * n;
 858                 if (pow > 255)
 859                         pow -= 255;
 860                 ASSERT(pow <= 255);
 861
 862                 for (j = 0; j < n; j++) {
 863                         pow -= map[i];
 864                         if (pow < 0)
 865                                 pow += 255;
 866                         rows[i][j] = vdev_raidz_pow2[pow];
 867                 }
 868         }
 869 }
 870
 871 static void
 872 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 873     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 874 {
 875         int i, j, ii, jj;
 876         uint8_t log;
 877
 878         /*
 879          * Assert that the first nmissing entries from the array of used
 880          * columns correspond to parity columns and that subsequent entries
 881          * correspond to data columns.
 882          */
 883         for (i = 0; i < nmissing; i++) {
 884                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 885         }
 886         for (; i < n; i++) {
 887                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 888         }
 889
 890         /*
 891          * First initialize the storage where we'll compute the inverse rows.
 892          */
 893         for (i = 0; i < nmissing; i++) {
 894                 for (j = 0; j < n; j++) {
 895                         invrows[i][j] = (i == j) ? 1 : 0;
 896                 }
 897         }
 898
 899         /*
 900          * Subtract all trivial rows from the rows of consequence.
 901          */
 902         for (i = 0; i < nmissing; i++) {
 903                 for (j = nmissing; j < n; j++) {
 904                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 905                         jj = used[j] - rm->rm_firstdatacol;
 906                         ASSERT3S(jj, <, n);
 907                         invrows[i][j] = rows[i][jj];
 908                         rows[i][jj] = 0;
 909                 }
 910         }
 911
 912         /*
 913          * For each of the rows of interest, we must normalize it and subtract
 914          * a multiple of it from the other rows.
 915          */
 916         for (i = 0; i < nmissing; i++) {
 917                 for (j = 0; j < missing[i]; j++) {
 918                         ASSERT3U(rows[i][j], ==, 0);
 919                 }
 920                 ASSERT3U(rows[i][missing[i]], !=, 0);
 921
 922                 /*
 923                  * Compute the inverse of the first element and multiply each
 924                  * element in the row by that value.
 925                  */
 926                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 927
 928                 for (j = 0; j < n; j++) {
 929                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 930                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 931                 }
 932
 933                 for (ii = 0; ii < nmissing; ii++) {
 934                         if (i == ii)
 935                                 continue;
 936
 937                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 938
 939                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 940
 941                         for (j = 0; j < n; j++) {
 942                                 rows[ii][j] ^=
 943                                     vdev_raidz_exp2(rows[i][j], log);
 944                                 invrows[ii][j] ^=
 945                                     vdev_raidz_exp2(invrows[i][j], log);
 946                         }
 947                 }
 948         }
 949
 950         /*
 951          * Verify that the data that is left in the rows are properly part of
 952          * an identity matrix.
 953          */
 954         for (i = 0; i < nmissing; i++) {
 955                 for (j = 0; j < n; j++) {
 956                         if (j == missing[i]) {
 957                                 ASSERT3U(rows[i][j], ==, 1);
 958                         } else {
 959                                 ASSERT3U(rows[i][j], ==, 0);
 960                         }
 961                 }
 962         }
 963 }
 964
 965 static void
 966 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 967     int *missing, uint8_t **invrows, const uint8_t *used)
 968 {
 969         int i, j, x, cc, c;
 970         uint8_t *src;
 971         uint64_t ccount;
 972         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 973         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 974         uint8_t log, val;
 975         int ll;
 976         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 977         uint8_t *p, *pp;
 978         size_t psize;
 979
 980         log = 0;        /* gcc */
 981         psize = sizeof (invlog[0][0]) * n * nmissing;
 982         p = zfs_alloc(psize);
 983
 984         for (pp = p, i = 0; i < nmissing; i++) {
 985                 invlog[i] = pp;
 986                 pp += n;
 987         }
 988
 989         for (i = 0; i < nmissing; i++) {
 990                 for (j = 0; j < n; j++) {
 991                         ASSERT3U(invrows[i][j], !=, 0);
 992                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 993                 }
 994         }
 995
 996         for (i = 0; i < n; i++) {
 997                 c = used[i];
 998                 ASSERT3U(c, <, rm->rm_cols);
 999
1000                 src = rm->rm_col[c].rc_data;
1001                 ccount = rm->rm_col[c].rc_size;
1002                 for (j = 0; j < nmissing; j++) {
1003                         cc = missing[j] + rm->rm_firstdatacol;
1004                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005                         ASSERT3U(cc, <, rm->rm_cols);
1006                         ASSERT3U(cc, !=, c);
1007
1008                         dst[j] = rm->rm_col[cc].rc_data;
1009                         dcount[j] = rm->rm_col[cc].rc_size;
1010                 }
1011
1012                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1013
1014                 for (x = 0; x < ccount; x++, src++) {
1015                         if (*src != 0)
1016                                 log = vdev_raidz_log2[*src];
1017
1018                         for (cc = 0; cc < nmissing; cc++) {
1019                                 if (x >= dcount[cc])
1020                                         continue;
1021
1022                                 if (*src == 0) {
1023                                         val = 0;
1024                                 } else {
1025                                         if ((ll = log + invlog[cc][i]) >= 255)
1026                                                 ll -= 255;
1027                                         val = vdev_raidz_pow2[ll];
1028                                 }
1029
1030                                 if (i == 0)
1031                                         dst[cc][x] = val;
1032                                 else
1033                                         dst[cc][x] ^= val;
1034                         }
1035                 }
1036         }
1037
1038         zfs_free(p, psize);
1039 }
1040
1041 static int
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1043 {
1044         int n, i, c, t, tt;
1045         int nmissing_rows;
1046         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047         int parity_map[VDEV_RAIDZ_MAXPARITY];
1048
1049         uint8_t *p, *pp;
1050         size_t psize;
1051
1052         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1054         uint8_t *used;
1055
1056         int code = 0;
1057
1058
1059         n = rm->rm_cols - rm->rm_firstdatacol;
1060
1061         /*
1062          * Figure out which data columns are missing.
1063          */
1064         nmissing_rows = 0;
1065         for (t = 0; t < ntgts; t++) {
1066                 if (tgts[t] >= rm->rm_firstdatacol) {
1067                         missing_rows[nmissing_rows++] =
1068                             tgts[t] - rm->rm_firstdatacol;
1069                 }
1070         }
1071
1072         /*
1073          * Figure out which parity columns to use to help generate the missing
1074          * data columns.
1075          */
1076         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1077                 ASSERT(tt < ntgts);
1078                 ASSERT(c < rm->rm_firstdatacol);
1079
1080                 /*
1081                  * Skip any targeted parity columns.
1082                  */
1083                 if (c == tgts[tt]) {
1084                         tt++;
1085                         continue;
1086                 }
1087
1088                 code |= 1 << c;
1089
1090                 parity_map[i] = c;
1091                 i++;
1092         }
1093
1094         ASSERT(code != 0);
1095         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1096
1097         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098             nmissing_rows * n + sizeof (used[0]) * n;
1099         p = kmem_alloc(psize, KM_SLEEP);
1100
1101         for (pp = p, i = 0; i < nmissing_rows; i++) {
1102                 rows[i] = pp;
1103                 pp += n;
1104                 invrows[i] = pp;
1105                 pp += n;
1106         }
1107         used = pp;
1108
1109         for (i = 0; i < nmissing_rows; i++) {
1110                 used[i] = parity_map[i];
1111         }
1112
1113         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1114                 if (tt < nmissing_rows &&
1115                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1116                         tt++;
1117                         continue;
1118                 }
1119
1120                 ASSERT3S(i, <, n);
1121                 used[i] = c;
1122                 i++;
1123         }
1124
1125         /*
1126          * Initialize the interesting rows of the matrix.
1127          */
1128         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1129
1130         /*
1131          * Invert the matrix.
1132          */
1133         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1134             invrows, used);
1135
1136         /*
1137          * Reconstruct the missing data using the generated matrix.
1138          */
1139         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1140             invrows, used);
1141
1142         kmem_free(p, psize);
1143
1144         return (code);
1145 }
1146
1147 static int
1148 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1149 {
1150         int tgts[VDEV_RAIDZ_MAXPARITY];
1151         int ntgts;
1152         int i, c;
1153         int code;
1154         int nbadparity, nbaddata;
1155
1156         /*
1157          * The tgts list must already be sorted.
1158          */
1159         for (i = 1; i < nt; i++) {
1160                 ASSERT(t[i] > t[i - 1]);
1161         }
1162
1163         nbadparity = rm->rm_firstdatacol;
1164         nbaddata = rm->rm_cols - nbadparity;
1165         ntgts = 0;
1166         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1167                 if (i < nt && c == t[i]) {
1168                         tgts[ntgts++] = c;
1169                         i++;
1170                 } else if (rm->rm_col[c].rc_error != 0) {
1171                         tgts[ntgts++] = c;
1172                 } else if (c >= rm->rm_firstdatacol) {
1173                         nbaddata--;
1174                 } else {
1175                         nbadparity--;
1176                 }
1177         }
1178
1179         ASSERT(ntgts >= nt);
1180         ASSERT(nbaddata >= 0);
1181         ASSERT(nbaddata + nbadparity == ntgts);
1182
1183         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1184         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1185         ASSERT(code > 0);
1186         return (code);
1187 }
1188
1189 static raidz_map_t *
1190 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1191     uint64_t dcols, uint64_t nparity)
1192 {
1193         raidz_map_t *rm;
1194         uint64_t b = offset >> unit_shift;
1195         uint64_t s = size >> unit_shift;
1196         uint64_t f = b % dcols;
1197         uint64_t o = (b / dcols) << unit_shift;
1198         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1199
1200         q = s / (dcols - nparity);
1201         r = s - q * (dcols - nparity);
1202         bc = (r == 0 ? 0 : r + nparity);
1203         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1204
1205         if (q == 0) {
1206                 acols = bc;
1207                 scols = MIN(dcols, roundup(bc, nparity + 1));
1208         } else {
1209                 acols = dcols;
1210                 scols = dcols;
1211         }
1212
1213         ASSERT3U(acols, <=, scols);
1214
1215         rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1216
1217         rm->rm_cols = acols;
1218         rm->rm_scols = scols;
1219         rm->rm_bigcols = bc;
1220         rm->rm_skipstart = bc;
1221         rm->rm_missingdata = 0;
1222         rm->rm_missingparity = 0;
1223         rm->rm_firstdatacol = nparity;
1224         rm->rm_reports = 0;
1225         rm->rm_freed = 0;
1226         rm->rm_ecksuminjected = 0;
1227
1228         asize = 0;
1229
1230         for (c = 0; c < scols; c++) {
1231                 col = f + c;
1232                 coff = o;
1233                 if (col >= dcols) {
1234                         col -= dcols;
1235                         coff += 1ULL << unit_shift;
1236                 }
1237                 rm->rm_col[c].rc_devidx = col;
1238                 rm->rm_col[c].rc_offset = coff;
1239                 rm->rm_col[c].rc_data = NULL;
1240                 rm->rm_col[c].rc_error = 0;
1241                 rm->rm_col[c].rc_tried = 0;
1242                 rm->rm_col[c].rc_skipped = 0;
1243
1244                 if (c >= acols)
1245                         rm->rm_col[c].rc_size = 0;
1246                 else if (c < bc)
1247                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1248                 else
1249                         rm->rm_col[c].rc_size = q << unit_shift;
1250
1251                 asize += rm->rm_col[c].rc_size;
1252         }
1253
1254         ASSERT3U(asize, ==, tot << unit_shift);
1255         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1256         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1257         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1258         ASSERT3U(rm->rm_nskip, <=, nparity);
1259
1260         for (c = 0; c < rm->rm_firstdatacol; c++)
1261                 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1262
1263         rm->rm_col[c].rc_data = data;
1264
1265         for (c = c + 1; c < acols; c++)
1266                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1267                     rm->rm_col[c - 1].rc_size;
1268
1269         /*
1270          * If all data stored spans all columns, there's a danger that parity
1271          * will always be on the same device and, since parity isn't read
1272          * during normal operation, that that device's I/O bandwidth won't be
1273          * used effectively. We therefore switch the parity every 1MB.
1274          *
1275          * ... at least that was, ostensibly, the theory. As a practical
1276          * matter unless we juggle the parity between all devices evenly, we
1277          * won't see any benefit. Further, occasional writes that aren't a
1278          * multiple of the LCM of the number of children and the minimum
1279          * stripe width are sufficient to avoid pessimal behavior.
1280          * Unfortunately, this decision created an implicit on-disk format
1281          * requirement that we need to support for all eternity, but only
1282          * for single-parity RAID-Z.
1283          *
1284          * If we intend to skip a sector in the zeroth column for padding
1285          * we must make sure to note this swap. We will never intend to
1286          * skip the first column since at least one data and one parity
1287          * column must appear in each row.
1288          */
1289         ASSERT(rm->rm_cols >= 2);
1290         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1291
1292         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1293                 devidx = rm->rm_col[0].rc_devidx;
1294                 o = rm->rm_col[0].rc_offset;
1295                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1296                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1297                 rm->rm_col[1].rc_devidx = devidx;
1298                 rm->rm_col[1].rc_offset = o;
1299
1300                 if (rm->rm_skipstart == 0)
1301                         rm->rm_skipstart = 1;
1302         }
1303
1304         return (rm);
1305 }
1306
1307 static void
1308 vdev_raidz_map_free(raidz_map_t *rm)
1309 {
1310         int c;
1311
1312         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1313                 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1314
1315         zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1316 }
1317
1318 static vdev_t *
1319 vdev_child(vdev_t *pvd, uint64_t devidx)
1320 {
1321         vdev_t *cvd;
1322
1323         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1324                 if (cvd->v_id == devidx)
1325                         break;
1326         }
1327
1328         return (cvd);
1329 }
1330
1331 /*
1332  * We keep track of whether or not there were any injected errors, so that
1333  * any ereports we generate can note it.
1334  */
1335 static int
1336 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1337     uint64_t size)
1338 {
1339         return (zio_checksum_verify(spa, bp, data));
1340 }
1341
1342 /*
1343  * Generate the parity from the data columns. If we tried and were able to
1344  * read the parity without error, verify that the generated parity matches the
1345  * data we read. If it doesn't, we fire off a checksum error. Return the
1346  * number such failures.
1347  */
1348 static int
1349 raidz_parity_verify(raidz_map_t *rm)
1350 {
1351         void *orig[VDEV_RAIDZ_MAXPARITY];
1352         int c, ret = 0;
1353         raidz_col_t *rc;
1354
1355         for (c = 0; c < rm->rm_firstdatacol; c++) {
1356                 rc = &rm->rm_col[c];
1357                 if (!rc->rc_tried || rc->rc_error != 0)
1358                         continue;
1359                 orig[c] = zfs_alloc(rc->rc_size);
1360                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1361         }
1362
1363         vdev_raidz_generate_parity(rm);
1364
1365         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1366                 rc = &rm->rm_col[c];
1367                 if (!rc->rc_tried || rc->rc_error != 0)
1368                         continue;
1369                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1370                         rc->rc_error = ECKSUM;
1371                         ret++;
1372                 }
1373                 zfs_free(orig[c], rc->rc_size);
1374         }
1375
1376         return (ret);
1377 }
1378
1379 /*
1380  * Iterate over all combinations of bad data and attempt a reconstruction.
1381  * Note that the algorithm below is non-optimal because it doesn't take into
1382  * account how reconstruction is actually performed. For example, with
1383  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1384  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1385  * cases we'd only use parity information in column 0.
1386  */
1387 static int
1388 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1389     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1390 {
1391         raidz_col_t *rc;
1392         void *orig[VDEV_RAIDZ_MAXPARITY];
1393         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1394         int *tgts = &tstore[1];
1395         int current, next, i, c, n;
1396         int code, ret = 0;
1397
1398         ASSERT(total_errors < rm->rm_firstdatacol);
1399
1400         /*
1401          * This simplifies one edge condition.
1402          */
1403         tgts[-1] = -1;
1404
1405         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1406                 /*
1407                  * Initialize the targets array by finding the first n columns
1408                  * that contain no error.
1409                  *
1410                  * If there were no data errors, we need to ensure that we're
1411                  * always explicitly attempting to reconstruct at least one
1412                  * data column. To do this, we simply push the highest target
1413                  * up into the data columns.
1414                  */
1415                 for (c = 0, i = 0; i < n; i++) {
1416                         if (i == n - 1 && data_errors == 0 &&
1417                             c < rm->rm_firstdatacol) {
1418                                 c = rm->rm_firstdatacol;
1419                         }
1420
1421                         while (rm->rm_col[c].rc_error != 0) {
1422                                 c++;
1423                                 ASSERT3S(c, <, rm->rm_cols);
1424                         }
1425
1426                         tgts[i] = c++;
1427                 }
1428
1429                 /*
1430                  * Setting tgts[n] simplifies the other edge condition.
1431                  */
1432                 tgts[n] = rm->rm_cols;
1433
1434                 /*
1435                  * These buffers were allocated in previous iterations.
1436                  */
1437                 for (i = 0; i < n - 1; i++) {
1438                         ASSERT(orig[i] != NULL);
1439                 }
1440
1441                 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1442
1443                 current = 0;
1444                 next = tgts[current];
1445
1446                 while (current != n) {
1447                         tgts[current] = next;
1448                         current = 0;
1449
1450                         /*
1451                          * Save off the original data that we're going to
1452                          * attempt to reconstruct.
1453                          */
1454                         for (i = 0; i < n; i++) {
1455                                 ASSERT(orig[i] != NULL);
1456                                 c = tgts[i];
1457                                 ASSERT3S(c, >=, 0);
1458                                 ASSERT3S(c, <, rm->rm_cols);
1459                                 rc = &rm->rm_col[c];
1460                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1461                         }
1462
1463                         /*
1464                          * Attempt a reconstruction and exit the outer loop on
1465                          * success.
1466                          */
1467                         code = vdev_raidz_reconstruct(rm, tgts, n);
1468                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1469                                 for (i = 0; i < n; i++) {
1470                                         c = tgts[i];
1471                                         rc = &rm->rm_col[c];
1472                                         ASSERT(rc->rc_error == 0);
1473                                         rc->rc_error = ECKSUM;
1474                                 }
1475
1476                                 ret = code;
1477                                 goto done;
1478                         }
1479
1480                         /*
1481                          * Restore the original data.
1482                          */
1483                         for (i = 0; i < n; i++) {
1484                                 c = tgts[i];
1485                                 rc = &rm->rm_col[c];
1486                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1487                         }
1488
1489                         do {
1490                                 /*
1491                                  * Find the next valid column after the current
1492                                  * position..
1493                                  */
1494                                 for (next = tgts[current] + 1;
1495                                     next < rm->rm_cols &&
1496                                     rm->rm_col[next].rc_error != 0; next++)
1497                                         continue;
1498
1499                                 ASSERT(next <= tgts[current + 1]);
1500
1501                                 /*
1502                                  * If that spot is available, we're done here.
1503                                  */
1504                                 if (next != tgts[current + 1])
1505                                         break;
1506
1507                                 /*
1508                                  * Otherwise, find the next valid column after
1509                                  * the previous position.
1510                                  */
1511                                 for (c = tgts[current - 1] + 1;
1512                                     rm->rm_col[c].rc_error != 0; c++)
1513                                         continue;
1514
1515                                 tgts[current] = c;
1516                                 current++;
1517
1518                         } while (current != n);
1519                 }
1520         }
1521         n--;
1522 done:
1523         for (i = n - 1; i >= 0; i--) {
1524                 zfs_free(orig[i], rm->rm_col[0].rc_size);
1525         }
1526
1527         return (ret);
1528 }
1529
1530 static int
1531 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1532     off_t offset, size_t bytes)
1533 {
1534         vdev_t *tvd = vd->v_top;
1535         vdev_t *cvd;
1536         raidz_map_t *rm;
1537         raidz_col_t *rc;
1538         int c, error;
1539         int unexpected_errors;
1540         int parity_errors;
1541         int parity_untried;
1542         int data_errors;
1543         int total_errors;
1544         int n;
1545         int tgts[VDEV_RAIDZ_MAXPARITY];
1546         int code;
1547
1548         rc = NULL;      /* gcc */
1549         error = 0;
1550
1551         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1552             vd->v_nchildren, vd->v_nparity);
1553
1554         /*
1555          * Iterate over the columns in reverse order so that we hit the parity
1556          * last -- any errors along the way will force us to read the parity.
1557          */
1558         for (c = rm->rm_cols - 1; c >= 0; c--) {
1559                 rc = &rm->rm_col[c];
1560                 cvd = vdev_child(vd, rc->rc_devidx);
1561                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1562                         if (c >= rm->rm_firstdatacol)
1563                                 rm->rm_missingdata++;
1564                         else
1565                                 rm->rm_missingparity++;
1566                         rc->rc_error = ENXIO;
1567                         rc->rc_tried = 1;       /* don't even try */
1568                         rc->rc_skipped = 1;
1569                         continue;
1570                 }
1571 #if 0           /* XXX: Too hard for the boot code. */
1572                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1573                         if (c >= rm->rm_firstdatacol)
1574                                 rm->rm_missingdata++;
1575                         else
1576                                 rm->rm_missingparity++;
1577                         rc->rc_error = ESTALE;
1578                         rc->rc_skipped = 1;
1579                         continue;
1580                 }
1581 #endif
1582                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1583                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1584                             rc->rc_offset, rc->rc_size);
1585                         rc->rc_tried = 1;
1586                         rc->rc_skipped = 0;
1587                 }
1588         }
1589
1590 reconstruct:
1591         unexpected_errors = 0;
1592         parity_errors = 0;
1593         parity_untried = 0;
1594         data_errors = 0;
1595         total_errors = 0;
1596
1597         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1598         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1599
1600         for (c = 0; c < rm->rm_cols; c++) {
1601                 rc = &rm->rm_col[c];
1602
1603                 if (rc->rc_error) {
1604                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1605
1606                         if (c < rm->rm_firstdatacol)
1607                                 parity_errors++;
1608                         else
1609                                 data_errors++;
1610
1611                         if (!rc->rc_skipped)
1612                                 unexpected_errors++;
1613
1614                         total_errors++;
1615                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1616                         parity_untried++;
1617                 }
1618         }
1619
1620         /*
1621          * There are three potential phases for a read:
1622          *      1. produce valid data from the columns read
1623          *      2. read all disks and try again
1624          *      3. perform combinatorial reconstruction
1625          *
1626          * Each phase is progressively both more expensive and less likely to
1627          * occur. If we encounter more errors than we can repair or all phases
1628          * fail, we have no choice but to return an error.
1629          */
1630
1631         /*
1632          * If the number of errors we saw was correctable -- less than or equal
1633          * to the number of parity disks read -- attempt to produce data that
1634          * has a valid checksum. Naturally, this case applies in the absence of
1635          * any errors.
1636          */
1637         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1638                 if (data_errors == 0) {
1639                         if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1640                                 /*
1641                                  * If we read parity information (unnecessarily
1642                                  * as it happens since no reconstruction was
1643                                  * needed) regenerate and verify the parity.
1644                                  * We also regenerate parity when resilvering
1645                                  * so we can write it out to the failed device
1646                                  * later.
1647                                  */
1648                                 if (parity_errors + parity_untried <
1649                                     rm->rm_firstdatacol) {
1650                                         n = raidz_parity_verify(rm);
1651                                         unexpected_errors += n;
1652                                         ASSERT(parity_errors + n <=
1653                                             rm->rm_firstdatacol);
1654                                 }
1655                                 goto done;
1656                         }
1657                 } else {
1658                         /*
1659                          * We either attempt to read all the parity columns or
1660                          * none of them. If we didn't try to read parity, we
1661                          * wouldn't be here in the correctable case. There must
1662                          * also have been fewer parity errors than parity
1663                          * columns or, again, we wouldn't be in this code path.
1664                          */
1665                         ASSERT(parity_untried == 0);
1666                         ASSERT(parity_errors < rm->rm_firstdatacol);
1667
1668                         /*
1669                          * Identify the data columns that reported an error.
1670                          */
1671                         n = 0;
1672                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1673                                 rc = &rm->rm_col[c];
1674                                 if (rc->rc_error != 0) {
1675                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1676                                         tgts[n++] = c;
1677                                 }
1678                         }
1679
1680                         ASSERT(rm->rm_firstdatacol >= n);
1681
1682                         code = vdev_raidz_reconstruct(rm, tgts, n);
1683
1684                         if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1685                                 /*
1686                                  * If we read more parity disks than were used
1687                                  * for reconstruction, confirm that the other
1688                                  * parity disks produced correct data. This
1689                                  * routine is suboptimal in that it regenerates
1690                                  * the parity that we already used in addition
1691                                  * to the parity that we're attempting to
1692                                  * verify, but this should be a relatively
1693                                  * uncommon case, and can be optimized if it
1694                                  * becomes a problem. Note that we regenerate
1695                                  * parity when resilvering so we can write it
1696                                  * out to failed devices later.
1697                                  */
1698                                 if (parity_errors < rm->rm_firstdatacol - n) {
1699                                         n = raidz_parity_verify(rm);
1700                                         unexpected_errors += n;
1701                                         ASSERT(parity_errors + n <=
1702                                             rm->rm_firstdatacol);
1703                                 }
1704
1705                                 goto done;
1706                         }
1707                 }
1708         }
1709
1710         /*
1711          * This isn't a typical situation -- either we got a read
1712          * error or a child silently returned bad data. Read every
1713          * block so we can try again with as much data and parity as
1714          * we can track down. If we've already been through once
1715          * before, all children will be marked as tried so we'll
1716          * proceed to combinatorial reconstruction.
1717          */
1718         unexpected_errors = 1;
1719         rm->rm_missingdata = 0;
1720         rm->rm_missingparity = 0;
1721
1722         n = 0;
1723         for (c = 0; c < rm->rm_cols; c++) {
1724                 rc = &rm->rm_col[c];
1725
1726                 if (rc->rc_tried)
1727                         continue;
1728
1729                 cvd = vdev_child(vd, rc->rc_devidx);
1730                 ASSERT(cvd != NULL);
1731                 rc->rc_error = cvd->v_read(cvd, NULL,
1732                     rc->rc_data, rc->rc_offset, rc->rc_size);
1733                 if (rc->rc_error == 0)
1734                         n++;
1735                 rc->rc_tried = 1;
1736                 rc->rc_skipped = 0;
1737         }
1738         /*
1739          * If we managed to read anything more, retry the
1740          * reconstruction.
1741          */
1742         if (n > 0)
1743                 goto reconstruct;
1744
1745         /*
1746          * At this point we've attempted to reconstruct the data given the
1747          * errors we detected, and we've attempted to read all columns. There
1748          * must, therefore, be one or more additional problems -- silent errors
1749          * resulting in invalid data rather than explicit I/O errors resulting
1750          * in absent data. We check if there is enough additional data to
1751          * possibly reconstruct the data and then perform combinatorial
1752          * reconstruction over all possible combinations. If that fails,
1753          * we're cooked.
1754          */
1755         if (total_errors > rm->rm_firstdatacol) {
1756                 error = EIO;
1757         } else if (total_errors < rm->rm_firstdatacol &&
1758             (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1759              total_errors, data_errors)) != 0) {
1760                 /*
1761                  * If we didn't use all the available parity for the
1762                  * combinatorial reconstruction, verify that the remaining
1763                  * parity is correct.
1764                  */
1765                 if (code != (1 << rm->rm_firstdatacol) - 1)
1766                         (void) raidz_parity_verify(rm);
1767         } else {
1768                 /*
1769                  * We're here because either:
1770                  *
1771                  *      total_errors == rm_first_datacol, or
1772                  *      vdev_raidz_combrec() failed
1773                  *
1774                  * In either case, there is enough bad data to prevent
1775                  * reconstruction.
1776                  *
1777                  * Start checksum ereports for all children which haven't
1778                  * failed, and the IO wasn't speculative.
1779                  */
1780                 error = ECKSUM;
1781         }
1782
1783 done:
1784         vdev_raidz_map_free(rm);
1785
1786         return (error);
1787 }