sys/cddl/boot/zfs/zfssubr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cdefs.h>
  27 __FBSDID("$FreeBSD$");
  28
  29 #include <lz4.h>
  30
  31 static uint64_t zfs_crc64_table[256];
  32
  33 #define ECKSUM  666
  34
  35 #define ASSERT3S(x, y, z)       ((void)0)
  36 #define ASSERT3U(x, y, z)       ((void)0)
  37 #define ASSERT3P(x, y, z)       ((void)0)
  38 #define ASSERT0(x)              ((void)0)
  39 #define ASSERT(x)               ((void)0)
  40
  41 #define panic(...)      do {                                            \
  42         printf(__VA_ARGS__);                                            \
  43         for (;;) ;                                                      \
  44 } while (0)
  45
  46 #define kmem_alloc(size, flag)  zfs_alloc((size))
  47 #define kmem_free(ptr, size)    zfs_free((ptr), (size))
  48
  49 static void
  50 zfs_init_crc(void)
  51 {
  52         int i, j;
  53         uint64_t *ct;
  54
  55         /*
  56          * Calculate the crc64 table (used for the zap hash
  57          * function).
  58          */
  59         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  60                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  61                 for (i = 0; i < 256; i++)
  62                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  63                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  64         }
  65 }
  66
  67 static void
  68 zio_checksum_off(const void *buf, uint64_t size,
  69     const void *ctx_template, zio_cksum_t *zcp)
  70 {
  71         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  72 }
  73
  74 /*
  75  * Signature for checksum functions.
  76  */
  77 typedef void zio_checksum_t(const void *data, uint64_t size,
  78     const void *ctx_template, zio_cksum_t *zcp);
  79 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  80 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  81
  82 typedef enum zio_checksum_flags {
  83         /* Strong enough for metadata? */
  84         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  85         /* ZIO embedded checksum */
  86         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  87         /* Strong enough for dedup (without verification)? */
  88         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  89         /* Uses salt value */
  90         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  91         /* Strong enough for nopwrite? */
  92         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  93 } zio_checksum_flags_t;
  94
  95 /*
  96  * Information about each checksum function.
  97  */
  98 typedef struct zio_checksum_info {
  99         /* checksum function for each byteorder */
 100         zio_checksum_t                  *ci_func[2];
 101         zio_checksum_tmpl_init_t        *ci_tmpl_init;
 102         zio_checksum_tmpl_free_t        *ci_tmpl_free;
 103         zio_checksum_flags_t            ci_flags;
 104         const char                      *ci_name;       /* descriptive name */
 105 } zio_checksum_info_t;
 106
 107 #include "blkptr.c"
 108
 109 #include "fletcher.c"
 110 #include "sha256.c"
 111 #include "skein_zfs.c"
 112
 113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 114         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 115         {{NULL, NULL}, NULL, NULL, 0, "on"},
 116         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 117         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 118             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 119         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 120             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 121         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 122             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 123         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 124             0, "fletcher2"},
 125         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 126             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 127         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 128             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 129             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 130         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 131             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 132         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 133             0, "noparity"},
 134         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 135             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 136             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 137         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
 138             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 139             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 140             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 141         /* no edonr for now */
 142         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 143             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
 144 };
 145
 146 /*
 147  * Common signature for all zio compress/decompress functions.
 148  */
 149 typedef size_t zio_compress_func_t(void *src, void *dst,
 150     size_t s_len, size_t d_len, int);
 151 typedef int zio_decompress_func_t(void *src, void *dst,
 152     size_t s_len, size_t d_len, int);
 153
 154 /*
 155  * Information about each compression function.
 156  */
 157 typedef struct zio_compress_info {
 158         zio_compress_func_t     *ci_compress;   /* compression function */
 159         zio_decompress_func_t   *ci_decompress; /* decompression function */
 160         int                     ci_level;       /* level parameter */
 161         const char              *ci_name;       /* algorithm name */
 162 } zio_compress_info_t;
 163
 164 #include "lzjb.c"
 165 #include "zle.c"
 166
 167 /*
 168  * Compression vectors.
 169  */
 170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 171         {NULL,                  NULL,                   0,      "inherit"},
 172         {NULL,                  NULL,                   0,      "on"},
 173         {NULL,                  NULL,                   0,      "uncompressed"},
 174         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 175         {NULL,                  NULL,                   0,      "empty"},
 176         {NULL,                  NULL,                   1,      "gzip-1"},
 177         {NULL,                  NULL,                   2,      "gzip-2"},
 178         {NULL,                  NULL,                   3,      "gzip-3"},
 179         {NULL,                  NULL,                   4,      "gzip-4"},
 180         {NULL,                  NULL,                   5,      "gzip-5"},
 181         {NULL,                  NULL,                   6,      "gzip-6"},
 182         {NULL,                  NULL,                   7,      "gzip-7"},
 183         {NULL,                  NULL,                   8,      "gzip-8"},
 184         {NULL,                  NULL,                   9,      "gzip-9"},
 185         {NULL,                  zle_decompress,         64,     "zle"},
 186         {NULL,                  lz4_decompress,         0,      "lz4"},
 187 };
 188
 189 static void
 190 byteswap_uint64_array(void *vbuf, size_t size)
 191 {
 192         uint64_t *buf = vbuf;
 193         size_t count = size >> 3;
 194         int i;
 195
 196         ASSERT((size & 7) == 0);
 197
 198         for (i = 0; i < count; i++)
 199                 buf[i] = BSWAP_64(buf[i]);
 200 }
 201
 202 /*
 203  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 204  * a tuple which is guaranteed to be unique for the life of the pool.
 205  */
 206 static void
 207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 208 {
 209         const dva_t *dva = BP_IDENTITY(bp);
 210         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 211
 212         ASSERT(BP_IS_GANG(bp));
 213
 214         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 215 }
 216
 217 /*
 218  * Set the external verifier for a label block based on its offset.
 219  * The vdev is implicit, and the txg is unknowable at pool open time --
 220  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 221  */
 222 static void
 223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 224 {
 225         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 226 }
 227
 228 /*
 229  * Calls the template init function of a checksum which supports context
 230  * templates and installs the template into the spa_t.
 231  */
 232 static void
 233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 234 {
 235         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 236
 237         if (ci->ci_tmpl_init == NULL)
 238                 return;
 239
 240         if (spa->spa_cksum_tmpls[checksum] != NULL)
 241                 return;
 242
 243         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 244                 spa->spa_cksum_tmpls[checksum] =
 245                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 246         }
 247 }
 248
 249 /*
 250  * Called by a spa_t that's about to be deallocated. This steps through
 251  * all of the checksum context templates and deallocates any that were
 252  * initialized using the algorithm-specific template init function.
 253  */
 254 static void __unused
 255 zio_checksum_templates_free(spa_t *spa)
 256 {
 257         for (enum zio_checksum checksum = 0;
 258             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 259                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 260                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 261
 262                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 263                         spa->spa_cksum_tmpls[checksum] = NULL;
 264                 }
 265         }
 266 }
 267
 268 static int
 269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
 270 {
 271         uint64_t size;
 272         unsigned int checksum;
 273         zio_checksum_info_t *ci;
 274         void *ctx = NULL;
 275         zio_cksum_t actual_cksum, expected_cksum, verifier;
 276         int byteswap;
 277
 278         checksum = BP_GET_CHECKSUM(bp);
 279         size = BP_GET_PSIZE(bp);
 280
 281         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 282                 return (EINVAL);
 283         ci = &zio_checksum_table[checksum];
 284         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 285                 return (EINVAL);
 286
 287         if (spa != NULL) {
 288                 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
 289                 ctx = spa->spa_cksum_tmpls[checksum];
 290         }
 291
 292         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 293                 zio_eck_t *eck;
 294
 295                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 296                     checksum == ZIO_CHECKSUM_LABEL);
 297
 298                 eck = (zio_eck_t *)((char *)data + size) - 1;
 299
 300                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 301                         zio_checksum_gang_verifier(&verifier, bp);
 302                 else if (checksum == ZIO_CHECKSUM_LABEL)
 303                         zio_checksum_label_verifier(&verifier,
 304                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 305                 else
 306                         verifier = bp->blk_cksum;
 307
 308                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 309
 310                 if (byteswap)
 311                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 312
 313                 expected_cksum = eck->zec_cksum;
 314                 eck->zec_cksum = verifier;
 315                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 316                 eck->zec_cksum = expected_cksum;
 317
 318                 if (byteswap)
 319                         byteswap_uint64_array(&expected_cksum,
 320                             sizeof (zio_cksum_t));
 321         } else {
 322                 byteswap = BP_SHOULD_BYTESWAP(bp);
 323                 expected_cksum = bp->blk_cksum;
 324                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 325         }
 326
 327         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 328                 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
 329                 return (EIO);
 330         }
 331
 332         return (0);
 333 }
 334
 335 static int
 336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 337         void *dest, uint64_t destsize)
 338 {
 339         zio_compress_info_t *ci;
 340
 341         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 342                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 343                 return (EIO);
 344         }
 345
 346         ci = &zio_compress_table[cpfunc];
 347         if (!ci->ci_decompress) {
 348                 printf("ZFS: unsupported compression algorithm %s\n",
 349                     ci->ci_name);
 350                 return (EIO);
 351         }
 352
 353         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 354 }
 355
 356 static uint64_t
 357 zap_hash(uint64_t salt, const char *name)
 358 {
 359         const uint8_t *cp;
 360         uint8_t c;
 361         uint64_t crc = salt;
 362
 363         ASSERT(crc != 0);
 364         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 365         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 366                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 367
 368         /*
 369          * Only use 28 bits, since we need 4 bits in the cookie for the
 370          * collision differentiator.  We MUST use the high bits, since
 371          * those are the onces that we first pay attention to when
 372          * chosing the bucket.
 373          */
 374         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 375
 376         return (crc);
 377 }
 378
 379 static void *zfs_alloc(size_t size);
 380 static void zfs_free(void *ptr, size_t size);
 381
 382 typedef struct raidz_col {
 383         uint64_t rc_devidx;             /* child device index for I/O */
 384         uint64_t rc_offset;             /* device offset */
 385         uint64_t rc_size;               /* I/O size */
 386         void *rc_data;                  /* I/O data */
 387         int rc_error;                   /* I/O error for this device */
 388         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 389         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 390 } raidz_col_t;
 391
 392 typedef struct raidz_map {
 393         uint64_t rm_cols;               /* Regular column count */
 394         uint64_t rm_scols;              /* Count including skipped columns */
 395         uint64_t rm_bigcols;            /* Number of oversized columns */
 396         uint64_t rm_asize;              /* Actual total I/O size */
 397         uint64_t rm_missingdata;        /* Count of missing data devices */
 398         uint64_t rm_missingparity;      /* Count of missing parity devices */
 399         uint64_t rm_firstdatacol;       /* First data column/parity count */
 400         uint64_t rm_nskip;              /* Skipped sectors for padding */
 401         uint64_t rm_skipstart;          /* Column index of padding start */
 402         uintptr_t rm_reports;           /* # of referencing checksum reports */
 403         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 404         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 405         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 406 } raidz_map_t;
 407
 408 #define VDEV_RAIDZ_P            0
 409 #define VDEV_RAIDZ_Q            1
 410 #define VDEV_RAIDZ_R            2
 411
 412 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 413 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 414
 415 /*
 416  * We provide a mechanism to perform the field multiplication operation on a
 417  * 64-bit value all at once rather than a byte at a time. This works by
 418  * creating a mask from the top bit in each byte and using that to
 419  * conditionally apply the XOR of 0x1d.
 420  */
 421 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 422 { \
 423         (mask) = (x) & 0x8080808080808080ULL; \
 424         (mask) = ((mask) << 1) - ((mask) >> 7); \
 425         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 426             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 427 }
 428
 429 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 430 { \
 431         VDEV_RAIDZ_64MUL_2((x), mask); \
 432         VDEV_RAIDZ_64MUL_2((x), mask); \
 433 }
 434
 435 /*
 436  * These two tables represent powers and logs of 2 in the Galois field defined
 437  * above. These values were computed by repeatedly multiplying by 2 as above.
 438  */
 439 static const uint8_t vdev_raidz_pow2[256] = {
 440         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 441         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 442         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 443         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 444         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 445         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 446         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 447         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 448         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 449         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 450         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 451         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 452         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 453         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 454         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 455         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 456         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 457         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 458         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 459         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 460         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 461         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 462         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 463         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 464         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 465         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 466         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 467         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 468         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 469         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 470         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 471         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 472 };
 473 static const uint8_t vdev_raidz_log2[256] = {
 474         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 475         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 476         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 477         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 478         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 479         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 480         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 481         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 482         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 483         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 484         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 485         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 486         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 487         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 488         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 489         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 490         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 491         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 492         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 493         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 494         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 495         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 496         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 497         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 498         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 499         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 500         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 501         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 502         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 503         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 504         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 505         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 506 };
 507
 508 /*
 509  * Multiply a given number by 2 raised to the given power.
 510  */
 511 static uint8_t
 512 vdev_raidz_exp2(uint8_t a, int exp)
 513 {
 514         if (a == 0)
 515                 return (0);
 516
 517         ASSERT(exp >= 0);
 518         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 519
 520         exp += vdev_raidz_log2[a];
 521         if (exp > 255)
 522                 exp -= 255;
 523
 524         return (vdev_raidz_pow2[exp]);
 525 }
 526
 527 static void
 528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 529 {
 530         uint64_t *p, *src, pcount, ccount, i;
 531         int c;
 532
 533         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 534
 535         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 536                 src = rm->rm_col[c].rc_data;
 537                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 538                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 539
 540                 if (c == rm->rm_firstdatacol) {
 541                         ASSERT(ccount == pcount);
 542                         for (i = 0; i < ccount; i++, src++, p++) {
 543                                 *p = *src;
 544                         }
 545                 } else {
 546                         ASSERT(ccount <= pcount);
 547                         for (i = 0; i < ccount; i++, src++, p++) {
 548                                 *p ^= *src;
 549                         }
 550                 }
 551         }
 552 }
 553
 554 static void
 555 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 556 {
 557         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 558         int c;
 559
 560         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 561         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 562             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 563
 564         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 565                 src = rm->rm_col[c].rc_data;
 566                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 567                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 568
 569                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 570
 571                 if (c == rm->rm_firstdatacol) {
 572                         ASSERT(ccnt == pcnt || ccnt == 0);
 573                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 574                                 *p = *src;
 575                                 *q = *src;
 576                         }
 577                         for (; i < pcnt; i++, src++, p++, q++) {
 578                                 *p = 0;
 579                                 *q = 0;
 580                         }
 581                 } else {
 582                         ASSERT(ccnt <= pcnt);
 583
 584                         /*
 585                          * Apply the algorithm described above by multiplying
 586                          * the previous result and adding in the new value.
 587                          */
 588                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 589                                 *p ^= *src;
 590
 591                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 592                                 *q ^= *src;
 593                         }
 594
 595                         /*
 596                          * Treat short columns as though they are full of 0s.
 597                          * Note that there's therefore nothing needed for P.
 598                          */
 599                         for (; i < pcnt; i++, q++) {
 600                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 601                         }
 602                 }
 603         }
 604 }
 605
 606 static void
 607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 608 {
 609         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 610         int c;
 611
 612         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 613         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 614             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 615         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 616             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 617
 618         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 619                 src = rm->rm_col[c].rc_data;
 620                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 621                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 622                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 623
 624                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 625
 626                 if (c == rm->rm_firstdatacol) {
 627                         ASSERT(ccnt == pcnt || ccnt == 0);
 628                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 629                                 *p = *src;
 630                                 *q = *src;
 631                                 *r = *src;
 632                         }
 633                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 634                                 *p = 0;
 635                                 *q = 0;
 636                                 *r = 0;
 637                         }
 638                 } else {
 639                         ASSERT(ccnt <= pcnt);
 640
 641                         /*
 642                          * Apply the algorithm described above by multiplying
 643                          * the previous result and adding in the new value.
 644                          */
 645                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 646                                 *p ^= *src;
 647
 648                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 649                                 *q ^= *src;
 650
 651                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 652                                 *r ^= *src;
 653                         }
 654
 655                         /*
 656                          * Treat short columns as though they are full of 0s.
 657                          * Note that there's therefore nothing needed for P.
 658                          */
 659                         for (; i < pcnt; i++, q++, r++) {
 660                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 661                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 662                         }
 663                 }
 664         }
 665 }
 666
 667 /*
 668  * Generate RAID parity in the first virtual columns according to the number of
 669  * parity columns available.
 670  */
 671 static void
 672 vdev_raidz_generate_parity(raidz_map_t *rm)
 673 {
 674         switch (rm->rm_firstdatacol) {
 675         case 1:
 676                 vdev_raidz_generate_parity_p(rm);
 677                 break;
 678         case 2:
 679                 vdev_raidz_generate_parity_pq(rm);
 680                 break;
 681         case 3:
 682                 vdev_raidz_generate_parity_pqr(rm);
 683                 break;
 684         default:
 685                 panic("invalid RAID-Z configuration");
 686         }
 687 }
 688
 689 /* BEGIN CSTYLED */
 690 /*
 691  * In the general case of reconstruction, we must solve the system of linear
 692  * equations defined by the coeffecients used to generate parity as well as
 693  * the contents of the data and parity disks. This can be expressed with
 694  * vectors for the original data (D) and the actual data (d) and parity (p)
 695  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 696  *
 697  *            __   __                     __     __
 698  *            |     |         __     __   |  p_0  |
 699  *            |  V  |         |  D_0  |   | p_m-1 |
 700  *            |     |    x    |   :   | = |  d_0  |
 701  *            |  I  |         | D_n-1 |   |   :   |
 702  *            |     |         ~~     ~~   | d_n-1 |
 703  *            ~~   ~~                     ~~     ~~
 704  *
 705  * I is simply a square identity matrix of size n, and V is a vandermonde
 706  * matrix defined by the coeffecients we chose for the various parity columns
 707  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 708  * computation as well as linear separability.
 709  *
 710  *      __               __               __     __
 711  *      |   1   ..  1 1 1 |               |  p_0  |
 712  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 713  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 714  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 715  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 716  *      |   :       : : : |   |   :   |   |  d_2  |
 717  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 718  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 719  *      |   0   ..  0 0 1 |               | d_n-1 |
 720  *      ~~               ~~               ~~     ~~
 721  *
 722  * Note that I, V, d, and p are known. To compute D, we must invert the
 723  * matrix and use the known data and parity values to reconstruct the unknown
 724  * data values. We begin by removing the rows in V|I and d|p that correspond
 725  * to failed or missing columns; we then make V|I square (n x n) and d|p
 726  * sized n by removing rows corresponding to unused parity from the bottom up
 727  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 728  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 729  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 730  *           __                               __
 731  *           |  1   1   1   1   1   1   1   1  |
 732  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 733  *           |  19 205 116  29  64  16  4   1  |      / /
 734  *           |  1   0   0   0   0   0   0   0  |     / /
 735  *           |  0   1   0   0   0   0   0   0  | <--' /
 736  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 737  *           |  0   0   0   1   0   0   0   0  |
 738  *           |  0   0   0   0   1   0   0   0  |
 739  *           |  0   0   0   0   0   1   0   0  |
 740  *           |  0   0   0   0   0   0   1   0  |
 741  *           |  0   0   0   0   0   0   0   1  |
 742  *           ~~                               ~~
 743  *           __                               __
 744  *           |  1   1   1   1   1   1   1   1  |
 745  *           | 128  64  32  16  8   4   2   1  |
 746  *           |  19 205 116  29  64  16  4   1  |
 747  *           |  1   0   0   0   0   0   0   0  |
 748  *           |  0   1   0   0   0   0   0   0  |
 749  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 750  *           |  0   0   0   1   0   0   0   0  |
 751  *           |  0   0   0   0   1   0   0   0  |
 752  *           |  0   0   0   0   0   1   0   0  |
 753  *           |  0   0   0   0   0   0   1   0  |
 754  *           |  0   0   0   0   0   0   0   1  |
 755  *           ~~                               ~~
 756  *
 757  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 758  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 759  * matrix is not singular.
 760  * __                                                                 __
 761  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 762  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 763  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 764  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 765  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 766  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 767  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 768  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 769  * ~~                                                                 ~~
 770  * __                                                                 __
 771  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 772  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 773  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 774  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 775  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 776  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 777  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 778  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 779  * ~~                                                                 ~~
 780  * __                                                                 __
 781  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 782  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 783  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 784  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 785  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 786  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 787  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 788  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 789  * ~~                                                                 ~~
 790  * __                                                                 __
 791  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 792  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 793  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 794  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 795  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 796  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 797  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 798  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 799  * ~~                                                                 ~~
 800  * __                                                                 __
 801  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 802  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 803  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 804  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 805  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 806  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 807  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 808  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 809  * ~~                                                                 ~~
 810  * __                                                                 __
 811  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 812  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 813  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 814  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 815  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 816  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 817  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 818  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 819  * ~~                                                                 ~~
 820  *                   __                               __
 821  *                   |  0   0   1   0   0   0   0   0  |
 822  *                   | 167 100  5   41 159 169 217 208 |
 823  *                   | 166 100  4   40 158 168 216 209 |
 824  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 825  *                   |  0   0   0   0   1   0   0   0  |
 826  *                   |  0   0   0   0   0   1   0   0  |
 827  *                   |  0   0   0   0   0   0   1   0  |
 828  *                   |  0   0   0   0   0   0   0   1  |
 829  *                   ~~                               ~~
 830  *
 831  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 832  * of the missing data.
 833  *
 834  * As is apparent from the example above, the only non-trivial rows in the
 835  * inverse matrix correspond to the data disks that we're trying to
 836  * reconstruct. Indeed, those are the only rows we need as the others would
 837  * only be useful for reconstructing data known or assumed to be valid. For
 838  * that reason, we only build the coefficients in the rows that correspond to
 839  * targeted columns.
 840  */
 841 /* END CSTYLED */
 842
 843 static void
 844 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 845     uint8_t **rows)
 846 {
 847         int i, j;
 848         int pow;
 849
 850         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 851
 852         /*
 853          * Fill in the missing rows of interest.
 854          */
 855         for (i = 0; i < nmap; i++) {
 856                 ASSERT3S(0, <=, map[i]);
 857                 ASSERT3S(map[i], <=, 2);
 858
 859                 pow = map[i] * n;
 860                 if (pow > 255)
 861                         pow -= 255;
 862                 ASSERT(pow <= 255);
 863
 864                 for (j = 0; j < n; j++) {
 865                         pow -= map[i];
 866                         if (pow < 0)
 867                                 pow += 255;
 868                         rows[i][j] = vdev_raidz_pow2[pow];
 869                 }
 870         }
 871 }
 872
 873 static void
 874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 875     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 876 {
 877         int i, j, ii, jj;
 878         uint8_t log;
 879
 880         /*
 881          * Assert that the first nmissing entries from the array of used
 882          * columns correspond to parity columns and that subsequent entries
 883          * correspond to data columns.
 884          */
 885         for (i = 0; i < nmissing; i++) {
 886                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 887         }
 888         for (; i < n; i++) {
 889                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 890         }
 891
 892         /*
 893          * First initialize the storage where we'll compute the inverse rows.
 894          */
 895         for (i = 0; i < nmissing; i++) {
 896                 for (j = 0; j < n; j++) {
 897                         invrows[i][j] = (i == j) ? 1 : 0;
 898                 }
 899         }
 900
 901         /*
 902          * Subtract all trivial rows from the rows of consequence.
 903          */
 904         for (i = 0; i < nmissing; i++) {
 905                 for (j = nmissing; j < n; j++) {
 906                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 907                         jj = used[j] - rm->rm_firstdatacol;
 908                         ASSERT3S(jj, <, n);
 909                         invrows[i][j] = rows[i][jj];
 910                         rows[i][jj] = 0;
 911                 }
 912         }
 913
 914         /*
 915          * For each of the rows of interest, we must normalize it and subtract
 916          * a multiple of it from the other rows.
 917          */
 918         for (i = 0; i < nmissing; i++) {
 919                 for (j = 0; j < missing[i]; j++) {
 920                         ASSERT3U(rows[i][j], ==, 0);
 921                 }
 922                 ASSERT3U(rows[i][missing[i]], !=, 0);
 923
 924                 /*
 925                  * Compute the inverse of the first element and multiply each
 926                  * element in the row by that value.
 927                  */
 928                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 929
 930                 for (j = 0; j < n; j++) {
 931                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 932                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 933                 }
 934
 935                 for (ii = 0; ii < nmissing; ii++) {
 936                         if (i == ii)
 937                                 continue;
 938
 939                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 940
 941                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 942
 943                         for (j = 0; j < n; j++) {
 944                                 rows[ii][j] ^=
 945                                     vdev_raidz_exp2(rows[i][j], log);
 946                                 invrows[ii][j] ^=
 947                                     vdev_raidz_exp2(invrows[i][j], log);
 948                         }
 949                 }
 950         }
 951
 952         /*
 953          * Verify that the data that is left in the rows are properly part of
 954          * an identity matrix.
 955          */
 956         for (i = 0; i < nmissing; i++) {
 957                 for (j = 0; j < n; j++) {
 958                         if (j == missing[i]) {
 959                                 ASSERT3U(rows[i][j], ==, 1);
 960                         } else {
 961                                 ASSERT3U(rows[i][j], ==, 0);
 962                         }
 963                 }
 964         }
 965 }
 966
 967 static void
 968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 969     int *missing, uint8_t **invrows, const uint8_t *used)
 970 {
 971         int i, j, x, cc, c;
 972         uint8_t *src;
 973         uint64_t ccount;
 974         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 975         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 976         uint8_t log, val;
 977         int ll;
 978         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 979         uint8_t *p, *pp;
 980         size_t psize;
 981
 982         log = 0;        /* gcc */
 983         psize = sizeof (invlog[0][0]) * n * nmissing;
 984         p = zfs_alloc(psize);
 985
 986         for (pp = p, i = 0; i < nmissing; i++) {
 987                 invlog[i] = pp;
 988                 pp += n;
 989         }
 990
 991         for (i = 0; i < nmissing; i++) {
 992                 for (j = 0; j < n; j++) {
 993                         ASSERT3U(invrows[i][j], !=, 0);
 994                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 995                 }
 996         }
 997
 998         for (i = 0; i < n; i++) {
 999                 c = used[i];
1000                 ASSERT3U(c, <, rm->rm_cols);
1001
1002                 src = rm->rm_col[c].rc_data;
1003                 ccount = rm->rm_col[c].rc_size;
1004                 for (j = 0; j < nmissing; j++) {
1005                         cc = missing[j] + rm->rm_firstdatacol;
1006                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1007                         ASSERT3U(cc, <, rm->rm_cols);
1008                         ASSERT3U(cc, !=, c);
1009
1010                         dst[j] = rm->rm_col[cc].rc_data;
1011                         dcount[j] = rm->rm_col[cc].rc_size;
1012                 }
1013
1014                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1015
1016                 for (x = 0; x < ccount; x++, src++) {
1017                         if (*src != 0)
1018                                 log = vdev_raidz_log2[*src];
1019
1020                         for (cc = 0; cc < nmissing; cc++) {
1021                                 if (x >= dcount[cc])
1022                                         continue;
1023
1024                                 if (*src == 0) {
1025                                         val = 0;
1026                                 } else {
1027                                         if ((ll = log + invlog[cc][i]) >= 255)
1028                                                 ll -= 255;
1029                                         val = vdev_raidz_pow2[ll];
1030                                 }
1031
1032                                 if (i == 0)
1033                                         dst[cc][x] = val;
1034                                 else
1035                                         dst[cc][x] ^= val;
1036                         }
1037                 }
1038         }
1039
1040         zfs_free(p, psize);
1041 }
1042
1043 static int
1044 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1045 {
1046         int n, i, c, t, tt;
1047         int nmissing_rows;
1048         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1049         int parity_map[VDEV_RAIDZ_MAXPARITY];
1050
1051         uint8_t *p, *pp;
1052         size_t psize;
1053
1054         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1055         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1056         uint8_t *used;
1057
1058         int code = 0;
1059
1060
1061         n = rm->rm_cols - rm->rm_firstdatacol;
1062
1063         /*
1064          * Figure out which data columns are missing.
1065          */
1066         nmissing_rows = 0;
1067         for (t = 0; t < ntgts; t++) {
1068                 if (tgts[t] >= rm->rm_firstdatacol) {
1069                         missing_rows[nmissing_rows++] =
1070                             tgts[t] - rm->rm_firstdatacol;
1071                 }
1072         }
1073
1074         /*
1075          * Figure out which parity columns to use to help generate the missing
1076          * data columns.
1077          */
1078         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1079                 ASSERT(tt < ntgts);
1080                 ASSERT(c < rm->rm_firstdatacol);
1081
1082                 /*
1083                  * Skip any targeted parity columns.
1084                  */
1085                 if (c == tgts[tt]) {
1086                         tt++;
1087                         continue;
1088                 }
1089
1090                 code |= 1 << c;
1091
1092                 parity_map[i] = c;
1093                 i++;
1094         }
1095
1096         ASSERT(code != 0);
1097         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1098
1099         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1100             nmissing_rows * n + sizeof (used[0]) * n;
1101         p = kmem_alloc(psize, KM_SLEEP);
1102
1103         for (pp = p, i = 0; i < nmissing_rows; i++) {
1104                 rows[i] = pp;
1105                 pp += n;
1106                 invrows[i] = pp;
1107                 pp += n;
1108         }
1109         used = pp;
1110
1111         for (i = 0; i < nmissing_rows; i++) {
1112                 used[i] = parity_map[i];
1113         }
1114
1115         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1116                 if (tt < nmissing_rows &&
1117                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1118                         tt++;
1119                         continue;
1120                 }
1121
1122                 ASSERT3S(i, <, n);
1123                 used[i] = c;
1124                 i++;
1125         }
1126
1127         /*
1128          * Initialize the interesting rows of the matrix.
1129          */
1130         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1131
1132         /*
1133          * Invert the matrix.
1134          */
1135         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1136             invrows, used);
1137
1138         /*
1139          * Reconstruct the missing data using the generated matrix.
1140          */
1141         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1142             invrows, used);
1143
1144         kmem_free(p, psize);
1145
1146         return (code);
1147 }
1148
1149 static int
1150 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1151 {
1152         int tgts[VDEV_RAIDZ_MAXPARITY];
1153         int ntgts;
1154         int i, c;
1155         int code;
1156         int nbadparity, nbaddata;
1157
1158         /*
1159          * The tgts list must already be sorted.
1160          */
1161         for (i = 1; i < nt; i++) {
1162                 ASSERT(t[i] > t[i - 1]);
1163         }
1164
1165         nbadparity = rm->rm_firstdatacol;
1166         nbaddata = rm->rm_cols - nbadparity;
1167         ntgts = 0;
1168         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1169                 if (i < nt && c == t[i]) {
1170                         tgts[ntgts++] = c;
1171                         i++;
1172                 } else if (rm->rm_col[c].rc_error != 0) {
1173                         tgts[ntgts++] = c;
1174                 } else if (c >= rm->rm_firstdatacol) {
1175                         nbaddata--;
1176                 } else {
1177                         nbadparity--;
1178                 }
1179         }
1180
1181         ASSERT(ntgts >= nt);
1182         ASSERT(nbaddata >= 0);
1183         ASSERT(nbaddata + nbadparity == ntgts);
1184
1185         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1186         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1187         ASSERT(code > 0);
1188         return (code);
1189 }
1190
1191 static raidz_map_t *
1192 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1193     uint64_t dcols, uint64_t nparity)
1194 {
1195         raidz_map_t *rm;
1196         uint64_t b = offset >> unit_shift;
1197         uint64_t s = size >> unit_shift;
1198         uint64_t f = b % dcols;
1199         uint64_t o = (b / dcols) << unit_shift;
1200         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1201
1202         q = s / (dcols - nparity);
1203         r = s - q * (dcols - nparity);
1204         bc = (r == 0 ? 0 : r + nparity);
1205         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1206
1207         if (q == 0) {
1208                 acols = bc;
1209                 scols = MIN(dcols, roundup(bc, nparity + 1));
1210         } else {
1211                 acols = dcols;
1212                 scols = dcols;
1213         }
1214
1215         ASSERT3U(acols, <=, scols);
1216
1217         rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1218
1219         rm->rm_cols = acols;
1220         rm->rm_scols = scols;
1221         rm->rm_bigcols = bc;
1222         rm->rm_skipstart = bc;
1223         rm->rm_missingdata = 0;
1224         rm->rm_missingparity = 0;
1225         rm->rm_firstdatacol = nparity;
1226         rm->rm_reports = 0;
1227         rm->rm_freed = 0;
1228         rm->rm_ecksuminjected = 0;
1229
1230         asize = 0;
1231
1232         for (c = 0; c < scols; c++) {
1233                 col = f + c;
1234                 coff = o;
1235                 if (col >= dcols) {
1236                         col -= dcols;
1237                         coff += 1ULL << unit_shift;
1238                 }
1239                 rm->rm_col[c].rc_devidx = col;
1240                 rm->rm_col[c].rc_offset = coff;
1241                 rm->rm_col[c].rc_data = NULL;
1242                 rm->rm_col[c].rc_error = 0;
1243                 rm->rm_col[c].rc_tried = 0;
1244                 rm->rm_col[c].rc_skipped = 0;
1245
1246                 if (c >= acols)
1247                         rm->rm_col[c].rc_size = 0;
1248                 else if (c < bc)
1249                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1250                 else
1251                         rm->rm_col[c].rc_size = q << unit_shift;
1252
1253                 asize += rm->rm_col[c].rc_size;
1254         }
1255
1256         ASSERT3U(asize, ==, tot << unit_shift);
1257         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1258         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1259         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1260         ASSERT3U(rm->rm_nskip, <=, nparity);
1261
1262         for (c = 0; c < rm->rm_firstdatacol; c++)
1263                 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1264
1265         rm->rm_col[c].rc_data = data;
1266
1267         for (c = c + 1; c < acols; c++)
1268                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1269                     rm->rm_col[c - 1].rc_size;
1270
1271         /*
1272          * If all data stored spans all columns, there's a danger that parity
1273          * will always be on the same device and, since parity isn't read
1274          * during normal operation, that that device's I/O bandwidth won't be
1275          * used effectively. We therefore switch the parity every 1MB.
1276          *
1277          * ... at least that was, ostensibly, the theory. As a practical
1278          * matter unless we juggle the parity between all devices evenly, we
1279          * won't see any benefit. Further, occasional writes that aren't a
1280          * multiple of the LCM of the number of children and the minimum
1281          * stripe width are sufficient to avoid pessimal behavior.
1282          * Unfortunately, this decision created an implicit on-disk format
1283          * requirement that we need to support for all eternity, but only
1284          * for single-parity RAID-Z.
1285          *
1286          * If we intend to skip a sector in the zeroth column for padding
1287          * we must make sure to note this swap. We will never intend to
1288          * skip the first column since at least one data and one parity
1289          * column must appear in each row.
1290          */
1291         ASSERT(rm->rm_cols >= 2);
1292         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1293
1294         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1295                 devidx = rm->rm_col[0].rc_devidx;
1296                 o = rm->rm_col[0].rc_offset;
1297                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1298                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1299                 rm->rm_col[1].rc_devidx = devidx;
1300                 rm->rm_col[1].rc_offset = o;
1301
1302                 if (rm->rm_skipstart == 0)
1303                         rm->rm_skipstart = 1;
1304         }
1305
1306         return (rm);
1307 }
1308
1309 static void
1310 vdev_raidz_map_free(raidz_map_t *rm)
1311 {
1312         int c;
1313
1314         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1315                 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1316
1317         zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1318 }
1319
1320 static vdev_t *
1321 vdev_child(vdev_t *pvd, uint64_t devidx)
1322 {
1323         vdev_t *cvd;
1324
1325         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1326                 if (cvd->v_id == devidx)
1327                         break;
1328         }
1329
1330         return (cvd);
1331 }
1332
1333 /*
1334  * We keep track of whether or not there were any injected errors, so that
1335  * any ereports we generate can note it.
1336  */
1337 static int
1338 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1339     uint64_t size)
1340 {
1341         return (zio_checksum_verify(spa, bp, data));
1342 }
1343
1344 /*
1345  * Generate the parity from the data columns. If we tried and were able to
1346  * read the parity without error, verify that the generated parity matches the
1347  * data we read. If it doesn't, we fire off a checksum error. Return the
1348  * number such failures.
1349  */
1350 static int
1351 raidz_parity_verify(raidz_map_t *rm)
1352 {
1353         void *orig[VDEV_RAIDZ_MAXPARITY];
1354         int c, ret = 0;
1355         raidz_col_t *rc;
1356
1357         for (c = 0; c < rm->rm_firstdatacol; c++) {
1358                 rc = &rm->rm_col[c];
1359                 if (!rc->rc_tried || rc->rc_error != 0)
1360                         continue;
1361                 orig[c] = zfs_alloc(rc->rc_size);
1362                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1363         }
1364
1365         vdev_raidz_generate_parity(rm);
1366
1367         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1368                 rc = &rm->rm_col[c];
1369                 if (!rc->rc_tried || rc->rc_error != 0)
1370                         continue;
1371                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1372                         rc->rc_error = ECKSUM;
1373                         ret++;
1374                 }
1375                 zfs_free(orig[c], rc->rc_size);
1376         }
1377
1378         return (ret);
1379 }
1380
1381 /*
1382  * Iterate over all combinations of bad data and attempt a reconstruction.
1383  * Note that the algorithm below is non-optimal because it doesn't take into
1384  * account how reconstruction is actually performed. For example, with
1385  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1386  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1387  * cases we'd only use parity information in column 0.
1388  */
1389 static int
1390 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1391     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1392 {
1393         raidz_col_t *rc;
1394         void *orig[VDEV_RAIDZ_MAXPARITY];
1395         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1396         int *tgts = &tstore[1];
1397         int current, next, i, c, n;
1398         int code, ret = 0;
1399
1400         ASSERT(total_errors < rm->rm_firstdatacol);
1401
1402         /*
1403          * This simplifies one edge condition.
1404          */
1405         tgts[-1] = -1;
1406
1407         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1408                 /*
1409                  * Initialize the targets array by finding the first n columns
1410                  * that contain no error.
1411                  *
1412                  * If there were no data errors, we need to ensure that we're
1413                  * always explicitly attempting to reconstruct at least one
1414                  * data column. To do this, we simply push the highest target
1415                  * up into the data columns.
1416                  */
1417                 for (c = 0, i = 0; i < n; i++) {
1418                         if (i == n - 1 && data_errors == 0 &&
1419                             c < rm->rm_firstdatacol) {
1420                                 c = rm->rm_firstdatacol;
1421                         }
1422
1423                         while (rm->rm_col[c].rc_error != 0) {
1424                                 c++;
1425                                 ASSERT3S(c, <, rm->rm_cols);
1426                         }
1427
1428                         tgts[i] = c++;
1429                 }
1430
1431                 /*
1432                  * Setting tgts[n] simplifies the other edge condition.
1433                  */
1434                 tgts[n] = rm->rm_cols;
1435
1436                 /*
1437                  * These buffers were allocated in previous iterations.
1438                  */
1439                 for (i = 0; i < n - 1; i++) {
1440                         ASSERT(orig[i] != NULL);
1441                 }
1442
1443                 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1444
1445                 current = 0;
1446                 next = tgts[current];
1447
1448                 while (current != n) {
1449                         tgts[current] = next;
1450                         current = 0;
1451
1452                         /*
1453                          * Save off the original data that we're going to
1454                          * attempt to reconstruct.
1455                          */
1456                         for (i = 0; i < n; i++) {
1457                                 ASSERT(orig[i] != NULL);
1458                                 c = tgts[i];
1459                                 ASSERT3S(c, >=, 0);
1460                                 ASSERT3S(c, <, rm->rm_cols);
1461                                 rc = &rm->rm_col[c];
1462                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1463                         }
1464
1465                         /*
1466                          * Attempt a reconstruction and exit the outer loop on
1467                          * success.
1468                          */
1469                         code = vdev_raidz_reconstruct(rm, tgts, n);
1470                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1471                                 for (i = 0; i < n; i++) {
1472                                         c = tgts[i];
1473                                         rc = &rm->rm_col[c];
1474                                         ASSERT(rc->rc_error == 0);
1475                                         rc->rc_error = ECKSUM;
1476                                 }
1477
1478                                 ret = code;
1479                                 goto done;
1480                         }
1481
1482                         /*
1483                          * Restore the original data.
1484                          */
1485                         for (i = 0; i < n; i++) {
1486                                 c = tgts[i];
1487                                 rc = &rm->rm_col[c];
1488                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1489                         }
1490
1491                         do {
1492                                 /*
1493                                  * Find the next valid column after the current
1494                                  * position..
1495                                  */
1496                                 for (next = tgts[current] + 1;
1497                                     next < rm->rm_cols &&
1498                                     rm->rm_col[next].rc_error != 0; next++)
1499                                         continue;
1500
1501                                 ASSERT(next <= tgts[current + 1]);
1502
1503                                 /*
1504                                  * If that spot is available, we're done here.
1505                                  */
1506                                 if (next != tgts[current + 1])
1507                                         break;
1508
1509                                 /*
1510                                  * Otherwise, find the next valid column after
1511                                  * the previous position.
1512                                  */
1513                                 for (c = tgts[current - 1] + 1;
1514                                     rm->rm_col[c].rc_error != 0; c++)
1515                                         continue;
1516
1517                                 tgts[current] = c;
1518                                 current++;
1519
1520                         } while (current != n);
1521                 }
1522         }
1523         n--;
1524 done:
1525         for (i = n - 1; i >= 0; i--) {
1526                 zfs_free(orig[i], rm->rm_col[0].rc_size);
1527         }
1528
1529         return (ret);
1530 }
1531
1532 static int
1533 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1534     off_t offset, size_t bytes)
1535 {
1536         vdev_t *tvd = vd->v_top;
1537         vdev_t *cvd;
1538         raidz_map_t *rm;
1539         raidz_col_t *rc;
1540         int c, error;
1541         int unexpected_errors;
1542         int parity_errors;
1543         int parity_untried;
1544         int data_errors;
1545         int total_errors;
1546         int n;
1547         int tgts[VDEV_RAIDZ_MAXPARITY];
1548         int code;
1549
1550         rc = NULL;      /* gcc */
1551         error = 0;
1552
1553         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1554             vd->v_nchildren, vd->v_nparity);
1555
1556         /*
1557          * Iterate over the columns in reverse order so that we hit the parity
1558          * last -- any errors along the way will force us to read the parity.
1559          */
1560         for (c = rm->rm_cols - 1; c >= 0; c--) {
1561                 rc = &rm->rm_col[c];
1562                 cvd = vdev_child(vd, rc->rc_devidx);
1563                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1564                         if (c >= rm->rm_firstdatacol)
1565                                 rm->rm_missingdata++;
1566                         else
1567                                 rm->rm_missingparity++;
1568                         rc->rc_error = ENXIO;
1569                         rc->rc_tried = 1;       /* don't even try */
1570                         rc->rc_skipped = 1;
1571                         continue;
1572                 }
1573 #if 0           /* XXX: Too hard for the boot code. */
1574                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1575                         if (c >= rm->rm_firstdatacol)
1576                                 rm->rm_missingdata++;
1577                         else
1578                                 rm->rm_missingparity++;
1579                         rc->rc_error = ESTALE;
1580                         rc->rc_skipped = 1;
1581                         continue;
1582                 }
1583 #endif
1584                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1585                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1586                             rc->rc_offset, rc->rc_size);
1587                         rc->rc_tried = 1;
1588                         rc->rc_skipped = 0;
1589                 }
1590         }
1591
1592 reconstruct:
1593         unexpected_errors = 0;
1594         parity_errors = 0;
1595         parity_untried = 0;
1596         data_errors = 0;
1597         total_errors = 0;
1598
1599         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1600         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1601
1602         for (c = 0; c < rm->rm_cols; c++) {
1603                 rc = &rm->rm_col[c];
1604
1605                 if (rc->rc_error) {
1606                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1607
1608                         if (c < rm->rm_firstdatacol)
1609                                 parity_errors++;
1610                         else
1611                                 data_errors++;
1612
1613                         if (!rc->rc_skipped)
1614                                 unexpected_errors++;
1615
1616                         total_errors++;
1617                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1618                         parity_untried++;
1619                 }
1620         }
1621
1622         /*
1623          * There are three potential phases for a read:
1624          *      1. produce valid data from the columns read
1625          *      2. read all disks and try again
1626          *      3. perform combinatorial reconstruction
1627          *
1628          * Each phase is progressively both more expensive and less likely to
1629          * occur. If we encounter more errors than we can repair or all phases
1630          * fail, we have no choice but to return an error.
1631          */
1632
1633         /*
1634          * If the number of errors we saw was correctable -- less than or equal
1635          * to the number of parity disks read -- attempt to produce data that
1636          * has a valid checksum. Naturally, this case applies in the absence of
1637          * any errors.
1638          */
1639         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1640                 int rv;
1641
1642                 if (data_errors == 0) {
1643                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1644                         if (rv == 0) {
1645                                 /*
1646                                  * If we read parity information (unnecessarily
1647                                  * as it happens since no reconstruction was
1648                                  * needed) regenerate and verify the parity.
1649                                  * We also regenerate parity when resilvering
1650                                  * so we can write it out to the failed device
1651                                  * later.
1652                                  */
1653                                 if (parity_errors + parity_untried <
1654                                     rm->rm_firstdatacol) {
1655                                         n = raidz_parity_verify(rm);
1656                                         unexpected_errors += n;
1657                                         ASSERT(parity_errors + n <=
1658                                             rm->rm_firstdatacol);
1659                                 }
1660                                 goto done;
1661                         }
1662                 } else {
1663                         /*
1664                          * We either attempt to read all the parity columns or
1665                          * none of them. If we didn't try to read parity, we
1666                          * wouldn't be here in the correctable case. There must
1667                          * also have been fewer parity errors than parity
1668                          * columns or, again, we wouldn't be in this code path.
1669                          */
1670                         ASSERT(parity_untried == 0);
1671                         ASSERT(parity_errors < rm->rm_firstdatacol);
1672
1673                         /*
1674                          * Identify the data columns that reported an error.
1675                          */
1676                         n = 0;
1677                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1678                                 rc = &rm->rm_col[c];
1679                                 if (rc->rc_error != 0) {
1680                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1681                                         tgts[n++] = c;
1682                                 }
1683                         }
1684
1685                         ASSERT(rm->rm_firstdatacol >= n);
1686
1687                         code = vdev_raidz_reconstruct(rm, tgts, n);
1688
1689                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1690                         if (rv == 0) {
1691                                 /*
1692                                  * If we read more parity disks than were used
1693                                  * for reconstruction, confirm that the other
1694                                  * parity disks produced correct data. This
1695                                  * routine is suboptimal in that it regenerates
1696                                  * the parity that we already used in addition
1697                                  * to the parity that we're attempting to
1698                                  * verify, but this should be a relatively
1699                                  * uncommon case, and can be optimized if it
1700                                  * becomes a problem. Note that we regenerate
1701                                  * parity when resilvering so we can write it
1702                                  * out to failed devices later.
1703                                  */
1704                                 if (parity_errors < rm->rm_firstdatacol - n) {
1705                                         n = raidz_parity_verify(rm);
1706                                         unexpected_errors += n;
1707                                         ASSERT(parity_errors + n <=
1708                                             rm->rm_firstdatacol);
1709                                 }
1710
1711                                 goto done;
1712                         }
1713                 }
1714         }
1715
1716         /*
1717          * This isn't a typical situation -- either we got a read
1718          * error or a child silently returned bad data. Read every
1719          * block so we can try again with as much data and parity as
1720          * we can track down. If we've already been through once
1721          * before, all children will be marked as tried so we'll
1722          * proceed to combinatorial reconstruction.
1723          */
1724         unexpected_errors = 1;
1725         rm->rm_missingdata = 0;
1726         rm->rm_missingparity = 0;
1727
1728         n = 0;
1729         for (c = 0; c < rm->rm_cols; c++) {
1730                 rc = &rm->rm_col[c];
1731
1732                 if (rc->rc_tried)
1733                         continue;
1734
1735                 cvd = vdev_child(vd, rc->rc_devidx);
1736                 ASSERT(cvd != NULL);
1737                 rc->rc_error = cvd->v_read(cvd, NULL,
1738                     rc->rc_data, rc->rc_offset, rc->rc_size);
1739                 if (rc->rc_error == 0)
1740                         n++;
1741                 rc->rc_tried = 1;
1742                 rc->rc_skipped = 0;
1743         }
1744         /*
1745          * If we managed to read anything more, retry the
1746          * reconstruction.
1747          */
1748         if (n > 0)
1749                 goto reconstruct;
1750
1751         /*
1752          * At this point we've attempted to reconstruct the data given the
1753          * errors we detected, and we've attempted to read all columns. There
1754          * must, therefore, be one or more additional problems -- silent errors
1755          * resulting in invalid data rather than explicit I/O errors resulting
1756          * in absent data. We check if there is enough additional data to
1757          * possibly reconstruct the data and then perform combinatorial
1758          * reconstruction over all possible combinations. If that fails,
1759          * we're cooked.
1760          */
1761         if (total_errors > rm->rm_firstdatacol) {
1762                 error = EIO;
1763         } else if (total_errors < rm->rm_firstdatacol &&
1764             (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1765              total_errors, data_errors)) != 0) {
1766                 /*
1767                  * If we didn't use all the available parity for the
1768                  * combinatorial reconstruction, verify that the remaining
1769                  * parity is correct.
1770                  */
1771                 if (code != (1 << rm->rm_firstdatacol) - 1)
1772                         (void) raidz_parity_verify(rm);
1773         } else {
1774                 /*
1775                  * We're here because either:
1776                  *
1777                  *      total_errors == rm_first_datacol, or
1778                  *      vdev_raidz_combrec() failed
1779                  *
1780                  * In either case, there is enough bad data to prevent
1781                  * reconstruction.
1782                  *
1783                  * Start checksum ereports for all children which haven't
1784                  * failed, and the IO wasn't speculative.
1785                  */
1786                 error = ECKSUM;
1787         }
1788
1789 done:
1790         vdev_raidz_map_free(rm);
1791
1792         return (error);
1793 }