sys/cddl/boot/zfs/zfssubr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cdefs.h>
  27 __FBSDID("$FreeBSD$");
  28
  29 #include <lz4.h>
  30
  31 static uint64_t zfs_crc64_table[256];
  32
  33 #define ASSERT3S(x, y, z)       ((void)0)
  34 #define ASSERT3U(x, y, z)       ((void)0)
  35 #define ASSERT3P(x, y, z)       ((void)0)
  36 #define ASSERT0(x)              ((void)0)
  37 #define ASSERT(x)               ((void)0)
  38
  39 #define panic(...)      do {                                            \
  40         printf(__VA_ARGS__);                                            \
  41         for (;;) ;                                                      \
  42 } while (0)
  43
  44 static void
  45 zfs_init_crc(void)
  46 {
  47         int i, j;
  48         uint64_t *ct;
  49
  50         /*
  51          * Calculate the crc64 table (used for the zap hash
  52          * function).
  53          */
  54         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  55                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  56                 for (i = 0; i < 256; i++)
  57                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  58                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  59         }
  60 }
  61
  62 static void
  63 zio_checksum_off(const void *buf, uint64_t size,
  64     const void *ctx_template, zio_cksum_t *zcp)
  65 {
  66         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  67 }
  68
  69 /*
  70  * Signature for checksum functions.
  71  */
  72 typedef void zio_checksum_t(const void *data, uint64_t size,
  73     const void *ctx_template, zio_cksum_t *zcp);
  74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  76
  77 typedef enum zio_checksum_flags {
  78         /* Strong enough for metadata? */
  79         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  80         /* ZIO embedded checksum */
  81         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  82         /* Strong enough for dedup (without verification)? */
  83         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  84         /* Uses salt value */
  85         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  86         /* Strong enough for nopwrite? */
  87         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  88 } zio_checksum_flags_t;
  89
  90 /*
  91  * Information about each checksum function.
  92  */
  93 typedef struct zio_checksum_info {
  94         /* checksum function for each byteorder */
  95         zio_checksum_t                  *ci_func[2];
  96         zio_checksum_tmpl_init_t        *ci_tmpl_init;
  97         zio_checksum_tmpl_free_t        *ci_tmpl_free;
  98         zio_checksum_flags_t            ci_flags;
  99         const char                      *ci_name;       /* descriptive name */
 100 } zio_checksum_info_t;
 101
 102 #include "blkptr.c"
 103
 104 #include "fletcher.c"
 105 #include "sha256.c"
 106 #include "skein_zfs.c"
 107
 108 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
 109     size_t d_len, int n);
 110
 111
 112 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 113         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 114         {{NULL, NULL}, NULL, NULL, 0, "on"},
 115         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 116         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 117             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 118         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 119             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 120         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 121             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 122         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 123             0, "fletcher2"},
 124         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 125             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 126         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 127             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 128             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 129         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 130             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 131         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 132             0, "noparity"},
 133         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 134             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 135             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 136         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
 137             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 138             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 139             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 140         /* no edonr for now */
 141         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 142             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
 143 };
 144
 145 /*
 146  * Common signature for all zio compress/decompress functions.
 147  */
 148 typedef size_t zio_compress_func_t(void *src, void *dst,
 149     size_t s_len, size_t d_len, int);
 150 typedef int zio_decompress_func_t(void *src, void *dst,
 151     size_t s_len, size_t d_len, int);
 152
 153 /*
 154  * Information about each compression function.
 155  */
 156 typedef struct zio_compress_info {
 157         zio_compress_func_t     *ci_compress;   /* compression function */
 158         zio_decompress_func_t   *ci_decompress; /* decompression function */
 159         int                     ci_level;       /* level parameter */
 160         const char              *ci_name;       /* algorithm name */
 161 } zio_compress_info_t;
 162
 163 #include "lzjb.c"
 164 #include "zle.c"
 165
 166 /*
 167  * Compression vectors.
 168  */
 169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 170         {NULL,                  NULL,                   0,      "inherit"},
 171         {NULL,                  NULL,                   0,      "on"},
 172         {NULL,                  NULL,                   0,      "uncompressed"},
 173         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 174         {NULL,                  NULL,                   0,      "empty"},
 175         {NULL,                  NULL,                   1,      "gzip-1"},
 176         {NULL,                  NULL,                   2,      "gzip-2"},
 177         {NULL,                  NULL,                   3,      "gzip-3"},
 178         {NULL,                  NULL,                   4,      "gzip-4"},
 179         {NULL,                  NULL,                   5,      "gzip-5"},
 180         {NULL,                  NULL,                   6,      "gzip-6"},
 181         {NULL,                  NULL,                   7,      "gzip-7"},
 182         {NULL,                  NULL,                   8,      "gzip-8"},
 183         {NULL,                  NULL,                   9,      "gzip-9"},
 184         {NULL,                  zle_decompress,         64,     "zle"},
 185         {NULL,                  lz4_decompress,         0,      "lz4"},
 186         {NULL,                  zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
 187 };
 188
 189 static void
 190 byteswap_uint64_array(void *vbuf, size_t size)
 191 {
 192         uint64_t *buf = vbuf;
 193         size_t count = size >> 3;
 194         int i;
 195
 196         ASSERT((size & 7) == 0);
 197
 198         for (i = 0; i < count; i++)
 199                 buf[i] = BSWAP_64(buf[i]);
 200 }
 201
 202 /*
 203  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 204  * a tuple which is guaranteed to be unique for the life of the pool.
 205  */
 206 static void
 207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 208 {
 209         const dva_t *dva = BP_IDENTITY(bp);
 210         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 211
 212         ASSERT(BP_IS_GANG(bp));
 213
 214         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 215 }
 216
 217 /*
 218  * Set the external verifier for a label block based on its offset.
 219  * The vdev is implicit, and the txg is unknowable at pool open time --
 220  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 221  */
 222 static void
 223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 224 {
 225         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 226 }
 227
 228 /*
 229  * Calls the template init function of a checksum which supports context
 230  * templates and installs the template into the spa_t.
 231  */
 232 static void
 233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 234 {
 235         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 236
 237         if (ci->ci_tmpl_init == NULL)
 238                 return;
 239
 240         if (spa->spa_cksum_tmpls[checksum] != NULL)
 241                 return;
 242
 243         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 244                 spa->spa_cksum_tmpls[checksum] =
 245                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 246         }
 247 }
 248
 249 /*
 250  * Called by a spa_t that's about to be deallocated. This steps through
 251  * all of the checksum context templates and deallocates any that were
 252  * initialized using the algorithm-specific template init function.
 253  */
 254 static void __unused
 255 zio_checksum_templates_free(spa_t *spa)
 256 {
 257         for (enum zio_checksum checksum = 0;
 258             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 259                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 260                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 261
 262                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 263                         spa->spa_cksum_tmpls[checksum] = NULL;
 264                 }
 265         }
 266 }
 267
 268 static int
 269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
 270 {
 271         uint64_t size;
 272         unsigned int checksum;
 273         zio_checksum_info_t *ci;
 274         void *ctx = NULL;
 275         zio_cksum_t actual_cksum, expected_cksum, verifier;
 276         int byteswap;
 277
 278         checksum = BP_GET_CHECKSUM(bp);
 279         size = BP_GET_PSIZE(bp);
 280
 281         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 282                 return (EINVAL);
 283         ci = &zio_checksum_table[checksum];
 284         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 285                 return (EINVAL);
 286
 287         if (spa != NULL) {
 288                 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
 289                 ctx = spa->spa_cksum_tmpls[checksum];
 290         }
 291
 292         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 293                 zio_eck_t *eck;
 294
 295                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 296                     checksum == ZIO_CHECKSUM_LABEL);
 297
 298                 eck = (zio_eck_t *)((char *)data + size) - 1;
 299
 300                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 301                         zio_checksum_gang_verifier(&verifier, bp);
 302                 else if (checksum == ZIO_CHECKSUM_LABEL)
 303                         zio_checksum_label_verifier(&verifier,
 304                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 305                 else
 306                         verifier = bp->blk_cksum;
 307
 308                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 309
 310                 if (byteswap)
 311                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 312
 313                 expected_cksum = eck->zec_cksum;
 314                 eck->zec_cksum = verifier;
 315                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 316                 eck->zec_cksum = expected_cksum;
 317
 318                 if (byteswap)
 319                         byteswap_uint64_array(&expected_cksum,
 320                             sizeof (zio_cksum_t));
 321         } else {
 322                 byteswap = BP_SHOULD_BYTESWAP(bp);
 323                 expected_cksum = bp->blk_cksum;
 324                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 325         }
 326
 327         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 328                 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
 329                 return (EIO);
 330         }
 331
 332         return (0);
 333 }
 334
 335 static int
 336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 337         void *dest, uint64_t destsize)
 338 {
 339         zio_compress_info_t *ci;
 340
 341         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 342                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 343                 return (EIO);
 344         }
 345
 346         ci = &zio_compress_table[cpfunc];
 347         if (!ci->ci_decompress) {
 348                 printf("ZFS: unsupported compression algorithm %s\n",
 349                     ci->ci_name);
 350                 return (EIO);
 351         }
 352
 353         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 354 }
 355
 356 static uint64_t
 357 zap_hash(uint64_t salt, const char *name)
 358 {
 359         const uint8_t *cp;
 360         uint8_t c;
 361         uint64_t crc = salt;
 362
 363         ASSERT(crc != 0);
 364         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 365         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 366                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 367
 368         /*
 369          * Only use 28 bits, since we need 4 bits in the cookie for the
 370          * collision differentiator.  We MUST use the high bits, since
 371          * those are the onces that we first pay attention to when
 372          * chosing the bucket.
 373          */
 374         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 375
 376         return (crc);
 377 }
 378
 379 typedef struct raidz_col {
 380         uint64_t rc_devidx;             /* child device index for I/O */
 381         uint64_t rc_offset;             /* device offset */
 382         uint64_t rc_size;               /* I/O size */
 383         void *rc_data;                  /* I/O data */
 384         int rc_error;                   /* I/O error for this device */
 385         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 386         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 387 } raidz_col_t;
 388
 389 typedef struct raidz_map {
 390         uint64_t rm_cols;               /* Regular column count */
 391         uint64_t rm_scols;              /* Count including skipped columns */
 392         uint64_t rm_bigcols;            /* Number of oversized columns */
 393         uint64_t rm_asize;              /* Actual total I/O size */
 394         uint64_t rm_missingdata;        /* Count of missing data devices */
 395         uint64_t rm_missingparity;      /* Count of missing parity devices */
 396         uint64_t rm_firstdatacol;       /* First data column/parity count */
 397         uint64_t rm_nskip;              /* Skipped sectors for padding */
 398         uint64_t rm_skipstart;          /* Column index of padding start */
 399         uintptr_t rm_reports;           /* # of referencing checksum reports */
 400         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 401         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 402         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 403 } raidz_map_t;
 404
 405 #define VDEV_RAIDZ_P            0
 406 #define VDEV_RAIDZ_Q            1
 407 #define VDEV_RAIDZ_R            2
 408
 409 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 410 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 411
 412 /*
 413  * We provide a mechanism to perform the field multiplication operation on a
 414  * 64-bit value all at once rather than a byte at a time. This works by
 415  * creating a mask from the top bit in each byte and using that to
 416  * conditionally apply the XOR of 0x1d.
 417  */
 418 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 419 { \
 420         (mask) = (x) & 0x8080808080808080ULL; \
 421         (mask) = ((mask) << 1) - ((mask) >> 7); \
 422         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 423             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 424 }
 425
 426 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 427 { \
 428         VDEV_RAIDZ_64MUL_2((x), mask); \
 429         VDEV_RAIDZ_64MUL_2((x), mask); \
 430 }
 431
 432 /*
 433  * These two tables represent powers and logs of 2 in the Galois field defined
 434  * above. These values were computed by repeatedly multiplying by 2 as above.
 435  */
 436 static const uint8_t vdev_raidz_pow2[256] = {
 437         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 438         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 439         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 440         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 441         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 442         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 443         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 444         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 445         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 446         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 447         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 448         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 449         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 450         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 451         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 452         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 453         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 454         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 455         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 456         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 457         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 458         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 459         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 460         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 461         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 462         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 463         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 464         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 465         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 466         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 467         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 468         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 469 };
 470 static const uint8_t vdev_raidz_log2[256] = {
 471         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 472         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 473         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 474         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 475         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 476         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 477         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 478         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 479         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 480         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 481         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 482         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 483         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 484         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 485         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 486         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 487         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 488         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 489         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 490         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 491         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 492         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 493         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 494         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 495         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 496         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 497         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 498         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 499         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 500         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 501         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 502         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 503 };
 504
 505 /*
 506  * Multiply a given number by 2 raised to the given power.
 507  */
 508 static uint8_t
 509 vdev_raidz_exp2(uint8_t a, int exp)
 510 {
 511         if (a == 0)
 512                 return (0);
 513
 514         ASSERT(exp >= 0);
 515         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 516
 517         exp += vdev_raidz_log2[a];
 518         if (exp > 255)
 519                 exp -= 255;
 520
 521         return (vdev_raidz_pow2[exp]);
 522 }
 523
 524 static void
 525 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 526 {
 527         uint64_t *p, *src, pcount, ccount, i;
 528         int c;
 529
 530         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 531
 532         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 533                 src = rm->rm_col[c].rc_data;
 534                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 535                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 536
 537                 if (c == rm->rm_firstdatacol) {
 538                         ASSERT(ccount == pcount);
 539                         for (i = 0; i < ccount; i++, src++, p++) {
 540                                 *p = *src;
 541                         }
 542                 } else {
 543                         ASSERT(ccount <= pcount);
 544                         for (i = 0; i < ccount; i++, src++, p++) {
 545                                 *p ^= *src;
 546                         }
 547                 }
 548         }
 549 }
 550
 551 static void
 552 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 553 {
 554         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 555         int c;
 556
 557         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 558         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 559             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 560
 561         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 562                 src = rm->rm_col[c].rc_data;
 563                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 564                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 565
 566                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 567
 568                 if (c == rm->rm_firstdatacol) {
 569                         ASSERT(ccnt == pcnt || ccnt == 0);
 570                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 571                                 *p = *src;
 572                                 *q = *src;
 573                         }
 574                         for (; i < pcnt; i++, src++, p++, q++) {
 575                                 *p = 0;
 576                                 *q = 0;
 577                         }
 578                 } else {
 579                         ASSERT(ccnt <= pcnt);
 580
 581                         /*
 582                          * Apply the algorithm described above by multiplying
 583                          * the previous result and adding in the new value.
 584                          */
 585                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 586                                 *p ^= *src;
 587
 588                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 589                                 *q ^= *src;
 590                         }
 591
 592                         /*
 593                          * Treat short columns as though they are full of 0s.
 594                          * Note that there's therefore nothing needed for P.
 595                          */
 596                         for (; i < pcnt; i++, q++) {
 597                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 598                         }
 599                 }
 600         }
 601 }
 602
 603 static void
 604 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 605 {
 606         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 607         int c;
 608
 609         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 610         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 611             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 612         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 613             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 614
 615         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 616                 src = rm->rm_col[c].rc_data;
 617                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 618                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 619                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 620
 621                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 622
 623                 if (c == rm->rm_firstdatacol) {
 624                         ASSERT(ccnt == pcnt || ccnt == 0);
 625                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 626                                 *p = *src;
 627                                 *q = *src;
 628                                 *r = *src;
 629                         }
 630                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 631                                 *p = 0;
 632                                 *q = 0;
 633                                 *r = 0;
 634                         }
 635                 } else {
 636                         ASSERT(ccnt <= pcnt);
 637
 638                         /*
 639                          * Apply the algorithm described above by multiplying
 640                          * the previous result and adding in the new value.
 641                          */
 642                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 643                                 *p ^= *src;
 644
 645                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 646                                 *q ^= *src;
 647
 648                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 649                                 *r ^= *src;
 650                         }
 651
 652                         /*
 653                          * Treat short columns as though they are full of 0s.
 654                          * Note that there's therefore nothing needed for P.
 655                          */
 656                         for (; i < pcnt; i++, q++, r++) {
 657                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 658                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 659                         }
 660                 }
 661         }
 662 }
 663
 664 /*
 665  * Generate RAID parity in the first virtual columns according to the number of
 666  * parity columns available.
 667  */
 668 static void
 669 vdev_raidz_generate_parity(raidz_map_t *rm)
 670 {
 671         switch (rm->rm_firstdatacol) {
 672         case 1:
 673                 vdev_raidz_generate_parity_p(rm);
 674                 break;
 675         case 2:
 676                 vdev_raidz_generate_parity_pq(rm);
 677                 break;
 678         case 3:
 679                 vdev_raidz_generate_parity_pqr(rm);
 680                 break;
 681         default:
 682                 panic("invalid RAID-Z configuration");
 683         }
 684 }
 685
 686 /* BEGIN CSTYLED */
 687 /*
 688  * In the general case of reconstruction, we must solve the system of linear
 689  * equations defined by the coeffecients used to generate parity as well as
 690  * the contents of the data and parity disks. This can be expressed with
 691  * vectors for the original data (D) and the actual data (d) and parity (p)
 692  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 693  *
 694  *            __   __                     __     __
 695  *            |     |         __     __   |  p_0  |
 696  *            |  V  |         |  D_0  |   | p_m-1 |
 697  *            |     |    x    |   :   | = |  d_0  |
 698  *            |  I  |         | D_n-1 |   |   :   |
 699  *            |     |         ~~     ~~   | d_n-1 |
 700  *            ~~   ~~                     ~~     ~~
 701  *
 702  * I is simply a square identity matrix of size n, and V is a vandermonde
 703  * matrix defined by the coeffecients we chose for the various parity columns
 704  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 705  * computation as well as linear separability.
 706  *
 707  *      __               __               __     __
 708  *      |   1   ..  1 1 1 |               |  p_0  |
 709  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 710  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 711  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 712  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 713  *      |   :       : : : |   |   :   |   |  d_2  |
 714  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 715  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 716  *      |   0   ..  0 0 1 |               | d_n-1 |
 717  *      ~~               ~~               ~~     ~~
 718  *
 719  * Note that I, V, d, and p are known. To compute D, we must invert the
 720  * matrix and use the known data and parity values to reconstruct the unknown
 721  * data values. We begin by removing the rows in V|I and d|p that correspond
 722  * to failed or missing columns; we then make V|I square (n x n) and d|p
 723  * sized n by removing rows corresponding to unused parity from the bottom up
 724  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 725  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 726  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 727  *           __                               __
 728  *           |  1   1   1   1   1   1   1   1  |
 729  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 730  *           |  19 205 116  29  64  16  4   1  |      / /
 731  *           |  1   0   0   0   0   0   0   0  |     / /
 732  *           |  0   1   0   0   0   0   0   0  | <--' /
 733  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 734  *           |  0   0   0   1   0   0   0   0  |
 735  *           |  0   0   0   0   1   0   0   0  |
 736  *           |  0   0   0   0   0   1   0   0  |
 737  *           |  0   0   0   0   0   0   1   0  |
 738  *           |  0   0   0   0   0   0   0   1  |
 739  *           ~~                               ~~
 740  *           __                               __
 741  *           |  1   1   1   1   1   1   1   1  |
 742  *           | 128  64  32  16  8   4   2   1  |
 743  *           |  19 205 116  29  64  16  4   1  |
 744  *           |  1   0   0   0   0   0   0   0  |
 745  *           |  0   1   0   0   0   0   0   0  |
 746  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 747  *           |  0   0   0   1   0   0   0   0  |
 748  *           |  0   0   0   0   1   0   0   0  |
 749  *           |  0   0   0   0   0   1   0   0  |
 750  *           |  0   0   0   0   0   0   1   0  |
 751  *           |  0   0   0   0   0   0   0   1  |
 752  *           ~~                               ~~
 753  *
 754  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 755  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 756  * matrix is not singular.
 757  * __                                                                 __
 758  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 759  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 760  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 761  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 762  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 763  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 764  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 765  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 766  * ~~                                                                 ~~
 767  * __                                                                 __
 768  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 769  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 770  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 771  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 772  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 773  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 774  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 775  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 776  * ~~                                                                 ~~
 777  * __                                                                 __
 778  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 779  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 780  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 781  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 782  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 783  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 784  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 785  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 786  * ~~                                                                 ~~
 787  * __                                                                 __
 788  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 789  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 790  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 791  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 792  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 793  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 794  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 795  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 796  * ~~                                                                 ~~
 797  * __                                                                 __
 798  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 799  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 800  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 801  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 802  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 803  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 804  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 805  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 806  * ~~                                                                 ~~
 807  * __                                                                 __
 808  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 809  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 810  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 811  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 812  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 813  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 814  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 815  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 816  * ~~                                                                 ~~
 817  *                   __                               __
 818  *                   |  0   0   1   0   0   0   0   0  |
 819  *                   | 167 100  5   41 159 169 217 208 |
 820  *                   | 166 100  4   40 158 168 216 209 |
 821  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 822  *                   |  0   0   0   0   1   0   0   0  |
 823  *                   |  0   0   0   0   0   1   0   0  |
 824  *                   |  0   0   0   0   0   0   1   0  |
 825  *                   |  0   0   0   0   0   0   0   1  |
 826  *                   ~~                               ~~
 827  *
 828  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 829  * of the missing data.
 830  *
 831  * As is apparent from the example above, the only non-trivial rows in the
 832  * inverse matrix correspond to the data disks that we're trying to
 833  * reconstruct. Indeed, those are the only rows we need as the others would
 834  * only be useful for reconstructing data known or assumed to be valid. For
 835  * that reason, we only build the coefficients in the rows that correspond to
 836  * targeted columns.
 837  */
 838 /* END CSTYLED */
 839
 840 static void
 841 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 842     uint8_t **rows)
 843 {
 844         int i, j;
 845         int pow;
 846
 847         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 848
 849         /*
 850          * Fill in the missing rows of interest.
 851          */
 852         for (i = 0; i < nmap; i++) {
 853                 ASSERT3S(0, <=, map[i]);
 854                 ASSERT3S(map[i], <=, 2);
 855
 856                 pow = map[i] * n;
 857                 if (pow > 255)
 858                         pow -= 255;
 859                 ASSERT(pow <= 255);
 860
 861                 for (j = 0; j < n; j++) {
 862                         pow -= map[i];
 863                         if (pow < 0)
 864                                 pow += 255;
 865                         rows[i][j] = vdev_raidz_pow2[pow];
 866                 }
 867         }
 868 }
 869
 870 static void
 871 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 872     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 873 {
 874         int i, j, ii, jj;
 875         uint8_t log;
 876
 877         /*
 878          * Assert that the first nmissing entries from the array of used
 879          * columns correspond to parity columns and that subsequent entries
 880          * correspond to data columns.
 881          */
 882         for (i = 0; i < nmissing; i++) {
 883                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 884         }
 885         for (; i < n; i++) {
 886                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 887         }
 888
 889         /*
 890          * First initialize the storage where we'll compute the inverse rows.
 891          */
 892         for (i = 0; i < nmissing; i++) {
 893                 for (j = 0; j < n; j++) {
 894                         invrows[i][j] = (i == j) ? 1 : 0;
 895                 }
 896         }
 897
 898         /*
 899          * Subtract all trivial rows from the rows of consequence.
 900          */
 901         for (i = 0; i < nmissing; i++) {
 902                 for (j = nmissing; j < n; j++) {
 903                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 904                         jj = used[j] - rm->rm_firstdatacol;
 905                         ASSERT3S(jj, <, n);
 906                         invrows[i][j] = rows[i][jj];
 907                         rows[i][jj] = 0;
 908                 }
 909         }
 910
 911         /*
 912          * For each of the rows of interest, we must normalize it and subtract
 913          * a multiple of it from the other rows.
 914          */
 915         for (i = 0; i < nmissing; i++) {
 916                 for (j = 0; j < missing[i]; j++) {
 917                         ASSERT3U(rows[i][j], ==, 0);
 918                 }
 919                 ASSERT3U(rows[i][missing[i]], !=, 0);
 920
 921                 /*
 922                  * Compute the inverse of the first element and multiply each
 923                  * element in the row by that value.
 924                  */
 925                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 926
 927                 for (j = 0; j < n; j++) {
 928                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 929                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 930                 }
 931
 932                 for (ii = 0; ii < nmissing; ii++) {
 933                         if (i == ii)
 934                                 continue;
 935
 936                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 937
 938                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 939
 940                         for (j = 0; j < n; j++) {
 941                                 rows[ii][j] ^=
 942                                     vdev_raidz_exp2(rows[i][j], log);
 943                                 invrows[ii][j] ^=
 944                                     vdev_raidz_exp2(invrows[i][j], log);
 945                         }
 946                 }
 947         }
 948
 949         /*
 950          * Verify that the data that is left in the rows are properly part of
 951          * an identity matrix.
 952          */
 953         for (i = 0; i < nmissing; i++) {
 954                 for (j = 0; j < n; j++) {
 955                         if (j == missing[i]) {
 956                                 ASSERT3U(rows[i][j], ==, 1);
 957                         } else {
 958                                 ASSERT3U(rows[i][j], ==, 0);
 959                         }
 960                 }
 961         }
 962 }
 963
 964 static void
 965 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 966     int *missing, uint8_t **invrows, const uint8_t *used)
 967 {
 968         int i, j, x, cc, c;
 969         uint8_t *src;
 970         uint64_t ccount;
 971         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 972         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 973         uint8_t log, val;
 974         int ll;
 975         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 976         uint8_t *p, *pp;
 977         size_t psize;
 978
 979         log = 0;        /* gcc */
 980         psize = sizeof (invlog[0][0]) * n * nmissing;
 981         p = malloc(psize);
 982         if (p == NULL) {
 983                 printf("Out of memory\n");
 984                 return;
 985         }
 986
 987         for (pp = p, i = 0; i < nmissing; i++) {
 988                 invlog[i] = pp;
 989                 pp += n;
 990         }
 991
 992         for (i = 0; i < nmissing; i++) {
 993                 for (j = 0; j < n; j++) {
 994                         ASSERT3U(invrows[i][j], !=, 0);
 995                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 996                 }
 997         }
 998
 999         for (i = 0; i < n; i++) {
1000                 c = used[i];
1001                 ASSERT3U(c, <, rm->rm_cols);
1002
1003                 src = rm->rm_col[c].rc_data;
1004                 ccount = rm->rm_col[c].rc_size;
1005                 for (j = 0; j < nmissing; j++) {
1006                         cc = missing[j] + rm->rm_firstdatacol;
1007                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1008                         ASSERT3U(cc, <, rm->rm_cols);
1009                         ASSERT3U(cc, !=, c);
1010
1011                         dst[j] = rm->rm_col[cc].rc_data;
1012                         dcount[j] = rm->rm_col[cc].rc_size;
1013                 }
1014
1015                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1016
1017                 for (x = 0; x < ccount; x++, src++) {
1018                         if (*src != 0)
1019                                 log = vdev_raidz_log2[*src];
1020
1021                         for (cc = 0; cc < nmissing; cc++) {
1022                                 if (x >= dcount[cc])
1023                                         continue;
1024
1025                                 if (*src == 0) {
1026                                         val = 0;
1027                                 } else {
1028                                         if ((ll = log + invlog[cc][i]) >= 255)
1029                                                 ll -= 255;
1030                                         val = vdev_raidz_pow2[ll];
1031                                 }
1032
1033                                 if (i == 0)
1034                                         dst[cc][x] = val;
1035                                 else
1036                                         dst[cc][x] ^= val;
1037                         }
1038                 }
1039         }
1040
1041         free(p);
1042 }
1043
1044 static int
1045 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046 {
1047         int n, i, c, t, tt;
1048         int nmissing_rows;
1049         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1050         int parity_map[VDEV_RAIDZ_MAXPARITY];
1051
1052         uint8_t *p, *pp;
1053         size_t psize;
1054
1055         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1056         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1057         uint8_t *used;
1058
1059         int code = 0;
1060
1061
1062         n = rm->rm_cols - rm->rm_firstdatacol;
1063
1064         /*
1065          * Figure out which data columns are missing.
1066          */
1067         nmissing_rows = 0;
1068         for (t = 0; t < ntgts; t++) {
1069                 if (tgts[t] >= rm->rm_firstdatacol) {
1070                         missing_rows[nmissing_rows++] =
1071                             tgts[t] - rm->rm_firstdatacol;
1072                 }
1073         }
1074
1075         /*
1076          * Figure out which parity columns to use to help generate the missing
1077          * data columns.
1078          */
1079         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1080                 ASSERT(tt < ntgts);
1081                 ASSERT(c < rm->rm_firstdatacol);
1082
1083                 /*
1084                  * Skip any targeted parity columns.
1085                  */
1086                 if (c == tgts[tt]) {
1087                         tt++;
1088                         continue;
1089                 }
1090
1091                 code |= 1 << c;
1092
1093                 parity_map[i] = c;
1094                 i++;
1095         }
1096
1097         ASSERT(code != 0);
1098         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1099
1100         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1101             nmissing_rows * n + sizeof (used[0]) * n;
1102         p = malloc(psize);
1103         if (p == NULL) {
1104                 printf("Out of memory\n");
1105                 return (code);
1106         }
1107
1108         for (pp = p, i = 0; i < nmissing_rows; i++) {
1109                 rows[i] = pp;
1110                 pp += n;
1111                 invrows[i] = pp;
1112                 pp += n;
1113         }
1114         used = pp;
1115
1116         for (i = 0; i < nmissing_rows; i++) {
1117                 used[i] = parity_map[i];
1118         }
1119
1120         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1121                 if (tt < nmissing_rows &&
1122                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1123                         tt++;
1124                         continue;
1125                 }
1126
1127                 ASSERT3S(i, <, n);
1128                 used[i] = c;
1129                 i++;
1130         }
1131
1132         /*
1133          * Initialize the interesting rows of the matrix.
1134          */
1135         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1136
1137         /*
1138          * Invert the matrix.
1139          */
1140         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1141             invrows, used);
1142
1143         /*
1144          * Reconstruct the missing data using the generated matrix.
1145          */
1146         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1147             invrows, used);
1148
1149         free(p);
1150
1151         return (code);
1152 }
1153
1154 static int
1155 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1156 {
1157         int tgts[VDEV_RAIDZ_MAXPARITY];
1158         int ntgts;
1159         int i, c;
1160         int code;
1161         int nbadparity, nbaddata;
1162
1163         /*
1164          * The tgts list must already be sorted.
1165          */
1166         for (i = 1; i < nt; i++) {
1167                 ASSERT(t[i] > t[i - 1]);
1168         }
1169
1170         nbadparity = rm->rm_firstdatacol;
1171         nbaddata = rm->rm_cols - nbadparity;
1172         ntgts = 0;
1173         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1174                 if (i < nt && c == t[i]) {
1175                         tgts[ntgts++] = c;
1176                         i++;
1177                 } else if (rm->rm_col[c].rc_error != 0) {
1178                         tgts[ntgts++] = c;
1179                 } else if (c >= rm->rm_firstdatacol) {
1180                         nbaddata--;
1181                 } else {
1182                         nbadparity--;
1183                 }
1184         }
1185
1186         ASSERT(ntgts >= nt);
1187         ASSERT(nbaddata >= 0);
1188         ASSERT(nbaddata + nbadparity == ntgts);
1189
1190         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1191         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1192         ASSERT(code > 0);
1193         return (code);
1194 }
1195
1196 static raidz_map_t *
1197 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1198     uint64_t dcols, uint64_t nparity)
1199 {
1200         raidz_map_t *rm;
1201         uint64_t b = offset >> unit_shift;
1202         uint64_t s = size >> unit_shift;
1203         uint64_t f = b % dcols;
1204         uint64_t o = (b / dcols) << unit_shift;
1205         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1206
1207         q = s / (dcols - nparity);
1208         r = s - q * (dcols - nparity);
1209         bc = (r == 0 ? 0 : r + nparity);
1210         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1211
1212         if (q == 0) {
1213                 acols = bc;
1214                 scols = MIN(dcols, roundup(bc, nparity + 1));
1215         } else {
1216                 acols = dcols;
1217                 scols = dcols;
1218         }
1219
1220         ASSERT3U(acols, <=, scols);
1221
1222         rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1223         if (rm == NULL)
1224                 return (rm);
1225
1226         rm->rm_cols = acols;
1227         rm->rm_scols = scols;
1228         rm->rm_bigcols = bc;
1229         rm->rm_skipstart = bc;
1230         rm->rm_missingdata = 0;
1231         rm->rm_missingparity = 0;
1232         rm->rm_firstdatacol = nparity;
1233         rm->rm_reports = 0;
1234         rm->rm_freed = 0;
1235         rm->rm_ecksuminjected = 0;
1236
1237         asize = 0;
1238
1239         for (c = 0; c < scols; c++) {
1240                 col = f + c;
1241                 coff = o;
1242                 if (col >= dcols) {
1243                         col -= dcols;
1244                         coff += 1ULL << unit_shift;
1245                 }
1246                 rm->rm_col[c].rc_devidx = col;
1247                 rm->rm_col[c].rc_offset = coff;
1248                 rm->rm_col[c].rc_data = NULL;
1249                 rm->rm_col[c].rc_error = 0;
1250                 rm->rm_col[c].rc_tried = 0;
1251                 rm->rm_col[c].rc_skipped = 0;
1252
1253                 if (c >= acols)
1254                         rm->rm_col[c].rc_size = 0;
1255                 else if (c < bc)
1256                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1257                 else
1258                         rm->rm_col[c].rc_size = q << unit_shift;
1259
1260                 asize += rm->rm_col[c].rc_size;
1261         }
1262
1263         ASSERT3U(asize, ==, tot << unit_shift);
1264         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1265         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1266         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1267         ASSERT3U(rm->rm_nskip, <=, nparity);
1268
1269         for (c = 0; c < rm->rm_firstdatacol; c++) {
1270                 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1271                 if (rm->rm_col[c].rc_data == NULL) {
1272                         c++;
1273                         while (c != 0)
1274                                 free(rm->rm_col[--c].rc_data);
1275                         free(rm);
1276                         return (NULL);
1277                 }
1278         }
1279
1280         rm->rm_col[c].rc_data = data;
1281
1282         for (c = c + 1; c < acols; c++)
1283                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1284                     rm->rm_col[c - 1].rc_size;
1285
1286         /*
1287          * If all data stored spans all columns, there's a danger that parity
1288          * will always be on the same device and, since parity isn't read
1289          * during normal operation, that that device's I/O bandwidth won't be
1290          * used effectively. We therefore switch the parity every 1MB.
1291          *
1292          * ... at least that was, ostensibly, the theory. As a practical
1293          * matter unless we juggle the parity between all devices evenly, we
1294          * won't see any benefit. Further, occasional writes that aren't a
1295          * multiple of the LCM of the number of children and the minimum
1296          * stripe width are sufficient to avoid pessimal behavior.
1297          * Unfortunately, this decision created an implicit on-disk format
1298          * requirement that we need to support for all eternity, but only
1299          * for single-parity RAID-Z.
1300          *
1301          * If we intend to skip a sector in the zeroth column for padding
1302          * we must make sure to note this swap. We will never intend to
1303          * skip the first column since at least one data and one parity
1304          * column must appear in each row.
1305          */
1306         ASSERT(rm->rm_cols >= 2);
1307         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1308
1309         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1310                 devidx = rm->rm_col[0].rc_devidx;
1311                 o = rm->rm_col[0].rc_offset;
1312                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1313                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1314                 rm->rm_col[1].rc_devidx = devidx;
1315                 rm->rm_col[1].rc_offset = o;
1316
1317                 if (rm->rm_skipstart == 0)
1318                         rm->rm_skipstart = 1;
1319         }
1320
1321         return (rm);
1322 }
1323
1324 static void
1325 vdev_raidz_map_free(raidz_map_t *rm)
1326 {
1327         int c;
1328
1329         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1330                 free(rm->rm_col[c].rc_data);
1331
1332         free(rm);
1333 }
1334
1335 static vdev_t *
1336 vdev_child(vdev_t *pvd, uint64_t devidx)
1337 {
1338         vdev_t *cvd;
1339
1340         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1341                 if (cvd->v_id == devidx)
1342                         break;
1343         }
1344
1345         return (cvd);
1346 }
1347
1348 /*
1349  * We keep track of whether or not there were any injected errors, so that
1350  * any ereports we generate can note it.
1351  */
1352 static int
1353 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1354     uint64_t size)
1355 {
1356         return (zio_checksum_verify(spa, bp, data));
1357 }
1358
1359 /*
1360  * Generate the parity from the data columns. If we tried and were able to
1361  * read the parity without error, verify that the generated parity matches the
1362  * data we read. If it doesn't, we fire off a checksum error. Return the
1363  * number such failures.
1364  */
1365 static int
1366 raidz_parity_verify(raidz_map_t *rm)
1367 {
1368         void *orig[VDEV_RAIDZ_MAXPARITY];
1369         int c, ret = 0;
1370         raidz_col_t *rc;
1371
1372         for (c = 0; c < rm->rm_firstdatacol; c++) {
1373                 rc = &rm->rm_col[c];
1374                 if (!rc->rc_tried || rc->rc_error != 0)
1375                         continue;
1376                 orig[c] = malloc(rc->rc_size);
1377                 if (orig[c] != NULL) {
1378                         bcopy(rc->rc_data, orig[c], rc->rc_size);
1379                 } else {
1380                         printf("Out of memory\n");
1381                 }
1382         }
1383
1384         vdev_raidz_generate_parity(rm);
1385
1386         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1387                 rc = &rm->rm_col[c];
1388                 if (!rc->rc_tried || rc->rc_error != 0)
1389                         continue;
1390                 if (orig[c] == NULL ||
1391                     bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1392                         rc->rc_error = ECKSUM;
1393                         ret++;
1394                 }
1395                 free(orig[c]);
1396         }
1397
1398         return (ret);
1399 }
1400
1401 /*
1402  * Iterate over all combinations of bad data and attempt a reconstruction.
1403  * Note that the algorithm below is non-optimal because it doesn't take into
1404  * account how reconstruction is actually performed. For example, with
1405  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1406  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1407  * cases we'd only use parity information in column 0.
1408  */
1409 static int
1410 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1411     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1412 {
1413         raidz_col_t *rc;
1414         void *orig[VDEV_RAIDZ_MAXPARITY];
1415         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416         int *tgts = &tstore[1];
1417         int current, next, i, c, n;
1418         int code, ret = 0;
1419
1420         ASSERT(total_errors < rm->rm_firstdatacol);
1421
1422         /*
1423          * This simplifies one edge condition.
1424          */
1425         tgts[-1] = -1;
1426
1427         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428                 /*
1429                  * Initialize the targets array by finding the first n columns
1430                  * that contain no error.
1431                  *
1432                  * If there were no data errors, we need to ensure that we're
1433                  * always explicitly attempting to reconstruct at least one
1434                  * data column. To do this, we simply push the highest target
1435                  * up into the data columns.
1436                  */
1437                 for (c = 0, i = 0; i < n; i++) {
1438                         if (i == n - 1 && data_errors == 0 &&
1439                             c < rm->rm_firstdatacol) {
1440                                 c = rm->rm_firstdatacol;
1441                         }
1442
1443                         while (rm->rm_col[c].rc_error != 0) {
1444                                 c++;
1445                                 ASSERT3S(c, <, rm->rm_cols);
1446                         }
1447
1448                         tgts[i] = c++;
1449                 }
1450
1451                 /*
1452                  * Setting tgts[n] simplifies the other edge condition.
1453                  */
1454                 tgts[n] = rm->rm_cols;
1455
1456                 /*
1457                  * These buffers were allocated in previous iterations.
1458                  */
1459                 for (i = 0; i < n - 1; i++) {
1460                         ASSERT(orig[i] != NULL);
1461                 }
1462
1463                 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464                 if (orig[n - 1] == NULL) {
1465                         ret = ENOMEM;
1466                         goto done;
1467                 }
1468
1469                 current = 0;
1470                 next = tgts[current];
1471
1472                 while (current != n) {
1473                         tgts[current] = next;
1474                         current = 0;
1475
1476                         /*
1477                          * Save off the original data that we're going to
1478                          * attempt to reconstruct.
1479                          */
1480                         for (i = 0; i < n; i++) {
1481                                 ASSERT(orig[i] != NULL);
1482                                 c = tgts[i];
1483                                 ASSERT3S(c, >=, 0);
1484                                 ASSERT3S(c, <, rm->rm_cols);
1485                                 rc = &rm->rm_col[c];
1486                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1487                         }
1488
1489                         /*
1490                          * Attempt a reconstruction and exit the outer loop on
1491                          * success.
1492                          */
1493                         code = vdev_raidz_reconstruct(rm, tgts, n);
1494                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495                                 for (i = 0; i < n; i++) {
1496                                         c = tgts[i];
1497                                         rc = &rm->rm_col[c];
1498                                         ASSERT(rc->rc_error == 0);
1499                                         rc->rc_error = ECKSUM;
1500                                 }
1501
1502                                 ret = code;
1503                                 goto done;
1504                         }
1505
1506                         /*
1507                          * Restore the original data.
1508                          */
1509                         for (i = 0; i < n; i++) {
1510                                 c = tgts[i];
1511                                 rc = &rm->rm_col[c];
1512                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1513                         }
1514
1515                         do {
1516                                 /*
1517                                  * Find the next valid column after the current
1518                                  * position..
1519                                  */
1520                                 for (next = tgts[current] + 1;
1521                                     next < rm->rm_cols &&
1522                                     rm->rm_col[next].rc_error != 0; next++)
1523                                         continue;
1524
1525                                 ASSERT(next <= tgts[current + 1]);
1526
1527                                 /*
1528                                  * If that spot is available, we're done here.
1529                                  */
1530                                 if (next != tgts[current + 1])
1531                                         break;
1532
1533                                 /*
1534                                  * Otherwise, find the next valid column after
1535                                  * the previous position.
1536                                  */
1537                                 for (c = tgts[current - 1] + 1;
1538                                     rm->rm_col[c].rc_error != 0; c++)
1539                                         continue;
1540
1541                                 tgts[current] = c;
1542                                 current++;
1543
1544                         } while (current != n);
1545                 }
1546         }
1547         n--;
1548 done:
1549         for (i = n - 1; i >= 0; i--) {
1550                 free(orig[i]);
1551         }
1552
1553         return (ret);
1554 }
1555
1556 static int
1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558     off_t offset, size_t bytes)
1559 {
1560         vdev_t *tvd = vd->v_top;
1561         vdev_t *cvd;
1562         raidz_map_t *rm;
1563         raidz_col_t *rc;
1564         int c, error;
1565         int unexpected_errors;
1566         int parity_errors;
1567         int parity_untried;
1568         int data_errors;
1569         int total_errors;
1570         int n;
1571         int tgts[VDEV_RAIDZ_MAXPARITY];
1572         int code;
1573
1574         rc = NULL;      /* gcc */
1575         error = 0;
1576
1577         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578             vd->v_nchildren, vd->v_nparity);
1579         if (rm == NULL)
1580                 return (ENOMEM);
1581
1582         /*
1583          * Iterate over the columns in reverse order so that we hit the parity
1584          * last -- any errors along the way will force us to read the parity.
1585          */
1586         for (c = rm->rm_cols - 1; c >= 0; c--) {
1587                 rc = &rm->rm_col[c];
1588                 cvd = vdev_child(vd, rc->rc_devidx);
1589                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590                         if (c >= rm->rm_firstdatacol)
1591                                 rm->rm_missingdata++;
1592                         else
1593                                 rm->rm_missingparity++;
1594                         rc->rc_error = ENXIO;
1595                         rc->rc_tried = 1;       /* don't even try */
1596                         rc->rc_skipped = 1;
1597                         continue;
1598                 }
1599 #if 0           /* XXX: Too hard for the boot code. */
1600                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601                         if (c >= rm->rm_firstdatacol)
1602                                 rm->rm_missingdata++;
1603                         else
1604                                 rm->rm_missingparity++;
1605                         rc->rc_error = ESTALE;
1606                         rc->rc_skipped = 1;
1607                         continue;
1608                 }
1609 #endif
1610                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612                             rc->rc_offset, rc->rc_size);
1613                         rc->rc_tried = 1;
1614                         rc->rc_skipped = 0;
1615                 }
1616         }
1617
1618 reconstruct:
1619         unexpected_errors = 0;
1620         parity_errors = 0;
1621         parity_untried = 0;
1622         data_errors = 0;
1623         total_errors = 0;
1624
1625         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627
1628         for (c = 0; c < rm->rm_cols; c++) {
1629                 rc = &rm->rm_col[c];
1630
1631                 if (rc->rc_error) {
1632                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1633
1634                         if (c < rm->rm_firstdatacol)
1635                                 parity_errors++;
1636                         else
1637                                 data_errors++;
1638
1639                         if (!rc->rc_skipped)
1640                                 unexpected_errors++;
1641
1642                         total_errors++;
1643                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644                         parity_untried++;
1645                 }
1646         }
1647
1648         /*
1649          * There are three potential phases for a read:
1650          *      1. produce valid data from the columns read
1651          *      2. read all disks and try again
1652          *      3. perform combinatorial reconstruction
1653          *
1654          * Each phase is progressively both more expensive and less likely to
1655          * occur. If we encounter more errors than we can repair or all phases
1656          * fail, we have no choice but to return an error.
1657          */
1658
1659         /*
1660          * If the number of errors we saw was correctable -- less than or equal
1661          * to the number of parity disks read -- attempt to produce data that
1662          * has a valid checksum. Naturally, this case applies in the absence of
1663          * any errors.
1664          */
1665         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1666                 int rv;
1667
1668                 if (data_errors == 0) {
1669                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1670                         if (rv == 0) {
1671                                 /*
1672                                  * If we read parity information (unnecessarily
1673                                  * as it happens since no reconstruction was
1674                                  * needed) regenerate and verify the parity.
1675                                  * We also regenerate parity when resilvering
1676                                  * so we can write it out to the failed device
1677                                  * later.
1678                                  */
1679                                 if (parity_errors + parity_untried <
1680                                     rm->rm_firstdatacol) {
1681                                         n = raidz_parity_verify(rm);
1682                                         unexpected_errors += n;
1683                                         ASSERT(parity_errors + n <=
1684                                             rm->rm_firstdatacol);
1685                                 }
1686                                 goto done;
1687                         }
1688                 } else {
1689                         /*
1690                          * We either attempt to read all the parity columns or
1691                          * none of them. If we didn't try to read parity, we
1692                          * wouldn't be here in the correctable case. There must
1693                          * also have been fewer parity errors than parity
1694                          * columns or, again, we wouldn't be in this code path.
1695                          */
1696                         ASSERT(parity_untried == 0);
1697                         ASSERT(parity_errors < rm->rm_firstdatacol);
1698
1699                         /*
1700                          * Identify the data columns that reported an error.
1701                          */
1702                         n = 0;
1703                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704                                 rc = &rm->rm_col[c];
1705                                 if (rc->rc_error != 0) {
1706                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707                                         tgts[n++] = c;
1708                                 }
1709                         }
1710
1711                         ASSERT(rm->rm_firstdatacol >= n);
1712
1713                         code = vdev_raidz_reconstruct(rm, tgts, n);
1714
1715                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1716                         if (rv == 0) {
1717                                 /*
1718                                  * If we read more parity disks than were used
1719                                  * for reconstruction, confirm that the other
1720                                  * parity disks produced correct data. This
1721                                  * routine is suboptimal in that it regenerates
1722                                  * the parity that we already used in addition
1723                                  * to the parity that we're attempting to
1724                                  * verify, but this should be a relatively
1725                                  * uncommon case, and can be optimized if it
1726                                  * becomes a problem. Note that we regenerate
1727                                  * parity when resilvering so we can write it
1728                                  * out to failed devices later.
1729                                  */
1730                                 if (parity_errors < rm->rm_firstdatacol - n) {
1731                                         n = raidz_parity_verify(rm);
1732                                         unexpected_errors += n;
1733                                         ASSERT(parity_errors + n <=
1734                                             rm->rm_firstdatacol);
1735                                 }
1736
1737                                 goto done;
1738                         }
1739                 }
1740         }
1741
1742         /*
1743          * This isn't a typical situation -- either we got a read
1744          * error or a child silently returned bad data. Read every
1745          * block so we can try again with as much data and parity as
1746          * we can track down. If we've already been through once
1747          * before, all children will be marked as tried so we'll
1748          * proceed to combinatorial reconstruction.
1749          */
1750         unexpected_errors = 1;
1751         rm->rm_missingdata = 0;
1752         rm->rm_missingparity = 0;
1753
1754         n = 0;
1755         for (c = 0; c < rm->rm_cols; c++) {
1756                 rc = &rm->rm_col[c];
1757
1758                 if (rc->rc_tried)
1759                         continue;
1760
1761                 cvd = vdev_child(vd, rc->rc_devidx);
1762                 ASSERT(cvd != NULL);
1763                 rc->rc_error = cvd->v_read(cvd, NULL,
1764                     rc->rc_data, rc->rc_offset, rc->rc_size);
1765                 if (rc->rc_error == 0)
1766                         n++;
1767                 rc->rc_tried = 1;
1768                 rc->rc_skipped = 0;
1769         }
1770         /*
1771          * If we managed to read anything more, retry the
1772          * reconstruction.
1773          */
1774         if (n > 0)
1775                 goto reconstruct;
1776
1777         /*
1778          * At this point we've attempted to reconstruct the data given the
1779          * errors we detected, and we've attempted to read all columns. There
1780          * must, therefore, be one or more additional problems -- silent errors
1781          * resulting in invalid data rather than explicit I/O errors resulting
1782          * in absent data. We check if there is enough additional data to
1783          * possibly reconstruct the data and then perform combinatorial
1784          * reconstruction over all possible combinations. If that fails,
1785          * we're cooked.
1786          */
1787         if (total_errors > rm->rm_firstdatacol) {
1788                 error = EIO;
1789         } else if (total_errors < rm->rm_firstdatacol &&
1790             (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1791              total_errors, data_errors)) != 0) {
1792                 /*
1793                  * If we didn't use all the available parity for the
1794                  * combinatorial reconstruction, verify that the remaining
1795                  * parity is correct.
1796                  */
1797                 if (code != (1 << rm->rm_firstdatacol) - 1)
1798                         (void) raidz_parity_verify(rm);
1799         } else {
1800                 /*
1801                  * We're here because either:
1802                  *
1803                  *      total_errors == rm_first_datacol, or
1804                  *      vdev_raidz_combrec() failed
1805                  *
1806                  * In either case, there is enough bad data to prevent
1807                  * reconstruction.
1808                  *
1809                  * Start checksum ereports for all children which haven't
1810                  * failed, and the IO wasn't speculative.
1811                  */
1812                 error = ECKSUM;
1813         }
1814
1815 done:
1816         vdev_raidz_map_free(rm);
1817
1818         return (error);
1819 }