sys/cddl/boot/zfs/zfssubr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cdefs.h>
  27 __FBSDID("$FreeBSD$");
  28
  29 #include <lz4.h>
  30
  31 static uint64_t zfs_crc64_table[256];
  32
  33 #define ECKSUM  666
  34
  35 #define ASSERT3S(x, y, z)       ((void)0)
  36 #define ASSERT3U(x, y, z)       ((void)0)
  37 #define ASSERT3P(x, y, z)       ((void)0)
  38 #define ASSERT0(x)              ((void)0)
  39 #define ASSERT(x)               ((void)0)
  40
  41 #define panic(...)      do {                                            \
  42         printf(__VA_ARGS__);                                            \
  43         for (;;) ;                                                      \
  44 } while (0)
  45
  46 static void
  47 zfs_init_crc(void)
  48 {
  49         int i, j;
  50         uint64_t *ct;
  51
  52         /*
  53          * Calculate the crc64 table (used for the zap hash
  54          * function).
  55          */
  56         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  57                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  58                 for (i = 0; i < 256; i++)
  59                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  60                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  61         }
  62 }
  63
  64 static void
  65 zio_checksum_off(const void *buf, uint64_t size,
  66     const void *ctx_template, zio_cksum_t *zcp)
  67 {
  68         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  69 }
  70
  71 /*
  72  * Signature for checksum functions.
  73  */
  74 typedef void zio_checksum_t(const void *data, uint64_t size,
  75     const void *ctx_template, zio_cksum_t *zcp);
  76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  78
  79 typedef enum zio_checksum_flags {
  80         /* Strong enough for metadata? */
  81         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  82         /* ZIO embedded checksum */
  83         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  84         /* Strong enough for dedup (without verification)? */
  85         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  86         /* Uses salt value */
  87         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  88         /* Strong enough for nopwrite? */
  89         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  90 } zio_checksum_flags_t;
  91
  92 /*
  93  * Information about each checksum function.
  94  */
  95 typedef struct zio_checksum_info {
  96         /* checksum function for each byteorder */
  97         zio_checksum_t                  *ci_func[2];
  98         zio_checksum_tmpl_init_t        *ci_tmpl_init;
  99         zio_checksum_tmpl_free_t        *ci_tmpl_free;
 100         zio_checksum_flags_t            ci_flags;
 101         const char                      *ci_name;       /* descriptive name */
 102 } zio_checksum_info_t;
 103
 104 #include "blkptr.c"
 105
 106 #include "fletcher.c"
 107 #include "sha256.c"
 108 #include "skein_zfs.c"
 109
 110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 111         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 112         {{NULL, NULL}, NULL, NULL, 0, "on"},
 113         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 114         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 115             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 116         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 117             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 118         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 119             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 120         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 121             0, "fletcher2"},
 122         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 123             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 124         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 125             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 126             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 127         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 128             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 129         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 130             0, "noparity"},
 131         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 132             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 133             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 134         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
 135             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 136             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 137             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 138         /* no edonr for now */
 139         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 140             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
 141 };
 142
 143 /*
 144  * Common signature for all zio compress/decompress functions.
 145  */
 146 typedef size_t zio_compress_func_t(void *src, void *dst,
 147     size_t s_len, size_t d_len, int);
 148 typedef int zio_decompress_func_t(void *src, void *dst,
 149     size_t s_len, size_t d_len, int);
 150
 151 /*
 152  * Information about each compression function.
 153  */
 154 typedef struct zio_compress_info {
 155         zio_compress_func_t     *ci_compress;   /* compression function */
 156         zio_decompress_func_t   *ci_decompress; /* decompression function */
 157         int                     ci_level;       /* level parameter */
 158         const char              *ci_name;       /* algorithm name */
 159 } zio_compress_info_t;
 160
 161 #include "lzjb.c"
 162 #include "zle.c"
 163
 164 /*
 165  * Compression vectors.
 166  */
 167 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 168         {NULL,                  NULL,                   0,      "inherit"},
 169         {NULL,                  NULL,                   0,      "on"},
 170         {NULL,                  NULL,                   0,      "uncompressed"},
 171         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 172         {NULL,                  NULL,                   0,      "empty"},
 173         {NULL,                  NULL,                   1,      "gzip-1"},
 174         {NULL,                  NULL,                   2,      "gzip-2"},
 175         {NULL,                  NULL,                   3,      "gzip-3"},
 176         {NULL,                  NULL,                   4,      "gzip-4"},
 177         {NULL,                  NULL,                   5,      "gzip-5"},
 178         {NULL,                  NULL,                   6,      "gzip-6"},
 179         {NULL,                  NULL,                   7,      "gzip-7"},
 180         {NULL,                  NULL,                   8,      "gzip-8"},
 181         {NULL,                  NULL,                   9,      "gzip-9"},
 182         {NULL,                  zle_decompress,         64,     "zle"},
 183         {NULL,                  lz4_decompress,         0,      "lz4"},
 184 };
 185
 186 static void
 187 byteswap_uint64_array(void *vbuf, size_t size)
 188 {
 189         uint64_t *buf = vbuf;
 190         size_t count = size >> 3;
 191         int i;
 192
 193         ASSERT((size & 7) == 0);
 194
 195         for (i = 0; i < count; i++)
 196                 buf[i] = BSWAP_64(buf[i]);
 197 }
 198
 199 /*
 200  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 201  * a tuple which is guaranteed to be unique for the life of the pool.
 202  */
 203 static void
 204 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 205 {
 206         const dva_t *dva = BP_IDENTITY(bp);
 207         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 208
 209         ASSERT(BP_IS_GANG(bp));
 210
 211         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 212 }
 213
 214 /*
 215  * Set the external verifier for a label block based on its offset.
 216  * The vdev is implicit, and the txg is unknowable at pool open time --
 217  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 218  */
 219 static void
 220 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 221 {
 222         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 223 }
 224
 225 /*
 226  * Calls the template init function of a checksum which supports context
 227  * templates and installs the template into the spa_t.
 228  */
 229 static void
 230 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 231 {
 232         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 233
 234         if (ci->ci_tmpl_init == NULL)
 235                 return;
 236
 237         if (spa->spa_cksum_tmpls[checksum] != NULL)
 238                 return;
 239
 240         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 241                 spa->spa_cksum_tmpls[checksum] =
 242                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 243         }
 244 }
 245
 246 /*
 247  * Called by a spa_t that's about to be deallocated. This steps through
 248  * all of the checksum context templates and deallocates any that were
 249  * initialized using the algorithm-specific template init function.
 250  */
 251 static void __unused
 252 zio_checksum_templates_free(spa_t *spa)
 253 {
 254         for (enum zio_checksum checksum = 0;
 255             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 256                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 257                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 258
 259                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 260                         spa->spa_cksum_tmpls[checksum] = NULL;
 261                 }
 262         }
 263 }
 264
 265 static int
 266 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
 267 {
 268         uint64_t size;
 269         unsigned int checksum;
 270         zio_checksum_info_t *ci;
 271         void *ctx = NULL;
 272         zio_cksum_t actual_cksum, expected_cksum, verifier;
 273         int byteswap;
 274
 275         checksum = BP_GET_CHECKSUM(bp);
 276         size = BP_GET_PSIZE(bp);
 277
 278         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 279                 return (EINVAL);
 280         ci = &zio_checksum_table[checksum];
 281         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 282                 return (EINVAL);
 283
 284         if (spa != NULL) {
 285                 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
 286                 ctx = spa->spa_cksum_tmpls[checksum];
 287         }
 288
 289         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 290                 zio_eck_t *eck;
 291
 292                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 293                     checksum == ZIO_CHECKSUM_LABEL);
 294
 295                 eck = (zio_eck_t *)((char *)data + size) - 1;
 296
 297                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 298                         zio_checksum_gang_verifier(&verifier, bp);
 299                 else if (checksum == ZIO_CHECKSUM_LABEL)
 300                         zio_checksum_label_verifier(&verifier,
 301                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 302                 else
 303                         verifier = bp->blk_cksum;
 304
 305                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 306
 307                 if (byteswap)
 308                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 309
 310                 expected_cksum = eck->zec_cksum;
 311                 eck->zec_cksum = verifier;
 312                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 313                 eck->zec_cksum = expected_cksum;
 314
 315                 if (byteswap)
 316                         byteswap_uint64_array(&expected_cksum,
 317                             sizeof (zio_cksum_t));
 318         } else {
 319                 byteswap = BP_SHOULD_BYTESWAP(bp);
 320                 expected_cksum = bp->blk_cksum;
 321                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 322         }
 323
 324         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 325                 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
 326                 return (EIO);
 327         }
 328
 329         return (0);
 330 }
 331
 332 static int
 333 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 334         void *dest, uint64_t destsize)
 335 {
 336         zio_compress_info_t *ci;
 337
 338         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 339                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 340                 return (EIO);
 341         }
 342
 343         ci = &zio_compress_table[cpfunc];
 344         if (!ci->ci_decompress) {
 345                 printf("ZFS: unsupported compression algorithm %s\n",
 346                     ci->ci_name);
 347                 return (EIO);
 348         }
 349
 350         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 351 }
 352
 353 static uint64_t
 354 zap_hash(uint64_t salt, const char *name)
 355 {
 356         const uint8_t *cp;
 357         uint8_t c;
 358         uint64_t crc = salt;
 359
 360         ASSERT(crc != 0);
 361         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 362         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 363                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 364
 365         /*
 366          * Only use 28 bits, since we need 4 bits in the cookie for the
 367          * collision differentiator.  We MUST use the high bits, since
 368          * those are the onces that we first pay attention to when
 369          * chosing the bucket.
 370          */
 371         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 372
 373         return (crc);
 374 }
 375
 376 typedef struct raidz_col {
 377         uint64_t rc_devidx;             /* child device index for I/O */
 378         uint64_t rc_offset;             /* device offset */
 379         uint64_t rc_size;               /* I/O size */
 380         void *rc_data;                  /* I/O data */
 381         int rc_error;                   /* I/O error for this device */
 382         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 383         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 384 } raidz_col_t;
 385
 386 typedef struct raidz_map {
 387         uint64_t rm_cols;               /* Regular column count */
 388         uint64_t rm_scols;              /* Count including skipped columns */
 389         uint64_t rm_bigcols;            /* Number of oversized columns */
 390         uint64_t rm_asize;              /* Actual total I/O size */
 391         uint64_t rm_missingdata;        /* Count of missing data devices */
 392         uint64_t rm_missingparity;      /* Count of missing parity devices */
 393         uint64_t rm_firstdatacol;       /* First data column/parity count */
 394         uint64_t rm_nskip;              /* Skipped sectors for padding */
 395         uint64_t rm_skipstart;          /* Column index of padding start */
 396         uintptr_t rm_reports;           /* # of referencing checksum reports */
 397         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 398         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 399         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 400 } raidz_map_t;
 401
 402 #define VDEV_RAIDZ_P            0
 403 #define VDEV_RAIDZ_Q            1
 404 #define VDEV_RAIDZ_R            2
 405
 406 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 407 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 408
 409 /*
 410  * We provide a mechanism to perform the field multiplication operation on a
 411  * 64-bit value all at once rather than a byte at a time. This works by
 412  * creating a mask from the top bit in each byte and using that to
 413  * conditionally apply the XOR of 0x1d.
 414  */
 415 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 416 { \
 417         (mask) = (x) & 0x8080808080808080ULL; \
 418         (mask) = ((mask) << 1) - ((mask) >> 7); \
 419         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 420             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 421 }
 422
 423 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 424 { \
 425         VDEV_RAIDZ_64MUL_2((x), mask); \
 426         VDEV_RAIDZ_64MUL_2((x), mask); \
 427 }
 428
 429 /*
 430  * These two tables represent powers and logs of 2 in the Galois field defined
 431  * above. These values were computed by repeatedly multiplying by 2 as above.
 432  */
 433 static const uint8_t vdev_raidz_pow2[256] = {
 434         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 435         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 436         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 437         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 438         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 439         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 440         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 441         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 442         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 443         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 444         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 445         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 446         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 447         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 448         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 449         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 450         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 451         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 452         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 453         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 454         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 455         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 456         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 457         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 458         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 459         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 460         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 461         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 462         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 463         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 464         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 465         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 466 };
 467 static const uint8_t vdev_raidz_log2[256] = {
 468         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 469         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 470         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 471         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 472         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 473         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 474         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 475         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 476         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 477         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 478         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 479         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 480         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 481         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 482         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 483         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 484         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 485         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 486         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 487         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 488         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 489         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 490         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 491         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 492         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 493         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 494         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 495         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 496         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 497         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 498         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 499         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 500 };
 501
 502 /*
 503  * Multiply a given number by 2 raised to the given power.
 504  */
 505 static uint8_t
 506 vdev_raidz_exp2(uint8_t a, int exp)
 507 {
 508         if (a == 0)
 509                 return (0);
 510
 511         ASSERT(exp >= 0);
 512         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 513
 514         exp += vdev_raidz_log2[a];
 515         if (exp > 255)
 516                 exp -= 255;
 517
 518         return (vdev_raidz_pow2[exp]);
 519 }
 520
 521 static void
 522 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 523 {
 524         uint64_t *p, *src, pcount, ccount, i;
 525         int c;
 526
 527         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 528
 529         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 530                 src = rm->rm_col[c].rc_data;
 531                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 532                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 533
 534                 if (c == rm->rm_firstdatacol) {
 535                         ASSERT(ccount == pcount);
 536                         for (i = 0; i < ccount; i++, src++, p++) {
 537                                 *p = *src;
 538                         }
 539                 } else {
 540                         ASSERT(ccount <= pcount);
 541                         for (i = 0; i < ccount; i++, src++, p++) {
 542                                 *p ^= *src;
 543                         }
 544                 }
 545         }
 546 }
 547
 548 static void
 549 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 550 {
 551         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 552         int c;
 553
 554         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 555         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 556             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 557
 558         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 559                 src = rm->rm_col[c].rc_data;
 560                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 561                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 562
 563                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 564
 565                 if (c == rm->rm_firstdatacol) {
 566                         ASSERT(ccnt == pcnt || ccnt == 0);
 567                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 568                                 *p = *src;
 569                                 *q = *src;
 570                         }
 571                         for (; i < pcnt; i++, src++, p++, q++) {
 572                                 *p = 0;
 573                                 *q = 0;
 574                         }
 575                 } else {
 576                         ASSERT(ccnt <= pcnt);
 577
 578                         /*
 579                          * Apply the algorithm described above by multiplying
 580                          * the previous result and adding in the new value.
 581                          */
 582                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 583                                 *p ^= *src;
 584
 585                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 586                                 *q ^= *src;
 587                         }
 588
 589                         /*
 590                          * Treat short columns as though they are full of 0s.
 591                          * Note that there's therefore nothing needed for P.
 592                          */
 593                         for (; i < pcnt; i++, q++) {
 594                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 595                         }
 596                 }
 597         }
 598 }
 599
 600 static void
 601 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 602 {
 603         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 604         int c;
 605
 606         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 607         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 608             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 609         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 610             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 611
 612         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 613                 src = rm->rm_col[c].rc_data;
 614                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 615                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 616                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 617
 618                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 619
 620                 if (c == rm->rm_firstdatacol) {
 621                         ASSERT(ccnt == pcnt || ccnt == 0);
 622                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 623                                 *p = *src;
 624                                 *q = *src;
 625                                 *r = *src;
 626                         }
 627                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 628                                 *p = 0;
 629                                 *q = 0;
 630                                 *r = 0;
 631                         }
 632                 } else {
 633                         ASSERT(ccnt <= pcnt);
 634
 635                         /*
 636                          * Apply the algorithm described above by multiplying
 637                          * the previous result and adding in the new value.
 638                          */
 639                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 640                                 *p ^= *src;
 641
 642                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 643                                 *q ^= *src;
 644
 645                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 646                                 *r ^= *src;
 647                         }
 648
 649                         /*
 650                          * Treat short columns as though they are full of 0s.
 651                          * Note that there's therefore nothing needed for P.
 652                          */
 653                         for (; i < pcnt; i++, q++, r++) {
 654                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 655                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 656                         }
 657                 }
 658         }
 659 }
 660
 661 /*
 662  * Generate RAID parity in the first virtual columns according to the number of
 663  * parity columns available.
 664  */
 665 static void
 666 vdev_raidz_generate_parity(raidz_map_t *rm)
 667 {
 668         switch (rm->rm_firstdatacol) {
 669         case 1:
 670                 vdev_raidz_generate_parity_p(rm);
 671                 break;
 672         case 2:
 673                 vdev_raidz_generate_parity_pq(rm);
 674                 break;
 675         case 3:
 676                 vdev_raidz_generate_parity_pqr(rm);
 677                 break;
 678         default:
 679                 panic("invalid RAID-Z configuration");
 680         }
 681 }
 682
 683 /* BEGIN CSTYLED */
 684 /*
 685  * In the general case of reconstruction, we must solve the system of linear
 686  * equations defined by the coeffecients used to generate parity as well as
 687  * the contents of the data and parity disks. This can be expressed with
 688  * vectors for the original data (D) and the actual data (d) and parity (p)
 689  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 690  *
 691  *            __   __                     __     __
 692  *            |     |         __     __   |  p_0  |
 693  *            |  V  |         |  D_0  |   | p_m-1 |
 694  *            |     |    x    |   :   | = |  d_0  |
 695  *            |  I  |         | D_n-1 |   |   :   |
 696  *            |     |         ~~     ~~   | d_n-1 |
 697  *            ~~   ~~                     ~~     ~~
 698  *
 699  * I is simply a square identity matrix of size n, and V is a vandermonde
 700  * matrix defined by the coeffecients we chose for the various parity columns
 701  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 702  * computation as well as linear separability.
 703  *
 704  *      __               __               __     __
 705  *      |   1   ..  1 1 1 |               |  p_0  |
 706  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 707  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 708  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 709  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 710  *      |   :       : : : |   |   :   |   |  d_2  |
 711  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 712  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 713  *      |   0   ..  0 0 1 |               | d_n-1 |
 714  *      ~~               ~~               ~~     ~~
 715  *
 716  * Note that I, V, d, and p are known. To compute D, we must invert the
 717  * matrix and use the known data and parity values to reconstruct the unknown
 718  * data values. We begin by removing the rows in V|I and d|p that correspond
 719  * to failed or missing columns; we then make V|I square (n x n) and d|p
 720  * sized n by removing rows corresponding to unused parity from the bottom up
 721  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 722  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 723  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 724  *           __                               __
 725  *           |  1   1   1   1   1   1   1   1  |
 726  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 727  *           |  19 205 116  29  64  16  4   1  |      / /
 728  *           |  1   0   0   0   0   0   0   0  |     / /
 729  *           |  0   1   0   0   0   0   0   0  | <--' /
 730  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 731  *           |  0   0   0   1   0   0   0   0  |
 732  *           |  0   0   0   0   1   0   0   0  |
 733  *           |  0   0   0   0   0   1   0   0  |
 734  *           |  0   0   0   0   0   0   1   0  |
 735  *           |  0   0   0   0   0   0   0   1  |
 736  *           ~~                               ~~
 737  *           __                               __
 738  *           |  1   1   1   1   1   1   1   1  |
 739  *           | 128  64  32  16  8   4   2   1  |
 740  *           |  19 205 116  29  64  16  4   1  |
 741  *           |  1   0   0   0   0   0   0   0  |
 742  *           |  0   1   0   0   0   0   0   0  |
 743  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 744  *           |  0   0   0   1   0   0   0   0  |
 745  *           |  0   0   0   0   1   0   0   0  |
 746  *           |  0   0   0   0   0   1   0   0  |
 747  *           |  0   0   0   0   0   0   1   0  |
 748  *           |  0   0   0   0   0   0   0   1  |
 749  *           ~~                               ~~
 750  *
 751  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 752  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 753  * matrix is not singular.
 754  * __                                                                 __
 755  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 756  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 757  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 758  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 759  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 760  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 761  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 762  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 763  * ~~                                                                 ~~
 764  * __                                                                 __
 765  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 766  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 767  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 768  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 769  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 770  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 771  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 772  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 773  * ~~                                                                 ~~
 774  * __                                                                 __
 775  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 776  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 777  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 778  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 779  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 780  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 781  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 782  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 783  * ~~                                                                 ~~
 784  * __                                                                 __
 785  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 786  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 787  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 788  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 789  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 790  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 791  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 792  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 793  * ~~                                                                 ~~
 794  * __                                                                 __
 795  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 796  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 797  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 798  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 799  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 800  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 801  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 802  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 803  * ~~                                                                 ~~
 804  * __                                                                 __
 805  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 806  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 807  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 808  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 809  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 810  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 811  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 812  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 813  * ~~                                                                 ~~
 814  *                   __                               __
 815  *                   |  0   0   1   0   0   0   0   0  |
 816  *                   | 167 100  5   41 159 169 217 208 |
 817  *                   | 166 100  4   40 158 168 216 209 |
 818  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 819  *                   |  0   0   0   0   1   0   0   0  |
 820  *                   |  0   0   0   0   0   1   0   0  |
 821  *                   |  0   0   0   0   0   0   1   0  |
 822  *                   |  0   0   0   0   0   0   0   1  |
 823  *                   ~~                               ~~
 824  *
 825  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 826  * of the missing data.
 827  *
 828  * As is apparent from the example above, the only non-trivial rows in the
 829  * inverse matrix correspond to the data disks that we're trying to
 830  * reconstruct. Indeed, those are the only rows we need as the others would
 831  * only be useful for reconstructing data known or assumed to be valid. For
 832  * that reason, we only build the coefficients in the rows that correspond to
 833  * targeted columns.
 834  */
 835 /* END CSTYLED */
 836
 837 static void
 838 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 839     uint8_t **rows)
 840 {
 841         int i, j;
 842         int pow;
 843
 844         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 845
 846         /*
 847          * Fill in the missing rows of interest.
 848          */
 849         for (i = 0; i < nmap; i++) {
 850                 ASSERT3S(0, <=, map[i]);
 851                 ASSERT3S(map[i], <=, 2);
 852
 853                 pow = map[i] * n;
 854                 if (pow > 255)
 855                         pow -= 255;
 856                 ASSERT(pow <= 255);
 857
 858                 for (j = 0; j < n; j++) {
 859                         pow -= map[i];
 860                         if (pow < 0)
 861                                 pow += 255;
 862                         rows[i][j] = vdev_raidz_pow2[pow];
 863                 }
 864         }
 865 }
 866
 867 static void
 868 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 869     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 870 {
 871         int i, j, ii, jj;
 872         uint8_t log;
 873
 874         /*
 875          * Assert that the first nmissing entries from the array of used
 876          * columns correspond to parity columns and that subsequent entries
 877          * correspond to data columns.
 878          */
 879         for (i = 0; i < nmissing; i++) {
 880                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 881         }
 882         for (; i < n; i++) {
 883                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 884         }
 885
 886         /*
 887          * First initialize the storage where we'll compute the inverse rows.
 888          */
 889         for (i = 0; i < nmissing; i++) {
 890                 for (j = 0; j < n; j++) {
 891                         invrows[i][j] = (i == j) ? 1 : 0;
 892                 }
 893         }
 894
 895         /*
 896          * Subtract all trivial rows from the rows of consequence.
 897          */
 898         for (i = 0; i < nmissing; i++) {
 899                 for (j = nmissing; j < n; j++) {
 900                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 901                         jj = used[j] - rm->rm_firstdatacol;
 902                         ASSERT3S(jj, <, n);
 903                         invrows[i][j] = rows[i][jj];
 904                         rows[i][jj] = 0;
 905                 }
 906         }
 907
 908         /*
 909          * For each of the rows of interest, we must normalize it and subtract
 910          * a multiple of it from the other rows.
 911          */
 912         for (i = 0; i < nmissing; i++) {
 913                 for (j = 0; j < missing[i]; j++) {
 914                         ASSERT3U(rows[i][j], ==, 0);
 915                 }
 916                 ASSERT3U(rows[i][missing[i]], !=, 0);
 917
 918                 /*
 919                  * Compute the inverse of the first element and multiply each
 920                  * element in the row by that value.
 921                  */
 922                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 923
 924                 for (j = 0; j < n; j++) {
 925                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 926                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 927                 }
 928
 929                 for (ii = 0; ii < nmissing; ii++) {
 930                         if (i == ii)
 931                                 continue;
 932
 933                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 934
 935                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 936
 937                         for (j = 0; j < n; j++) {
 938                                 rows[ii][j] ^=
 939                                     vdev_raidz_exp2(rows[i][j], log);
 940                                 invrows[ii][j] ^=
 941                                     vdev_raidz_exp2(invrows[i][j], log);
 942                         }
 943                 }
 944         }
 945
 946         /*
 947          * Verify that the data that is left in the rows are properly part of
 948          * an identity matrix.
 949          */
 950         for (i = 0; i < nmissing; i++) {
 951                 for (j = 0; j < n; j++) {
 952                         if (j == missing[i]) {
 953                                 ASSERT3U(rows[i][j], ==, 1);
 954                         } else {
 955                                 ASSERT3U(rows[i][j], ==, 0);
 956                         }
 957                 }
 958         }
 959 }
 960
 961 static void
 962 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 963     int *missing, uint8_t **invrows, const uint8_t *used)
 964 {
 965         int i, j, x, cc, c;
 966         uint8_t *src;
 967         uint64_t ccount;
 968         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 969         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 970         uint8_t log, val;
 971         int ll;
 972         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 973         uint8_t *p, *pp;
 974         size_t psize;
 975
 976         log = 0;        /* gcc */
 977         psize = sizeof (invlog[0][0]) * n * nmissing;
 978         p = malloc(psize);
 979         if (p == NULL) {
 980                 printf("Out of memory\n");
 981                 return;
 982         }
 983
 984         for (pp = p, i = 0; i < nmissing; i++) {
 985                 invlog[i] = pp;
 986                 pp += n;
 987         }
 988
 989         for (i = 0; i < nmissing; i++) {
 990                 for (j = 0; j < n; j++) {
 991                         ASSERT3U(invrows[i][j], !=, 0);
 992                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 993                 }
 994         }
 995
 996         for (i = 0; i < n; i++) {
 997                 c = used[i];
 998                 ASSERT3U(c, <, rm->rm_cols);
 999
1000                 src = rm->rm_col[c].rc_data;
1001                 ccount = rm->rm_col[c].rc_size;
1002                 for (j = 0; j < nmissing; j++) {
1003                         cc = missing[j] + rm->rm_firstdatacol;
1004                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005                         ASSERT3U(cc, <, rm->rm_cols);
1006                         ASSERT3U(cc, !=, c);
1007
1008                         dst[j] = rm->rm_col[cc].rc_data;
1009                         dcount[j] = rm->rm_col[cc].rc_size;
1010                 }
1011
1012                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1013
1014                 for (x = 0; x < ccount; x++, src++) {
1015                         if (*src != 0)
1016                                 log = vdev_raidz_log2[*src];
1017
1018                         for (cc = 0; cc < nmissing; cc++) {
1019                                 if (x >= dcount[cc])
1020                                         continue;
1021
1022                                 if (*src == 0) {
1023                                         val = 0;
1024                                 } else {
1025                                         if ((ll = log + invlog[cc][i]) >= 255)
1026                                                 ll -= 255;
1027                                         val = vdev_raidz_pow2[ll];
1028                                 }
1029
1030                                 if (i == 0)
1031                                         dst[cc][x] = val;
1032                                 else
1033                                         dst[cc][x] ^= val;
1034                         }
1035                 }
1036         }
1037
1038         free(p);
1039 }
1040
1041 static int
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1043 {
1044         int n, i, c, t, tt;
1045         int nmissing_rows;
1046         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047         int parity_map[VDEV_RAIDZ_MAXPARITY];
1048
1049         uint8_t *p, *pp;
1050         size_t psize;
1051
1052         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1054         uint8_t *used;
1055
1056         int code = 0;
1057
1058
1059         n = rm->rm_cols - rm->rm_firstdatacol;
1060
1061         /*
1062          * Figure out which data columns are missing.
1063          */
1064         nmissing_rows = 0;
1065         for (t = 0; t < ntgts; t++) {
1066                 if (tgts[t] >= rm->rm_firstdatacol) {
1067                         missing_rows[nmissing_rows++] =
1068                             tgts[t] - rm->rm_firstdatacol;
1069                 }
1070         }
1071
1072         /*
1073          * Figure out which parity columns to use to help generate the missing
1074          * data columns.
1075          */
1076         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1077                 ASSERT(tt < ntgts);
1078                 ASSERT(c < rm->rm_firstdatacol);
1079
1080                 /*
1081                  * Skip any targeted parity columns.
1082                  */
1083                 if (c == tgts[tt]) {
1084                         tt++;
1085                         continue;
1086                 }
1087
1088                 code |= 1 << c;
1089
1090                 parity_map[i] = c;
1091                 i++;
1092         }
1093
1094         ASSERT(code != 0);
1095         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1096
1097         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098             nmissing_rows * n + sizeof (used[0]) * n;
1099         p = malloc(psize);
1100         if (p == NULL) {
1101                 printf("Out of memory\n");
1102                 return (code);
1103         }
1104
1105         for (pp = p, i = 0; i < nmissing_rows; i++) {
1106                 rows[i] = pp;
1107                 pp += n;
1108                 invrows[i] = pp;
1109                 pp += n;
1110         }
1111         used = pp;
1112
1113         for (i = 0; i < nmissing_rows; i++) {
1114                 used[i] = parity_map[i];
1115         }
1116
1117         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118                 if (tt < nmissing_rows &&
1119                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1120                         tt++;
1121                         continue;
1122                 }
1123
1124                 ASSERT3S(i, <, n);
1125                 used[i] = c;
1126                 i++;
1127         }
1128
1129         /*
1130          * Initialize the interesting rows of the matrix.
1131          */
1132         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1133
1134         /*
1135          * Invert the matrix.
1136          */
1137         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1138             invrows, used);
1139
1140         /*
1141          * Reconstruct the missing data using the generated matrix.
1142          */
1143         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144             invrows, used);
1145
1146         free(p);
1147
1148         return (code);
1149 }
1150
1151 static int
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1153 {
1154         int tgts[VDEV_RAIDZ_MAXPARITY];
1155         int ntgts;
1156         int i, c;
1157         int code;
1158         int nbadparity, nbaddata;
1159
1160         /*
1161          * The tgts list must already be sorted.
1162          */
1163         for (i = 1; i < nt; i++) {
1164                 ASSERT(t[i] > t[i - 1]);
1165         }
1166
1167         nbadparity = rm->rm_firstdatacol;
1168         nbaddata = rm->rm_cols - nbadparity;
1169         ntgts = 0;
1170         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171                 if (i < nt && c == t[i]) {
1172                         tgts[ntgts++] = c;
1173                         i++;
1174                 } else if (rm->rm_col[c].rc_error != 0) {
1175                         tgts[ntgts++] = c;
1176                 } else if (c >= rm->rm_firstdatacol) {
1177                         nbaddata--;
1178                 } else {
1179                         nbadparity--;
1180                 }
1181         }
1182
1183         ASSERT(ntgts >= nt);
1184         ASSERT(nbaddata >= 0);
1185         ASSERT(nbaddata + nbadparity == ntgts);
1186
1187         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189         ASSERT(code > 0);
1190         return (code);
1191 }
1192
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195     uint64_t dcols, uint64_t nparity)
1196 {
1197         raidz_map_t *rm;
1198         uint64_t b = offset >> unit_shift;
1199         uint64_t s = size >> unit_shift;
1200         uint64_t f = b % dcols;
1201         uint64_t o = (b / dcols) << unit_shift;
1202         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1203
1204         q = s / (dcols - nparity);
1205         r = s - q * (dcols - nparity);
1206         bc = (r == 0 ? 0 : r + nparity);
1207         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1208
1209         if (q == 0) {
1210                 acols = bc;
1211                 scols = MIN(dcols, roundup(bc, nparity + 1));
1212         } else {
1213                 acols = dcols;
1214                 scols = dcols;
1215         }
1216
1217         ASSERT3U(acols, <=, scols);
1218
1219         rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1220         if (rm == NULL)
1221                 return (rm);
1222
1223         rm->rm_cols = acols;
1224         rm->rm_scols = scols;
1225         rm->rm_bigcols = bc;
1226         rm->rm_skipstart = bc;
1227         rm->rm_missingdata = 0;
1228         rm->rm_missingparity = 0;
1229         rm->rm_firstdatacol = nparity;
1230         rm->rm_reports = 0;
1231         rm->rm_freed = 0;
1232         rm->rm_ecksuminjected = 0;
1233
1234         asize = 0;
1235
1236         for (c = 0; c < scols; c++) {
1237                 col = f + c;
1238                 coff = o;
1239                 if (col >= dcols) {
1240                         col -= dcols;
1241                         coff += 1ULL << unit_shift;
1242                 }
1243                 rm->rm_col[c].rc_devidx = col;
1244                 rm->rm_col[c].rc_offset = coff;
1245                 rm->rm_col[c].rc_data = NULL;
1246                 rm->rm_col[c].rc_error = 0;
1247                 rm->rm_col[c].rc_tried = 0;
1248                 rm->rm_col[c].rc_skipped = 0;
1249
1250                 if (c >= acols)
1251                         rm->rm_col[c].rc_size = 0;
1252                 else if (c < bc)
1253                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1254                 else
1255                         rm->rm_col[c].rc_size = q << unit_shift;
1256
1257                 asize += rm->rm_col[c].rc_size;
1258         }
1259
1260         ASSERT3U(asize, ==, tot << unit_shift);
1261         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1262         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1263         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1264         ASSERT3U(rm->rm_nskip, <=, nparity);
1265
1266         for (c = 0; c < rm->rm_firstdatacol; c++) {
1267                 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1268                 if (rm->rm_col[c].rc_data == NULL) {
1269                         c++;
1270                         while (c != 0)
1271                                 free(rm->rm_col[--c].rc_data);
1272                         free(rm);
1273                         return (NULL);
1274                 }
1275         }
1276
1277         rm->rm_col[c].rc_data = data;
1278
1279         for (c = c + 1; c < acols; c++)
1280                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1281                     rm->rm_col[c - 1].rc_size;
1282
1283         /*
1284          * If all data stored spans all columns, there's a danger that parity
1285          * will always be on the same device and, since parity isn't read
1286          * during normal operation, that that device's I/O bandwidth won't be
1287          * used effectively. We therefore switch the parity every 1MB.
1288          *
1289          * ... at least that was, ostensibly, the theory. As a practical
1290          * matter unless we juggle the parity between all devices evenly, we
1291          * won't see any benefit. Further, occasional writes that aren't a
1292          * multiple of the LCM of the number of children and the minimum
1293          * stripe width are sufficient to avoid pessimal behavior.
1294          * Unfortunately, this decision created an implicit on-disk format
1295          * requirement that we need to support for all eternity, but only
1296          * for single-parity RAID-Z.
1297          *
1298          * If we intend to skip a sector in the zeroth column for padding
1299          * we must make sure to note this swap. We will never intend to
1300          * skip the first column since at least one data and one parity
1301          * column must appear in each row.
1302          */
1303         ASSERT(rm->rm_cols >= 2);
1304         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1305
1306         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1307                 devidx = rm->rm_col[0].rc_devidx;
1308                 o = rm->rm_col[0].rc_offset;
1309                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1310                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1311                 rm->rm_col[1].rc_devidx = devidx;
1312                 rm->rm_col[1].rc_offset = o;
1313
1314                 if (rm->rm_skipstart == 0)
1315                         rm->rm_skipstart = 1;
1316         }
1317
1318         return (rm);
1319 }
1320
1321 static void
1322 vdev_raidz_map_free(raidz_map_t *rm)
1323 {
1324         int c;
1325
1326         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1327                 free(rm->rm_col[c].rc_data);
1328
1329         free(rm);
1330 }
1331
1332 static vdev_t *
1333 vdev_child(vdev_t *pvd, uint64_t devidx)
1334 {
1335         vdev_t *cvd;
1336
1337         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1338                 if (cvd->v_id == devidx)
1339                         break;
1340         }
1341
1342         return (cvd);
1343 }
1344
1345 /*
1346  * We keep track of whether or not there were any injected errors, so that
1347  * any ereports we generate can note it.
1348  */
1349 static int
1350 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1351     uint64_t size)
1352 {
1353         return (zio_checksum_verify(spa, bp, data));
1354 }
1355
1356 /*
1357  * Generate the parity from the data columns. If we tried and were able to
1358  * read the parity without error, verify that the generated parity matches the
1359  * data we read. If it doesn't, we fire off a checksum error. Return the
1360  * number such failures.
1361  */
1362 static int
1363 raidz_parity_verify(raidz_map_t *rm)
1364 {
1365         void *orig[VDEV_RAIDZ_MAXPARITY];
1366         int c, ret = 0;
1367         raidz_col_t *rc;
1368
1369         for (c = 0; c < rm->rm_firstdatacol; c++) {
1370                 rc = &rm->rm_col[c];
1371                 if (!rc->rc_tried || rc->rc_error != 0)
1372                         continue;
1373                 orig[c] = malloc(rc->rc_size);
1374                 if (orig[c] != NULL) {
1375                         bcopy(rc->rc_data, orig[c], rc->rc_size);
1376                 } else {
1377                         printf("Out of memory\n");
1378                 }
1379         }
1380
1381         vdev_raidz_generate_parity(rm);
1382
1383         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1384                 rc = &rm->rm_col[c];
1385                 if (!rc->rc_tried || rc->rc_error != 0)
1386                         continue;
1387                 if (orig[c] == NULL ||
1388                     bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1389                         rc->rc_error = ECKSUM;
1390                         ret++;
1391                 }
1392                 free(orig[c]);
1393         }
1394
1395         return (ret);
1396 }
1397
1398 /*
1399  * Iterate over all combinations of bad data and attempt a reconstruction.
1400  * Note that the algorithm below is non-optimal because it doesn't take into
1401  * account how reconstruction is actually performed. For example, with
1402  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1403  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1404  * cases we'd only use parity information in column 0.
1405  */
1406 static int
1407 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1408     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1409 {
1410         raidz_col_t *rc;
1411         void *orig[VDEV_RAIDZ_MAXPARITY];
1412         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1413         int *tgts = &tstore[1];
1414         int current, next, i, c, n;
1415         int code, ret = 0;
1416
1417         ASSERT(total_errors < rm->rm_firstdatacol);
1418
1419         /*
1420          * This simplifies one edge condition.
1421          */
1422         tgts[-1] = -1;
1423
1424         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1425                 /*
1426                  * Initialize the targets array by finding the first n columns
1427                  * that contain no error.
1428                  *
1429                  * If there were no data errors, we need to ensure that we're
1430                  * always explicitly attempting to reconstruct at least one
1431                  * data column. To do this, we simply push the highest target
1432                  * up into the data columns.
1433                  */
1434                 for (c = 0, i = 0; i < n; i++) {
1435                         if (i == n - 1 && data_errors == 0 &&
1436                             c < rm->rm_firstdatacol) {
1437                                 c = rm->rm_firstdatacol;
1438                         }
1439
1440                         while (rm->rm_col[c].rc_error != 0) {
1441                                 c++;
1442                                 ASSERT3S(c, <, rm->rm_cols);
1443                         }
1444
1445                         tgts[i] = c++;
1446                 }
1447
1448                 /*
1449                  * Setting tgts[n] simplifies the other edge condition.
1450                  */
1451                 tgts[n] = rm->rm_cols;
1452
1453                 /*
1454                  * These buffers were allocated in previous iterations.
1455                  */
1456                 for (i = 0; i < n - 1; i++) {
1457                         ASSERT(orig[i] != NULL);
1458                 }
1459
1460                 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1461                 if (orig[n - 1] == NULL) {
1462                         ret = ENOMEM;
1463                         goto done;
1464                 }
1465
1466                 current = 0;
1467                 next = tgts[current];
1468
1469                 while (current != n) {
1470                         tgts[current] = next;
1471                         current = 0;
1472
1473                         /*
1474                          * Save off the original data that we're going to
1475                          * attempt to reconstruct.
1476                          */
1477                         for (i = 0; i < n; i++) {
1478                                 ASSERT(orig[i] != NULL);
1479                                 c = tgts[i];
1480                                 ASSERT3S(c, >=, 0);
1481                                 ASSERT3S(c, <, rm->rm_cols);
1482                                 rc = &rm->rm_col[c];
1483                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1484                         }
1485
1486                         /*
1487                          * Attempt a reconstruction and exit the outer loop on
1488                          * success.
1489                          */
1490                         code = vdev_raidz_reconstruct(rm, tgts, n);
1491                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1492                                 for (i = 0; i < n; i++) {
1493                                         c = tgts[i];
1494                                         rc = &rm->rm_col[c];
1495                                         ASSERT(rc->rc_error == 0);
1496                                         rc->rc_error = ECKSUM;
1497                                 }
1498
1499                                 ret = code;
1500                                 goto done;
1501                         }
1502
1503                         /*
1504                          * Restore the original data.
1505                          */
1506                         for (i = 0; i < n; i++) {
1507                                 c = tgts[i];
1508                                 rc = &rm->rm_col[c];
1509                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1510                         }
1511
1512                         do {
1513                                 /*
1514                                  * Find the next valid column after the current
1515                                  * position..
1516                                  */
1517                                 for (next = tgts[current] + 1;
1518                                     next < rm->rm_cols &&
1519                                     rm->rm_col[next].rc_error != 0; next++)
1520                                         continue;
1521
1522                                 ASSERT(next <= tgts[current + 1]);
1523
1524                                 /*
1525                                  * If that spot is available, we're done here.
1526                                  */
1527                                 if (next != tgts[current + 1])
1528                                         break;
1529
1530                                 /*
1531                                  * Otherwise, find the next valid column after
1532                                  * the previous position.
1533                                  */
1534                                 for (c = tgts[current - 1] + 1;
1535                                     rm->rm_col[c].rc_error != 0; c++)
1536                                         continue;
1537
1538                                 tgts[current] = c;
1539                                 current++;
1540
1541                         } while (current != n);
1542                 }
1543         }
1544         n--;
1545 done:
1546         for (i = n - 1; i >= 0; i--) {
1547                 free(orig[i]);
1548         }
1549
1550         return (ret);
1551 }
1552
1553 static int
1554 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1555     off_t offset, size_t bytes)
1556 {
1557         vdev_t *tvd = vd->v_top;
1558         vdev_t *cvd;
1559         raidz_map_t *rm;
1560         raidz_col_t *rc;
1561         int c, error;
1562         int unexpected_errors;
1563         int parity_errors;
1564         int parity_untried;
1565         int data_errors;
1566         int total_errors;
1567         int n;
1568         int tgts[VDEV_RAIDZ_MAXPARITY];
1569         int code;
1570
1571         rc = NULL;      /* gcc */
1572         error = 0;
1573
1574         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1575             vd->v_nchildren, vd->v_nparity);
1576         if (rm == NULL)
1577                 return (ENOMEM);
1578
1579         /*
1580          * Iterate over the columns in reverse order so that we hit the parity
1581          * last -- any errors along the way will force us to read the parity.
1582          */
1583         for (c = rm->rm_cols - 1; c >= 0; c--) {
1584                 rc = &rm->rm_col[c];
1585                 cvd = vdev_child(vd, rc->rc_devidx);
1586                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1587                         if (c >= rm->rm_firstdatacol)
1588                                 rm->rm_missingdata++;
1589                         else
1590                                 rm->rm_missingparity++;
1591                         rc->rc_error = ENXIO;
1592                         rc->rc_tried = 1;       /* don't even try */
1593                         rc->rc_skipped = 1;
1594                         continue;
1595                 }
1596 #if 0           /* XXX: Too hard for the boot code. */
1597                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1598                         if (c >= rm->rm_firstdatacol)
1599                                 rm->rm_missingdata++;
1600                         else
1601                                 rm->rm_missingparity++;
1602                         rc->rc_error = ESTALE;
1603                         rc->rc_skipped = 1;
1604                         continue;
1605                 }
1606 #endif
1607                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1608                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1609                             rc->rc_offset, rc->rc_size);
1610                         rc->rc_tried = 1;
1611                         rc->rc_skipped = 0;
1612                 }
1613         }
1614
1615 reconstruct:
1616         unexpected_errors = 0;
1617         parity_errors = 0;
1618         parity_untried = 0;
1619         data_errors = 0;
1620         total_errors = 0;
1621
1622         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1623         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1624
1625         for (c = 0; c < rm->rm_cols; c++) {
1626                 rc = &rm->rm_col[c];
1627
1628                 if (rc->rc_error) {
1629                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1630
1631                         if (c < rm->rm_firstdatacol)
1632                                 parity_errors++;
1633                         else
1634                                 data_errors++;
1635
1636                         if (!rc->rc_skipped)
1637                                 unexpected_errors++;
1638
1639                         total_errors++;
1640                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1641                         parity_untried++;
1642                 }
1643         }
1644
1645         /*
1646          * There are three potential phases for a read:
1647          *      1. produce valid data from the columns read
1648          *      2. read all disks and try again
1649          *      3. perform combinatorial reconstruction
1650          *
1651          * Each phase is progressively both more expensive and less likely to
1652          * occur. If we encounter more errors than we can repair or all phases
1653          * fail, we have no choice but to return an error.
1654          */
1655
1656         /*
1657          * If the number of errors we saw was correctable -- less than or equal
1658          * to the number of parity disks read -- attempt to produce data that
1659          * has a valid checksum. Naturally, this case applies in the absence of
1660          * any errors.
1661          */
1662         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1663                 int rv;
1664
1665                 if (data_errors == 0) {
1666                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1667                         if (rv == 0) {
1668                                 /*
1669                                  * If we read parity information (unnecessarily
1670                                  * as it happens since no reconstruction was
1671                                  * needed) regenerate and verify the parity.
1672                                  * We also regenerate parity when resilvering
1673                                  * so we can write it out to the failed device
1674                                  * later.
1675                                  */
1676                                 if (parity_errors + parity_untried <
1677                                     rm->rm_firstdatacol) {
1678                                         n = raidz_parity_verify(rm);
1679                                         unexpected_errors += n;
1680                                         ASSERT(parity_errors + n <=
1681                                             rm->rm_firstdatacol);
1682                                 }
1683                                 goto done;
1684                         }
1685                 } else {
1686                         /*
1687                          * We either attempt to read all the parity columns or
1688                          * none of them. If we didn't try to read parity, we
1689                          * wouldn't be here in the correctable case. There must
1690                          * also have been fewer parity errors than parity
1691                          * columns or, again, we wouldn't be in this code path.
1692                          */
1693                         ASSERT(parity_untried == 0);
1694                         ASSERT(parity_errors < rm->rm_firstdatacol);
1695
1696                         /*
1697                          * Identify the data columns that reported an error.
1698                          */
1699                         n = 0;
1700                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1701                                 rc = &rm->rm_col[c];
1702                                 if (rc->rc_error != 0) {
1703                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1704                                         tgts[n++] = c;
1705                                 }
1706                         }
1707
1708                         ASSERT(rm->rm_firstdatacol >= n);
1709
1710                         code = vdev_raidz_reconstruct(rm, tgts, n);
1711
1712                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1713                         if (rv == 0) {
1714                                 /*
1715                                  * If we read more parity disks than were used
1716                                  * for reconstruction, confirm that the other
1717                                  * parity disks produced correct data. This
1718                                  * routine is suboptimal in that it regenerates
1719                                  * the parity that we already used in addition
1720                                  * to the parity that we're attempting to
1721                                  * verify, but this should be a relatively
1722                                  * uncommon case, and can be optimized if it
1723                                  * becomes a problem. Note that we regenerate
1724                                  * parity when resilvering so we can write it
1725                                  * out to failed devices later.
1726                                  */
1727                                 if (parity_errors < rm->rm_firstdatacol - n) {
1728                                         n = raidz_parity_verify(rm);
1729                                         unexpected_errors += n;
1730                                         ASSERT(parity_errors + n <=
1731                                             rm->rm_firstdatacol);
1732                                 }
1733
1734                                 goto done;
1735                         }
1736                 }
1737         }
1738
1739         /*
1740          * This isn't a typical situation -- either we got a read
1741          * error or a child silently returned bad data. Read every
1742          * block so we can try again with as much data and parity as
1743          * we can track down. If we've already been through once
1744          * before, all children will be marked as tried so we'll
1745          * proceed to combinatorial reconstruction.
1746          */
1747         unexpected_errors = 1;
1748         rm->rm_missingdata = 0;
1749         rm->rm_missingparity = 0;
1750
1751         n = 0;
1752         for (c = 0; c < rm->rm_cols; c++) {
1753                 rc = &rm->rm_col[c];
1754
1755                 if (rc->rc_tried)
1756                         continue;
1757
1758                 cvd = vdev_child(vd, rc->rc_devidx);
1759                 ASSERT(cvd != NULL);
1760                 rc->rc_error = cvd->v_read(cvd, NULL,
1761                     rc->rc_data, rc->rc_offset, rc->rc_size);
1762                 if (rc->rc_error == 0)
1763                         n++;
1764                 rc->rc_tried = 1;
1765                 rc->rc_skipped = 0;
1766         }
1767         /*
1768          * If we managed to read anything more, retry the
1769          * reconstruction.
1770          */
1771         if (n > 0)
1772                 goto reconstruct;
1773
1774         /*
1775          * At this point we've attempted to reconstruct the data given the
1776          * errors we detected, and we've attempted to read all columns. There
1777          * must, therefore, be one or more additional problems -- silent errors
1778          * resulting in invalid data rather than explicit I/O errors resulting
1779          * in absent data. We check if there is enough additional data to
1780          * possibly reconstruct the data and then perform combinatorial
1781          * reconstruction over all possible combinations. If that fails,
1782          * we're cooked.
1783          */
1784         if (total_errors > rm->rm_firstdatacol) {
1785                 error = EIO;
1786         } else if (total_errors < rm->rm_firstdatacol &&
1787             (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1788              total_errors, data_errors)) != 0) {
1789                 /*
1790                  * If we didn't use all the available parity for the
1791                  * combinatorial reconstruction, verify that the remaining
1792                  * parity is correct.
1793                  */
1794                 if (code != (1 << rm->rm_firstdatacol) - 1)
1795                         (void) raidz_parity_verify(rm);
1796         } else {
1797                 /*
1798                  * We're here because either:
1799                  *
1800                  *      total_errors == rm_first_datacol, or
1801                  *      vdev_raidz_combrec() failed
1802                  *
1803                  * In either case, there is enough bad data to prevent
1804                  * reconstruction.
1805                  *
1806                  * Start checksum ereports for all children which haven't
1807                  * failed, and the IO wasn't speculative.
1808                  */
1809                 error = ECKSUM;
1810         }
1811
1812 done:
1813         vdev_raidz_map_free(rm);
1814
1815         return (error);
1816 }