4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
31 static uint64_t zfs_crc64_table[256];
35 #define ASSERT3S(x, y, z) ((void)0)
36 #define ASSERT3U(x, y, z) ((void)0)
37 #define ASSERT3P(x, y, z) ((void)0)
38 #define ASSERT0(x) ((void)0)
39 #define ASSERT(x) ((void)0)
41 #define panic(...) do { \
42 printf(__VA_ARGS__); \
53 * Calculate the crc64 table (used for the zap hash
56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 for (i = 0; i < 256; i++)
59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
65 zio_checksum_off(const void *buf, uint64_t size,
66 const void *ctx_template, zio_cksum_t *zcp)
68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
72 * Signature for checksum functions.
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75 const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
79 typedef enum zio_checksum_flags {
80 /* Strong enough for metadata? */
81 ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 /* ZIO embedded checksum */
83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 /* Strong enough for dedup (without verification)? */
85 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
87 ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 /* Strong enough for nopwrite? */
89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
93 * Information about each checksum function.
95 typedef struct zio_checksum_info {
96 /* checksum function for each byteorder */
97 zio_checksum_t *ci_func[2];
98 zio_checksum_tmpl_init_t *ci_tmpl_init;
99 zio_checksum_tmpl_free_t *ci_tmpl_free;
100 zio_checksum_flags_t ci_flags;
101 const char *ci_name; /* descriptive name */
102 } zio_checksum_info_t;
106 #include "fletcher.c"
108 #include "skein_zfs.c"
110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
112 {{NULL, NULL}, NULL, NULL, 0, "on"},
113 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
114 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
115 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
117 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
119 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
122 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
123 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
125 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
128 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
131 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
132 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
135 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
136 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
138 /* no edonr for now */
139 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144 * Common signature for all zio compress/decompress functions.
146 typedef size_t zio_compress_func_t(void *src, void *dst,
147 size_t s_len, size_t d_len, int);
148 typedef int zio_decompress_func_t(void *src, void *dst,
149 size_t s_len, size_t d_len, int);
152 * Information about each compression function.
154 typedef struct zio_compress_info {
155 zio_compress_func_t *ci_compress; /* compression function */
156 zio_decompress_func_t *ci_decompress; /* decompression function */
157 int ci_level; /* level parameter */
158 const char *ci_name; /* algorithm name */
159 } zio_compress_info_t;
165 * Compression vectors.
167 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
168 {NULL, NULL, 0, "inherit"},
169 {NULL, NULL, 0, "on"},
170 {NULL, NULL, 0, "uncompressed"},
171 {NULL, lzjb_decompress, 0, "lzjb"},
172 {NULL, NULL, 0, "empty"},
173 {NULL, NULL, 1, "gzip-1"},
174 {NULL, NULL, 2, "gzip-2"},
175 {NULL, NULL, 3, "gzip-3"},
176 {NULL, NULL, 4, "gzip-4"},
177 {NULL, NULL, 5, "gzip-5"},
178 {NULL, NULL, 6, "gzip-6"},
179 {NULL, NULL, 7, "gzip-7"},
180 {NULL, NULL, 8, "gzip-8"},
181 {NULL, NULL, 9, "gzip-9"},
182 {NULL, zle_decompress, 64, "zle"},
183 {NULL, lz4_decompress, 0, "lz4"},
187 byteswap_uint64_array(void *vbuf, size_t size)
189 uint64_t *buf = vbuf;
190 size_t count = size >> 3;
193 ASSERT((size & 7) == 0);
195 for (i = 0; i < count; i++)
196 buf[i] = BSWAP_64(buf[i]);
200 * Set the external verifier for a gang block based on <vdev, offset, txg>,
201 * a tuple which is guaranteed to be unique for the life of the pool.
204 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206 const dva_t *dva = BP_IDENTITY(bp);
207 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209 ASSERT(BP_IS_GANG(bp));
211 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
215 * Set the external verifier for a label block based on its offset.
216 * The vdev is implicit, and the txg is unknowable at pool open time --
217 * hence the logic in vdev_uberblock_load() to find the most recent copy.
220 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
226 * Calls the template init function of a checksum which supports context
227 * templates and installs the template into the spa_t.
230 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
232 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234 if (ci->ci_tmpl_init == NULL)
237 if (spa->spa_cksum_tmpls[checksum] != NULL)
240 if (spa->spa_cksum_tmpls[checksum] == NULL) {
241 spa->spa_cksum_tmpls[checksum] =
242 ci->ci_tmpl_init(&spa->spa_cksum_salt);
247 * Called by a spa_t that's about to be deallocated. This steps through
248 * all of the checksum context templates and deallocates any that were
249 * initialized using the algorithm-specific template init function.
252 zio_checksum_templates_free(spa_t *spa)
254 for (enum zio_checksum checksum = 0;
255 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
256 if (spa->spa_cksum_tmpls[checksum] != NULL) {
257 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
259 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
260 spa->spa_cksum_tmpls[checksum] = NULL;
266 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
269 unsigned int checksum;
270 zio_checksum_info_t *ci;
272 zio_cksum_t actual_cksum, expected_cksum, verifier;
275 checksum = BP_GET_CHECKSUM(bp);
276 size = BP_GET_PSIZE(bp);
278 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
280 ci = &zio_checksum_table[checksum];
281 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
285 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
286 ctx = spa->spa_cksum_tmpls[checksum];
289 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
292 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
293 checksum == ZIO_CHECKSUM_LABEL);
295 eck = (zio_eck_t *)((char *)data + size) - 1;
297 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
298 zio_checksum_gang_verifier(&verifier, bp);
299 else if (checksum == ZIO_CHECKSUM_LABEL)
300 zio_checksum_label_verifier(&verifier,
301 DVA_GET_OFFSET(BP_IDENTITY(bp)));
303 verifier = bp->blk_cksum;
305 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
308 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
310 expected_cksum = eck->zec_cksum;
311 eck->zec_cksum = verifier;
312 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
313 eck->zec_cksum = expected_cksum;
316 byteswap_uint64_array(&expected_cksum,
317 sizeof (zio_cksum_t));
319 byteswap = BP_SHOULD_BYTESWAP(bp);
320 expected_cksum = bp->blk_cksum;
321 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
324 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
325 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
333 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
334 void *dest, uint64_t destsize)
336 zio_compress_info_t *ci;
338 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
339 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
343 ci = &zio_compress_table[cpfunc];
344 if (!ci->ci_decompress) {
345 printf("ZFS: unsupported compression algorithm %s\n",
350 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
354 zap_hash(uint64_t salt, const char *name)
361 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
362 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
363 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
366 * Only use 28 bits, since we need 4 bits in the cookie for the
367 * collision differentiator. We MUST use the high bits, since
368 * those are the onces that we first pay attention to when
369 * chosing the bucket.
371 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
376 typedef struct raidz_col {
377 uint64_t rc_devidx; /* child device index for I/O */
378 uint64_t rc_offset; /* device offset */
379 uint64_t rc_size; /* I/O size */
380 void *rc_data; /* I/O data */
381 int rc_error; /* I/O error for this device */
382 uint8_t rc_tried; /* Did we attempt this I/O column? */
383 uint8_t rc_skipped; /* Did we skip this I/O column? */
386 typedef struct raidz_map {
387 uint64_t rm_cols; /* Regular column count */
388 uint64_t rm_scols; /* Count including skipped columns */
389 uint64_t rm_bigcols; /* Number of oversized columns */
390 uint64_t rm_asize; /* Actual total I/O size */
391 uint64_t rm_missingdata; /* Count of missing data devices */
392 uint64_t rm_missingparity; /* Count of missing parity devices */
393 uint64_t rm_firstdatacol; /* First data column/parity count */
394 uint64_t rm_nskip; /* Skipped sectors for padding */
395 uint64_t rm_skipstart; /* Column index of padding start */
396 uintptr_t rm_reports; /* # of referencing checksum reports */
397 uint8_t rm_freed; /* map no longer has referencing ZIO */
398 uint8_t rm_ecksuminjected; /* checksum error was injected */
399 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
402 #define VDEV_RAIDZ_P 0
403 #define VDEV_RAIDZ_Q 1
404 #define VDEV_RAIDZ_R 2
406 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
407 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
410 * We provide a mechanism to perform the field multiplication operation on a
411 * 64-bit value all at once rather than a byte at a time. This works by
412 * creating a mask from the top bit in each byte and using that to
413 * conditionally apply the XOR of 0x1d.
415 #define VDEV_RAIDZ_64MUL_2(x, mask) \
417 (mask) = (x) & 0x8080808080808080ULL; \
418 (mask) = ((mask) << 1) - ((mask) >> 7); \
419 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
420 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
423 #define VDEV_RAIDZ_64MUL_4(x, mask) \
425 VDEV_RAIDZ_64MUL_2((x), mask); \
426 VDEV_RAIDZ_64MUL_2((x), mask); \
430 * These two tables represent powers and logs of 2 in the Galois field defined
431 * above. These values were computed by repeatedly multiplying by 2 as above.
433 static const uint8_t vdev_raidz_pow2[256] = {
434 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
435 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
436 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
437 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
438 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
439 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
440 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
441 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
442 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
443 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
444 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
445 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
446 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
447 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
448 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
449 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
450 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
451 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
452 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
453 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
454 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
455 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
456 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
457 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
458 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
459 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
460 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
461 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
462 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
463 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
464 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
465 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
467 static const uint8_t vdev_raidz_log2[256] = {
468 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
469 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
470 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
471 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
472 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
473 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
474 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
475 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
476 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
477 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
478 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
479 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
480 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
481 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
482 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
483 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
484 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
485 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
486 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
487 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
488 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
489 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
490 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
491 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
492 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
493 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
494 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
495 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
496 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
497 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
498 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
499 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
503 * Multiply a given number by 2 raised to the given power.
506 vdev_raidz_exp2(uint8_t a, int exp)
512 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
514 exp += vdev_raidz_log2[a];
518 return (vdev_raidz_pow2[exp]);
522 vdev_raidz_generate_parity_p(raidz_map_t *rm)
524 uint64_t *p, *src, pcount, ccount, i;
527 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
529 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
530 src = rm->rm_col[c].rc_data;
531 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
532 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
534 if (c == rm->rm_firstdatacol) {
535 ASSERT(ccount == pcount);
536 for (i = 0; i < ccount; i++, src++, p++) {
540 ASSERT(ccount <= pcount);
541 for (i = 0; i < ccount; i++, src++, p++) {
549 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
551 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
554 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
555 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
556 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
558 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
559 src = rm->rm_col[c].rc_data;
560 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
561 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
563 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
565 if (c == rm->rm_firstdatacol) {
566 ASSERT(ccnt == pcnt || ccnt == 0);
567 for (i = 0; i < ccnt; i++, src++, p++, q++) {
571 for (; i < pcnt; i++, src++, p++, q++) {
576 ASSERT(ccnt <= pcnt);
579 * Apply the algorithm described above by multiplying
580 * the previous result and adding in the new value.
582 for (i = 0; i < ccnt; i++, src++, p++, q++) {
585 VDEV_RAIDZ_64MUL_2(*q, mask);
590 * Treat short columns as though they are full of 0s.
591 * Note that there's therefore nothing needed for P.
593 for (; i < pcnt; i++, q++) {
594 VDEV_RAIDZ_64MUL_2(*q, mask);
601 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
603 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
606 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
608 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
609 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
610 rm->rm_col[VDEV_RAIDZ_R].rc_size);
612 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
613 src = rm->rm_col[c].rc_data;
614 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
615 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
616 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
618 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
620 if (c == rm->rm_firstdatacol) {
621 ASSERT(ccnt == pcnt || ccnt == 0);
622 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
627 for (; i < pcnt; i++, src++, p++, q++, r++) {
633 ASSERT(ccnt <= pcnt);
636 * Apply the algorithm described above by multiplying
637 * the previous result and adding in the new value.
639 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
642 VDEV_RAIDZ_64MUL_2(*q, mask);
645 VDEV_RAIDZ_64MUL_4(*r, mask);
650 * Treat short columns as though they are full of 0s.
651 * Note that there's therefore nothing needed for P.
653 for (; i < pcnt; i++, q++, r++) {
654 VDEV_RAIDZ_64MUL_2(*q, mask);
655 VDEV_RAIDZ_64MUL_4(*r, mask);
662 * Generate RAID parity in the first virtual columns according to the number of
663 * parity columns available.
666 vdev_raidz_generate_parity(raidz_map_t *rm)
668 switch (rm->rm_firstdatacol) {
670 vdev_raidz_generate_parity_p(rm);
673 vdev_raidz_generate_parity_pq(rm);
676 vdev_raidz_generate_parity_pqr(rm);
679 panic("invalid RAID-Z configuration");
685 * In the general case of reconstruction, we must solve the system of linear
686 * equations defined by the coeffecients used to generate parity as well as
687 * the contents of the data and parity disks. This can be expressed with
688 * vectors for the original data (D) and the actual data (d) and parity (p)
689 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
693 * | V | | D_0 | | p_m-1 |
694 * | | x | : | = | d_0 |
695 * | I | | D_n-1 | | : |
696 * | | ~~ ~~ | d_n-1 |
699 * I is simply a square identity matrix of size n, and V is a vandermonde
700 * matrix defined by the coeffecients we chose for the various parity columns
701 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
702 * computation as well as linear separability.
705 * | 1 .. 1 1 1 | | p_0 |
706 * | 2^n-1 .. 4 2 1 | __ __ | : |
707 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
708 * | 1 .. 0 0 0 | | D_1 | | d_0 |
709 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
710 * | : : : : | | : | | d_2 |
711 * | 0 .. 1 0 0 | | D_n-1 | | : |
712 * | 0 .. 0 1 0 | ~~ ~~ | : |
713 * | 0 .. 0 0 1 | | d_n-1 |
716 * Note that I, V, d, and p are known. To compute D, we must invert the
717 * matrix and use the known data and parity values to reconstruct the unknown
718 * data values. We begin by removing the rows in V|I and d|p that correspond
719 * to failed or missing columns; we then make V|I square (n x n) and d|p
720 * sized n by removing rows corresponding to unused parity from the bottom up
721 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
722 * using Gauss-Jordan elimination. In the example below we use m=3 parity
723 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
725 * | 1 1 1 1 1 1 1 1 |
726 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
727 * | 19 205 116 29 64 16 4 1 | / /
728 * | 1 0 0 0 0 0 0 0 | / /
729 * | 0 1 0 0 0 0 0 0 | <--' /
730 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
731 * | 0 0 0 1 0 0 0 0 |
732 * | 0 0 0 0 1 0 0 0 |
733 * | 0 0 0 0 0 1 0 0 |
734 * | 0 0 0 0 0 0 1 0 |
735 * | 0 0 0 0 0 0 0 1 |
738 * | 1 1 1 1 1 1 1 1 |
739 * | 128 64 32 16 8 4 2 1 |
740 * | 19 205 116 29 64 16 4 1 |
741 * | 1 0 0 0 0 0 0 0 |
742 * | 0 1 0 0 0 0 0 0 |
743 * (V|I)' = | 0 0 1 0 0 0 0 0 |
744 * | 0 0 0 1 0 0 0 0 |
745 * | 0 0 0 0 1 0 0 0 |
746 * | 0 0 0 0 0 1 0 0 |
747 * | 0 0 0 0 0 0 1 0 |
748 * | 0 0 0 0 0 0 0 1 |
751 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
752 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
753 * matrix is not singular.
755 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
756 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
757 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
758 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
759 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
760 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
761 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
762 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
766 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
767 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
768 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
769 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
770 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
771 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
772 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
775 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
776 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
777 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
778 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
779 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
780 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
781 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
782 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
785 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
786 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
787 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
788 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
789 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
790 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
791 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
792 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
795 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
796 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
797 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
798 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
799 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
800 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
801 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
802 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
805 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
806 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
807 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
808 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
809 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
810 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
811 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
812 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
815 * | 0 0 1 0 0 0 0 0 |
816 * | 167 100 5 41 159 169 217 208 |
817 * | 166 100 4 40 158 168 216 209 |
818 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
819 * | 0 0 0 0 1 0 0 0 |
820 * | 0 0 0 0 0 1 0 0 |
821 * | 0 0 0 0 0 0 1 0 |
822 * | 0 0 0 0 0 0 0 1 |
825 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
826 * of the missing data.
828 * As is apparent from the example above, the only non-trivial rows in the
829 * inverse matrix correspond to the data disks that we're trying to
830 * reconstruct. Indeed, those are the only rows we need as the others would
831 * only be useful for reconstructing data known or assumed to be valid. For
832 * that reason, we only build the coefficients in the rows that correspond to
838 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
844 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
847 * Fill in the missing rows of interest.
849 for (i = 0; i < nmap; i++) {
850 ASSERT3S(0, <=, map[i]);
851 ASSERT3S(map[i], <=, 2);
858 for (j = 0; j < n; j++) {
862 rows[i][j] = vdev_raidz_pow2[pow];
868 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
869 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
875 * Assert that the first nmissing entries from the array of used
876 * columns correspond to parity columns and that subsequent entries
877 * correspond to data columns.
879 for (i = 0; i < nmissing; i++) {
880 ASSERT3S(used[i], <, rm->rm_firstdatacol);
883 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
887 * First initialize the storage where we'll compute the inverse rows.
889 for (i = 0; i < nmissing; i++) {
890 for (j = 0; j < n; j++) {
891 invrows[i][j] = (i == j) ? 1 : 0;
896 * Subtract all trivial rows from the rows of consequence.
898 for (i = 0; i < nmissing; i++) {
899 for (j = nmissing; j < n; j++) {
900 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
901 jj = used[j] - rm->rm_firstdatacol;
903 invrows[i][j] = rows[i][jj];
909 * For each of the rows of interest, we must normalize it and subtract
910 * a multiple of it from the other rows.
912 for (i = 0; i < nmissing; i++) {
913 for (j = 0; j < missing[i]; j++) {
914 ASSERT3U(rows[i][j], ==, 0);
916 ASSERT3U(rows[i][missing[i]], !=, 0);
919 * Compute the inverse of the first element and multiply each
920 * element in the row by that value.
922 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
924 for (j = 0; j < n; j++) {
925 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
926 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
929 for (ii = 0; ii < nmissing; ii++) {
933 ASSERT3U(rows[ii][missing[i]], !=, 0);
935 log = vdev_raidz_log2[rows[ii][missing[i]]];
937 for (j = 0; j < n; j++) {
939 vdev_raidz_exp2(rows[i][j], log);
941 vdev_raidz_exp2(invrows[i][j], log);
947 * Verify that the data that is left in the rows are properly part of
948 * an identity matrix.
950 for (i = 0; i < nmissing; i++) {
951 for (j = 0; j < n; j++) {
952 if (j == missing[i]) {
953 ASSERT3U(rows[i][j], ==, 1);
955 ASSERT3U(rows[i][j], ==, 0);
962 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
963 int *missing, uint8_t **invrows, const uint8_t *used)
968 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
969 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
972 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
977 psize = sizeof (invlog[0][0]) * n * nmissing;
980 printf("Out of memory\n");
984 for (pp = p, i = 0; i < nmissing; i++) {
989 for (i = 0; i < nmissing; i++) {
990 for (j = 0; j < n; j++) {
991 ASSERT3U(invrows[i][j], !=, 0);
992 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
996 for (i = 0; i < n; i++) {
998 ASSERT3U(c, <, rm->rm_cols);
1000 src = rm->rm_col[c].rc_data;
1001 ccount = rm->rm_col[c].rc_size;
1002 for (j = 0; j < nmissing; j++) {
1003 cc = missing[j] + rm->rm_firstdatacol;
1004 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005 ASSERT3U(cc, <, rm->rm_cols);
1006 ASSERT3U(cc, !=, c);
1008 dst[j] = rm->rm_col[cc].rc_data;
1009 dcount[j] = rm->rm_col[cc].rc_size;
1012 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014 for (x = 0; x < ccount; x++, src++) {
1016 log = vdev_raidz_log2[*src];
1018 for (cc = 0; cc < nmissing; cc++) {
1019 if (x >= dcount[cc])
1025 if ((ll = log + invlog[cc][i]) >= 255)
1027 val = vdev_raidz_pow2[ll];
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047 int parity_map[VDEV_RAIDZ_MAXPARITY];
1052 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1059 n = rm->rm_cols - rm->rm_firstdatacol;
1062 * Figure out which data columns are missing.
1065 for (t = 0; t < ntgts; t++) {
1066 if (tgts[t] >= rm->rm_firstdatacol) {
1067 missing_rows[nmissing_rows++] =
1068 tgts[t] - rm->rm_firstdatacol;
1073 * Figure out which parity columns to use to help generate the missing
1076 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078 ASSERT(c < rm->rm_firstdatacol);
1081 * Skip any targeted parity columns.
1083 if (c == tgts[tt]) {
1095 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098 nmissing_rows * n + sizeof (used[0]) * n;
1101 printf("Out of memory\n");
1105 for (pp = p, i = 0; i < nmissing_rows; i++) {
1113 for (i = 0; i < nmissing_rows; i++) {
1114 used[i] = parity_map[i];
1117 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118 if (tt < nmissing_rows &&
1119 c == missing_rows[tt] + rm->rm_firstdatacol) {
1130 * Initialize the interesting rows of the matrix.
1132 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1135 * Invert the matrix.
1137 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1141 * Reconstruct the missing data using the generated matrix.
1143 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154 int tgts[VDEV_RAIDZ_MAXPARITY];
1158 int nbadparity, nbaddata;
1161 * The tgts list must already be sorted.
1163 for (i = 1; i < nt; i++) {
1164 ASSERT(t[i] > t[i - 1]);
1167 nbadparity = rm->rm_firstdatacol;
1168 nbaddata = rm->rm_cols - nbadparity;
1170 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171 if (i < nt && c == t[i]) {
1174 } else if (rm->rm_col[c].rc_error != 0) {
1176 } else if (c >= rm->rm_firstdatacol) {
1183 ASSERT(ntgts >= nt);
1184 ASSERT(nbaddata >= 0);
1185 ASSERT(nbaddata + nbadparity == ntgts);
1187 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195 uint64_t dcols, uint64_t nparity)
1198 uint64_t b = offset >> unit_shift;
1199 uint64_t s = size >> unit_shift;
1200 uint64_t f = b % dcols;
1201 uint64_t o = (b / dcols) << unit_shift;
1202 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204 q = s / (dcols - nparity);
1205 r = s - q * (dcols - nparity);
1206 bc = (r == 0 ? 0 : r + nparity);
1207 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1211 scols = MIN(dcols, roundup(bc, nparity + 1));
1217 ASSERT3U(acols, <=, scols);
1219 rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1223 rm->rm_cols = acols;
1224 rm->rm_scols = scols;
1225 rm->rm_bigcols = bc;
1226 rm->rm_skipstart = bc;
1227 rm->rm_missingdata = 0;
1228 rm->rm_missingparity = 0;
1229 rm->rm_firstdatacol = nparity;
1232 rm->rm_ecksuminjected = 0;
1236 for (c = 0; c < scols; c++) {
1241 coff += 1ULL << unit_shift;
1243 rm->rm_col[c].rc_devidx = col;
1244 rm->rm_col[c].rc_offset = coff;
1245 rm->rm_col[c].rc_data = NULL;
1246 rm->rm_col[c].rc_error = 0;
1247 rm->rm_col[c].rc_tried = 0;
1248 rm->rm_col[c].rc_skipped = 0;
1251 rm->rm_col[c].rc_size = 0;
1253 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1255 rm->rm_col[c].rc_size = q << unit_shift;
1257 asize += rm->rm_col[c].rc_size;
1260 ASSERT3U(asize, ==, tot << unit_shift);
1261 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1262 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1263 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1264 ASSERT3U(rm->rm_nskip, <=, nparity);
1266 for (c = 0; c < rm->rm_firstdatacol; c++) {
1267 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1268 if (rm->rm_col[c].rc_data == NULL) {
1271 free(rm->rm_col[--c].rc_data);
1277 rm->rm_col[c].rc_data = data;
1279 for (c = c + 1; c < acols; c++)
1280 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1281 rm->rm_col[c - 1].rc_size;
1284 * If all data stored spans all columns, there's a danger that parity
1285 * will always be on the same device and, since parity isn't read
1286 * during normal operation, that that device's I/O bandwidth won't be
1287 * used effectively. We therefore switch the parity every 1MB.
1289 * ... at least that was, ostensibly, the theory. As a practical
1290 * matter unless we juggle the parity between all devices evenly, we
1291 * won't see any benefit. Further, occasional writes that aren't a
1292 * multiple of the LCM of the number of children and the minimum
1293 * stripe width are sufficient to avoid pessimal behavior.
1294 * Unfortunately, this decision created an implicit on-disk format
1295 * requirement that we need to support for all eternity, but only
1296 * for single-parity RAID-Z.
1298 * If we intend to skip a sector in the zeroth column for padding
1299 * we must make sure to note this swap. We will never intend to
1300 * skip the first column since at least one data and one parity
1301 * column must appear in each row.
1303 ASSERT(rm->rm_cols >= 2);
1304 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1306 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1307 devidx = rm->rm_col[0].rc_devidx;
1308 o = rm->rm_col[0].rc_offset;
1309 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1310 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1311 rm->rm_col[1].rc_devidx = devidx;
1312 rm->rm_col[1].rc_offset = o;
1314 if (rm->rm_skipstart == 0)
1315 rm->rm_skipstart = 1;
1322 vdev_raidz_map_free(raidz_map_t *rm)
1326 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1327 free(rm->rm_col[c].rc_data);
1333 vdev_child(vdev_t *pvd, uint64_t devidx)
1337 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1338 if (cvd->v_id == devidx)
1346 * We keep track of whether or not there were any injected errors, so that
1347 * any ereports we generate can note it.
1350 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1353 return (zio_checksum_verify(spa, bp, data));
1357 * Generate the parity from the data columns. If we tried and were able to
1358 * read the parity without error, verify that the generated parity matches the
1359 * data we read. If it doesn't, we fire off a checksum error. Return the
1360 * number such failures.
1363 raidz_parity_verify(raidz_map_t *rm)
1365 void *orig[VDEV_RAIDZ_MAXPARITY];
1369 for (c = 0; c < rm->rm_firstdatacol; c++) {
1370 rc = &rm->rm_col[c];
1371 if (!rc->rc_tried || rc->rc_error != 0)
1373 orig[c] = malloc(rc->rc_size);
1374 if (orig[c] != NULL) {
1375 bcopy(rc->rc_data, orig[c], rc->rc_size);
1377 printf("Out of memory\n");
1381 vdev_raidz_generate_parity(rm);
1383 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1384 rc = &rm->rm_col[c];
1385 if (!rc->rc_tried || rc->rc_error != 0)
1387 if (orig[c] == NULL ||
1388 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1389 rc->rc_error = ECKSUM;
1399 * Iterate over all combinations of bad data and attempt a reconstruction.
1400 * Note that the algorithm below is non-optimal because it doesn't take into
1401 * account how reconstruction is actually performed. For example, with
1402 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1403 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1404 * cases we'd only use parity information in column 0.
1407 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1408 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1411 void *orig[VDEV_RAIDZ_MAXPARITY];
1412 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1413 int *tgts = &tstore[1];
1414 int current, next, i, c, n;
1417 ASSERT(total_errors < rm->rm_firstdatacol);
1420 * This simplifies one edge condition.
1424 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1426 * Initialize the targets array by finding the first n columns
1427 * that contain no error.
1429 * If there were no data errors, we need to ensure that we're
1430 * always explicitly attempting to reconstruct at least one
1431 * data column. To do this, we simply push the highest target
1432 * up into the data columns.
1434 for (c = 0, i = 0; i < n; i++) {
1435 if (i == n - 1 && data_errors == 0 &&
1436 c < rm->rm_firstdatacol) {
1437 c = rm->rm_firstdatacol;
1440 while (rm->rm_col[c].rc_error != 0) {
1442 ASSERT3S(c, <, rm->rm_cols);
1449 * Setting tgts[n] simplifies the other edge condition.
1451 tgts[n] = rm->rm_cols;
1454 * These buffers were allocated in previous iterations.
1456 for (i = 0; i < n - 1; i++) {
1457 ASSERT(orig[i] != NULL);
1460 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1461 if (orig[n - 1] == NULL) {
1467 next = tgts[current];
1469 while (current != n) {
1470 tgts[current] = next;
1474 * Save off the original data that we're going to
1475 * attempt to reconstruct.
1477 for (i = 0; i < n; i++) {
1478 ASSERT(orig[i] != NULL);
1481 ASSERT3S(c, <, rm->rm_cols);
1482 rc = &rm->rm_col[c];
1483 bcopy(rc->rc_data, orig[i], rc->rc_size);
1487 * Attempt a reconstruction and exit the outer loop on
1490 code = vdev_raidz_reconstruct(rm, tgts, n);
1491 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1492 for (i = 0; i < n; i++) {
1494 rc = &rm->rm_col[c];
1495 ASSERT(rc->rc_error == 0);
1496 rc->rc_error = ECKSUM;
1504 * Restore the original data.
1506 for (i = 0; i < n; i++) {
1508 rc = &rm->rm_col[c];
1509 bcopy(orig[i], rc->rc_data, rc->rc_size);
1514 * Find the next valid column after the current
1517 for (next = tgts[current] + 1;
1518 next < rm->rm_cols &&
1519 rm->rm_col[next].rc_error != 0; next++)
1522 ASSERT(next <= tgts[current + 1]);
1525 * If that spot is available, we're done here.
1527 if (next != tgts[current + 1])
1531 * Otherwise, find the next valid column after
1532 * the previous position.
1534 for (c = tgts[current - 1] + 1;
1535 rm->rm_col[c].rc_error != 0; c++)
1541 } while (current != n);
1546 for (i = n - 1; i >= 0; i--) {
1554 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1555 off_t offset, size_t bytes)
1557 vdev_t *tvd = vd->v_top;
1562 int unexpected_errors;
1568 int tgts[VDEV_RAIDZ_MAXPARITY];
1571 rc = NULL; /* gcc */
1574 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1575 vd->v_nchildren, vd->v_nparity);
1580 * Iterate over the columns in reverse order so that we hit the parity
1581 * last -- any errors along the way will force us to read the parity.
1583 for (c = rm->rm_cols - 1; c >= 0; c--) {
1584 rc = &rm->rm_col[c];
1585 cvd = vdev_child(vd, rc->rc_devidx);
1586 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1587 if (c >= rm->rm_firstdatacol)
1588 rm->rm_missingdata++;
1590 rm->rm_missingparity++;
1591 rc->rc_error = ENXIO;
1592 rc->rc_tried = 1; /* don't even try */
1596 #if 0 /* XXX: Too hard for the boot code. */
1597 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1598 if (c >= rm->rm_firstdatacol)
1599 rm->rm_missingdata++;
1601 rm->rm_missingparity++;
1602 rc->rc_error = ESTALE;
1607 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1608 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1609 rc->rc_offset, rc->rc_size);
1616 unexpected_errors = 0;
1622 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1623 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1625 for (c = 0; c < rm->rm_cols; c++) {
1626 rc = &rm->rm_col[c];
1629 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1631 if (c < rm->rm_firstdatacol)
1636 if (!rc->rc_skipped)
1637 unexpected_errors++;
1640 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1646 * There are three potential phases for a read:
1647 * 1. produce valid data from the columns read
1648 * 2. read all disks and try again
1649 * 3. perform combinatorial reconstruction
1651 * Each phase is progressively both more expensive and less likely to
1652 * occur. If we encounter more errors than we can repair or all phases
1653 * fail, we have no choice but to return an error.
1657 * If the number of errors we saw was correctable -- less than or equal
1658 * to the number of parity disks read -- attempt to produce data that
1659 * has a valid checksum. Naturally, this case applies in the absence of
1662 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1665 if (data_errors == 0) {
1666 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1669 * If we read parity information (unnecessarily
1670 * as it happens since no reconstruction was
1671 * needed) regenerate and verify the parity.
1672 * We also regenerate parity when resilvering
1673 * so we can write it out to the failed device
1676 if (parity_errors + parity_untried <
1677 rm->rm_firstdatacol) {
1678 n = raidz_parity_verify(rm);
1679 unexpected_errors += n;
1680 ASSERT(parity_errors + n <=
1681 rm->rm_firstdatacol);
1687 * We either attempt to read all the parity columns or
1688 * none of them. If we didn't try to read parity, we
1689 * wouldn't be here in the correctable case. There must
1690 * also have been fewer parity errors than parity
1691 * columns or, again, we wouldn't be in this code path.
1693 ASSERT(parity_untried == 0);
1694 ASSERT(parity_errors < rm->rm_firstdatacol);
1697 * Identify the data columns that reported an error.
1700 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1701 rc = &rm->rm_col[c];
1702 if (rc->rc_error != 0) {
1703 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1708 ASSERT(rm->rm_firstdatacol >= n);
1710 code = vdev_raidz_reconstruct(rm, tgts, n);
1712 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1715 * If we read more parity disks than were used
1716 * for reconstruction, confirm that the other
1717 * parity disks produced correct data. This
1718 * routine is suboptimal in that it regenerates
1719 * the parity that we already used in addition
1720 * to the parity that we're attempting to
1721 * verify, but this should be a relatively
1722 * uncommon case, and can be optimized if it
1723 * becomes a problem. Note that we regenerate
1724 * parity when resilvering so we can write it
1725 * out to failed devices later.
1727 if (parity_errors < rm->rm_firstdatacol - n) {
1728 n = raidz_parity_verify(rm);
1729 unexpected_errors += n;
1730 ASSERT(parity_errors + n <=
1731 rm->rm_firstdatacol);
1740 * This isn't a typical situation -- either we got a read
1741 * error or a child silently returned bad data. Read every
1742 * block so we can try again with as much data and parity as
1743 * we can track down. If we've already been through once
1744 * before, all children will be marked as tried so we'll
1745 * proceed to combinatorial reconstruction.
1747 unexpected_errors = 1;
1748 rm->rm_missingdata = 0;
1749 rm->rm_missingparity = 0;
1752 for (c = 0; c < rm->rm_cols; c++) {
1753 rc = &rm->rm_col[c];
1758 cvd = vdev_child(vd, rc->rc_devidx);
1759 ASSERT(cvd != NULL);
1760 rc->rc_error = cvd->v_read(cvd, NULL,
1761 rc->rc_data, rc->rc_offset, rc->rc_size);
1762 if (rc->rc_error == 0)
1768 * If we managed to read anything more, retry the
1775 * At this point we've attempted to reconstruct the data given the
1776 * errors we detected, and we've attempted to read all columns. There
1777 * must, therefore, be one or more additional problems -- silent errors
1778 * resulting in invalid data rather than explicit I/O errors resulting
1779 * in absent data. We check if there is enough additional data to
1780 * possibly reconstruct the data and then perform combinatorial
1781 * reconstruction over all possible combinations. If that fails,
1784 if (total_errors > rm->rm_firstdatacol) {
1786 } else if (total_errors < rm->rm_firstdatacol &&
1787 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1788 total_errors, data_errors)) != 0) {
1790 * If we didn't use all the available parity for the
1791 * combinatorial reconstruction, verify that the remaining
1792 * parity is correct.
1794 if (code != (1 << rm->rm_firstdatacol) - 1)
1795 (void) raidz_parity_verify(rm);
1798 * We're here because either:
1800 * total_errors == rm_first_datacol, or
1801 * vdev_raidz_combrec() failed
1803 * In either case, there is enough bad data to prevent
1806 * Start checksum ereports for all children which haven't
1807 * failed, and the IO wasn't speculative.
1813 vdev_raidz_map_free(rm);