4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
33 #define ASSERT3S(x, y, z) ((void)0)
34 #define ASSERT3U(x, y, z) ((void)0)
35 #define ASSERT3P(x, y, z) ((void)0)
36 #define ASSERT0(x) ((void)0)
37 #define ASSERT(x) ((void)0)
39 #define panic(...) do { \
40 printf(__VA_ARGS__); \
44 #define kmem_alloc(size, flag) zfs_alloc((size))
45 #define kmem_free(ptr, size) zfs_free((ptr), (size))
54 * Calculate the crc64 table (used for the zap hash
57 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59 for (i = 0; i < 256; i++)
60 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
66 zio_checksum_off(const void *buf, uint64_t size,
67 const void *ctx_template, zio_cksum_t *zcp)
69 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
73 * Signature for checksum functions.
75 typedef void zio_checksum_t(const void *data, uint64_t size,
76 const void *ctx_template, zio_cksum_t *zcp);
77 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
78 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
80 typedef enum zio_checksum_flags {
81 /* Strong enough for metadata? */
82 ZCHECKSUM_FLAG_METADATA = (1 << 1),
83 /* ZIO embedded checksum */
84 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
85 /* Strong enough for dedup (without verification)? */
86 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
88 ZCHECKSUM_FLAG_SALTED = (1 << 4),
89 /* Strong enough for nopwrite? */
90 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
91 } zio_checksum_flags_t;
94 * Information about each checksum function.
96 typedef struct zio_checksum_info {
97 /* checksum function for each byteorder */
98 zio_checksum_t *ci_func[2];
99 zio_checksum_tmpl_init_t *ci_tmpl_init;
100 zio_checksum_tmpl_free_t *ci_tmpl_free;
101 zio_checksum_flags_t ci_flags;
102 const char *ci_name; /* descriptive name */
103 } zio_checksum_info_t;
107 #include "fletcher.c"
109 #include "skein_zfs.c"
111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
112 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
113 {{NULL, NULL}, NULL, NULL, 0, "on"},
114 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
120 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
123 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
124 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
125 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
126 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
127 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
128 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
129 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
130 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
132 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
133 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
134 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
135 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139 /* no edonr for now */
140 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
141 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
145 * Common signature for all zio compress/decompress functions.
147 typedef size_t zio_compress_func_t(void *src, void *dst,
148 size_t s_len, size_t d_len, int);
149 typedef int zio_decompress_func_t(void *src, void *dst,
150 size_t s_len, size_t d_len, int);
153 * Information about each compression function.
155 typedef struct zio_compress_info {
156 zio_compress_func_t *ci_compress; /* compression function */
157 zio_decompress_func_t *ci_decompress; /* decompression function */
158 int ci_level; /* level parameter */
159 const char *ci_name; /* algorithm name */
160 } zio_compress_info_t;
167 * Compression vectors.
169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
170 {NULL, NULL, 0, "inherit"},
171 {NULL, NULL, 0, "on"},
172 {NULL, NULL, 0, "uncompressed"},
173 {NULL, lzjb_decompress, 0, "lzjb"},
174 {NULL, NULL, 0, "empty"},
175 {NULL, NULL, 1, "gzip-1"},
176 {NULL, NULL, 2, "gzip-2"},
177 {NULL, NULL, 3, "gzip-3"},
178 {NULL, NULL, 4, "gzip-4"},
179 {NULL, NULL, 5, "gzip-5"},
180 {NULL, NULL, 6, "gzip-6"},
181 {NULL, NULL, 7, "gzip-7"},
182 {NULL, NULL, 8, "gzip-8"},
183 {NULL, NULL, 9, "gzip-9"},
184 {NULL, zle_decompress, 64, "zle"},
185 {NULL, lz4_decompress, 0, "lz4"},
189 byteswap_uint64_array(void *vbuf, size_t size)
191 uint64_t *buf = vbuf;
192 size_t count = size >> 3;
195 ASSERT((size & 7) == 0);
197 for (i = 0; i < count; i++)
198 buf[i] = BSWAP_64(buf[i]);
202 * Set the external verifier for a gang block based on <vdev, offset, txg>,
203 * a tuple which is guaranteed to be unique for the life of the pool.
206 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
208 const dva_t *dva = BP_IDENTITY(bp);
209 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
211 ASSERT(BP_IS_GANG(bp));
213 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
217 * Set the external verifier for a label block based on its offset.
218 * The vdev is implicit, and the txg is unknowable at pool open time --
219 * hence the logic in vdev_uberblock_load() to find the most recent copy.
222 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
224 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
228 * Calls the template init function of a checksum which supports context
229 * templates and installs the template into the spa_t.
232 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
234 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
236 if (ci->ci_tmpl_init == NULL)
239 if (spa->spa_cksum_tmpls[checksum] != NULL)
242 if (spa->spa_cksum_tmpls[checksum] == NULL) {
243 spa->spa_cksum_tmpls[checksum] =
244 ci->ci_tmpl_init(&spa->spa_cksum_salt);
249 * Called by a spa_t that's about to be deallocated. This steps through
250 * all of the checksum context templates and deallocates any that were
251 * initialized using the algorithm-specific template init function.
254 zio_checksum_templates_free(spa_t *spa)
256 for (enum zio_checksum checksum = 0;
257 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
258 if (spa->spa_cksum_tmpls[checksum] != NULL) {
259 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
261 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
262 spa->spa_cksum_tmpls[checksum] = NULL;
268 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
271 unsigned int checksum;
272 zio_checksum_info_t *ci;
274 zio_cksum_t actual_cksum, expected_cksum, verifier;
277 checksum = BP_GET_CHECKSUM(bp);
278 size = BP_GET_PSIZE(bp);
280 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
282 ci = &zio_checksum_table[checksum];
283 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
287 zio_checksum_template_init(checksum, (spa_t *) spa);
288 ctx = spa->spa_cksum_tmpls[checksum];
291 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
294 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
295 checksum == ZIO_CHECKSUM_LABEL);
297 eck = (zio_eck_t *)((char *)data + size) - 1;
299 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
300 zio_checksum_gang_verifier(&verifier, bp);
301 else if (checksum == ZIO_CHECKSUM_LABEL)
302 zio_checksum_label_verifier(&verifier,
303 DVA_GET_OFFSET(BP_IDENTITY(bp)));
305 verifier = bp->blk_cksum;
307 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
310 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
312 expected_cksum = eck->zec_cksum;
313 eck->zec_cksum = verifier;
314 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
315 eck->zec_cksum = expected_cksum;
318 byteswap_uint64_array(&expected_cksum,
319 sizeof (zio_cksum_t));
321 expected_cksum = bp->blk_cksum;
322 ci->ci_func[0](data, size, ctx, &actual_cksum);
325 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335 void *dest, uint64_t destsize)
337 zio_compress_info_t *ci;
339 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
344 ci = &zio_compress_table[cpfunc];
345 if (!ci->ci_decompress) {
346 printf("ZFS: unsupported compression algorithm %s\n",
351 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
355 zap_hash(uint64_t salt, const char *name)
362 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
367 * Only use 28 bits, since we need 4 bits in the cookie for the
368 * collision differentiator. We MUST use the high bits, since
369 * those are the onces that we first pay attention to when
370 * chosing the bucket.
372 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
377 static void *zfs_alloc(size_t size);
378 static void zfs_free(void *ptr, size_t size);
380 typedef struct raidz_col {
381 uint64_t rc_devidx; /* child device index for I/O */
382 uint64_t rc_offset; /* device offset */
383 uint64_t rc_size; /* I/O size */
384 void *rc_data; /* I/O data */
385 int rc_error; /* I/O error for this device */
386 uint8_t rc_tried; /* Did we attempt this I/O column? */
387 uint8_t rc_skipped; /* Did we skip this I/O column? */
390 typedef struct raidz_map {
391 uint64_t rm_cols; /* Regular column count */
392 uint64_t rm_scols; /* Count including skipped columns */
393 uint64_t rm_bigcols; /* Number of oversized columns */
394 uint64_t rm_asize; /* Actual total I/O size */
395 uint64_t rm_missingdata; /* Count of missing data devices */
396 uint64_t rm_missingparity; /* Count of missing parity devices */
397 uint64_t rm_firstdatacol; /* First data column/parity count */
398 uint64_t rm_nskip; /* Skipped sectors for padding */
399 uint64_t rm_skipstart; /* Column index of padding start */
400 uintptr_t rm_reports; /* # of referencing checksum reports */
401 uint8_t rm_freed; /* map no longer has referencing ZIO */
402 uint8_t rm_ecksuminjected; /* checksum error was injected */
403 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
406 #define VDEV_RAIDZ_P 0
407 #define VDEV_RAIDZ_Q 1
408 #define VDEV_RAIDZ_R 2
410 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
411 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414 * We provide a mechanism to perform the field multiplication operation on a
415 * 64-bit value all at once rather than a byte at a time. This works by
416 * creating a mask from the top bit in each byte and using that to
417 * conditionally apply the XOR of 0x1d.
419 #define VDEV_RAIDZ_64MUL_2(x, mask) \
421 (mask) = (x) & 0x8080808080808080ULL; \
422 (mask) = ((mask) << 1) - ((mask) >> 7); \
423 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
424 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427 #define VDEV_RAIDZ_64MUL_4(x, mask) \
429 VDEV_RAIDZ_64MUL_2((x), mask); \
430 VDEV_RAIDZ_64MUL_2((x), mask); \
434 * These two tables represent powers and logs of 2 in the Galois field defined
435 * above. These values were computed by repeatedly multiplying by 2 as above.
437 static const uint8_t vdev_raidz_pow2[256] = {
438 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
439 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
440 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
441 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
442 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
443 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
444 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
445 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
446 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
447 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
448 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
449 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
450 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
451 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
452 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
453 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
454 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
455 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
456 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
457 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
458 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
459 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
460 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
461 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
462 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
463 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
464 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
465 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
466 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
467 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
468 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
469 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
471 static const uint8_t vdev_raidz_log2[256] = {
472 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
473 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
474 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
475 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
476 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
477 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
478 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
479 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
480 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
481 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
482 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
483 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
484 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
485 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
486 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
487 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
488 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
489 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
490 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
491 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
492 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
493 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
494 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
495 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
496 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
497 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
498 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
499 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
500 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
501 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
502 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
503 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
507 * Multiply a given number by 2 raised to the given power.
510 vdev_raidz_exp2(uint8_t a, int exp)
516 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
518 exp += vdev_raidz_log2[a];
522 return (vdev_raidz_pow2[exp]);
526 vdev_raidz_generate_parity_p(raidz_map_t *rm)
528 uint64_t *p, *src, pcount, ccount, i;
531 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
533 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
534 src = rm->rm_col[c].rc_data;
535 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
536 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
538 if (c == rm->rm_firstdatacol) {
539 ASSERT(ccount == pcount);
540 for (i = 0; i < ccount; i++, src++, p++) {
544 ASSERT(ccount <= pcount);
545 for (i = 0; i < ccount; i++, src++, p++) {
553 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
555 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
559 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
560 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
562 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
563 src = rm->rm_col[c].rc_data;
564 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
565 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
567 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
569 if (c == rm->rm_firstdatacol) {
570 ASSERT(ccnt == pcnt || ccnt == 0);
571 for (i = 0; i < ccnt; i++, src++, p++, q++) {
575 for (; i < pcnt; i++, src++, p++, q++) {
580 ASSERT(ccnt <= pcnt);
583 * Apply the algorithm described above by multiplying
584 * the previous result and adding in the new value.
586 for (i = 0; i < ccnt; i++, src++, p++, q++) {
589 VDEV_RAIDZ_64MUL_2(*q, mask);
594 * Treat short columns as though they are full of 0s.
595 * Note that there's therefore nothing needed for P.
597 for (; i < pcnt; i++, q++) {
598 VDEV_RAIDZ_64MUL_2(*q, mask);
605 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
607 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
611 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
612 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
613 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 rm->rm_col[VDEV_RAIDZ_R].rc_size);
616 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
617 src = rm->rm_col[c].rc_data;
618 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
619 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
620 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
622 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
624 if (c == rm->rm_firstdatacol) {
625 ASSERT(ccnt == pcnt || ccnt == 0);
626 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
631 for (; i < pcnt; i++, src++, p++, q++, r++) {
637 ASSERT(ccnt <= pcnt);
640 * Apply the algorithm described above by multiplying
641 * the previous result and adding in the new value.
643 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646 VDEV_RAIDZ_64MUL_2(*q, mask);
649 VDEV_RAIDZ_64MUL_4(*r, mask);
654 * Treat short columns as though they are full of 0s.
655 * Note that there's therefore nothing needed for P.
657 for (; i < pcnt; i++, q++, r++) {
658 VDEV_RAIDZ_64MUL_2(*q, mask);
659 VDEV_RAIDZ_64MUL_4(*r, mask);
666 * Generate RAID parity in the first virtual columns according to the number of
667 * parity columns available.
670 vdev_raidz_generate_parity(raidz_map_t *rm)
672 switch (rm->rm_firstdatacol) {
674 vdev_raidz_generate_parity_p(rm);
677 vdev_raidz_generate_parity_pq(rm);
680 vdev_raidz_generate_parity_pqr(rm);
683 panic("invalid RAID-Z configuration");
689 * In the general case of reconstruction, we must solve the system of linear
690 * equations defined by the coeffecients used to generate parity as well as
691 * the contents of the data and parity disks. This can be expressed with
692 * vectors for the original data (D) and the actual data (d) and parity (p)
693 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
697 * | V | | D_0 | | p_m-1 |
698 * | | x | : | = | d_0 |
699 * | I | | D_n-1 | | : |
700 * | | ~~ ~~ | d_n-1 |
703 * I is simply a square identity matrix of size n, and V is a vandermonde
704 * matrix defined by the coeffecients we chose for the various parity columns
705 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
706 * computation as well as linear separability.
709 * | 1 .. 1 1 1 | | p_0 |
710 * | 2^n-1 .. 4 2 1 | __ __ | : |
711 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
712 * | 1 .. 0 0 0 | | D_1 | | d_0 |
713 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
714 * | : : : : | | : | | d_2 |
715 * | 0 .. 1 0 0 | | D_n-1 | | : |
716 * | 0 .. 0 1 0 | ~~ ~~ | : |
717 * | 0 .. 0 0 1 | | d_n-1 |
720 * Note that I, V, d, and p are known. To compute D, we must invert the
721 * matrix and use the known data and parity values to reconstruct the unknown
722 * data values. We begin by removing the rows in V|I and d|p that correspond
723 * to failed or missing columns; we then make V|I square (n x n) and d|p
724 * sized n by removing rows corresponding to unused parity from the bottom up
725 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
726 * using Gauss-Jordan elimination. In the example below we use m=3 parity
727 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
729 * | 1 1 1 1 1 1 1 1 |
730 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
731 * | 19 205 116 29 64 16 4 1 | / /
732 * | 1 0 0 0 0 0 0 0 | / /
733 * | 0 1 0 0 0 0 0 0 | <--' /
734 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
735 * | 0 0 0 1 0 0 0 0 |
736 * | 0 0 0 0 1 0 0 0 |
737 * | 0 0 0 0 0 1 0 0 |
738 * | 0 0 0 0 0 0 1 0 |
739 * | 0 0 0 0 0 0 0 1 |
742 * | 1 1 1 1 1 1 1 1 |
743 * | 128 64 32 16 8 4 2 1 |
744 * | 19 205 116 29 64 16 4 1 |
745 * | 1 0 0 0 0 0 0 0 |
746 * | 0 1 0 0 0 0 0 0 |
747 * (V|I)' = | 0 0 1 0 0 0 0 0 |
748 * | 0 0 0 1 0 0 0 0 |
749 * | 0 0 0 0 1 0 0 0 |
750 * | 0 0 0 0 0 1 0 0 |
751 * | 0 0 0 0 0 0 1 0 |
752 * | 0 0 0 0 0 0 0 1 |
755 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
756 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
757 * matrix is not singular.
759 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
760 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
761 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
762 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
763 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
764 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
765 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
766 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
769 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
770 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
771 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
772 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
773 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
774 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
775 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
776 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
779 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
780 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
781 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
782 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
783 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
784 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
785 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
786 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
789 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
790 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
791 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
792 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
793 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
794 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
795 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
796 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
799 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
800 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
801 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
802 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
803 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
804 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
805 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
806 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
809 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
810 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
811 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
812 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
813 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
814 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
815 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
816 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
819 * | 0 0 1 0 0 0 0 0 |
820 * | 167 100 5 41 159 169 217 208 |
821 * | 166 100 4 40 158 168 216 209 |
822 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
823 * | 0 0 0 0 1 0 0 0 |
824 * | 0 0 0 0 0 1 0 0 |
825 * | 0 0 0 0 0 0 1 0 |
826 * | 0 0 0 0 0 0 0 1 |
829 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
830 * of the missing data.
832 * As is apparent from the example above, the only non-trivial rows in the
833 * inverse matrix correspond to the data disks that we're trying to
834 * reconstruct. Indeed, those are the only rows we need as the others would
835 * only be useful for reconstructing data known or assumed to be valid. For
836 * that reason, we only build the coefficients in the rows that correspond to
842 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
848 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851 * Fill in the missing rows of interest.
853 for (i = 0; i < nmap; i++) {
854 ASSERT3S(0, <=, map[i]);
855 ASSERT3S(map[i], <=, 2);
862 for (j = 0; j < n; j++) {
866 rows[i][j] = vdev_raidz_pow2[pow];
872 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
873 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
879 * Assert that the first nmissing entries from the array of used
880 * columns correspond to parity columns and that subsequent entries
881 * correspond to data columns.
883 for (i = 0; i < nmissing; i++) {
884 ASSERT3S(used[i], <, rm->rm_firstdatacol);
887 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
891 * First initialize the storage where we'll compute the inverse rows.
893 for (i = 0; i < nmissing; i++) {
894 for (j = 0; j < n; j++) {
895 invrows[i][j] = (i == j) ? 1 : 0;
900 * Subtract all trivial rows from the rows of consequence.
902 for (i = 0; i < nmissing; i++) {
903 for (j = nmissing; j < n; j++) {
904 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
905 jj = used[j] - rm->rm_firstdatacol;
907 invrows[i][j] = rows[i][jj];
913 * For each of the rows of interest, we must normalize it and subtract
914 * a multiple of it from the other rows.
916 for (i = 0; i < nmissing; i++) {
917 for (j = 0; j < missing[i]; j++) {
918 ASSERT3U(rows[i][j], ==, 0);
920 ASSERT3U(rows[i][missing[i]], !=, 0);
923 * Compute the inverse of the first element and multiply each
924 * element in the row by that value.
926 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
928 for (j = 0; j < n; j++) {
929 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
930 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933 for (ii = 0; ii < nmissing; ii++) {
937 ASSERT3U(rows[ii][missing[i]], !=, 0);
939 log = vdev_raidz_log2[rows[ii][missing[i]]];
941 for (j = 0; j < n; j++) {
943 vdev_raidz_exp2(rows[i][j], log);
945 vdev_raidz_exp2(invrows[i][j], log);
951 * Verify that the data that is left in the rows are properly part of
952 * an identity matrix.
954 for (i = 0; i < nmissing; i++) {
955 for (j = 0; j < n; j++) {
956 if (j == missing[i]) {
957 ASSERT3U(rows[i][j], ==, 1);
959 ASSERT3U(rows[i][j], ==, 0);
966 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
967 int *missing, uint8_t **invrows, const uint8_t *used)
972 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
973 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
981 psize = sizeof (invlog[0][0]) * n * nmissing;
982 p = zfs_alloc(psize);
984 for (pp = p, i = 0; i < nmissing; i++) {
989 for (i = 0; i < nmissing; i++) {
990 for (j = 0; j < n; j++) {
991 ASSERT3U(invrows[i][j], !=, 0);
992 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
996 for (i = 0; i < n; i++) {
998 ASSERT3U(c, <, rm->rm_cols);
1000 src = rm->rm_col[c].rc_data;
1001 ccount = rm->rm_col[c].rc_size;
1002 for (j = 0; j < nmissing; j++) {
1003 cc = missing[j] + rm->rm_firstdatacol;
1004 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005 ASSERT3U(cc, <, rm->rm_cols);
1006 ASSERT3U(cc, !=, c);
1008 dst[j] = rm->rm_col[cc].rc_data;
1009 dcount[j] = rm->rm_col[cc].rc_size;
1012 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014 for (x = 0; x < ccount; x++, src++) {
1016 log = vdev_raidz_log2[*src];
1018 for (cc = 0; cc < nmissing; cc++) {
1019 if (x >= dcount[cc])
1025 if ((ll = log + invlog[cc][i]) >= 255)
1027 val = vdev_raidz_pow2[ll];
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047 int parity_map[VDEV_RAIDZ_MAXPARITY];
1052 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1059 n = rm->rm_cols - rm->rm_firstdatacol;
1062 * Figure out which data columns are missing.
1065 for (t = 0; t < ntgts; t++) {
1066 if (tgts[t] >= rm->rm_firstdatacol) {
1067 missing_rows[nmissing_rows++] =
1068 tgts[t] - rm->rm_firstdatacol;
1073 * Figure out which parity columns to use to help generate the missing
1076 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078 ASSERT(c < rm->rm_firstdatacol);
1081 * Skip any targeted parity columns.
1083 if (c == tgts[tt]) {
1095 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098 nmissing_rows * n + sizeof (used[0]) * n;
1099 p = kmem_alloc(psize, KM_SLEEP);
1101 for (pp = p, i = 0; i < nmissing_rows; i++) {
1109 for (i = 0; i < nmissing_rows; i++) {
1110 used[i] = parity_map[i];
1113 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1114 if (tt < nmissing_rows &&
1115 c == missing_rows[tt] + rm->rm_firstdatacol) {
1126 * Initialize the interesting rows of the matrix.
1128 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1131 * Invert the matrix.
1133 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1137 * Reconstruct the missing data using the generated matrix.
1139 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1142 kmem_free(p, psize);
1148 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1150 int tgts[VDEV_RAIDZ_MAXPARITY];
1154 int nbadparity, nbaddata;
1157 * The tgts list must already be sorted.
1159 for (i = 1; i < nt; i++) {
1160 ASSERT(t[i] > t[i - 1]);
1163 nbadparity = rm->rm_firstdatacol;
1164 nbaddata = rm->rm_cols - nbadparity;
1166 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1167 if (i < nt && c == t[i]) {
1170 } else if (rm->rm_col[c].rc_error != 0) {
1172 } else if (c >= rm->rm_firstdatacol) {
1179 ASSERT(ntgts >= nt);
1180 ASSERT(nbaddata >= 0);
1181 ASSERT(nbaddata + nbadparity == ntgts);
1183 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1184 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189 static raidz_map_t *
1190 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1191 uint64_t dcols, uint64_t nparity)
1194 uint64_t b = offset >> unit_shift;
1195 uint64_t s = size >> unit_shift;
1196 uint64_t f = b % dcols;
1197 uint64_t o = (b / dcols) << unit_shift;
1198 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1200 q = s / (dcols - nparity);
1201 r = s - q * (dcols - nparity);
1202 bc = (r == 0 ? 0 : r + nparity);
1203 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1207 scols = MIN(dcols, roundup(bc, nparity + 1));
1213 ASSERT3U(acols, <=, scols);
1215 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1217 rm->rm_cols = acols;
1218 rm->rm_scols = scols;
1219 rm->rm_bigcols = bc;
1220 rm->rm_skipstart = bc;
1221 rm->rm_missingdata = 0;
1222 rm->rm_missingparity = 0;
1223 rm->rm_firstdatacol = nparity;
1226 rm->rm_ecksuminjected = 0;
1230 for (c = 0; c < scols; c++) {
1235 coff += 1ULL << unit_shift;
1237 rm->rm_col[c].rc_devidx = col;
1238 rm->rm_col[c].rc_offset = coff;
1239 rm->rm_col[c].rc_data = NULL;
1240 rm->rm_col[c].rc_error = 0;
1241 rm->rm_col[c].rc_tried = 0;
1242 rm->rm_col[c].rc_skipped = 0;
1245 rm->rm_col[c].rc_size = 0;
1247 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1249 rm->rm_col[c].rc_size = q << unit_shift;
1251 asize += rm->rm_col[c].rc_size;
1254 ASSERT3U(asize, ==, tot << unit_shift);
1255 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1256 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1257 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1258 ASSERT3U(rm->rm_nskip, <=, nparity);
1260 for (c = 0; c < rm->rm_firstdatacol; c++)
1261 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1263 rm->rm_col[c].rc_data = data;
1265 for (c = c + 1; c < acols; c++)
1266 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1267 rm->rm_col[c - 1].rc_size;
1270 * If all data stored spans all columns, there's a danger that parity
1271 * will always be on the same device and, since parity isn't read
1272 * during normal operation, that that device's I/O bandwidth won't be
1273 * used effectively. We therefore switch the parity every 1MB.
1275 * ... at least that was, ostensibly, the theory. As a practical
1276 * matter unless we juggle the parity between all devices evenly, we
1277 * won't see any benefit. Further, occasional writes that aren't a
1278 * multiple of the LCM of the number of children and the minimum
1279 * stripe width are sufficient to avoid pessimal behavior.
1280 * Unfortunately, this decision created an implicit on-disk format
1281 * requirement that we need to support for all eternity, but only
1282 * for single-parity RAID-Z.
1284 * If we intend to skip a sector in the zeroth column for padding
1285 * we must make sure to note this swap. We will never intend to
1286 * skip the first column since at least one data and one parity
1287 * column must appear in each row.
1289 ASSERT(rm->rm_cols >= 2);
1290 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1292 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1293 devidx = rm->rm_col[0].rc_devidx;
1294 o = rm->rm_col[0].rc_offset;
1295 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1296 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1297 rm->rm_col[1].rc_devidx = devidx;
1298 rm->rm_col[1].rc_offset = o;
1300 if (rm->rm_skipstart == 0)
1301 rm->rm_skipstart = 1;
1308 vdev_raidz_map_free(raidz_map_t *rm)
1312 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1313 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1315 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1319 vdev_child(vdev_t *pvd, uint64_t devidx)
1323 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1324 if (cvd->v_id == devidx)
1332 * We keep track of whether or not there were any injected errors, so that
1333 * any ereports we generate can note it.
1336 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1339 return (zio_checksum_verify(spa, bp, data));
1343 * Generate the parity from the data columns. If we tried and were able to
1344 * read the parity without error, verify that the generated parity matches the
1345 * data we read. If it doesn't, we fire off a checksum error. Return the
1346 * number such failures.
1349 raidz_parity_verify(raidz_map_t *rm)
1351 void *orig[VDEV_RAIDZ_MAXPARITY];
1355 for (c = 0; c < rm->rm_firstdatacol; c++) {
1356 rc = &rm->rm_col[c];
1357 if (!rc->rc_tried || rc->rc_error != 0)
1359 orig[c] = zfs_alloc(rc->rc_size);
1360 bcopy(rc->rc_data, orig[c], rc->rc_size);
1363 vdev_raidz_generate_parity(rm);
1365 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1366 rc = &rm->rm_col[c];
1367 if (!rc->rc_tried || rc->rc_error != 0)
1369 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1370 rc->rc_error = ECKSUM;
1373 zfs_free(orig[c], rc->rc_size);
1380 * Iterate over all combinations of bad data and attempt a reconstruction.
1381 * Note that the algorithm below is non-optimal because it doesn't take into
1382 * account how reconstruction is actually performed. For example, with
1383 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1384 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1385 * cases we'd only use parity information in column 0.
1388 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1389 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1392 void *orig[VDEV_RAIDZ_MAXPARITY];
1393 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1394 int *tgts = &tstore[1];
1395 int current, next, i, c, n;
1398 ASSERT(total_errors < rm->rm_firstdatacol);
1401 * This simplifies one edge condition.
1405 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1407 * Initialize the targets array by finding the first n columns
1408 * that contain no error.
1410 * If there were no data errors, we need to ensure that we're
1411 * always explicitly attempting to reconstruct at least one
1412 * data column. To do this, we simply push the highest target
1413 * up into the data columns.
1415 for (c = 0, i = 0; i < n; i++) {
1416 if (i == n - 1 && data_errors == 0 &&
1417 c < rm->rm_firstdatacol) {
1418 c = rm->rm_firstdatacol;
1421 while (rm->rm_col[c].rc_error != 0) {
1423 ASSERT3S(c, <, rm->rm_cols);
1430 * Setting tgts[n] simplifies the other edge condition.
1432 tgts[n] = rm->rm_cols;
1435 * These buffers were allocated in previous iterations.
1437 for (i = 0; i < n - 1; i++) {
1438 ASSERT(orig[i] != NULL);
1441 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1444 next = tgts[current];
1446 while (current != n) {
1447 tgts[current] = next;
1451 * Save off the original data that we're going to
1452 * attempt to reconstruct.
1454 for (i = 0; i < n; i++) {
1455 ASSERT(orig[i] != NULL);
1458 ASSERT3S(c, <, rm->rm_cols);
1459 rc = &rm->rm_col[c];
1460 bcopy(rc->rc_data, orig[i], rc->rc_size);
1464 * Attempt a reconstruction and exit the outer loop on
1467 code = vdev_raidz_reconstruct(rm, tgts, n);
1468 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1469 for (i = 0; i < n; i++) {
1471 rc = &rm->rm_col[c];
1472 ASSERT(rc->rc_error == 0);
1473 rc->rc_error = ECKSUM;
1481 * Restore the original data.
1483 for (i = 0; i < n; i++) {
1485 rc = &rm->rm_col[c];
1486 bcopy(orig[i], rc->rc_data, rc->rc_size);
1491 * Find the next valid column after the current
1494 for (next = tgts[current] + 1;
1495 next < rm->rm_cols &&
1496 rm->rm_col[next].rc_error != 0; next++)
1499 ASSERT(next <= tgts[current + 1]);
1502 * If that spot is available, we're done here.
1504 if (next != tgts[current + 1])
1508 * Otherwise, find the next valid column after
1509 * the previous position.
1511 for (c = tgts[current - 1] + 1;
1512 rm->rm_col[c].rc_error != 0; c++)
1518 } while (current != n);
1523 for (i = n - 1; i >= 0; i--) {
1524 zfs_free(orig[i], rm->rm_col[0].rc_size);
1531 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1532 off_t offset, size_t bytes)
1534 vdev_t *tvd = vd->v_top;
1539 int unexpected_errors;
1545 int tgts[VDEV_RAIDZ_MAXPARITY];
1548 rc = NULL; /* gcc */
1551 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1552 vd->v_nchildren, vd->v_nparity);
1555 * Iterate over the columns in reverse order so that we hit the parity
1556 * last -- any errors along the way will force us to read the parity.
1558 for (c = rm->rm_cols - 1; c >= 0; c--) {
1559 rc = &rm->rm_col[c];
1560 cvd = vdev_child(vd, rc->rc_devidx);
1561 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1562 if (c >= rm->rm_firstdatacol)
1563 rm->rm_missingdata++;
1565 rm->rm_missingparity++;
1566 rc->rc_error = ENXIO;
1567 rc->rc_tried = 1; /* don't even try */
1571 #if 0 /* XXX: Too hard for the boot code. */
1572 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1573 if (c >= rm->rm_firstdatacol)
1574 rm->rm_missingdata++;
1576 rm->rm_missingparity++;
1577 rc->rc_error = ESTALE;
1582 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1583 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1584 rc->rc_offset, rc->rc_size);
1591 unexpected_errors = 0;
1597 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1598 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1600 for (c = 0; c < rm->rm_cols; c++) {
1601 rc = &rm->rm_col[c];
1604 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1606 if (c < rm->rm_firstdatacol)
1611 if (!rc->rc_skipped)
1612 unexpected_errors++;
1615 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621 * There are three potential phases for a read:
1622 * 1. produce valid data from the columns read
1623 * 2. read all disks and try again
1624 * 3. perform combinatorial reconstruction
1626 * Each phase is progressively both more expensive and less likely to
1627 * occur. If we encounter more errors than we can repair or all phases
1628 * fail, we have no choice but to return an error.
1632 * If the number of errors we saw was correctable -- less than or equal
1633 * to the number of parity disks read -- attempt to produce data that
1634 * has a valid checksum. Naturally, this case applies in the absence of
1637 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1638 if (data_errors == 0) {
1639 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1641 * If we read parity information (unnecessarily
1642 * as it happens since no reconstruction was
1643 * needed) regenerate and verify the parity.
1644 * We also regenerate parity when resilvering
1645 * so we can write it out to the failed device
1648 if (parity_errors + parity_untried <
1649 rm->rm_firstdatacol) {
1650 n = raidz_parity_verify(rm);
1651 unexpected_errors += n;
1652 ASSERT(parity_errors + n <=
1653 rm->rm_firstdatacol);
1659 * We either attempt to read all the parity columns or
1660 * none of them. If we didn't try to read parity, we
1661 * wouldn't be here in the correctable case. There must
1662 * also have been fewer parity errors than parity
1663 * columns or, again, we wouldn't be in this code path.
1665 ASSERT(parity_untried == 0);
1666 ASSERT(parity_errors < rm->rm_firstdatacol);
1669 * Identify the data columns that reported an error.
1672 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1673 rc = &rm->rm_col[c];
1674 if (rc->rc_error != 0) {
1675 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1680 ASSERT(rm->rm_firstdatacol >= n);
1682 code = vdev_raidz_reconstruct(rm, tgts, n);
1684 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1686 * If we read more parity disks than were used
1687 * for reconstruction, confirm that the other
1688 * parity disks produced correct data. This
1689 * routine is suboptimal in that it regenerates
1690 * the parity that we already used in addition
1691 * to the parity that we're attempting to
1692 * verify, but this should be a relatively
1693 * uncommon case, and can be optimized if it
1694 * becomes a problem. Note that we regenerate
1695 * parity when resilvering so we can write it
1696 * out to failed devices later.
1698 if (parity_errors < rm->rm_firstdatacol - n) {
1699 n = raidz_parity_verify(rm);
1700 unexpected_errors += n;
1701 ASSERT(parity_errors + n <=
1702 rm->rm_firstdatacol);
1711 * This isn't a typical situation -- either we got a read
1712 * error or a child silently returned bad data. Read every
1713 * block so we can try again with as much data and parity as
1714 * we can track down. If we've already been through once
1715 * before, all children will be marked as tried so we'll
1716 * proceed to combinatorial reconstruction.
1718 unexpected_errors = 1;
1719 rm->rm_missingdata = 0;
1720 rm->rm_missingparity = 0;
1723 for (c = 0; c < rm->rm_cols; c++) {
1724 rc = &rm->rm_col[c];
1729 cvd = vdev_child(vd, rc->rc_devidx);
1730 ASSERT(cvd != NULL);
1731 rc->rc_error = cvd->v_read(cvd, NULL,
1732 rc->rc_data, rc->rc_offset, rc->rc_size);
1733 if (rc->rc_error == 0)
1739 * If we managed to read anything more, retry the
1746 * At this point we've attempted to reconstruct the data given the
1747 * errors we detected, and we've attempted to read all columns. There
1748 * must, therefore, be one or more additional problems -- silent errors
1749 * resulting in invalid data rather than explicit I/O errors resulting
1750 * in absent data. We check if there is enough additional data to
1751 * possibly reconstruct the data and then perform combinatorial
1752 * reconstruction over all possible combinations. If that fails,
1755 if (total_errors > rm->rm_firstdatacol) {
1757 } else if (total_errors < rm->rm_firstdatacol &&
1758 (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1759 total_errors, data_errors)) != 0) {
1761 * If we didn't use all the available parity for the
1762 * combinatorial reconstruction, verify that the remaining
1763 * parity is correct.
1765 if (code != (1 << rm->rm_firstdatacol) - 1)
1766 (void) raidz_parity_verify(rm);
1769 * We're here because either:
1771 * total_errors == rm_first_datacol, or
1772 * vdev_raidz_combrec() failed
1774 * In either case, there is enough bad data to prevent
1777 * Start checksum ereports for all children which haven't
1778 * failed, and the IO wasn't speculative.
1784 vdev_raidz_map_free(rm);