4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
31 static uint64_t zfs_crc64_table[256];
35 #define ASSERT3S(x, y, z) ((void)0)
36 #define ASSERT3U(x, y, z) ((void)0)
37 #define ASSERT3P(x, y, z) ((void)0)
38 #define ASSERT0(x) ((void)0)
39 #define ASSERT(x) ((void)0)
41 #define panic(...) do { \
42 printf(__VA_ARGS__); \
46 #define kmem_alloc(size, flag) zfs_alloc((size))
47 #define kmem_free(ptr, size) zfs_free((ptr), (size))
56 * Calculate the crc64 table (used for the zap hash
59 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
60 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
61 for (i = 0; i < 256; i++)
62 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
63 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
68 zio_checksum_off(const void *buf, uint64_t size,
69 const void *ctx_template, zio_cksum_t *zcp)
71 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
75 * Signature for checksum functions.
77 typedef void zio_checksum_t(const void *data, uint64_t size,
78 const void *ctx_template, zio_cksum_t *zcp);
79 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
80 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
82 typedef enum zio_checksum_flags {
83 /* Strong enough for metadata? */
84 ZCHECKSUM_FLAG_METADATA = (1 << 1),
85 /* ZIO embedded checksum */
86 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
87 /* Strong enough for dedup (without verification)? */
88 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
90 ZCHECKSUM_FLAG_SALTED = (1 << 4),
91 /* Strong enough for nopwrite? */
92 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
93 } zio_checksum_flags_t;
96 * Information about each checksum function.
98 typedef struct zio_checksum_info {
99 /* checksum function for each byteorder */
100 zio_checksum_t *ci_func[2];
101 zio_checksum_tmpl_init_t *ci_tmpl_init;
102 zio_checksum_tmpl_free_t *ci_tmpl_free;
103 zio_checksum_flags_t ci_flags;
104 const char *ci_name; /* descriptive name */
105 } zio_checksum_info_t;
109 #include "fletcher.c"
111 #include "skein_zfs.c"
113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
115 {{NULL, NULL}, NULL, NULL, 0, "on"},
116 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
120 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
122 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
125 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
126 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
128 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
131 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
134 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
135 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141 /* no edonr for now */
142 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
147 * Common signature for all zio compress/decompress functions.
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150 size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152 size_t s_len, size_t d_len, int);
155 * Information about each compression function.
157 typedef struct zio_compress_info {
158 zio_compress_func_t *ci_compress; /* compression function */
159 zio_decompress_func_t *ci_decompress; /* decompression function */
160 int ci_level; /* level parameter */
161 const char *ci_name; /* algorithm name */
162 } zio_compress_info_t;
168 * Compression vectors.
170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
171 {NULL, NULL, 0, "inherit"},
172 {NULL, NULL, 0, "on"},
173 {NULL, NULL, 0, "uncompressed"},
174 {NULL, lzjb_decompress, 0, "lzjb"},
175 {NULL, NULL, 0, "empty"},
176 {NULL, NULL, 1, "gzip-1"},
177 {NULL, NULL, 2, "gzip-2"},
178 {NULL, NULL, 3, "gzip-3"},
179 {NULL, NULL, 4, "gzip-4"},
180 {NULL, NULL, 5, "gzip-5"},
181 {NULL, NULL, 6, "gzip-6"},
182 {NULL, NULL, 7, "gzip-7"},
183 {NULL, NULL, 8, "gzip-8"},
184 {NULL, NULL, 9, "gzip-9"},
185 {NULL, zle_decompress, 64, "zle"},
186 {NULL, lz4_decompress, 0, "lz4"},
190 byteswap_uint64_array(void *vbuf, size_t size)
192 uint64_t *buf = vbuf;
193 size_t count = size >> 3;
196 ASSERT((size & 7) == 0);
198 for (i = 0; i < count; i++)
199 buf[i] = BSWAP_64(buf[i]);
203 * Set the external verifier for a gang block based on <vdev, offset, txg>,
204 * a tuple which is guaranteed to be unique for the life of the pool.
207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
209 const dva_t *dva = BP_IDENTITY(bp);
210 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212 ASSERT(BP_IS_GANG(bp));
214 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
218 * Set the external verifier for a label block based on its offset.
219 * The vdev is implicit, and the txg is unknowable at pool open time --
220 * hence the logic in vdev_uberblock_load() to find the most recent copy.
223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
229 * Calls the template init function of a checksum which supports context
230 * templates and installs the template into the spa_t.
233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237 if (ci->ci_tmpl_init == NULL)
240 if (spa->spa_cksum_tmpls[checksum] != NULL)
243 if (spa->spa_cksum_tmpls[checksum] == NULL) {
244 spa->spa_cksum_tmpls[checksum] =
245 ci->ci_tmpl_init(&spa->spa_cksum_salt);
250 * Called by a spa_t that's about to be deallocated. This steps through
251 * all of the checksum context templates and deallocates any that were
252 * initialized using the algorithm-specific template init function.
255 zio_checksum_templates_free(spa_t *spa)
257 for (enum zio_checksum checksum = 0;
258 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
259 if (spa->spa_cksum_tmpls[checksum] != NULL) {
260 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
262 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
263 spa->spa_cksum_tmpls[checksum] = NULL;
269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
272 unsigned int checksum;
273 zio_checksum_info_t *ci;
275 zio_cksum_t actual_cksum, expected_cksum, verifier;
278 checksum = BP_GET_CHECKSUM(bp);
279 size = BP_GET_PSIZE(bp);
281 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
283 ci = &zio_checksum_table[checksum];
284 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
289 ctx = spa->spa_cksum_tmpls[checksum];
292 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
295 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
296 checksum == ZIO_CHECKSUM_LABEL);
298 eck = (zio_eck_t *)((char *)data + size) - 1;
300 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
301 zio_checksum_gang_verifier(&verifier, bp);
302 else if (checksum == ZIO_CHECKSUM_LABEL)
303 zio_checksum_label_verifier(&verifier,
304 DVA_GET_OFFSET(BP_IDENTITY(bp)));
306 verifier = bp->blk_cksum;
308 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
311 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
313 expected_cksum = eck->zec_cksum;
314 eck->zec_cksum = verifier;
315 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
316 eck->zec_cksum = expected_cksum;
319 byteswap_uint64_array(&expected_cksum,
320 sizeof (zio_cksum_t));
322 byteswap = BP_SHOULD_BYTESWAP(bp);
323 expected_cksum = bp->blk_cksum;
324 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
327 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
328 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
337 void *dest, uint64_t destsize)
339 zio_compress_info_t *ci;
341 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
342 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
346 ci = &zio_compress_table[cpfunc];
347 if (!ci->ci_decompress) {
348 printf("ZFS: unsupported compression algorithm %s\n",
353 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
357 zap_hash(uint64_t salt, const char *name)
364 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
365 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
366 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
369 * Only use 28 bits, since we need 4 bits in the cookie for the
370 * collision differentiator. We MUST use the high bits, since
371 * those are the onces that we first pay attention to when
372 * chosing the bucket.
374 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
379 static void *zfs_alloc(size_t size);
380 static void zfs_free(void *ptr, size_t size);
382 typedef struct raidz_col {
383 uint64_t rc_devidx; /* child device index for I/O */
384 uint64_t rc_offset; /* device offset */
385 uint64_t rc_size; /* I/O size */
386 void *rc_data; /* I/O data */
387 int rc_error; /* I/O error for this device */
388 uint8_t rc_tried; /* Did we attempt this I/O column? */
389 uint8_t rc_skipped; /* Did we skip this I/O column? */
392 typedef struct raidz_map {
393 uint64_t rm_cols; /* Regular column count */
394 uint64_t rm_scols; /* Count including skipped columns */
395 uint64_t rm_bigcols; /* Number of oversized columns */
396 uint64_t rm_asize; /* Actual total I/O size */
397 uint64_t rm_missingdata; /* Count of missing data devices */
398 uint64_t rm_missingparity; /* Count of missing parity devices */
399 uint64_t rm_firstdatacol; /* First data column/parity count */
400 uint64_t rm_nskip; /* Skipped sectors for padding */
401 uint64_t rm_skipstart; /* Column index of padding start */
402 uintptr_t rm_reports; /* # of referencing checksum reports */
403 uint8_t rm_freed; /* map no longer has referencing ZIO */
404 uint8_t rm_ecksuminjected; /* checksum error was injected */
405 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
408 #define VDEV_RAIDZ_P 0
409 #define VDEV_RAIDZ_Q 1
410 #define VDEV_RAIDZ_R 2
412 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
416 * We provide a mechanism to perform the field multiplication operation on a
417 * 64-bit value all at once rather than a byte at a time. This works by
418 * creating a mask from the top bit in each byte and using that to
419 * conditionally apply the XOR of 0x1d.
421 #define VDEV_RAIDZ_64MUL_2(x, mask) \
423 (mask) = (x) & 0x8080808080808080ULL; \
424 (mask) = ((mask) << 1) - ((mask) >> 7); \
425 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
429 #define VDEV_RAIDZ_64MUL_4(x, mask) \
431 VDEV_RAIDZ_64MUL_2((x), mask); \
432 VDEV_RAIDZ_64MUL_2((x), mask); \
436 * These two tables represent powers and logs of 2 in the Galois field defined
437 * above. These values were computed by repeatedly multiplying by 2 as above.
439 static const uint8_t vdev_raidz_pow2[256] = {
440 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
473 static const uint8_t vdev_raidz_log2[256] = {
474 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
509 * Multiply a given number by 2 raised to the given power.
512 vdev_raidz_exp2(uint8_t a, int exp)
518 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
520 exp += vdev_raidz_log2[a];
524 return (vdev_raidz_pow2[exp]);
528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
530 uint64_t *p, *src, pcount, ccount, i;
533 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
535 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536 src = rm->rm_col[c].rc_data;
537 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
540 if (c == rm->rm_firstdatacol) {
541 ASSERT(ccount == pcount);
542 for (i = 0; i < ccount; i++, src++, p++) {
546 ASSERT(ccount <= pcount);
547 for (i = 0; i < ccount; i++, src++, p++) {
555 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
557 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
560 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 src = rm->rm_col[c].rc_data;
566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
569 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
571 if (c == rm->rm_firstdatacol) {
572 ASSERT(ccnt == pcnt || ccnt == 0);
573 for (i = 0; i < ccnt; i++, src++, p++, q++) {
577 for (; i < pcnt; i++, src++, p++, q++) {
582 ASSERT(ccnt <= pcnt);
585 * Apply the algorithm described above by multiplying
586 * the previous result and adding in the new value.
588 for (i = 0; i < ccnt; i++, src++, p++, q++) {
591 VDEV_RAIDZ_64MUL_2(*q, mask);
596 * Treat short columns as though they are full of 0s.
597 * Note that there's therefore nothing needed for P.
599 for (; i < pcnt; i++, q++) {
600 VDEV_RAIDZ_64MUL_2(*q, mask);
607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
609 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
612 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 rm->rm_col[VDEV_RAIDZ_R].rc_size);
618 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619 src = rm->rm_col[c].rc_data;
620 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626 if (c == rm->rm_firstdatacol) {
627 ASSERT(ccnt == pcnt || ccnt == 0);
628 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
633 for (; i < pcnt; i++, src++, p++, q++, r++) {
639 ASSERT(ccnt <= pcnt);
642 * Apply the algorithm described above by multiplying
643 * the previous result and adding in the new value.
645 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
648 VDEV_RAIDZ_64MUL_2(*q, mask);
651 VDEV_RAIDZ_64MUL_4(*r, mask);
656 * Treat short columns as though they are full of 0s.
657 * Note that there's therefore nothing needed for P.
659 for (; i < pcnt; i++, q++, r++) {
660 VDEV_RAIDZ_64MUL_2(*q, mask);
661 VDEV_RAIDZ_64MUL_4(*r, mask);
668 * Generate RAID parity in the first virtual columns according to the number of
669 * parity columns available.
672 vdev_raidz_generate_parity(raidz_map_t *rm)
674 switch (rm->rm_firstdatacol) {
676 vdev_raidz_generate_parity_p(rm);
679 vdev_raidz_generate_parity_pq(rm);
682 vdev_raidz_generate_parity_pqr(rm);
685 panic("invalid RAID-Z configuration");
691 * In the general case of reconstruction, we must solve the system of linear
692 * equations defined by the coeffecients used to generate parity as well as
693 * the contents of the data and parity disks. This can be expressed with
694 * vectors for the original data (D) and the actual data (d) and parity (p)
695 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
699 * | V | | D_0 | | p_m-1 |
700 * | | x | : | = | d_0 |
701 * | I | | D_n-1 | | : |
702 * | | ~~ ~~ | d_n-1 |
705 * I is simply a square identity matrix of size n, and V is a vandermonde
706 * matrix defined by the coeffecients we chose for the various parity columns
707 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708 * computation as well as linear separability.
711 * | 1 .. 1 1 1 | | p_0 |
712 * | 2^n-1 .. 4 2 1 | __ __ | : |
713 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
714 * | 1 .. 0 0 0 | | D_1 | | d_0 |
715 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
716 * | : : : : | | : | | d_2 |
717 * | 0 .. 1 0 0 | | D_n-1 | | : |
718 * | 0 .. 0 1 0 | ~~ ~~ | : |
719 * | 0 .. 0 0 1 | | d_n-1 |
722 * Note that I, V, d, and p are known. To compute D, we must invert the
723 * matrix and use the known data and parity values to reconstruct the unknown
724 * data values. We begin by removing the rows in V|I and d|p that correspond
725 * to failed or missing columns; we then make V|I square (n x n) and d|p
726 * sized n by removing rows corresponding to unused parity from the bottom up
727 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728 * using Gauss-Jordan elimination. In the example below we use m=3 parity
729 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
731 * | 1 1 1 1 1 1 1 1 |
732 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
733 * | 19 205 116 29 64 16 4 1 | / /
734 * | 1 0 0 0 0 0 0 0 | / /
735 * | 0 1 0 0 0 0 0 0 | <--' /
736 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
737 * | 0 0 0 1 0 0 0 0 |
738 * | 0 0 0 0 1 0 0 0 |
739 * | 0 0 0 0 0 1 0 0 |
740 * | 0 0 0 0 0 0 1 0 |
741 * | 0 0 0 0 0 0 0 1 |
744 * | 1 1 1 1 1 1 1 1 |
745 * | 128 64 32 16 8 4 2 1 |
746 * | 19 205 116 29 64 16 4 1 |
747 * | 1 0 0 0 0 0 0 0 |
748 * | 0 1 0 0 0 0 0 0 |
749 * (V|I)' = | 0 0 1 0 0 0 0 0 |
750 * | 0 0 0 1 0 0 0 0 |
751 * | 0 0 0 0 1 0 0 0 |
752 * | 0 0 0 0 0 1 0 0 |
753 * | 0 0 0 0 0 0 1 0 |
754 * | 0 0 0 0 0 0 0 1 |
757 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759 * matrix is not singular.
761 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
762 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
763 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
764 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
765 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
766 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
767 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
768 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
771 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
772 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
773 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
774 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
775 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
776 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
777 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
778 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
781 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
782 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
783 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
784 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
785 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
786 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
787 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
788 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
791 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
792 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
793 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
794 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
795 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
796 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
797 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
798 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
801 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
802 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
803 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
804 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
805 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
806 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
807 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
808 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
811 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
812 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
813 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
814 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
815 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
816 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
817 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
818 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
821 * | 0 0 1 0 0 0 0 0 |
822 * | 167 100 5 41 159 169 217 208 |
823 * | 166 100 4 40 158 168 216 209 |
824 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
825 * | 0 0 0 0 1 0 0 0 |
826 * | 0 0 0 0 0 1 0 0 |
827 * | 0 0 0 0 0 0 1 0 |
828 * | 0 0 0 0 0 0 0 1 |
831 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832 * of the missing data.
834 * As is apparent from the example above, the only non-trivial rows in the
835 * inverse matrix correspond to the data disks that we're trying to
836 * reconstruct. Indeed, those are the only rows we need as the others would
837 * only be useful for reconstructing data known or assumed to be valid. For
838 * that reason, we only build the coefficients in the rows that correspond to
844 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
850 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
853 * Fill in the missing rows of interest.
855 for (i = 0; i < nmap; i++) {
856 ASSERT3S(0, <=, map[i]);
857 ASSERT3S(map[i], <=, 2);
864 for (j = 0; j < n; j++) {
868 rows[i][j] = vdev_raidz_pow2[pow];
874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
881 * Assert that the first nmissing entries from the array of used
882 * columns correspond to parity columns and that subsequent entries
883 * correspond to data columns.
885 for (i = 0; i < nmissing; i++) {
886 ASSERT3S(used[i], <, rm->rm_firstdatacol);
889 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
893 * First initialize the storage where we'll compute the inverse rows.
895 for (i = 0; i < nmissing; i++) {
896 for (j = 0; j < n; j++) {
897 invrows[i][j] = (i == j) ? 1 : 0;
902 * Subtract all trivial rows from the rows of consequence.
904 for (i = 0; i < nmissing; i++) {
905 for (j = nmissing; j < n; j++) {
906 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907 jj = used[j] - rm->rm_firstdatacol;
909 invrows[i][j] = rows[i][jj];
915 * For each of the rows of interest, we must normalize it and subtract
916 * a multiple of it from the other rows.
918 for (i = 0; i < nmissing; i++) {
919 for (j = 0; j < missing[i]; j++) {
920 ASSERT3U(rows[i][j], ==, 0);
922 ASSERT3U(rows[i][missing[i]], !=, 0);
925 * Compute the inverse of the first element and multiply each
926 * element in the row by that value.
928 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
930 for (j = 0; j < n; j++) {
931 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
935 for (ii = 0; ii < nmissing; ii++) {
939 ASSERT3U(rows[ii][missing[i]], !=, 0);
941 log = vdev_raidz_log2[rows[ii][missing[i]]];
943 for (j = 0; j < n; j++) {
945 vdev_raidz_exp2(rows[i][j], log);
947 vdev_raidz_exp2(invrows[i][j], log);
953 * Verify that the data that is left in the rows are properly part of
954 * an identity matrix.
956 for (i = 0; i < nmissing; i++) {
957 for (j = 0; j < n; j++) {
958 if (j == missing[i]) {
959 ASSERT3U(rows[i][j], ==, 1);
961 ASSERT3U(rows[i][j], ==, 0);
968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969 int *missing, uint8_t **invrows, const uint8_t *used)
974 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
978 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
983 psize = sizeof (invlog[0][0]) * n * nmissing;
984 p = zfs_alloc(psize);
986 for (pp = p, i = 0; i < nmissing; i++) {
991 for (i = 0; i < nmissing; i++) {
992 for (j = 0; j < n; j++) {
993 ASSERT3U(invrows[i][j], !=, 0);
994 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
998 for (i = 0; i < n; i++) {
1000 ASSERT3U(c, <, rm->rm_cols);
1002 src = rm->rm_col[c].rc_data;
1003 ccount = rm->rm_col[c].rc_size;
1004 for (j = 0; j < nmissing; j++) {
1005 cc = missing[j] + rm->rm_firstdatacol;
1006 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1007 ASSERT3U(cc, <, rm->rm_cols);
1008 ASSERT3U(cc, !=, c);
1010 dst[j] = rm->rm_col[cc].rc_data;
1011 dcount[j] = rm->rm_col[cc].rc_size;
1014 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1016 for (x = 0; x < ccount; x++, src++) {
1018 log = vdev_raidz_log2[*src];
1020 for (cc = 0; cc < nmissing; cc++) {
1021 if (x >= dcount[cc])
1027 if ((ll = log + invlog[cc][i]) >= 255)
1029 val = vdev_raidz_pow2[ll];
1044 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1048 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1049 int parity_map[VDEV_RAIDZ_MAXPARITY];
1054 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1055 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1061 n = rm->rm_cols - rm->rm_firstdatacol;
1064 * Figure out which data columns are missing.
1067 for (t = 0; t < ntgts; t++) {
1068 if (tgts[t] >= rm->rm_firstdatacol) {
1069 missing_rows[nmissing_rows++] =
1070 tgts[t] - rm->rm_firstdatacol;
1075 * Figure out which parity columns to use to help generate the missing
1078 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1080 ASSERT(c < rm->rm_firstdatacol);
1083 * Skip any targeted parity columns.
1085 if (c == tgts[tt]) {
1097 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1099 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1100 nmissing_rows * n + sizeof (used[0]) * n;
1101 p = kmem_alloc(psize, KM_SLEEP);
1103 for (pp = p, i = 0; i < nmissing_rows; i++) {
1111 for (i = 0; i < nmissing_rows; i++) {
1112 used[i] = parity_map[i];
1115 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1116 if (tt < nmissing_rows &&
1117 c == missing_rows[tt] + rm->rm_firstdatacol) {
1128 * Initialize the interesting rows of the matrix.
1130 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1133 * Invert the matrix.
1135 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1139 * Reconstruct the missing data using the generated matrix.
1141 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144 kmem_free(p, psize);
1150 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1152 int tgts[VDEV_RAIDZ_MAXPARITY];
1156 int nbadparity, nbaddata;
1159 * The tgts list must already be sorted.
1161 for (i = 1; i < nt; i++) {
1162 ASSERT(t[i] > t[i - 1]);
1165 nbadparity = rm->rm_firstdatacol;
1166 nbaddata = rm->rm_cols - nbadparity;
1168 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1169 if (i < nt && c == t[i]) {
1172 } else if (rm->rm_col[c].rc_error != 0) {
1174 } else if (c >= rm->rm_firstdatacol) {
1181 ASSERT(ntgts >= nt);
1182 ASSERT(nbaddata >= 0);
1183 ASSERT(nbaddata + nbadparity == ntgts);
1185 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1186 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1191 static raidz_map_t *
1192 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1193 uint64_t dcols, uint64_t nparity)
1196 uint64_t b = offset >> unit_shift;
1197 uint64_t s = size >> unit_shift;
1198 uint64_t f = b % dcols;
1199 uint64_t o = (b / dcols) << unit_shift;
1200 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1202 q = s / (dcols - nparity);
1203 r = s - q * (dcols - nparity);
1204 bc = (r == 0 ? 0 : r + nparity);
1205 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209 scols = MIN(dcols, roundup(bc, nparity + 1));
1215 ASSERT3U(acols, <=, scols);
1217 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1219 rm->rm_cols = acols;
1220 rm->rm_scols = scols;
1221 rm->rm_bigcols = bc;
1222 rm->rm_skipstart = bc;
1223 rm->rm_missingdata = 0;
1224 rm->rm_missingparity = 0;
1225 rm->rm_firstdatacol = nparity;
1228 rm->rm_ecksuminjected = 0;
1232 for (c = 0; c < scols; c++) {
1237 coff += 1ULL << unit_shift;
1239 rm->rm_col[c].rc_devidx = col;
1240 rm->rm_col[c].rc_offset = coff;
1241 rm->rm_col[c].rc_data = NULL;
1242 rm->rm_col[c].rc_error = 0;
1243 rm->rm_col[c].rc_tried = 0;
1244 rm->rm_col[c].rc_skipped = 0;
1247 rm->rm_col[c].rc_size = 0;
1249 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1251 rm->rm_col[c].rc_size = q << unit_shift;
1253 asize += rm->rm_col[c].rc_size;
1256 ASSERT3U(asize, ==, tot << unit_shift);
1257 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1258 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1259 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1260 ASSERT3U(rm->rm_nskip, <=, nparity);
1262 for (c = 0; c < rm->rm_firstdatacol; c++)
1263 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1265 rm->rm_col[c].rc_data = data;
1267 for (c = c + 1; c < acols; c++)
1268 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1269 rm->rm_col[c - 1].rc_size;
1272 * If all data stored spans all columns, there's a danger that parity
1273 * will always be on the same device and, since parity isn't read
1274 * during normal operation, that that device's I/O bandwidth won't be
1275 * used effectively. We therefore switch the parity every 1MB.
1277 * ... at least that was, ostensibly, the theory. As a practical
1278 * matter unless we juggle the parity between all devices evenly, we
1279 * won't see any benefit. Further, occasional writes that aren't a
1280 * multiple of the LCM of the number of children and the minimum
1281 * stripe width are sufficient to avoid pessimal behavior.
1282 * Unfortunately, this decision created an implicit on-disk format
1283 * requirement that we need to support for all eternity, but only
1284 * for single-parity RAID-Z.
1286 * If we intend to skip a sector in the zeroth column for padding
1287 * we must make sure to note this swap. We will never intend to
1288 * skip the first column since at least one data and one parity
1289 * column must appear in each row.
1291 ASSERT(rm->rm_cols >= 2);
1292 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1294 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1295 devidx = rm->rm_col[0].rc_devidx;
1296 o = rm->rm_col[0].rc_offset;
1297 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1298 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1299 rm->rm_col[1].rc_devidx = devidx;
1300 rm->rm_col[1].rc_offset = o;
1302 if (rm->rm_skipstart == 0)
1303 rm->rm_skipstart = 1;
1310 vdev_raidz_map_free(raidz_map_t *rm)
1314 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1315 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1317 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1321 vdev_child(vdev_t *pvd, uint64_t devidx)
1325 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1326 if (cvd->v_id == devidx)
1334 * We keep track of whether or not there were any injected errors, so that
1335 * any ereports we generate can note it.
1338 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1341 return (zio_checksum_verify(spa, bp, data));
1345 * Generate the parity from the data columns. If we tried and were able to
1346 * read the parity without error, verify that the generated parity matches the
1347 * data we read. If it doesn't, we fire off a checksum error. Return the
1348 * number such failures.
1351 raidz_parity_verify(raidz_map_t *rm)
1353 void *orig[VDEV_RAIDZ_MAXPARITY];
1357 for (c = 0; c < rm->rm_firstdatacol; c++) {
1358 rc = &rm->rm_col[c];
1359 if (!rc->rc_tried || rc->rc_error != 0)
1361 orig[c] = zfs_alloc(rc->rc_size);
1362 bcopy(rc->rc_data, orig[c], rc->rc_size);
1365 vdev_raidz_generate_parity(rm);
1367 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1368 rc = &rm->rm_col[c];
1369 if (!rc->rc_tried || rc->rc_error != 0)
1371 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1372 rc->rc_error = ECKSUM;
1375 zfs_free(orig[c], rc->rc_size);
1382 * Iterate over all combinations of bad data and attempt a reconstruction.
1383 * Note that the algorithm below is non-optimal because it doesn't take into
1384 * account how reconstruction is actually performed. For example, with
1385 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1386 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1387 * cases we'd only use parity information in column 0.
1390 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1391 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1394 void *orig[VDEV_RAIDZ_MAXPARITY];
1395 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1396 int *tgts = &tstore[1];
1397 int current, next, i, c, n;
1400 ASSERT(total_errors < rm->rm_firstdatacol);
1403 * This simplifies one edge condition.
1407 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1409 * Initialize the targets array by finding the first n columns
1410 * that contain no error.
1412 * If there were no data errors, we need to ensure that we're
1413 * always explicitly attempting to reconstruct at least one
1414 * data column. To do this, we simply push the highest target
1415 * up into the data columns.
1417 for (c = 0, i = 0; i < n; i++) {
1418 if (i == n - 1 && data_errors == 0 &&
1419 c < rm->rm_firstdatacol) {
1420 c = rm->rm_firstdatacol;
1423 while (rm->rm_col[c].rc_error != 0) {
1425 ASSERT3S(c, <, rm->rm_cols);
1432 * Setting tgts[n] simplifies the other edge condition.
1434 tgts[n] = rm->rm_cols;
1437 * These buffers were allocated in previous iterations.
1439 for (i = 0; i < n - 1; i++) {
1440 ASSERT(orig[i] != NULL);
1443 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1446 next = tgts[current];
1448 while (current != n) {
1449 tgts[current] = next;
1453 * Save off the original data that we're going to
1454 * attempt to reconstruct.
1456 for (i = 0; i < n; i++) {
1457 ASSERT(orig[i] != NULL);
1460 ASSERT3S(c, <, rm->rm_cols);
1461 rc = &rm->rm_col[c];
1462 bcopy(rc->rc_data, orig[i], rc->rc_size);
1466 * Attempt a reconstruction and exit the outer loop on
1469 code = vdev_raidz_reconstruct(rm, tgts, n);
1470 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1471 for (i = 0; i < n; i++) {
1473 rc = &rm->rm_col[c];
1474 ASSERT(rc->rc_error == 0);
1475 rc->rc_error = ECKSUM;
1483 * Restore the original data.
1485 for (i = 0; i < n; i++) {
1487 rc = &rm->rm_col[c];
1488 bcopy(orig[i], rc->rc_data, rc->rc_size);
1493 * Find the next valid column after the current
1496 for (next = tgts[current] + 1;
1497 next < rm->rm_cols &&
1498 rm->rm_col[next].rc_error != 0; next++)
1501 ASSERT(next <= tgts[current + 1]);
1504 * If that spot is available, we're done here.
1506 if (next != tgts[current + 1])
1510 * Otherwise, find the next valid column after
1511 * the previous position.
1513 for (c = tgts[current - 1] + 1;
1514 rm->rm_col[c].rc_error != 0; c++)
1520 } while (current != n);
1525 for (i = n - 1; i >= 0; i--) {
1526 zfs_free(orig[i], rm->rm_col[0].rc_size);
1533 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1534 off_t offset, size_t bytes)
1536 vdev_t *tvd = vd->v_top;
1541 int unexpected_errors;
1547 int tgts[VDEV_RAIDZ_MAXPARITY];
1550 rc = NULL; /* gcc */
1553 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1554 vd->v_nchildren, vd->v_nparity);
1557 * Iterate over the columns in reverse order so that we hit the parity
1558 * last -- any errors along the way will force us to read the parity.
1560 for (c = rm->rm_cols - 1; c >= 0; c--) {
1561 rc = &rm->rm_col[c];
1562 cvd = vdev_child(vd, rc->rc_devidx);
1563 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1564 if (c >= rm->rm_firstdatacol)
1565 rm->rm_missingdata++;
1567 rm->rm_missingparity++;
1568 rc->rc_error = ENXIO;
1569 rc->rc_tried = 1; /* don't even try */
1573 #if 0 /* XXX: Too hard for the boot code. */
1574 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1575 if (c >= rm->rm_firstdatacol)
1576 rm->rm_missingdata++;
1578 rm->rm_missingparity++;
1579 rc->rc_error = ESTALE;
1584 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1585 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1586 rc->rc_offset, rc->rc_size);
1593 unexpected_errors = 0;
1599 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1600 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1602 for (c = 0; c < rm->rm_cols; c++) {
1603 rc = &rm->rm_col[c];
1606 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1608 if (c < rm->rm_firstdatacol)
1613 if (!rc->rc_skipped)
1614 unexpected_errors++;
1617 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1623 * There are three potential phases for a read:
1624 * 1. produce valid data from the columns read
1625 * 2. read all disks and try again
1626 * 3. perform combinatorial reconstruction
1628 * Each phase is progressively both more expensive and less likely to
1629 * occur. If we encounter more errors than we can repair or all phases
1630 * fail, we have no choice but to return an error.
1634 * If the number of errors we saw was correctable -- less than or equal
1635 * to the number of parity disks read -- attempt to produce data that
1636 * has a valid checksum. Naturally, this case applies in the absence of
1639 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1642 if (data_errors == 0) {
1643 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1646 * If we read parity information (unnecessarily
1647 * as it happens since no reconstruction was
1648 * needed) regenerate and verify the parity.
1649 * We also regenerate parity when resilvering
1650 * so we can write it out to the failed device
1653 if (parity_errors + parity_untried <
1654 rm->rm_firstdatacol) {
1655 n = raidz_parity_verify(rm);
1656 unexpected_errors += n;
1657 ASSERT(parity_errors + n <=
1658 rm->rm_firstdatacol);
1664 * We either attempt to read all the parity columns or
1665 * none of them. If we didn't try to read parity, we
1666 * wouldn't be here in the correctable case. There must
1667 * also have been fewer parity errors than parity
1668 * columns or, again, we wouldn't be in this code path.
1670 ASSERT(parity_untried == 0);
1671 ASSERT(parity_errors < rm->rm_firstdatacol);
1674 * Identify the data columns that reported an error.
1677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1678 rc = &rm->rm_col[c];
1679 if (rc->rc_error != 0) {
1680 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1685 ASSERT(rm->rm_firstdatacol >= n);
1687 code = vdev_raidz_reconstruct(rm, tgts, n);
1689 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1692 * If we read more parity disks than were used
1693 * for reconstruction, confirm that the other
1694 * parity disks produced correct data. This
1695 * routine is suboptimal in that it regenerates
1696 * the parity that we already used in addition
1697 * to the parity that we're attempting to
1698 * verify, but this should be a relatively
1699 * uncommon case, and can be optimized if it
1700 * becomes a problem. Note that we regenerate
1701 * parity when resilvering so we can write it
1702 * out to failed devices later.
1704 if (parity_errors < rm->rm_firstdatacol - n) {
1705 n = raidz_parity_verify(rm);
1706 unexpected_errors += n;
1707 ASSERT(parity_errors + n <=
1708 rm->rm_firstdatacol);
1717 * This isn't a typical situation -- either we got a read
1718 * error or a child silently returned bad data. Read every
1719 * block so we can try again with as much data and parity as
1720 * we can track down. If we've already been through once
1721 * before, all children will be marked as tried so we'll
1722 * proceed to combinatorial reconstruction.
1724 unexpected_errors = 1;
1725 rm->rm_missingdata = 0;
1726 rm->rm_missingparity = 0;
1729 for (c = 0; c < rm->rm_cols; c++) {
1730 rc = &rm->rm_col[c];
1735 cvd = vdev_child(vd, rc->rc_devidx);
1736 ASSERT(cvd != NULL);
1737 rc->rc_error = cvd->v_read(cvd, NULL,
1738 rc->rc_data, rc->rc_offset, rc->rc_size);
1739 if (rc->rc_error == 0)
1745 * If we managed to read anything more, retry the
1752 * At this point we've attempted to reconstruct the data given the
1753 * errors we detected, and we've attempted to read all columns. There
1754 * must, therefore, be one or more additional problems -- silent errors
1755 * resulting in invalid data rather than explicit I/O errors resulting
1756 * in absent data. We check if there is enough additional data to
1757 * possibly reconstruct the data and then perform combinatorial
1758 * reconstruction over all possible combinations. If that fails,
1761 if (total_errors > rm->rm_firstdatacol) {
1763 } else if (total_errors < rm->rm_firstdatacol &&
1764 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1765 total_errors, data_errors)) != 0) {
1767 * If we didn't use all the available parity for the
1768 * combinatorial reconstruction, verify that the remaining
1769 * parity is correct.
1771 if (code != (1 << rm->rm_firstdatacol) - 1)
1772 (void) raidz_parity_verify(rm);
1775 * We're here because either:
1777 * total_errors == rm_first_datacol, or
1778 * vdev_raidz_combrec() failed
1780 * In either case, there is enough bad data to prevent
1783 * Start checksum ereports for all children which haven't
1784 * failed, and the IO wasn't speculative.
1790 vdev_raidz_map_free(rm);