4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
33 #define ASSERT3S(x, y, z) ((void)0)
34 #define ASSERT3U(x, y, z) ((void)0)
35 #define ASSERT3P(x, y, z) ((void)0)
36 #define ASSERT0(x) ((void)0)
37 #define ASSERT(x) ((void)0)
39 #define panic(...) do { \
40 printf(__VA_ARGS__); \
44 #define kmem_alloc(size, flag) zfs_alloc((size))
45 #define kmem_free(ptr, size) zfs_free((ptr), (size))
54 * Calculate the crc64 table (used for the zap hash
57 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59 for (i = 0; i < 256; i++)
60 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
66 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
72 * Signature for checksum functions.
74 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
77 * Information about each checksum function.
79 typedef struct zio_checksum_info {
80 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
81 int ci_correctable; /* number of correctable bits */
82 int ci_eck; /* uses zio embedded checksum? */
83 int ci_dedup; /* strong enough for dedup? */
84 const char *ci_name; /* descriptive name */
85 } zio_checksum_info_t;
92 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
93 {{NULL, NULL}, 0, 0, 0, "inherit"},
94 {{NULL, NULL}, 0, 0, 0, "on"},
95 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
96 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
98 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
99 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
100 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
101 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"},
102 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"},
107 * Common signature for all zio compress/decompress functions.
109 typedef size_t zio_compress_func_t(void *src, void *dst,
110 size_t s_len, size_t d_len, int);
111 typedef int zio_decompress_func_t(void *src, void *dst,
112 size_t s_len, size_t d_len, int);
115 * Information about each compression function.
117 typedef struct zio_compress_info {
118 zio_compress_func_t *ci_compress; /* compression function */
119 zio_decompress_func_t *ci_decompress; /* decompression function */
120 int ci_level; /* level parameter */
121 const char *ci_name; /* algorithm name */
122 } zio_compress_info_t;
129 * Compression vectors.
131 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
132 {NULL, NULL, 0, "inherit"},
133 {NULL, NULL, 0, "on"},
134 {NULL, NULL, 0, "uncompressed"},
135 {NULL, lzjb_decompress, 0, "lzjb"},
136 {NULL, NULL, 0, "empty"},
137 {NULL, NULL, 1, "gzip-1"},
138 {NULL, NULL, 2, "gzip-2"},
139 {NULL, NULL, 3, "gzip-3"},
140 {NULL, NULL, 4, "gzip-4"},
141 {NULL, NULL, 5, "gzip-5"},
142 {NULL, NULL, 6, "gzip-6"},
143 {NULL, NULL, 7, "gzip-7"},
144 {NULL, NULL, 8, "gzip-8"},
145 {NULL, NULL, 9, "gzip-9"},
146 {NULL, zle_decompress, 64, "zle"},
147 {NULL, lz4_decompress, 0, "lz4"},
151 byteswap_uint64_array(void *vbuf, size_t size)
153 uint64_t *buf = vbuf;
154 size_t count = size >> 3;
157 ASSERT((size & 7) == 0);
159 for (i = 0; i < count; i++)
160 buf[i] = BSWAP_64(buf[i]);
164 * Set the external verifier for a gang block based on <vdev, offset, txg>,
165 * a tuple which is guaranteed to be unique for the life of the pool.
168 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
170 const dva_t *dva = BP_IDENTITY(bp);
171 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
173 ASSERT(BP_IS_GANG(bp));
175 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
179 * Set the external verifier for a label block based on its offset.
180 * The vdev is implicit, and the txg is unknowable at pool open time --
181 * hence the logic in vdev_uberblock_load() to find the most recent copy.
184 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
186 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
190 zio_checksum_verify(const blkptr_t *bp, void *data)
193 unsigned int checksum;
194 zio_checksum_info_t *ci;
195 zio_cksum_t actual_cksum, expected_cksum, verifier;
198 checksum = BP_GET_CHECKSUM(bp);
199 size = BP_GET_PSIZE(bp);
201 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
203 ci = &zio_checksum_table[checksum];
204 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
210 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
211 checksum == ZIO_CHECKSUM_LABEL);
213 eck = (zio_eck_t *)((char *)data + size) - 1;
215 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
216 zio_checksum_gang_verifier(&verifier, bp);
217 else if (checksum == ZIO_CHECKSUM_LABEL)
218 zio_checksum_label_verifier(&verifier,
219 DVA_GET_OFFSET(BP_IDENTITY(bp)));
221 verifier = bp->blk_cksum;
223 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
226 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
228 expected_cksum = eck->zec_cksum;
229 eck->zec_cksum = verifier;
230 ci->ci_func[byteswap](data, size, &actual_cksum);
231 eck->zec_cksum = expected_cksum;
234 byteswap_uint64_array(&expected_cksum,
235 sizeof (zio_cksum_t));
237 expected_cksum = bp->blk_cksum;
238 ci->ci_func[0](data, size, &actual_cksum);
241 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
242 /*printf("ZFS: read checksum failed\n");*/
250 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
251 void *dest, uint64_t destsize)
253 zio_compress_info_t *ci;
255 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
256 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
260 ci = &zio_compress_table[cpfunc];
261 if (!ci->ci_decompress) {
262 printf("ZFS: unsupported compression algorithm %s\n",
267 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
271 zap_hash(uint64_t salt, const char *name)
278 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
279 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
280 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
283 * Only use 28 bits, since we need 4 bits in the cookie for the
284 * collision differentiator. We MUST use the high bits, since
285 * those are the onces that we first pay attention to when
286 * chosing the bucket.
288 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
293 static void *zfs_alloc(size_t size);
294 static void zfs_free(void *ptr, size_t size);
296 typedef struct raidz_col {
297 uint64_t rc_devidx; /* child device index for I/O */
298 uint64_t rc_offset; /* device offset */
299 uint64_t rc_size; /* I/O size */
300 void *rc_data; /* I/O data */
301 int rc_error; /* I/O error for this device */
302 uint8_t rc_tried; /* Did we attempt this I/O column? */
303 uint8_t rc_skipped; /* Did we skip this I/O column? */
306 typedef struct raidz_map {
307 uint64_t rm_cols; /* Regular column count */
308 uint64_t rm_scols; /* Count including skipped columns */
309 uint64_t rm_bigcols; /* Number of oversized columns */
310 uint64_t rm_asize; /* Actual total I/O size */
311 uint64_t rm_missingdata; /* Count of missing data devices */
312 uint64_t rm_missingparity; /* Count of missing parity devices */
313 uint64_t rm_firstdatacol; /* First data column/parity count */
314 uint64_t rm_nskip; /* Skipped sectors for padding */
315 uint64_t rm_skipstart; /* Column index of padding start */
316 uintptr_t rm_reports; /* # of referencing checksum reports */
317 uint8_t rm_freed; /* map no longer has referencing ZIO */
318 uint8_t rm_ecksuminjected; /* checksum error was injected */
319 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
322 #define VDEV_RAIDZ_P 0
323 #define VDEV_RAIDZ_Q 1
324 #define VDEV_RAIDZ_R 2
326 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
327 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
330 * We provide a mechanism to perform the field multiplication operation on a
331 * 64-bit value all at once rather than a byte at a time. This works by
332 * creating a mask from the top bit in each byte and using that to
333 * conditionally apply the XOR of 0x1d.
335 #define VDEV_RAIDZ_64MUL_2(x, mask) \
337 (mask) = (x) & 0x8080808080808080ULL; \
338 (mask) = ((mask) << 1) - ((mask) >> 7); \
339 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
340 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
343 #define VDEV_RAIDZ_64MUL_4(x, mask) \
345 VDEV_RAIDZ_64MUL_2((x), mask); \
346 VDEV_RAIDZ_64MUL_2((x), mask); \
350 * These two tables represent powers and logs of 2 in the Galois field defined
351 * above. These values were computed by repeatedly multiplying by 2 as above.
353 static const uint8_t vdev_raidz_pow2[256] = {
354 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
355 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
356 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
357 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
358 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
359 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
360 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
361 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
362 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
363 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
364 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
365 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
366 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
367 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
368 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
369 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
370 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
371 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
372 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
373 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
374 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
375 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
376 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
377 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
378 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
379 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
380 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
381 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
382 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
383 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
384 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
385 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
387 static const uint8_t vdev_raidz_log2[256] = {
388 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
389 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
390 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
391 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
392 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
393 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
394 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
395 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
396 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
397 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
398 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
399 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
400 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
401 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
402 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
403 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
404 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
405 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
406 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
407 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
408 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
409 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
410 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
411 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
412 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
413 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
414 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
415 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
416 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
417 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
418 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
419 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
423 * Multiply a given number by 2 raised to the given power.
426 vdev_raidz_exp2(uint8_t a, int exp)
432 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
434 exp += vdev_raidz_log2[a];
438 return (vdev_raidz_pow2[exp]);
442 vdev_raidz_generate_parity_p(raidz_map_t *rm)
444 uint64_t *p, *src, pcount, ccount, i;
447 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
449 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450 src = rm->rm_col[c].rc_data;
451 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
452 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
454 if (c == rm->rm_firstdatacol) {
455 ASSERT(ccount == pcount);
456 for (i = 0; i < ccount; i++, src++, p++) {
460 ASSERT(ccount <= pcount);
461 for (i = 0; i < ccount; i++, src++, p++) {
469 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
471 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
474 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
475 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
476 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
478 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
479 src = rm->rm_col[c].rc_data;
480 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
481 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
483 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
485 if (c == rm->rm_firstdatacol) {
486 ASSERT(ccnt == pcnt || ccnt == 0);
487 for (i = 0; i < ccnt; i++, src++, p++, q++) {
491 for (; i < pcnt; i++, src++, p++, q++) {
496 ASSERT(ccnt <= pcnt);
499 * Apply the algorithm described above by multiplying
500 * the previous result and adding in the new value.
502 for (i = 0; i < ccnt; i++, src++, p++, q++) {
505 VDEV_RAIDZ_64MUL_2(*q, mask);
510 * Treat short columns as though they are full of 0s.
511 * Note that there's therefore nothing needed for P.
513 for (; i < pcnt; i++, q++) {
514 VDEV_RAIDZ_64MUL_2(*q, mask);
521 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
523 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
526 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
527 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
528 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
529 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
530 rm->rm_col[VDEV_RAIDZ_R].rc_size);
532 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
533 src = rm->rm_col[c].rc_data;
534 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
535 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
536 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
538 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
540 if (c == rm->rm_firstdatacol) {
541 ASSERT(ccnt == pcnt || ccnt == 0);
542 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
547 for (; i < pcnt; i++, src++, p++, q++, r++) {
553 ASSERT(ccnt <= pcnt);
556 * Apply the algorithm described above by multiplying
557 * the previous result and adding in the new value.
559 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
562 VDEV_RAIDZ_64MUL_2(*q, mask);
565 VDEV_RAIDZ_64MUL_4(*r, mask);
570 * Treat short columns as though they are full of 0s.
571 * Note that there's therefore nothing needed for P.
573 for (; i < pcnt; i++, q++, r++) {
574 VDEV_RAIDZ_64MUL_2(*q, mask);
575 VDEV_RAIDZ_64MUL_4(*r, mask);
582 * Generate RAID parity in the first virtual columns according to the number of
583 * parity columns available.
586 vdev_raidz_generate_parity(raidz_map_t *rm)
588 switch (rm->rm_firstdatacol) {
590 vdev_raidz_generate_parity_p(rm);
593 vdev_raidz_generate_parity_pq(rm);
596 vdev_raidz_generate_parity_pqr(rm);
599 panic("invalid RAID-Z configuration");
605 * In the general case of reconstruction, we must solve the system of linear
606 * equations defined by the coeffecients used to generate parity as well as
607 * the contents of the data and parity disks. This can be expressed with
608 * vectors for the original data (D) and the actual data (d) and parity (p)
609 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
613 * | V | | D_0 | | p_m-1 |
614 * | | x | : | = | d_0 |
615 * | I | | D_n-1 | | : |
616 * | | ~~ ~~ | d_n-1 |
619 * I is simply a square identity matrix of size n, and V is a vandermonde
620 * matrix defined by the coeffecients we chose for the various parity columns
621 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
622 * computation as well as linear separability.
625 * | 1 .. 1 1 1 | | p_0 |
626 * | 2^n-1 .. 4 2 1 | __ __ | : |
627 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
628 * | 1 .. 0 0 0 | | D_1 | | d_0 |
629 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
630 * | : : : : | | : | | d_2 |
631 * | 0 .. 1 0 0 | | D_n-1 | | : |
632 * | 0 .. 0 1 0 | ~~ ~~ | : |
633 * | 0 .. 0 0 1 | | d_n-1 |
636 * Note that I, V, d, and p are known. To compute D, we must invert the
637 * matrix and use the known data and parity values to reconstruct the unknown
638 * data values. We begin by removing the rows in V|I and d|p that correspond
639 * to failed or missing columns; we then make V|I square (n x n) and d|p
640 * sized n by removing rows corresponding to unused parity from the bottom up
641 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
642 * using Gauss-Jordan elimination. In the example below we use m=3 parity
643 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
645 * | 1 1 1 1 1 1 1 1 |
646 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
647 * | 19 205 116 29 64 16 4 1 | / /
648 * | 1 0 0 0 0 0 0 0 | / /
649 * | 0 1 0 0 0 0 0 0 | <--' /
650 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
651 * | 0 0 0 1 0 0 0 0 |
652 * | 0 0 0 0 1 0 0 0 |
653 * | 0 0 0 0 0 1 0 0 |
654 * | 0 0 0 0 0 0 1 0 |
655 * | 0 0 0 0 0 0 0 1 |
658 * | 1 1 1 1 1 1 1 1 |
659 * | 128 64 32 16 8 4 2 1 |
660 * | 19 205 116 29 64 16 4 1 |
661 * | 1 0 0 0 0 0 0 0 |
662 * | 0 1 0 0 0 0 0 0 |
663 * (V|I)' = | 0 0 1 0 0 0 0 0 |
664 * | 0 0 0 1 0 0 0 0 |
665 * | 0 0 0 0 1 0 0 0 |
666 * | 0 0 0 0 0 1 0 0 |
667 * | 0 0 0 0 0 0 1 0 |
668 * | 0 0 0 0 0 0 0 1 |
671 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
672 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
673 * matrix is not singular.
675 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
676 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
677 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
678 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
679 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
680 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
681 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
682 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
685 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
686 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
687 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
688 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
689 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
690 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
691 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
692 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
695 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
696 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
697 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
698 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
699 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
700 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
701 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
702 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
705 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
706 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
707 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
708 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
709 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
710 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
711 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
712 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
715 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
716 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
717 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
718 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
719 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
720 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
721 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
722 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
725 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
726 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
727 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
728 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
729 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
730 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
731 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
732 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
735 * | 0 0 1 0 0 0 0 0 |
736 * | 167 100 5 41 159 169 217 208 |
737 * | 166 100 4 40 158 168 216 209 |
738 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
739 * | 0 0 0 0 1 0 0 0 |
740 * | 0 0 0 0 0 1 0 0 |
741 * | 0 0 0 0 0 0 1 0 |
742 * | 0 0 0 0 0 0 0 1 |
745 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
746 * of the missing data.
748 * As is apparent from the example above, the only non-trivial rows in the
749 * inverse matrix correspond to the data disks that we're trying to
750 * reconstruct. Indeed, those are the only rows we need as the others would
751 * only be useful for reconstructing data known or assumed to be valid. For
752 * that reason, we only build the coefficients in the rows that correspond to
758 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
764 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
767 * Fill in the missing rows of interest.
769 for (i = 0; i < nmap; i++) {
770 ASSERT3S(0, <=, map[i]);
771 ASSERT3S(map[i], <=, 2);
778 for (j = 0; j < n; j++) {
782 rows[i][j] = vdev_raidz_pow2[pow];
788 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
789 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
795 * Assert that the first nmissing entries from the array of used
796 * columns correspond to parity columns and that subsequent entries
797 * correspond to data columns.
799 for (i = 0; i < nmissing; i++) {
800 ASSERT3S(used[i], <, rm->rm_firstdatacol);
803 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
807 * First initialize the storage where we'll compute the inverse rows.
809 for (i = 0; i < nmissing; i++) {
810 for (j = 0; j < n; j++) {
811 invrows[i][j] = (i == j) ? 1 : 0;
816 * Subtract all trivial rows from the rows of consequence.
818 for (i = 0; i < nmissing; i++) {
819 for (j = nmissing; j < n; j++) {
820 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
821 jj = used[j] - rm->rm_firstdatacol;
823 invrows[i][j] = rows[i][jj];
829 * For each of the rows of interest, we must normalize it and subtract
830 * a multiple of it from the other rows.
832 for (i = 0; i < nmissing; i++) {
833 for (j = 0; j < missing[i]; j++) {
834 ASSERT3U(rows[i][j], ==, 0);
836 ASSERT3U(rows[i][missing[i]], !=, 0);
839 * Compute the inverse of the first element and multiply each
840 * element in the row by that value.
842 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
844 for (j = 0; j < n; j++) {
845 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
846 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
849 for (ii = 0; ii < nmissing; ii++) {
853 ASSERT3U(rows[ii][missing[i]], !=, 0);
855 log = vdev_raidz_log2[rows[ii][missing[i]]];
857 for (j = 0; j < n; j++) {
859 vdev_raidz_exp2(rows[i][j], log);
861 vdev_raidz_exp2(invrows[i][j], log);
867 * Verify that the data that is left in the rows are properly part of
868 * an identity matrix.
870 for (i = 0; i < nmissing; i++) {
871 for (j = 0; j < n; j++) {
872 if (j == missing[i]) {
873 ASSERT3U(rows[i][j], ==, 1);
875 ASSERT3U(rows[i][j], ==, 0);
882 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
883 int *missing, uint8_t **invrows, const uint8_t *used)
888 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
889 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
892 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
897 psize = sizeof (invlog[0][0]) * n * nmissing;
898 p = zfs_alloc(psize);
900 for (pp = p, i = 0; i < nmissing; i++) {
905 for (i = 0; i < nmissing; i++) {
906 for (j = 0; j < n; j++) {
907 ASSERT3U(invrows[i][j], !=, 0);
908 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
912 for (i = 0; i < n; i++) {
914 ASSERT3U(c, <, rm->rm_cols);
916 src = rm->rm_col[c].rc_data;
917 ccount = rm->rm_col[c].rc_size;
918 for (j = 0; j < nmissing; j++) {
919 cc = missing[j] + rm->rm_firstdatacol;
920 ASSERT3U(cc, >=, rm->rm_firstdatacol);
921 ASSERT3U(cc, <, rm->rm_cols);
924 dst[j] = rm->rm_col[cc].rc_data;
925 dcount[j] = rm->rm_col[cc].rc_size;
928 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
930 for (x = 0; x < ccount; x++, src++) {
932 log = vdev_raidz_log2[*src];
934 for (cc = 0; cc < nmissing; cc++) {
941 if ((ll = log + invlog[cc][i]) >= 255)
943 val = vdev_raidz_pow2[ll];
958 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
962 int missing_rows[VDEV_RAIDZ_MAXPARITY];
963 int parity_map[VDEV_RAIDZ_MAXPARITY];
968 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
969 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
975 n = rm->rm_cols - rm->rm_firstdatacol;
978 * Figure out which data columns are missing.
981 for (t = 0; t < ntgts; t++) {
982 if (tgts[t] >= rm->rm_firstdatacol) {
983 missing_rows[nmissing_rows++] =
984 tgts[t] - rm->rm_firstdatacol;
989 * Figure out which parity columns to use to help generate the missing
992 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
994 ASSERT(c < rm->rm_firstdatacol);
997 * Skip any targeted parity columns.
1011 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1013 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1014 nmissing_rows * n + sizeof (used[0]) * n;
1015 p = kmem_alloc(psize, KM_SLEEP);
1017 for (pp = p, i = 0; i < nmissing_rows; i++) {
1025 for (i = 0; i < nmissing_rows; i++) {
1026 used[i] = parity_map[i];
1029 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1030 if (tt < nmissing_rows &&
1031 c == missing_rows[tt] + rm->rm_firstdatacol) {
1042 * Initialize the interesting rows of the matrix.
1044 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1047 * Invert the matrix.
1049 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1053 * Reconstruct the missing data using the generated matrix.
1055 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1058 kmem_free(p, psize);
1064 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1066 int tgts[VDEV_RAIDZ_MAXPARITY];
1070 int nbadparity, nbaddata;
1073 * The tgts list must already be sorted.
1075 for (i = 1; i < nt; i++) {
1076 ASSERT(t[i] > t[i - 1]);
1079 nbadparity = rm->rm_firstdatacol;
1080 nbaddata = rm->rm_cols - nbadparity;
1082 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1083 if (i < nt && c == t[i]) {
1086 } else if (rm->rm_col[c].rc_error != 0) {
1088 } else if (c >= rm->rm_firstdatacol) {
1095 ASSERT(ntgts >= nt);
1096 ASSERT(nbaddata >= 0);
1097 ASSERT(nbaddata + nbadparity == ntgts);
1099 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1100 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1105 static raidz_map_t *
1106 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1107 uint64_t dcols, uint64_t nparity)
1110 uint64_t b = offset >> unit_shift;
1111 uint64_t s = size >> unit_shift;
1112 uint64_t f = b % dcols;
1113 uint64_t o = (b / dcols) << unit_shift;
1114 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1116 q = s / (dcols - nparity);
1117 r = s - q * (dcols - nparity);
1118 bc = (r == 0 ? 0 : r + nparity);
1119 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1123 scols = MIN(dcols, roundup(bc, nparity + 1));
1129 ASSERT3U(acols, <=, scols);
1131 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1133 rm->rm_cols = acols;
1134 rm->rm_scols = scols;
1135 rm->rm_bigcols = bc;
1136 rm->rm_skipstart = bc;
1137 rm->rm_missingdata = 0;
1138 rm->rm_missingparity = 0;
1139 rm->rm_firstdatacol = nparity;
1142 rm->rm_ecksuminjected = 0;
1146 for (c = 0; c < scols; c++) {
1151 coff += 1ULL << unit_shift;
1153 rm->rm_col[c].rc_devidx = col;
1154 rm->rm_col[c].rc_offset = coff;
1155 rm->rm_col[c].rc_data = NULL;
1156 rm->rm_col[c].rc_error = 0;
1157 rm->rm_col[c].rc_tried = 0;
1158 rm->rm_col[c].rc_skipped = 0;
1161 rm->rm_col[c].rc_size = 0;
1163 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1165 rm->rm_col[c].rc_size = q << unit_shift;
1167 asize += rm->rm_col[c].rc_size;
1170 ASSERT3U(asize, ==, tot << unit_shift);
1171 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1172 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1173 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1174 ASSERT3U(rm->rm_nskip, <=, nparity);
1176 for (c = 0; c < rm->rm_firstdatacol; c++)
1177 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1179 rm->rm_col[c].rc_data = data;
1181 for (c = c + 1; c < acols; c++)
1182 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1183 rm->rm_col[c - 1].rc_size;
1186 * If all data stored spans all columns, there's a danger that parity
1187 * will always be on the same device and, since parity isn't read
1188 * during normal operation, that that device's I/O bandwidth won't be
1189 * used effectively. We therefore switch the parity every 1MB.
1191 * ... at least that was, ostensibly, the theory. As a practical
1192 * matter unless we juggle the parity between all devices evenly, we
1193 * won't see any benefit. Further, occasional writes that aren't a
1194 * multiple of the LCM of the number of children and the minimum
1195 * stripe width are sufficient to avoid pessimal behavior.
1196 * Unfortunately, this decision created an implicit on-disk format
1197 * requirement that we need to support for all eternity, but only
1198 * for single-parity RAID-Z.
1200 * If we intend to skip a sector in the zeroth column for padding
1201 * we must make sure to note this swap. We will never intend to
1202 * skip the first column since at least one data and one parity
1203 * column must appear in each row.
1205 ASSERT(rm->rm_cols >= 2);
1206 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1208 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1209 devidx = rm->rm_col[0].rc_devidx;
1210 o = rm->rm_col[0].rc_offset;
1211 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1212 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1213 rm->rm_col[1].rc_devidx = devidx;
1214 rm->rm_col[1].rc_offset = o;
1216 if (rm->rm_skipstart == 0)
1217 rm->rm_skipstart = 1;
1224 vdev_raidz_map_free(raidz_map_t *rm)
1228 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1229 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1231 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1235 vdev_child(vdev_t *pvd, uint64_t devidx)
1239 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1240 if (cvd->v_id == devidx)
1248 * We keep track of whether or not there were any injected errors, so that
1249 * any ereports we generate can note it.
1252 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1255 return (zio_checksum_verify(bp, data));
1259 * Generate the parity from the data columns. If we tried and were able to
1260 * read the parity without error, verify that the generated parity matches the
1261 * data we read. If it doesn't, we fire off a checksum error. Return the
1262 * number such failures.
1265 raidz_parity_verify(raidz_map_t *rm)
1267 void *orig[VDEV_RAIDZ_MAXPARITY];
1271 for (c = 0; c < rm->rm_firstdatacol; c++) {
1272 rc = &rm->rm_col[c];
1273 if (!rc->rc_tried || rc->rc_error != 0)
1275 orig[c] = zfs_alloc(rc->rc_size);
1276 bcopy(rc->rc_data, orig[c], rc->rc_size);
1279 vdev_raidz_generate_parity(rm);
1281 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1282 rc = &rm->rm_col[c];
1283 if (!rc->rc_tried || rc->rc_error != 0)
1285 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1286 rc->rc_error = ECKSUM;
1289 zfs_free(orig[c], rc->rc_size);
1296 * Iterate over all combinations of bad data and attempt a reconstruction.
1297 * Note that the algorithm below is non-optimal because it doesn't take into
1298 * account how reconstruction is actually performed. For example, with
1299 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1300 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1301 * cases we'd only use parity information in column 0.
1304 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1305 off_t offset, uint64_t bytes, int total_errors, int data_errors)
1308 void *orig[VDEV_RAIDZ_MAXPARITY];
1309 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1310 int *tgts = &tstore[1];
1311 int current, next, i, c, n;
1314 ASSERT(total_errors < rm->rm_firstdatacol);
1317 * This simplifies one edge condition.
1321 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1323 * Initialize the targets array by finding the first n columns
1324 * that contain no error.
1326 * If there were no data errors, we need to ensure that we're
1327 * always explicitly attempting to reconstruct at least one
1328 * data column. To do this, we simply push the highest target
1329 * up into the data columns.
1331 for (c = 0, i = 0; i < n; i++) {
1332 if (i == n - 1 && data_errors == 0 &&
1333 c < rm->rm_firstdatacol) {
1334 c = rm->rm_firstdatacol;
1337 while (rm->rm_col[c].rc_error != 0) {
1339 ASSERT3S(c, <, rm->rm_cols);
1346 * Setting tgts[n] simplifies the other edge condition.
1348 tgts[n] = rm->rm_cols;
1351 * These buffers were allocated in previous iterations.
1353 for (i = 0; i < n - 1; i++) {
1354 ASSERT(orig[i] != NULL);
1357 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1360 next = tgts[current];
1362 while (current != n) {
1363 tgts[current] = next;
1367 * Save off the original data that we're going to
1368 * attempt to reconstruct.
1370 for (i = 0; i < n; i++) {
1371 ASSERT(orig[i] != NULL);
1374 ASSERT3S(c, <, rm->rm_cols);
1375 rc = &rm->rm_col[c];
1376 bcopy(rc->rc_data, orig[i], rc->rc_size);
1380 * Attempt a reconstruction and exit the outer loop on
1383 code = vdev_raidz_reconstruct(rm, tgts, n);
1384 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1385 for (i = 0; i < n; i++) {
1387 rc = &rm->rm_col[c];
1388 ASSERT(rc->rc_error == 0);
1389 rc->rc_error = ECKSUM;
1397 * Restore the original data.
1399 for (i = 0; i < n; i++) {
1401 rc = &rm->rm_col[c];
1402 bcopy(orig[i], rc->rc_data, rc->rc_size);
1407 * Find the next valid column after the current
1410 for (next = tgts[current] + 1;
1411 next < rm->rm_cols &&
1412 rm->rm_col[next].rc_error != 0; next++)
1415 ASSERT(next <= tgts[current + 1]);
1418 * If that spot is available, we're done here.
1420 if (next != tgts[current + 1])
1424 * Otherwise, find the next valid column after
1425 * the previous position.
1427 for (c = tgts[current - 1] + 1;
1428 rm->rm_col[c].rc_error != 0; c++)
1434 } while (current != n);
1439 for (i = n - 1; i >= 0; i--) {
1440 zfs_free(orig[i], rm->rm_col[0].rc_size);
1447 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1448 off_t offset, size_t bytes)
1450 vdev_t *tvd = vd->v_top;
1455 int unexpected_errors;
1461 int tgts[VDEV_RAIDZ_MAXPARITY];
1464 rc = NULL; /* gcc */
1467 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1468 vd->v_nchildren, vd->v_nparity);
1471 * Iterate over the columns in reverse order so that we hit the parity
1472 * last -- any errors along the way will force us to read the parity.
1474 for (c = rm->rm_cols - 1; c >= 0; c--) {
1475 rc = &rm->rm_col[c];
1476 cvd = vdev_child(vd, rc->rc_devidx);
1477 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1478 if (c >= rm->rm_firstdatacol)
1479 rm->rm_missingdata++;
1481 rm->rm_missingparity++;
1482 rc->rc_error = ENXIO;
1483 rc->rc_tried = 1; /* don't even try */
1487 #if 0 /* XXX: Too hard for the boot code. */
1488 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1489 if (c >= rm->rm_firstdatacol)
1490 rm->rm_missingdata++;
1492 rm->rm_missingparity++;
1493 rc->rc_error = ESTALE;
1498 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1499 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1500 rc->rc_offset, rc->rc_size);
1507 unexpected_errors = 0;
1513 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1514 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1516 for (c = 0; c < rm->rm_cols; c++) {
1517 rc = &rm->rm_col[c];
1520 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1522 if (c < rm->rm_firstdatacol)
1527 if (!rc->rc_skipped)
1528 unexpected_errors++;
1531 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1537 * There are three potential phases for a read:
1538 * 1. produce valid data from the columns read
1539 * 2. read all disks and try again
1540 * 3. perform combinatorial reconstruction
1542 * Each phase is progressively both more expensive and less likely to
1543 * occur. If we encounter more errors than we can repair or all phases
1544 * fail, we have no choice but to return an error.
1548 * If the number of errors we saw was correctable -- less than or equal
1549 * to the number of parity disks read -- attempt to produce data that
1550 * has a valid checksum. Naturally, this case applies in the absence of
1553 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1554 if (data_errors == 0) {
1555 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1557 * If we read parity information (unnecessarily
1558 * as it happens since no reconstruction was
1559 * needed) regenerate and verify the parity.
1560 * We also regenerate parity when resilvering
1561 * so we can write it out to the failed device
1564 if (parity_errors + parity_untried <
1565 rm->rm_firstdatacol) {
1566 n = raidz_parity_verify(rm);
1567 unexpected_errors += n;
1568 ASSERT(parity_errors + n <=
1569 rm->rm_firstdatacol);
1575 * We either attempt to read all the parity columns or
1576 * none of them. If we didn't try to read parity, we
1577 * wouldn't be here in the correctable case. There must
1578 * also have been fewer parity errors than parity
1579 * columns or, again, we wouldn't be in this code path.
1581 ASSERT(parity_untried == 0);
1582 ASSERT(parity_errors < rm->rm_firstdatacol);
1585 * Identify the data columns that reported an error.
1588 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1589 rc = &rm->rm_col[c];
1590 if (rc->rc_error != 0) {
1591 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1596 ASSERT(rm->rm_firstdatacol >= n);
1598 code = vdev_raidz_reconstruct(rm, tgts, n);
1600 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1602 * If we read more parity disks than were used
1603 * for reconstruction, confirm that the other
1604 * parity disks produced correct data. This
1605 * routine is suboptimal in that it regenerates
1606 * the parity that we already used in addition
1607 * to the parity that we're attempting to
1608 * verify, but this should be a relatively
1609 * uncommon case, and can be optimized if it
1610 * becomes a problem. Note that we regenerate
1611 * parity when resilvering so we can write it
1612 * out to failed devices later.
1614 if (parity_errors < rm->rm_firstdatacol - n) {
1615 n = raidz_parity_verify(rm);
1616 unexpected_errors += n;
1617 ASSERT(parity_errors + n <=
1618 rm->rm_firstdatacol);
1627 * This isn't a typical situation -- either we got a read
1628 * error or a child silently returned bad data. Read every
1629 * block so we can try again with as much data and parity as
1630 * we can track down. If we've already been through once
1631 * before, all children will be marked as tried so we'll
1632 * proceed to combinatorial reconstruction.
1634 unexpected_errors = 1;
1635 rm->rm_missingdata = 0;
1636 rm->rm_missingparity = 0;
1639 for (c = 0; c < rm->rm_cols; c++) {
1640 rc = &rm->rm_col[c];
1645 cvd = vdev_child(vd, rc->rc_devidx);
1646 ASSERT(cvd != NULL);
1647 rc->rc_error = cvd->v_read(cvd, NULL,
1648 rc->rc_data, rc->rc_offset, rc->rc_size);
1649 if (rc->rc_error == 0)
1655 * If we managed to read anything more, retry the
1662 * At this point we've attempted to reconstruct the data given the
1663 * errors we detected, and we've attempted to read all columns. There
1664 * must, therefore, be one or more additional problems -- silent errors
1665 * resulting in invalid data rather than explicit I/O errors resulting
1666 * in absent data. We check if there is enough additional data to
1667 * possibly reconstruct the data and then perform combinatorial
1668 * reconstruction over all possible combinations. If that fails,
1671 if (total_errors > rm->rm_firstdatacol) {
1673 } else if (total_errors < rm->rm_firstdatacol &&
1674 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1675 total_errors, data_errors)) != 0) {
1677 * If we didn't use all the available parity for the
1678 * combinatorial reconstruction, verify that the remaining
1679 * parity is correct.
1681 if (code != (1 << rm->rm_firstdatacol) - 1)
1682 (void) raidz_parity_verify(rm);
1685 * We're here because either:
1687 * total_errors == rm_first_datacol, or
1688 * vdev_raidz_combrec() failed
1690 * In either case, there is enough bad data to prevent
1693 * Start checksum ereports for all children which haven't
1694 * failed, and the IO wasn't speculative.
1700 vdev_raidz_map_free(rm);