4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
38 * Calculate the crc64 table (used for the zap hash
41 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43 for (i = 0; i < 256; i++)
44 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
52 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
56 * Signature for checksum functions.
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
61 * Information about each checksum function.
63 typedef struct zio_checksum_info {
64 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
65 int ci_correctable; /* number of correctable bits */
66 int ci_zbt; /* uses zio block tail? */
67 const char *ci_name; /* descriptive name */
68 } zio_checksum_info_t;
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74 {{NULL, NULL}, 0, 0, "inherit"},
75 {{NULL, NULL}, 0, 0, "on"},
76 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
77 {{zio_checksum_SHA256, NULL}, 1, 1, "label"},
78 {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"},
79 {{fletcher_2_native, NULL}, 0, 1, "zilog"},
80 {{fletcher_2_native, NULL}, 0, 0, "fletcher2"},
81 {{fletcher_4_native, NULL}, 1, 0, "fletcher4"},
82 {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"},
86 * Common signature for all zio compress/decompress functions.
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89 size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91 size_t s_len, size_t d_len, int);
94 * Information about each compression function.
96 typedef struct zio_compress_info {
97 zio_compress_func_t *ci_compress; /* compression function */
98 zio_decompress_func_t *ci_decompress; /* decompression function */
99 int ci_level; /* level parameter */
100 const char *ci_name; /* algorithm name */
101 } zio_compress_info_t;
106 * Compression vectors.
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109 {NULL, NULL, 0, "inherit"},
110 {NULL, NULL, 0, "on"},
111 {NULL, NULL, 0, "uncompressed"},
112 {NULL, lzjb_decompress, 0, "lzjb"},
113 {NULL, NULL, 0, "empty"},
114 {NULL, NULL, 1, "gzip-1"},
115 {NULL, NULL, 2, "gzip-2"},
116 {NULL, NULL, 3, "gzip-3"},
117 {NULL, NULL, 4, "gzip-4"},
118 {NULL, NULL, 5, "gzip-5"},
119 {NULL, NULL, 6, "gzip-6"},
120 {NULL, NULL, 7, "gzip-7"},
121 {NULL, NULL, 8, "gzip-8"},
122 {NULL, NULL, 9, "gzip-9"},
126 zio_checksum_error(const blkptr_t *bp, void *data)
128 zio_cksum_t zc = bp->blk_cksum;
129 unsigned int checksum = BP_GET_CHECKSUM(bp);
130 uint64_t size = BP_GET_PSIZE(bp);
131 zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133 zio_cksum_t actual_cksum, expected_cksum;
135 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
139 expected_cksum = zbt->zbt_cksum;
141 ci->ci_func[0](data, size, &actual_cksum);
142 zbt->zbt_cksum = expected_cksum;
145 /* ASSERT(!BP_IS_GANG(bp)); */
146 ci->ci_func[0](data, size, &actual_cksum);
149 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150 /*printf("ZFS: read checksum failed\n");*/
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159 void *dest, uint64_t destsize)
161 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
163 /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164 if (!ci->ci_decompress) {
165 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
169 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
173 zap_hash(uint64_t salt, const char *name)
179 /*ASSERT(crc != 0);*/
180 /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
185 * Only use 28 bits, since we need 4 bits in the cookie for the
186 * collision differentiator. We MUST use the high bits, since
187 * those are the onces that we first pay attention to when
188 * chosing the bucket.
190 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
195 static char *zfs_alloc_temp(size_t sz);
197 typedef struct raidz_col {
198 uint64_t rc_devidx; /* child device index for I/O */
199 uint64_t rc_offset; /* device offset */
200 uint64_t rc_size; /* I/O size */
201 void *rc_data; /* I/O data */
202 int rc_error; /* I/O error for this device */
203 uint8_t rc_tried; /* Did we attempt this I/O column? */
204 uint8_t rc_skipped; /* Did we skip this I/O column? */
207 #define VDEV_RAIDZ_P 0
208 #define VDEV_RAIDZ_Q 1
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
213 uint64_t *dst, *src, xcount, ccount, count, i;
216 xcount = cols[x].rc_size / sizeof (src[0]);
217 //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218 //ASSERT(xcount > 0);
220 src = cols[VDEV_RAIDZ_P].rc_data;
221 dst = cols[x].rc_data;
222 for (i = 0; i < xcount; i++, dst++, src++) {
226 for (c = nparity; c < acols; c++) {
227 src = cols[c].rc_data;
228 dst = cols[x].rc_data;
233 ccount = cols[c].rc_size / sizeof (src[0]);
234 count = MIN(ccount, xcount);
236 for (i = 0; i < count; i++, dst++, src++) {
243 * These two tables represent powers and logs of 2 in the Galois field defined
244 * above. These values were computed by repeatedly multiplying by 2 as above.
246 static const uint8_t vdev_raidz_pow2[256] = {
247 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
280 static const uint8_t vdev_raidz_log2[256] = {
281 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
316 * Multiply a given number by 2 raised to the given power.
319 vdev_raidz_exp2(uint8_t a, int exp)
325 //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
327 exp += vdev_raidz_log2[a];
331 return (vdev_raidz_pow2[exp]);
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
337 uint64_t *q, *p, *src, pcount, ccount, mask, i;
340 pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341 //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
343 for (c = nparity; c < acols; c++) {
344 src = cols[c].rc_data;
345 p = cols[VDEV_RAIDZ_P].rc_data;
346 q = cols[VDEV_RAIDZ_Q].rc_data;
347 ccount = cols[c].rc_size / sizeof (src[0]);
350 //ASSERT(ccount == pcount || ccount == 0);
351 for (i = 0; i < ccount; i++, p++, q++, src++) {
355 for (; i < pcount; i++, p++, q++, src++) {
360 //ASSERT(ccount <= pcount);
363 * Rather than multiplying each byte
364 * individually (as described above), we are
365 * able to handle 8 at once by generating a
366 * mask based on the high bit in each byte and
367 * using that to conditionally XOR in 0x1d.
369 for (i = 0; i < ccount; i++, p++, q++, src++) {
370 mask = *q & 0x8080808080808080ULL;
371 mask = (mask << 1) - (mask >> 7);
372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 (mask & 0x1d1d1d1d1d1d1d1dULL);
379 * Treat short columns as though they are full of 0s.
381 for (; i < pcount; i++, q++) {
382 mask = *q & 0x8080808080808080ULL;
383 mask = (mask << 1) - (mask >> 7);
384 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385 (mask & 0x1d1d1d1d1d1d1d1dULL);
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
394 uint64_t *dst, *src, xcount, ccount, count, mask, i;
398 xcount = cols[x].rc_size / sizeof (src[0]);
399 //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
401 for (c = nparity; c < acols; c++) {
402 src = cols[c].rc_data;
403 dst = cols[x].rc_data;
408 ccount = cols[c].rc_size / sizeof (src[0]);
410 count = MIN(ccount, xcount);
413 for (i = 0; i < count; i++, dst++, src++) {
416 for (; i < xcount; i++, dst++) {
422 * For an explanation of this, see the comment in
423 * vdev_raidz_generate_parity_pq() above.
425 for (i = 0; i < count; i++, dst++, src++) {
426 mask = *dst & 0x8080808080808080ULL;
427 mask = (mask << 1) - (mask >> 7);
428 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429 (mask & 0x1d1d1d1d1d1d1d1dULL);
433 for (; i < xcount; i++, dst++) {
434 mask = *dst & 0x8080808080808080ULL;
435 mask = (mask << 1) - (mask >> 7);
436 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437 (mask & 0x1d1d1d1d1d1d1d1dULL);
442 src = cols[VDEV_RAIDZ_Q].rc_data;
443 dst = cols[x].rc_data;
444 exp = 255 - (acols - 1 - x);
446 for (i = 0; i < xcount; i++, dst++, src++) {
448 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449 *b = vdev_raidz_exp2(*b, exp);
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
459 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
461 uint64_t xsize, ysize, i;
464 //ASSERT(x >= nparity);
467 //ASSERT(cols[x].rc_size >= cols[y].rc_size);
470 * Move the parity data aside -- we're going to compute parity as
471 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472 * reuse the parity generation mechanism without trashing the actual
473 * parity so we make those columns appear to be full of zeros by
474 * setting their lengths to zero.
476 pdata = cols[VDEV_RAIDZ_P].rc_data;
477 qdata = cols[VDEV_RAIDZ_Q].rc_data;
478 xsize = cols[x].rc_size;
479 ysize = cols[y].rc_size;
481 cols[VDEV_RAIDZ_P].rc_data =
482 zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483 cols[VDEV_RAIDZ_Q].rc_data =
484 zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
488 vdev_raidz_generate_parity_pq(cols, nparity, acols);
490 cols[x].rc_size = xsize;
491 cols[y].rc_size = ysize;
495 pxy = cols[VDEV_RAIDZ_P].rc_data;
496 qxy = cols[VDEV_RAIDZ_Q].rc_data;
497 xd = cols[x].rc_data;
498 yd = cols[y].rc_data;
502 * Pxy = P + D_x + D_y
503 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
505 * We can then solve for D_x:
506 * D_x = A * (P + Pxy) + B * (Q + Qxy)
508 * A = 2^(x - y) * (2^(x - y) + 1)^-1
509 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
511 * With D_x in hand, we can easily solve for D_y:
512 * D_y = P + Pxy + D_x
515 a = vdev_raidz_pow2[255 + x - y];
516 b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517 tmp = 255 - vdev_raidz_log2[a ^ 1];
519 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
522 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524 vdev_raidz_exp2(*q ^ *qxy, bexp);
527 *yd = *p ^ *pxy ^ *xd;
531 * Restore the saved parity data.
533 cols[VDEV_RAIDZ_P].rc_data = pdata;
534 cols[VDEV_RAIDZ_Q].rc_data = qdata;
538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539 off_t offset, size_t bytes)
541 size_t psize = BP_GET_PSIZE(bp);
543 int unit_shift = vdev->v_ashift;
544 int dcols = vdev->v_nchildren;
545 int nparity = vdev->v_nparity;
546 int missingdata, missingparity;
547 int parity_errors, data_errors, unexpected_errors, total_errors;
549 uint64_t b = offset >> unit_shift;
550 uint64_t s = psize >> unit_shift;
551 uint64_t f = b % dcols;
552 uint64_t o = (b / dcols) << unit_shift;
554 int c, c1, bc, col, acols, devidx, asize, n;
555 static raidz_col_t cols[16];
556 raidz_col_t *rc, *rc1;
558 q = s / (dcols - nparity);
559 r = s - q * (dcols - nparity);
560 bc = (r == 0 ? 0 : r + nparity);
562 acols = (q == 0 ? bc : dcols);
565 for (c = 0; c < acols; c++) {
570 coff += 1ULL << unit_shift;
572 cols[c].rc_devidx = col;
573 cols[c].rc_offset = coff;
574 cols[c].rc_size = (q + (c < bc)) << unit_shift;
575 cols[c].rc_data = NULL;
576 cols[c].rc_error = 0;
577 cols[c].rc_tried = 0;
578 cols[c].rc_skipped = 0;
579 asize += cols[c].rc_size;
582 asize = roundup(asize, (nparity + 1) << unit_shift);
584 for (c = 0; c < nparity; c++) {
585 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
588 cols[c].rc_data = buf;
590 for (c = c + 1; c < acols; c++)
591 cols[c].rc_data = (char *)cols[c - 1].rc_data +
595 * If all data stored spans all columns, there's a danger that
596 * parity will always be on the same device and, since parity
597 * isn't read during normal operation, that that device's I/O
598 * bandwidth won't be used effectively. We therefore switch
599 * the parity every 1MB.
601 * ... at least that was, ostensibly, the theory. As a
602 * practical matter unless we juggle the parity between all
603 * devices evenly, we won't see any benefit. Further,
604 * occasional writes that aren't a multiple of the LCM of the
605 * number of children and the minimum stripe width are
606 * sufficient to avoid pessimal behavior. Unfortunately, this
607 * decision created an implicit on-disk format requirement
608 * that we need to support for all eternity, but only for
609 * single-parity RAID-Z.
611 //ASSERT(acols >= 2);
612 //ASSERT(cols[0].rc_size == cols[1].rc_size);
614 if (nparity == 1 && (offset & (1ULL << 20))) {
615 devidx = cols[0].rc_devidx;
616 o = cols[0].rc_offset;
617 cols[0].rc_devidx = cols[1].rc_devidx;
618 cols[0].rc_offset = cols[1].rc_offset;
619 cols[1].rc_devidx = devidx;
620 cols[1].rc_offset = o;
624 * Iterate over the columns in reverse order so that we hit
625 * the parity last -- any errors along the way will force us
626 * to read the parity data.
630 for (c = acols - 1; c >= 0; c--) {
632 devidx = rc->rc_devidx;
633 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
634 if (kid->v_id == devidx)
636 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
641 rc->rc_error = ENXIO;
642 rc->rc_tried = 1; /* don't even try */
648 * Too hard for the bootcode
650 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
652 rm->rm_missingdata++;
654 rm->rm_missingparity++;
655 rc->rc_error = ESTALE;
660 if (c >= nparity || missingdata > 0) {
662 rc->rc_error = kid->v_read(kid, NULL,
663 rc->rc_data, rc->rc_offset, rc->rc_size);
665 rc->rc_error = ENXIO;
674 unexpected_errors = 0;
677 for (c = 0; c < acols; c++) {
690 } else if (c < nparity && !rc->rc_tried) {
696 * There are three potential phases for a read:
697 * 1. produce valid data from the columns read
698 * 2. read all disks and try again
699 * 3. perform combinatorial reconstruction
701 * Each phase is progressively both more expensive and less
702 * likely to occur. If we encounter more errors than we can
703 * repair or all phases fail, we have no choice but to return
708 * If the number of errors we saw was correctable -- less than
709 * or equal to the number of parity disks read -- attempt to
710 * produce data that has a valid checksum. Naturally, this
711 * case applies in the absence of any errors.
713 if (total_errors <= nparity - parity_untried) {
714 switch (data_errors) {
716 if (zio_checksum_error(bp, buf) == 0)
722 * We either attempt to read all the parity columns or
723 * none of them. If we didn't try to read parity, we
724 * wouldn't be here in the correctable case. There must
725 * also have been fewer parity errors than parity
726 * columns or, again, we wouldn't be in this code path.
728 //ASSERT(parity_untried == 0);
729 //ASSERT(parity_errors < nparity);
732 * Find the column that reported the error.
734 for (c = nparity; c < acols; c++) {
736 if (rc->rc_error != 0)
739 //ASSERT(c != acols);
740 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
742 if (cols[VDEV_RAIDZ_P].rc_error == 0) {
743 vdev_raidz_reconstruct_p(cols, nparity,
746 //ASSERT(nparity > 1);
747 vdev_raidz_reconstruct_q(cols, nparity,
751 if (zio_checksum_error(bp, buf) == 0)
757 * Two data column errors require double parity.
759 //ASSERT(nparity == 2);
762 * Find the two columns that reported errors.
764 for (c = nparity; c < acols; c++) {
766 if (rc->rc_error != 0)
769 //ASSERT(c != acols);
770 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
772 for (c1 = c++; c < acols; c++) {
774 if (rc->rc_error != 0)
777 //ASSERT(c != acols);
778 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
780 vdev_raidz_reconstruct_pq(cols, nparity, acols,
783 if (zio_checksum_error(bp, buf) == 0)
789 //ASSERT(nparity <= 2);
795 * This isn't a typical situation -- either we got a read
796 * error or a child silently returned bad data. Read every
797 * block so we can try again with as much data and parity as
798 * we can track down. If we've already been through once
799 * before, all children will be marked as tried so we'll
800 * proceed to combinatorial reconstruction.
803 for (c = 0; c < acols; c++) {
808 devidx = rc->rc_devidx;
809 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
810 if (kid->v_id == devidx)
812 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
813 rc->rc_error = ENXIO;
814 rc->rc_tried = 1; /* don't even try */
819 rc->rc_error = kid->v_read(kid, NULL,
820 rc->rc_data, rc->rc_offset, rc->rc_size);
822 rc->rc_error = ENXIO;
823 if (rc->rc_error == 0)
830 * If we managed to read anything more, retry the
837 * At this point we've attempted to reconstruct the data given the
838 * errors we detected, and we've attempted to read all columns. There
839 * must, therefore, be one or more additional problems -- silent errors
840 * resulting in invalid data rather than explicit I/O errors resulting
841 * in absent data. Before we attempt combinatorial reconstruction make
842 * sure we have a chance of coming up with the right answer.
844 if (total_errors >= nparity) {
849 for (c = 0; c < acols; c++) {
851 if (rc->rc_size > asize)
854 if (cols[VDEV_RAIDZ_P].rc_error == 0) {
856 * Attempt to reconstruct the data from parity P.
859 orig = zfs_alloc_temp(asize);
860 for (c = nparity; c < acols; c++) {
863 memcpy(orig, rc->rc_data, rc->rc_size);
864 vdev_raidz_reconstruct_p(cols, nparity, acols, c);
866 if (zio_checksum_error(bp, buf) == 0)
869 memcpy(rc->rc_data, orig, rc->rc_size);
873 if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
875 * Attempt to reconstruct the data from parity Q.
878 orig = zfs_alloc_temp(asize);
879 for (c = nparity; c < acols; c++) {
882 memcpy(orig, rc->rc_data, rc->rc_size);
883 vdev_raidz_reconstruct_q(cols, nparity, acols, c);
885 if (zio_checksum_error(bp, buf) == 0)
888 memcpy(rc->rc_data, orig, rc->rc_size);
893 cols[VDEV_RAIDZ_P].rc_error == 0 &&
894 cols[VDEV_RAIDZ_Q].rc_error == 0) {
896 * Attempt to reconstruct the data from both P and Q.
899 orig = zfs_alloc_temp(asize);
900 orig1 = zfs_alloc_temp(asize);
901 for (c = nparity; c < acols - 1; c++) {
904 memcpy(orig, rc->rc_data, rc->rc_size);
906 for (c1 = c + 1; c1 < acols; c1++) {
909 memcpy(orig1, rc1->rc_data, rc1->rc_size);
911 vdev_raidz_reconstruct_pq(cols, nparity,
914 if (zio_checksum_error(bp, buf) == 0)
917 memcpy(rc1->rc_data, orig1, rc1->rc_size);
920 memcpy(rc->rc_data, orig, rc->rc_size);