4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
38 * Calculate the crc64 table (used for the zap hash
41 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43 for (i = 0; i < 256; i++)
44 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
52 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
56 * Signature for checksum functions.
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
61 * Information about each checksum function.
63 typedef struct zio_checksum_info {
64 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
65 int ci_correctable; /* number of correctable bits */
66 int ci_zbt; /* uses zio block tail? */
67 const char *ci_name; /* descriptive name */
68 } zio_checksum_info_t;
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74 {{NULL, NULL}, 0, 0, "inherit"},
75 {{NULL, NULL}, 0, 0, "on"},
76 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
77 {{zio_checksum_SHA256, NULL}, 1, 1, "label"},
78 {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"},
79 {{fletcher_2_native, NULL}, 0, 1, "zilog"},
80 {{fletcher_2_native, NULL}, 0, 0, "fletcher2"},
81 {{fletcher_4_native, NULL}, 1, 0, "fletcher4"},
82 {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"},
86 * Common signature for all zio compress/decompress functions.
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89 size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91 size_t s_len, size_t d_len, int);
94 * Information about each compression function.
96 typedef struct zio_compress_info {
97 zio_compress_func_t *ci_compress; /* compression function */
98 zio_decompress_func_t *ci_decompress; /* decompression function */
99 int ci_level; /* level parameter */
100 const char *ci_name; /* algorithm name */
101 } zio_compress_info_t;
106 * Compression vectors.
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109 {NULL, NULL, 0, "inherit"},
110 {NULL, NULL, 0, "on"},
111 {NULL, NULL, 0, "uncompressed"},
112 {NULL, lzjb_decompress, 0, "lzjb"},
113 {NULL, NULL, 0, "empty"},
114 {NULL, NULL, 1, "gzip-1"},
115 {NULL, NULL, 2, "gzip-2"},
116 {NULL, NULL, 3, "gzip-3"},
117 {NULL, NULL, 4, "gzip-4"},
118 {NULL, NULL, 5, "gzip-5"},
119 {NULL, NULL, 6, "gzip-6"},
120 {NULL, NULL, 7, "gzip-7"},
121 {NULL, NULL, 8, "gzip-8"},
122 {NULL, NULL, 9, "gzip-9"},
126 zio_checksum_error(const blkptr_t *bp, void *data)
128 zio_cksum_t zc = bp->blk_cksum;
129 unsigned int checksum = BP_GET_CHECKSUM(bp);
130 uint64_t size = BP_GET_PSIZE(bp);
131 zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133 zio_cksum_t actual_cksum, expected_cksum;
135 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
139 expected_cksum = zbt->zbt_cksum;
141 ci->ci_func[0](data, size, &actual_cksum);
142 zbt->zbt_cksum = expected_cksum;
145 /* ASSERT(!BP_IS_GANG(bp)); */
146 ci->ci_func[0](data, size, &actual_cksum);
149 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150 /*printf("ZFS: read checksum failed\n");*/
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159 void *dest, uint64_t destsize)
161 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
163 /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164 if (!ci->ci_decompress) {
165 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
169 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
173 zap_hash(uint64_t salt, const char *name)
179 /*ASSERT(crc != 0);*/
180 /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
185 * Only use 28 bits, since we need 4 bits in the cookie for the
186 * collision differentiator. We MUST use the high bits, since
187 * those are the onces that we first pay attention to when
188 * chosing the bucket.
190 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
195 static char *zfs_alloc_temp(size_t sz);
197 typedef struct raidz_col {
198 uint64_t rc_devidx; /* child device index for I/O */
199 uint64_t rc_offset; /* device offset */
200 uint64_t rc_size; /* I/O size */
201 void *rc_data; /* I/O data */
202 int rc_error; /* I/O error for this device */
203 uint8_t rc_tried; /* Did we attempt this I/O column? */
204 uint8_t rc_skipped; /* Did we skip this I/O column? */
207 #define VDEV_RAIDZ_P 0
208 #define VDEV_RAIDZ_Q 1
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
213 uint64_t *dst, *src, xcount, ccount, count, i;
216 xcount = cols[x].rc_size / sizeof (src[0]);
217 //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218 //ASSERT(xcount > 0);
220 src = cols[VDEV_RAIDZ_P].rc_data;
221 dst = cols[x].rc_data;
222 for (i = 0; i < xcount; i++, dst++, src++) {
226 for (c = nparity; c < acols; c++) {
227 src = cols[c].rc_data;
228 dst = cols[x].rc_data;
233 ccount = cols[c].rc_size / sizeof (src[0]);
234 count = MIN(ccount, xcount);
236 for (i = 0; i < count; i++, dst++, src++) {
243 * These two tables represent powers and logs of 2 in the Galois field defined
244 * above. These values were computed by repeatedly multiplying by 2 as above.
246 static const uint8_t vdev_raidz_pow2[256] = {
247 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
280 static const uint8_t vdev_raidz_log2[256] = {
281 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
316 * Multiply a given number by 2 raised to the given power.
319 vdev_raidz_exp2(uint8_t a, int exp)
325 //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
327 exp += vdev_raidz_log2[a];
331 return (vdev_raidz_pow2[exp]);
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
337 uint64_t *q, *p, *src, pcount, ccount, mask, i;
340 pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341 //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
343 for (c = nparity; c < acols; c++) {
344 src = cols[c].rc_data;
345 p = cols[VDEV_RAIDZ_P].rc_data;
346 q = cols[VDEV_RAIDZ_Q].rc_data;
347 ccount = cols[c].rc_size / sizeof (src[0]);
350 //ASSERT(ccount == pcount || ccount == 0);
351 for (i = 0; i < ccount; i++, p++, q++, src++) {
355 for (; i < pcount; i++, p++, q++, src++) {
360 //ASSERT(ccount <= pcount);
363 * Rather than multiplying each byte
364 * individually (as described above), we are
365 * able to handle 8 at once by generating a
366 * mask based on the high bit in each byte and
367 * using that to conditionally XOR in 0x1d.
369 for (i = 0; i < ccount; i++, p++, q++, src++) {
370 mask = *q & 0x8080808080808080ULL;
371 mask = (mask << 1) - (mask >> 7);
372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 (mask & 0x1d1d1d1d1d1d1d1dULL);
379 * Treat short columns as though they are full of 0s.
381 for (; i < pcount; i++, q++) {
382 mask = *q & 0x8080808080808080ULL;
383 mask = (mask << 1) - (mask >> 7);
384 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385 (mask & 0x1d1d1d1d1d1d1d1dULL);
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
394 uint64_t *dst, *src, xcount, ccount, count, mask, i;
398 xcount = cols[x].rc_size / sizeof (src[0]);
399 //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
401 for (c = nparity; c < acols; c++) {
402 src = cols[c].rc_data;
403 dst = cols[x].rc_data;
408 ccount = cols[c].rc_size / sizeof (src[0]);
410 count = MIN(ccount, xcount);
413 for (i = 0; i < count; i++, dst++, src++) {
416 for (; i < xcount; i++, dst++) {
422 * For an explanation of this, see the comment in
423 * vdev_raidz_generate_parity_pq() above.
425 for (i = 0; i < count; i++, dst++, src++) {
426 mask = *dst & 0x8080808080808080ULL;
427 mask = (mask << 1) - (mask >> 7);
428 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429 (mask & 0x1d1d1d1d1d1d1d1dULL);
433 for (; i < xcount; i++, dst++) {
434 mask = *dst & 0x8080808080808080ULL;
435 mask = (mask << 1) - (mask >> 7);
436 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437 (mask & 0x1d1d1d1d1d1d1d1dULL);
442 src = cols[VDEV_RAIDZ_Q].rc_data;
443 dst = cols[x].rc_data;
444 exp = 255 - (acols - 1 - x);
446 for (i = 0; i < xcount; i++, dst++, src++) {
448 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449 *b = vdev_raidz_exp2(*b, exp);
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457 int x, int y, void *temp_p, void *temp_q)
459 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
461 uint64_t xsize, ysize, i;
464 //ASSERT(x >= nparity);
467 //ASSERT(cols[x].rc_size >= cols[y].rc_size);
470 * Move the parity data aside -- we're going to compute parity as
471 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472 * reuse the parity generation mechanism without trashing the actual
473 * parity so we make those columns appear to be full of zeros by
474 * setting their lengths to zero.
476 pdata = cols[VDEV_RAIDZ_P].rc_data;
477 qdata = cols[VDEV_RAIDZ_Q].rc_data;
478 xsize = cols[x].rc_size;
479 ysize = cols[y].rc_size;
481 cols[VDEV_RAIDZ_P].rc_data = temp_p;
482 cols[VDEV_RAIDZ_Q].rc_data = temp_q;
486 vdev_raidz_generate_parity_pq(cols, nparity, acols);
488 cols[x].rc_size = xsize;
489 cols[y].rc_size = ysize;
493 pxy = cols[VDEV_RAIDZ_P].rc_data;
494 qxy = cols[VDEV_RAIDZ_Q].rc_data;
495 xd = cols[x].rc_data;
496 yd = cols[y].rc_data;
500 * Pxy = P + D_x + D_y
501 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
503 * We can then solve for D_x:
504 * D_x = A * (P + Pxy) + B * (Q + Qxy)
506 * A = 2^(x - y) * (2^(x - y) + 1)^-1
507 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
509 * With D_x in hand, we can easily solve for D_y:
510 * D_y = P + Pxy + D_x
513 a = vdev_raidz_pow2[255 + x - y];
514 b = vdev_raidz_pow2[255 - (acols - 1 - x)];
515 tmp = 255 - vdev_raidz_log2[a ^ 1];
517 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
518 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
520 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
521 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
522 vdev_raidz_exp2(*q ^ *qxy, bexp);
525 *yd = *p ^ *pxy ^ *xd;
529 * Restore the saved parity data.
531 cols[VDEV_RAIDZ_P].rc_data = pdata;
532 cols[VDEV_RAIDZ_Q].rc_data = qdata;
536 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
537 off_t offset, size_t bytes)
539 size_t psize = BP_GET_PSIZE(bp);
541 int unit_shift = vdev->v_ashift;
542 int dcols = vdev->v_nchildren;
543 int nparity = vdev->v_nparity;
544 int missingdata, missingparity;
545 int parity_errors, data_errors, unexpected_errors, total_errors;
547 uint64_t b = offset >> unit_shift;
548 uint64_t s = psize >> unit_shift;
549 uint64_t f = b % dcols;
550 uint64_t o = (b / dcols) << unit_shift;
552 int c, c1, bc, col, acols, devidx, asize, n, max_rc_size;
553 static raidz_col_t cols[16];
554 raidz_col_t *rc, *rc1;
555 void *orig, *orig1, *temp_p, *temp_q;
557 orig = orig1 = temp_p = temp_q = NULL;
559 q = s / (dcols - nparity);
560 r = s - q * (dcols - nparity);
561 bc = (r == 0 ? 0 : r + nparity);
563 acols = (q == 0 ? bc : dcols);
567 for (c = 0; c < acols; c++) {
572 coff += 1ULL << unit_shift;
574 cols[c].rc_devidx = col;
575 cols[c].rc_offset = coff;
576 cols[c].rc_size = (q + (c < bc)) << unit_shift;
577 cols[c].rc_data = NULL;
578 cols[c].rc_error = 0;
579 cols[c].rc_tried = 0;
580 cols[c].rc_skipped = 0;
581 asize += cols[c].rc_size;
582 if (cols[c].rc_size > max_rc_size)
583 max_rc_size = cols[c].rc_size;
586 asize = roundup(asize, (nparity + 1) << unit_shift);
588 for (c = 0; c < nparity; c++) {
589 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
592 cols[c].rc_data = buf;
594 for (c = c + 1; c < acols; c++)
595 cols[c].rc_data = (char *)cols[c - 1].rc_data +
599 * If all data stored spans all columns, there's a danger that
600 * parity will always be on the same device and, since parity
601 * isn't read during normal operation, that that device's I/O
602 * bandwidth won't be used effectively. We therefore switch
603 * the parity every 1MB.
605 * ... at least that was, ostensibly, the theory. As a
606 * practical matter unless we juggle the parity between all
607 * devices evenly, we won't see any benefit. Further,
608 * occasional writes that aren't a multiple of the LCM of the
609 * number of children and the minimum stripe width are
610 * sufficient to avoid pessimal behavior. Unfortunately, this
611 * decision created an implicit on-disk format requirement
612 * that we need to support for all eternity, but only for
613 * single-parity RAID-Z.
615 //ASSERT(acols >= 2);
616 //ASSERT(cols[0].rc_size == cols[1].rc_size);
618 if (nparity == 1 && (offset & (1ULL << 20))) {
619 devidx = cols[0].rc_devidx;
620 o = cols[0].rc_offset;
621 cols[0].rc_devidx = cols[1].rc_devidx;
622 cols[0].rc_offset = cols[1].rc_offset;
623 cols[1].rc_devidx = devidx;
624 cols[1].rc_offset = o;
628 * Iterate over the columns in reverse order so that we hit
629 * the parity last -- any errors along the way will force us
630 * to read the parity data.
634 for (c = acols - 1; c >= 0; c--) {
636 devidx = rc->rc_devidx;
637 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
638 if (kid->v_id == devidx)
640 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
645 rc->rc_error = ENXIO;
646 rc->rc_tried = 1; /* don't even try */
652 * Too hard for the bootcode
654 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
656 rm->rm_missingdata++;
658 rm->rm_missingparity++;
659 rc->rc_error = ESTALE;
664 if (c >= nparity || missingdata > 0) {
666 rc->rc_error = kid->v_read(kid, NULL,
667 rc->rc_data, rc->rc_offset, rc->rc_size);
669 rc->rc_error = ENXIO;
678 unexpected_errors = 0;
681 for (c = 0; c < acols; c++) {
694 } else if (c < nparity && !rc->rc_tried) {
700 * There are three potential phases for a read:
701 * 1. produce valid data from the columns read
702 * 2. read all disks and try again
703 * 3. perform combinatorial reconstruction
705 * Each phase is progressively both more expensive and less
706 * likely to occur. If we encounter more errors than we can
707 * repair or all phases fail, we have no choice but to return
712 * If the number of errors we saw was correctable -- less than
713 * or equal to the number of parity disks read -- attempt to
714 * produce data that has a valid checksum. Naturally, this
715 * case applies in the absence of any errors.
717 if (total_errors <= nparity - parity_untried) {
718 switch (data_errors) {
720 if (zio_checksum_error(bp, buf) == 0)
726 * We either attempt to read all the parity columns or
727 * none of them. If we didn't try to read parity, we
728 * wouldn't be here in the correctable case. There must
729 * also have been fewer parity errors than parity
730 * columns or, again, we wouldn't be in this code path.
732 //ASSERT(parity_untried == 0);
733 //ASSERT(parity_errors < nparity);
736 * Find the column that reported the error.
738 for (c = nparity; c < acols; c++) {
740 if (rc->rc_error != 0)
743 //ASSERT(c != acols);
744 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
746 if (cols[VDEV_RAIDZ_P].rc_error == 0) {
747 vdev_raidz_reconstruct_p(cols, nparity,
750 //ASSERT(nparity > 1);
751 vdev_raidz_reconstruct_q(cols, nparity,
755 if (zio_checksum_error(bp, buf) == 0)
761 * Two data column errors require double parity.
763 //ASSERT(nparity == 2);
766 * Find the two columns that reported errors.
768 for (c = nparity; c < acols; c++) {
770 if (rc->rc_error != 0)
773 //ASSERT(c != acols);
774 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
776 for (c1 = c++; c < acols; c++) {
778 if (rc->rc_error != 0)
781 //ASSERT(c != acols);
782 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
785 temp_p = zfs_alloc_temp(max_rc_size);
787 temp_q = zfs_alloc_temp(max_rc_size);
789 vdev_raidz_reconstruct_pq(cols, nparity, acols,
790 c1, c, temp_p, temp_q);
792 if (zio_checksum_error(bp, buf) == 0)
798 //ASSERT(nparity <= 2);
804 * This isn't a typical situation -- either we got a read
805 * error or a child silently returned bad data. Read every
806 * block so we can try again with as much data and parity as
807 * we can track down. If we've already been through once
808 * before, all children will be marked as tried so we'll
809 * proceed to combinatorial reconstruction.
812 for (c = 0; c < acols; c++) {
817 devidx = rc->rc_devidx;
818 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
819 if (kid->v_id == devidx)
821 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
822 rc->rc_error = ENXIO;
823 rc->rc_tried = 1; /* don't even try */
828 rc->rc_error = kid->v_read(kid, NULL,
829 rc->rc_data, rc->rc_offset, rc->rc_size);
831 rc->rc_error = ENXIO;
832 if (rc->rc_error == 0)
839 * If we managed to read anything more, retry the
846 * At this point we've attempted to reconstruct the data given the
847 * errors we detected, and we've attempted to read all columns. There
848 * must, therefore, be one or more additional problems -- silent errors
849 * resulting in invalid data rather than explicit I/O errors resulting
850 * in absent data. Before we attempt combinatorial reconstruction make
851 * sure we have a chance of coming up with the right answer.
853 if (total_errors >= nparity) {
857 if (cols[VDEV_RAIDZ_P].rc_error == 0) {
859 * Attempt to reconstruct the data from parity P.
862 orig = zfs_alloc_temp(max_rc_size);
863 for (c = nparity; c < acols; c++) {
866 memcpy(orig, rc->rc_data, rc->rc_size);
867 vdev_raidz_reconstruct_p(cols, nparity, acols, c);
869 if (zio_checksum_error(bp, buf) == 0)
872 memcpy(rc->rc_data, orig, rc->rc_size);
876 if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
878 * Attempt to reconstruct the data from parity Q.
881 orig = zfs_alloc_temp(max_rc_size);
882 for (c = nparity; c < acols; c++) {
885 memcpy(orig, rc->rc_data, rc->rc_size);
886 vdev_raidz_reconstruct_q(cols, nparity, acols, c);
888 if (zio_checksum_error(bp, buf) == 0)
891 memcpy(rc->rc_data, orig, rc->rc_size);
896 cols[VDEV_RAIDZ_P].rc_error == 0 &&
897 cols[VDEV_RAIDZ_Q].rc_error == 0) {
899 * Attempt to reconstruct the data from both P and Q.
902 orig = zfs_alloc_temp(max_rc_size);
904 orig1 = zfs_alloc_temp(max_rc_size);
906 temp_p = zfs_alloc_temp(max_rc_size);
908 temp_q = zfs_alloc_temp(max_rc_size);
909 for (c = nparity; c < acols - 1; c++) {
912 memcpy(orig, rc->rc_data, rc->rc_size);
914 for (c1 = c + 1; c1 < acols; c1++) {
917 memcpy(orig1, rc1->rc_data, rc1->rc_size);
919 vdev_raidz_reconstruct_pq(cols, nparity,
920 acols, c, c1, temp_p, temp_q);
922 if (zio_checksum_error(bp, buf) == 0)
925 memcpy(rc1->rc_data, orig1, rc1->rc_size);
928 memcpy(rc->rc_data, orig, rc->rc_size);