4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
33 #define ASSERT(...) do { } while (0)
34 #define ASSERT3U(...) do { } while (0)
35 #define ASSERT3S(...) do { } while (0)
37 #define panic(...) do { \
38 printf(__VA_ARGS__); \
42 #define kmem_alloc(size, flag) zfs_alloc((size))
43 #define kmem_free(ptr, size) zfs_free((ptr), (size))
52 * Calculate the crc64 table (used for the zap hash
55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 for (i = 0; i < 256; i++)
58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
70 * Signature for checksum functions.
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75 * Information about each checksum function.
77 typedef struct zio_checksum_info {
78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
79 int ci_correctable; /* number of correctable bits */
80 int ci_eck; /* uses zio embedded checksum? */
81 int ci_dedup; /* strong enough for dedup? */
82 const char *ci_name; /* descriptive name */
83 } zio_checksum_info_t;
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 {{NULL, NULL}, 0, 0, 0, "inherit"},
90 {{NULL, NULL}, 0, 0, 0, "on"},
91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"},
98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"},
103 * Common signature for all zio compress/decompress functions.
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106 size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108 size_t s_len, size_t d_len, int);
111 * Information about each compression function.
113 typedef struct zio_compress_info {
114 zio_compress_func_t *ci_compress; /* compression function */
115 zio_decompress_func_t *ci_decompress; /* decompression function */
116 int ci_level; /* level parameter */
117 const char *ci_name; /* algorithm name */
118 } zio_compress_info_t;
125 * Compression vectors.
127 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
128 {NULL, NULL, 0, "inherit"},
129 {NULL, NULL, 0, "on"},
130 {NULL, NULL, 0, "uncompressed"},
131 {NULL, lzjb_decompress, 0, "lzjb"},
132 {NULL, NULL, 0, "empty"},
133 {NULL, NULL, 1, "gzip-1"},
134 {NULL, NULL, 2, "gzip-2"},
135 {NULL, NULL, 3, "gzip-3"},
136 {NULL, NULL, 4, "gzip-4"},
137 {NULL, NULL, 5, "gzip-5"},
138 {NULL, NULL, 6, "gzip-6"},
139 {NULL, NULL, 7, "gzip-7"},
140 {NULL, NULL, 8, "gzip-8"},
141 {NULL, NULL, 9, "gzip-9"},
142 {NULL, zle_decompress, 64, "zle"},
143 {NULL, lz4_decompress, 0, "lz4"},
147 byteswap_uint64_array(void *vbuf, size_t size)
149 uint64_t *buf = vbuf;
150 size_t count = size >> 3;
153 ASSERT((size & 7) == 0);
155 for (i = 0; i < count; i++)
156 buf[i] = BSWAP_64(buf[i]);
160 * Set the external verifier for a gang block based on <vdev, offset, txg>,
161 * a tuple which is guaranteed to be unique for the life of the pool.
164 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
166 const dva_t *dva = BP_IDENTITY(bp);
167 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
169 ASSERT(BP_IS_GANG(bp));
171 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
175 * Set the external verifier for a label block based on its offset.
176 * The vdev is implicit, and the txg is unknowable at pool open time --
177 * hence the logic in vdev_uberblock_load() to find the most recent copy.
180 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
182 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
186 zio_checksum_verify(const blkptr_t *bp, void *data)
189 unsigned int checksum;
190 zio_checksum_info_t *ci;
191 zio_cksum_t actual_cksum, expected_cksum, verifier;
194 checksum = BP_GET_CHECKSUM(bp);
195 size = BP_GET_PSIZE(bp);
197 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
199 ci = &zio_checksum_table[checksum];
200 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
206 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
207 checksum == ZIO_CHECKSUM_LABEL);
209 eck = (zio_eck_t *)((char *)data + size) - 1;
211 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
212 zio_checksum_gang_verifier(&verifier, bp);
213 else if (checksum == ZIO_CHECKSUM_LABEL)
214 zio_checksum_label_verifier(&verifier,
215 DVA_GET_OFFSET(BP_IDENTITY(bp)));
217 verifier = bp->blk_cksum;
219 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
222 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
224 expected_cksum = eck->zec_cksum;
225 eck->zec_cksum = verifier;
226 ci->ci_func[byteswap](data, size, &actual_cksum);
227 eck->zec_cksum = expected_cksum;
230 byteswap_uint64_array(&expected_cksum,
231 sizeof (zio_cksum_t));
233 expected_cksum = bp->blk_cksum;
234 ci->ci_func[0](data, size, &actual_cksum);
237 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
238 /*printf("ZFS: read checksum failed\n");*/
246 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
247 void *dest, uint64_t destsize)
249 zio_compress_info_t *ci;
251 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
252 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
256 ci = &zio_compress_table[cpfunc];
257 if (!ci->ci_decompress) {
258 printf("ZFS: unsupported compression algorithm %s\n",
263 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
267 zap_hash(uint64_t salt, const char *name)
274 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
275 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
276 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
279 * Only use 28 bits, since we need 4 bits in the cookie for the
280 * collision differentiator. We MUST use the high bits, since
281 * those are the onces that we first pay attention to when
282 * chosing the bucket.
284 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
289 static void *zfs_alloc(size_t size);
290 static void zfs_free(void *ptr, size_t size);
292 typedef struct raidz_col {
293 uint64_t rc_devidx; /* child device index for I/O */
294 uint64_t rc_offset; /* device offset */
295 uint64_t rc_size; /* I/O size */
296 void *rc_data; /* I/O data */
297 int rc_error; /* I/O error for this device */
298 uint8_t rc_tried; /* Did we attempt this I/O column? */
299 uint8_t rc_skipped; /* Did we skip this I/O column? */
302 typedef struct raidz_map {
303 uint64_t rm_cols; /* Regular column count */
304 uint64_t rm_scols; /* Count including skipped columns */
305 uint64_t rm_bigcols; /* Number of oversized columns */
306 uint64_t rm_asize; /* Actual total I/O size */
307 uint64_t rm_missingdata; /* Count of missing data devices */
308 uint64_t rm_missingparity; /* Count of missing parity devices */
309 uint64_t rm_firstdatacol; /* First data column/parity count */
310 uint64_t rm_nskip; /* Skipped sectors for padding */
311 uint64_t rm_skipstart; /* Column index of padding start */
312 uintptr_t rm_reports; /* # of referencing checksum reports */
313 uint8_t rm_freed; /* map no longer has referencing ZIO */
314 uint8_t rm_ecksuminjected; /* checksum error was injected */
315 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
318 #define VDEV_RAIDZ_P 0
319 #define VDEV_RAIDZ_Q 1
320 #define VDEV_RAIDZ_R 2
322 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
323 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
326 * We provide a mechanism to perform the field multiplication operation on a
327 * 64-bit value all at once rather than a byte at a time. This works by
328 * creating a mask from the top bit in each byte and using that to
329 * conditionally apply the XOR of 0x1d.
331 #define VDEV_RAIDZ_64MUL_2(x, mask) \
333 (mask) = (x) & 0x8080808080808080ULL; \
334 (mask) = ((mask) << 1) - ((mask) >> 7); \
335 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
336 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
339 #define VDEV_RAIDZ_64MUL_4(x, mask) \
341 VDEV_RAIDZ_64MUL_2((x), mask); \
342 VDEV_RAIDZ_64MUL_2((x), mask); \
346 * These two tables represent powers and logs of 2 in the Galois field defined
347 * above. These values were computed by repeatedly multiplying by 2 as above.
349 static const uint8_t vdev_raidz_pow2[256] = {
350 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
351 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
352 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
353 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
354 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
355 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
356 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
357 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
358 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
359 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
360 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
361 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
362 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
363 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
364 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
365 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
366 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
367 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
368 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
369 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
370 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
371 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
372 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
373 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
374 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
375 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
376 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
377 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
378 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
379 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
380 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
381 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
383 static const uint8_t vdev_raidz_log2[256] = {
384 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
385 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
386 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
387 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
388 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
389 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
390 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
391 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
392 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
393 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
394 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
395 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
396 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
397 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
398 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
399 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
400 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
401 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
402 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
403 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
404 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
405 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
406 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
407 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
408 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
409 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
410 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
411 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
412 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
413 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
414 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
415 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
419 * Multiply a given number by 2 raised to the given power.
422 vdev_raidz_exp2(uint8_t a, int exp)
428 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
430 exp += vdev_raidz_log2[a];
434 return (vdev_raidz_pow2[exp]);
438 vdev_raidz_generate_parity_p(raidz_map_t *rm)
440 uint64_t *p, *src, pcount, ccount, i;
443 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
445 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
446 src = rm->rm_col[c].rc_data;
447 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
448 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
450 if (c == rm->rm_firstdatacol) {
451 ASSERT(ccount == pcount);
452 for (i = 0; i < ccount; i++, src++, p++) {
456 ASSERT(ccount <= pcount);
457 for (i = 0; i < ccount; i++, src++, p++) {
465 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
467 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
470 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
471 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
472 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
474 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
475 src = rm->rm_col[c].rc_data;
476 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
477 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
479 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
481 if (c == rm->rm_firstdatacol) {
482 ASSERT(ccnt == pcnt || ccnt == 0);
483 for (i = 0; i < ccnt; i++, src++, p++, q++) {
487 for (; i < pcnt; i++, src++, p++, q++) {
492 ASSERT(ccnt <= pcnt);
495 * Apply the algorithm described above by multiplying
496 * the previous result and adding in the new value.
498 for (i = 0; i < ccnt; i++, src++, p++, q++) {
501 VDEV_RAIDZ_64MUL_2(*q, mask);
506 * Treat short columns as though they are full of 0s.
507 * Note that there's therefore nothing needed for P.
509 for (; i < pcnt; i++, q++) {
510 VDEV_RAIDZ_64MUL_2(*q, mask);
517 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
519 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
522 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
523 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
525 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
526 rm->rm_col[VDEV_RAIDZ_R].rc_size);
528 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
529 src = rm->rm_col[c].rc_data;
530 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
531 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
532 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
534 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
536 if (c == rm->rm_firstdatacol) {
537 ASSERT(ccnt == pcnt || ccnt == 0);
538 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
543 for (; i < pcnt; i++, src++, p++, q++, r++) {
549 ASSERT(ccnt <= pcnt);
552 * Apply the algorithm described above by multiplying
553 * the previous result and adding in the new value.
555 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
558 VDEV_RAIDZ_64MUL_2(*q, mask);
561 VDEV_RAIDZ_64MUL_4(*r, mask);
566 * Treat short columns as though they are full of 0s.
567 * Note that there's therefore nothing needed for P.
569 for (; i < pcnt; i++, q++, r++) {
570 VDEV_RAIDZ_64MUL_2(*q, mask);
571 VDEV_RAIDZ_64MUL_4(*r, mask);
578 * Generate RAID parity in the first virtual columns according to the number of
579 * parity columns available.
582 vdev_raidz_generate_parity(raidz_map_t *rm)
584 switch (rm->rm_firstdatacol) {
586 vdev_raidz_generate_parity_p(rm);
589 vdev_raidz_generate_parity_pq(rm);
592 vdev_raidz_generate_parity_pqr(rm);
595 panic("invalid RAID-Z configuration");
601 * In the general case of reconstruction, we must solve the system of linear
602 * equations defined by the coeffecients used to generate parity as well as
603 * the contents of the data and parity disks. This can be expressed with
604 * vectors for the original data (D) and the actual data (d) and parity (p)
605 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
609 * | V | | D_0 | | p_m-1 |
610 * | | x | : | = | d_0 |
611 * | I | | D_n-1 | | : |
612 * | | ~~ ~~ | d_n-1 |
615 * I is simply a square identity matrix of size n, and V is a vandermonde
616 * matrix defined by the coeffecients we chose for the various parity columns
617 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
618 * computation as well as linear separability.
621 * | 1 .. 1 1 1 | | p_0 |
622 * | 2^n-1 .. 4 2 1 | __ __ | : |
623 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
624 * | 1 .. 0 0 0 | | D_1 | | d_0 |
625 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
626 * | : : : : | | : | | d_2 |
627 * | 0 .. 1 0 0 | | D_n-1 | | : |
628 * | 0 .. 0 1 0 | ~~ ~~ | : |
629 * | 0 .. 0 0 1 | | d_n-1 |
632 * Note that I, V, d, and p are known. To compute D, we must invert the
633 * matrix and use the known data and parity values to reconstruct the unknown
634 * data values. We begin by removing the rows in V|I and d|p that correspond
635 * to failed or missing columns; we then make V|I square (n x n) and d|p
636 * sized n by removing rows corresponding to unused parity from the bottom up
637 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
638 * using Gauss-Jordan elimination. In the example below we use m=3 parity
639 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
641 * | 1 1 1 1 1 1 1 1 |
642 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
643 * | 19 205 116 29 64 16 4 1 | / /
644 * | 1 0 0 0 0 0 0 0 | / /
645 * | 0 1 0 0 0 0 0 0 | <--' /
646 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
647 * | 0 0 0 1 0 0 0 0 |
648 * | 0 0 0 0 1 0 0 0 |
649 * | 0 0 0 0 0 1 0 0 |
650 * | 0 0 0 0 0 0 1 0 |
651 * | 0 0 0 0 0 0 0 1 |
654 * | 1 1 1 1 1 1 1 1 |
655 * | 128 64 32 16 8 4 2 1 |
656 * | 19 205 116 29 64 16 4 1 |
657 * | 1 0 0 0 0 0 0 0 |
658 * | 0 1 0 0 0 0 0 0 |
659 * (V|I)' = | 0 0 1 0 0 0 0 0 |
660 * | 0 0 0 1 0 0 0 0 |
661 * | 0 0 0 0 1 0 0 0 |
662 * | 0 0 0 0 0 1 0 0 |
663 * | 0 0 0 0 0 0 1 0 |
664 * | 0 0 0 0 0 0 0 1 |
667 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
668 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
669 * matrix is not singular.
671 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
672 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
673 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
674 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
675 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
676 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
677 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
678 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
681 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
682 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
683 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
684 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
685 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
686 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
687 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
688 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
691 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
692 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
693 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
694 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
695 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
696 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
697 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
698 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
701 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
702 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
703 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
704 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
705 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
706 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
707 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
708 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
711 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
712 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
713 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
714 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
715 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
716 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
717 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
718 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
721 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
722 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
723 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
724 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
725 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
726 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
727 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
728 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
731 * | 0 0 1 0 0 0 0 0 |
732 * | 167 100 5 41 159 169 217 208 |
733 * | 166 100 4 40 158 168 216 209 |
734 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
735 * | 0 0 0 0 1 0 0 0 |
736 * | 0 0 0 0 0 1 0 0 |
737 * | 0 0 0 0 0 0 1 0 |
738 * | 0 0 0 0 0 0 0 1 |
741 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
742 * of the missing data.
744 * As is apparent from the example above, the only non-trivial rows in the
745 * inverse matrix correspond to the data disks that we're trying to
746 * reconstruct. Indeed, those are the only rows we need as the others would
747 * only be useful for reconstructing data known or assumed to be valid. For
748 * that reason, we only build the coefficients in the rows that correspond to
754 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
760 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
763 * Fill in the missing rows of interest.
765 for (i = 0; i < nmap; i++) {
766 ASSERT3S(0, <=, map[i]);
767 ASSERT3S(map[i], <=, 2);
774 for (j = 0; j < n; j++) {
778 rows[i][j] = vdev_raidz_pow2[pow];
784 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
785 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
791 * Assert that the first nmissing entries from the array of used
792 * columns correspond to parity columns and that subsequent entries
793 * correspond to data columns.
795 for (i = 0; i < nmissing; i++) {
796 ASSERT3S(used[i], <, rm->rm_firstdatacol);
799 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
803 * First initialize the storage where we'll compute the inverse rows.
805 for (i = 0; i < nmissing; i++) {
806 for (j = 0; j < n; j++) {
807 invrows[i][j] = (i == j) ? 1 : 0;
812 * Subtract all trivial rows from the rows of consequence.
814 for (i = 0; i < nmissing; i++) {
815 for (j = nmissing; j < n; j++) {
816 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
817 jj = used[j] - rm->rm_firstdatacol;
819 invrows[i][j] = rows[i][jj];
825 * For each of the rows of interest, we must normalize it and subtract
826 * a multiple of it from the other rows.
828 for (i = 0; i < nmissing; i++) {
829 for (j = 0; j < missing[i]; j++) {
830 ASSERT3U(rows[i][j], ==, 0);
832 ASSERT3U(rows[i][missing[i]], !=, 0);
835 * Compute the inverse of the first element and multiply each
836 * element in the row by that value.
838 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
840 for (j = 0; j < n; j++) {
841 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
842 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
845 for (ii = 0; ii < nmissing; ii++) {
849 ASSERT3U(rows[ii][missing[i]], !=, 0);
851 log = vdev_raidz_log2[rows[ii][missing[i]]];
853 for (j = 0; j < n; j++) {
855 vdev_raidz_exp2(rows[i][j], log);
857 vdev_raidz_exp2(invrows[i][j], log);
863 * Verify that the data that is left in the rows are properly part of
864 * an identity matrix.
866 for (i = 0; i < nmissing; i++) {
867 for (j = 0; j < n; j++) {
868 if (j == missing[i]) {
869 ASSERT3U(rows[i][j], ==, 1);
871 ASSERT3U(rows[i][j], ==, 0);
878 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
879 int *missing, uint8_t **invrows, const uint8_t *used)
884 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
885 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
888 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
893 psize = sizeof (invlog[0][0]) * n * nmissing;
894 p = zfs_alloc(psize);
896 for (pp = p, i = 0; i < nmissing; i++) {
901 for (i = 0; i < nmissing; i++) {
902 for (j = 0; j < n; j++) {
903 ASSERT3U(invrows[i][j], !=, 0);
904 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
908 for (i = 0; i < n; i++) {
910 ASSERT3U(c, <, rm->rm_cols);
912 src = rm->rm_col[c].rc_data;
913 ccount = rm->rm_col[c].rc_size;
914 for (j = 0; j < nmissing; j++) {
915 cc = missing[j] + rm->rm_firstdatacol;
916 ASSERT3U(cc, >=, rm->rm_firstdatacol);
917 ASSERT3U(cc, <, rm->rm_cols);
920 dst[j] = rm->rm_col[cc].rc_data;
921 dcount[j] = rm->rm_col[cc].rc_size;
924 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
926 for (x = 0; x < ccount; x++, src++) {
928 log = vdev_raidz_log2[*src];
930 for (cc = 0; cc < nmissing; cc++) {
937 if ((ll = log + invlog[cc][i]) >= 255)
939 val = vdev_raidz_pow2[ll];
954 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
958 int missing_rows[VDEV_RAIDZ_MAXPARITY];
959 int parity_map[VDEV_RAIDZ_MAXPARITY];
964 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
965 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
971 n = rm->rm_cols - rm->rm_firstdatacol;
974 * Figure out which data columns are missing.
977 for (t = 0; t < ntgts; t++) {
978 if (tgts[t] >= rm->rm_firstdatacol) {
979 missing_rows[nmissing_rows++] =
980 tgts[t] - rm->rm_firstdatacol;
985 * Figure out which parity columns to use to help generate the missing
988 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
990 ASSERT(c < rm->rm_firstdatacol);
993 * Skip any targeted parity columns.
1007 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1009 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1010 nmissing_rows * n + sizeof (used[0]) * n;
1011 p = kmem_alloc(psize, KM_SLEEP);
1013 for (pp = p, i = 0; i < nmissing_rows; i++) {
1021 for (i = 0; i < nmissing_rows; i++) {
1022 used[i] = parity_map[i];
1025 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1026 if (tt < nmissing_rows &&
1027 c == missing_rows[tt] + rm->rm_firstdatacol) {
1038 * Initialize the interesting rows of the matrix.
1040 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1043 * Invert the matrix.
1045 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1049 * Reconstruct the missing data using the generated matrix.
1051 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1054 kmem_free(p, psize);
1060 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1062 int tgts[VDEV_RAIDZ_MAXPARITY];
1066 int nbadparity, nbaddata;
1069 * The tgts list must already be sorted.
1071 for (i = 1; i < nt; i++) {
1072 ASSERT(t[i] > t[i - 1]);
1075 nbadparity = rm->rm_firstdatacol;
1076 nbaddata = rm->rm_cols - nbadparity;
1078 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1079 if (i < nt && c == t[i]) {
1082 } else if (rm->rm_col[c].rc_error != 0) {
1084 } else if (c >= rm->rm_firstdatacol) {
1091 ASSERT(ntgts >= nt);
1092 ASSERT(nbaddata >= 0);
1093 ASSERT(nbaddata + nbadparity == ntgts);
1095 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1096 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1101 static raidz_map_t *
1102 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1103 uint64_t dcols, uint64_t nparity)
1106 uint64_t b = offset >> unit_shift;
1107 uint64_t s = size >> unit_shift;
1108 uint64_t f = b % dcols;
1109 uint64_t o = (b / dcols) << unit_shift;
1110 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1112 q = s / (dcols - nparity);
1113 r = s - q * (dcols - nparity);
1114 bc = (r == 0 ? 0 : r + nparity);
1115 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1119 scols = MIN(dcols, roundup(bc, nparity + 1));
1125 ASSERT3U(acols, <=, scols);
1127 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1129 rm->rm_cols = acols;
1130 rm->rm_scols = scols;
1131 rm->rm_bigcols = bc;
1132 rm->rm_skipstart = bc;
1133 rm->rm_missingdata = 0;
1134 rm->rm_missingparity = 0;
1135 rm->rm_firstdatacol = nparity;
1138 rm->rm_ecksuminjected = 0;
1142 for (c = 0; c < scols; c++) {
1147 coff += 1ULL << unit_shift;
1149 rm->rm_col[c].rc_devidx = col;
1150 rm->rm_col[c].rc_offset = coff;
1151 rm->rm_col[c].rc_data = NULL;
1152 rm->rm_col[c].rc_error = 0;
1153 rm->rm_col[c].rc_tried = 0;
1154 rm->rm_col[c].rc_skipped = 0;
1157 rm->rm_col[c].rc_size = 0;
1159 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1161 rm->rm_col[c].rc_size = q << unit_shift;
1163 asize += rm->rm_col[c].rc_size;
1166 ASSERT3U(asize, ==, tot << unit_shift);
1167 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1168 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1169 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1170 ASSERT3U(rm->rm_nskip, <=, nparity);
1172 for (c = 0; c < rm->rm_firstdatacol; c++)
1173 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1175 rm->rm_col[c].rc_data = data;
1177 for (c = c + 1; c < acols; c++)
1178 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1179 rm->rm_col[c - 1].rc_size;
1182 * If all data stored spans all columns, there's a danger that parity
1183 * will always be on the same device and, since parity isn't read
1184 * during normal operation, that that device's I/O bandwidth won't be
1185 * used effectively. We therefore switch the parity every 1MB.
1187 * ... at least that was, ostensibly, the theory. As a practical
1188 * matter unless we juggle the parity between all devices evenly, we
1189 * won't see any benefit. Further, occasional writes that aren't a
1190 * multiple of the LCM of the number of children and the minimum
1191 * stripe width are sufficient to avoid pessimal behavior.
1192 * Unfortunately, this decision created an implicit on-disk format
1193 * requirement that we need to support for all eternity, but only
1194 * for single-parity RAID-Z.
1196 * If we intend to skip a sector in the zeroth column for padding
1197 * we must make sure to note this swap. We will never intend to
1198 * skip the first column since at least one data and one parity
1199 * column must appear in each row.
1201 ASSERT(rm->rm_cols >= 2);
1202 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1204 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1205 devidx = rm->rm_col[0].rc_devidx;
1206 o = rm->rm_col[0].rc_offset;
1207 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1208 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1209 rm->rm_col[1].rc_devidx = devidx;
1210 rm->rm_col[1].rc_offset = o;
1212 if (rm->rm_skipstart == 0)
1213 rm->rm_skipstart = 1;
1220 vdev_raidz_map_free(raidz_map_t *rm)
1224 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1225 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1227 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1231 vdev_child(vdev_t *pvd, uint64_t devidx)
1235 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236 if (cvd->v_id == devidx)
1244 * We keep track of whether or not there were any injected errors, so that
1245 * any ereports we generate can note it.
1248 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1251 return (zio_checksum_verify(bp, data));
1255 * Generate the parity from the data columns. If we tried and were able to
1256 * read the parity without error, verify that the generated parity matches the
1257 * data we read. If it doesn't, we fire off a checksum error. Return the
1258 * number such failures.
1261 raidz_parity_verify(raidz_map_t *rm)
1263 void *orig[VDEV_RAIDZ_MAXPARITY];
1267 for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 rc = &rm->rm_col[c];
1269 if (!rc->rc_tried || rc->rc_error != 0)
1271 orig[c] = zfs_alloc(rc->rc_size);
1272 bcopy(rc->rc_data, orig[c], rc->rc_size);
1275 vdev_raidz_generate_parity(rm);
1277 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278 rc = &rm->rm_col[c];
1279 if (!rc->rc_tried || rc->rc_error != 0)
1281 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282 rc->rc_error = ECKSUM;
1285 zfs_free(orig[c], rc->rc_size);
1292 * Iterate over all combinations of bad data and attempt a reconstruction.
1293 * Note that the algorithm below is non-optimal because it doesn't take into
1294 * account how reconstruction is actually performed. For example, with
1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297 * cases we'd only use parity information in column 0.
1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301 off_t offset, uint64_t bytes, int total_errors, int data_errors)
1304 void *orig[VDEV_RAIDZ_MAXPARITY];
1305 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306 int *tgts = &tstore[1];
1307 int current, next, i, c, n;
1310 ASSERT(total_errors < rm->rm_firstdatacol);
1313 * This simplifies one edge condition.
1317 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1319 * Initialize the targets array by finding the first n columns
1320 * that contain no error.
1322 * If there were no data errors, we need to ensure that we're
1323 * always explicitly attempting to reconstruct at least one
1324 * data column. To do this, we simply push the highest target
1325 * up into the data columns.
1327 for (c = 0, i = 0; i < n; i++) {
1328 if (i == n - 1 && data_errors == 0 &&
1329 c < rm->rm_firstdatacol) {
1330 c = rm->rm_firstdatacol;
1333 while (rm->rm_col[c].rc_error != 0) {
1335 ASSERT3S(c, <, rm->rm_cols);
1342 * Setting tgts[n] simplifies the other edge condition.
1344 tgts[n] = rm->rm_cols;
1347 * These buffers were allocated in previous iterations.
1349 for (i = 0; i < n - 1; i++) {
1350 ASSERT(orig[i] != NULL);
1353 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1356 next = tgts[current];
1358 while (current != n) {
1359 tgts[current] = next;
1363 * Save off the original data that we're going to
1364 * attempt to reconstruct.
1366 for (i = 0; i < n; i++) {
1367 ASSERT(orig[i] != NULL);
1370 ASSERT3S(c, <, rm->rm_cols);
1371 rc = &rm->rm_col[c];
1372 bcopy(rc->rc_data, orig[i], rc->rc_size);
1376 * Attempt a reconstruction and exit the outer loop on
1379 code = vdev_raidz_reconstruct(rm, tgts, n);
1380 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1381 for (i = 0; i < n; i++) {
1383 rc = &rm->rm_col[c];
1384 ASSERT(rc->rc_error == 0);
1385 rc->rc_error = ECKSUM;
1393 * Restore the original data.
1395 for (i = 0; i < n; i++) {
1397 rc = &rm->rm_col[c];
1398 bcopy(orig[i], rc->rc_data, rc->rc_size);
1403 * Find the next valid column after the current
1406 for (next = tgts[current] + 1;
1407 next < rm->rm_cols &&
1408 rm->rm_col[next].rc_error != 0; next++)
1411 ASSERT(next <= tgts[current + 1]);
1414 * If that spot is available, we're done here.
1416 if (next != tgts[current + 1])
1420 * Otherwise, find the next valid column after
1421 * the previous position.
1423 for (c = tgts[current - 1] + 1;
1424 rm->rm_col[c].rc_error != 0; c++)
1430 } while (current != n);
1435 for (i = n - 1; i >= 0; i--) {
1436 zfs_free(orig[i], rm->rm_col[0].rc_size);
1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444 off_t offset, size_t bytes)
1446 vdev_t *tvd = vd->v_top;
1451 int unexpected_errors;
1457 int tgts[VDEV_RAIDZ_MAXPARITY];
1460 rc = NULL; /* gcc */
1463 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464 vd->v_nchildren, vd->v_nparity);
1467 * Iterate over the columns in reverse order so that we hit the parity
1468 * last -- any errors along the way will force us to read the parity.
1470 for (c = rm->rm_cols - 1; c >= 0; c--) {
1471 rc = &rm->rm_col[c];
1472 cvd = vdev_child(vd, rc->rc_devidx);
1473 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474 if (c >= rm->rm_firstdatacol)
1475 rm->rm_missingdata++;
1477 rm->rm_missingparity++;
1478 rc->rc_error = ENXIO;
1479 rc->rc_tried = 1; /* don't even try */
1483 #if 0 /* XXX: Too hard for the boot code. */
1484 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485 if (c >= rm->rm_firstdatacol)
1486 rm->rm_missingdata++;
1488 rm->rm_missingparity++;
1489 rc->rc_error = ESTALE;
1494 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496 rc->rc_offset, rc->rc_size);
1503 unexpected_errors = 0;
1509 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1512 for (c = 0; c < rm->rm_cols; c++) {
1513 rc = &rm->rm_col[c];
1516 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1518 if (c < rm->rm_firstdatacol)
1523 if (!rc->rc_skipped)
1524 unexpected_errors++;
1527 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1533 * There are three potential phases for a read:
1534 * 1. produce valid data from the columns read
1535 * 2. read all disks and try again
1536 * 3. perform combinatorial reconstruction
1538 * Each phase is progressively both more expensive and less likely to
1539 * occur. If we encounter more errors than we can repair or all phases
1540 * fail, we have no choice but to return an error.
1544 * If the number of errors we saw was correctable -- less than or equal
1545 * to the number of parity disks read -- attempt to produce data that
1546 * has a valid checksum. Naturally, this case applies in the absence of
1549 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550 if (data_errors == 0) {
1551 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1553 * If we read parity information (unnecessarily
1554 * as it happens since no reconstruction was
1555 * needed) regenerate and verify the parity.
1556 * We also regenerate parity when resilvering
1557 * so we can write it out to the failed device
1560 if (parity_errors + parity_untried <
1561 rm->rm_firstdatacol) {
1562 n = raidz_parity_verify(rm);
1563 unexpected_errors += n;
1564 ASSERT(parity_errors + n <=
1565 rm->rm_firstdatacol);
1571 * We either attempt to read all the parity columns or
1572 * none of them. If we didn't try to read parity, we
1573 * wouldn't be here in the correctable case. There must
1574 * also have been fewer parity errors than parity
1575 * columns or, again, we wouldn't be in this code path.
1577 ASSERT(parity_untried == 0);
1578 ASSERT(parity_errors < rm->rm_firstdatacol);
1581 * Identify the data columns that reported an error.
1584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585 rc = &rm->rm_col[c];
1586 if (rc->rc_error != 0) {
1587 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1592 ASSERT(rm->rm_firstdatacol >= n);
1594 code = vdev_raidz_reconstruct(rm, tgts, n);
1596 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1598 * If we read more parity disks than were used
1599 * for reconstruction, confirm that the other
1600 * parity disks produced correct data. This
1601 * routine is suboptimal in that it regenerates
1602 * the parity that we already used in addition
1603 * to the parity that we're attempting to
1604 * verify, but this should be a relatively
1605 * uncommon case, and can be optimized if it
1606 * becomes a problem. Note that we regenerate
1607 * parity when resilvering so we can write it
1608 * out to failed devices later.
1610 if (parity_errors < rm->rm_firstdatacol - n) {
1611 n = raidz_parity_verify(rm);
1612 unexpected_errors += n;
1613 ASSERT(parity_errors + n <=
1614 rm->rm_firstdatacol);
1623 * This isn't a typical situation -- either we got a read
1624 * error or a child silently returned bad data. Read every
1625 * block so we can try again with as much data and parity as
1626 * we can track down. If we've already been through once
1627 * before, all children will be marked as tried so we'll
1628 * proceed to combinatorial reconstruction.
1630 unexpected_errors = 1;
1631 rm->rm_missingdata = 0;
1632 rm->rm_missingparity = 0;
1635 for (c = 0; c < rm->rm_cols; c++) {
1636 rc = &rm->rm_col[c];
1641 cvd = vdev_child(vd, rc->rc_devidx);
1642 ASSERT(cvd != NULL);
1643 rc->rc_error = cvd->v_read(cvd, NULL,
1644 rc->rc_data, rc->rc_offset, rc->rc_size);
1645 if (rc->rc_error == 0)
1651 * If we managed to read anything more, retry the
1658 * At this point we've attempted to reconstruct the data given the
1659 * errors we detected, and we've attempted to read all columns. There
1660 * must, therefore, be one or more additional problems -- silent errors
1661 * resulting in invalid data rather than explicit I/O errors resulting
1662 * in absent data. We check if there is enough additional data to
1663 * possibly reconstruct the data and then perform combinatorial
1664 * reconstruction over all possible combinations. If that fails,
1667 if (total_errors > rm->rm_firstdatacol) {
1669 } else if (total_errors < rm->rm_firstdatacol &&
1670 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1671 total_errors, data_errors)) != 0) {
1673 * If we didn't use all the available parity for the
1674 * combinatorial reconstruction, verify that the remaining
1675 * parity is correct.
1677 if (code != (1 << rm->rm_firstdatacol) - 1)
1678 (void) raidz_parity_verify(rm);
1681 * We're here because either:
1683 * total_errors == rm_first_datacol, or
1684 * vdev_raidz_combrec() failed
1686 * In either case, there is enough bad data to prevent
1689 * Start checksum ereports for all children which haven't
1690 * failed, and the IO wasn't speculative.
1696 vdev_raidz_map_free(rm);