4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
33 #define ASSERT(...) do { } while (0)
34 #define ASSERT3U(...) do { } while (0)
35 #define ASSERT3S(...) do { } while (0)
37 #define panic(...) do { \
38 printf(__VA_ARGS__); \
42 #define kmem_alloc(size, flag) zfs_alloc((size))
43 #define kmem_free(ptr, size) zfs_free((ptr), (size))
52 * Calculate the crc64 table (used for the zap hash
55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 for (i = 0; i < 256; i++)
58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
70 * Signature for checksum functions.
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75 * Information about each checksum function.
77 typedef struct zio_checksum_info {
78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
79 int ci_correctable; /* number of correctable bits */
80 int ci_eck; /* uses zio embedded checksum? */
81 int ci_dedup; /* strong enough for dedup? */
82 const char *ci_name; /* descriptive name */
83 } zio_checksum_info_t;
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 {{NULL, NULL}, 0, 0, 0, "inherit"},
90 {{NULL, NULL}, 0, 0, 0, "on"},
91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"},
98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"},
103 * Common signature for all zio compress/decompress functions.
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106 size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108 size_t s_len, size_t d_len, int);
111 * Information about each compression function.
113 typedef struct zio_compress_info {
114 zio_compress_func_t *ci_compress; /* compression function */
115 zio_decompress_func_t *ci_decompress; /* decompression function */
116 int ci_level; /* level parameter */
117 const char *ci_name; /* algorithm name */
118 } zio_compress_info_t;
124 * Compression vectors.
126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
127 {NULL, NULL, 0, "inherit"},
128 {NULL, NULL, 0, "on"},
129 {NULL, NULL, 0, "uncompressed"},
130 {NULL, lzjb_decompress, 0, "lzjb"},
131 {NULL, NULL, 0, "empty"},
132 {NULL, NULL, 1, "gzip-1"},
133 {NULL, NULL, 2, "gzip-2"},
134 {NULL, NULL, 3, "gzip-3"},
135 {NULL, NULL, 4, "gzip-4"},
136 {NULL, NULL, 5, "gzip-5"},
137 {NULL, NULL, 6, "gzip-6"},
138 {NULL, NULL, 7, "gzip-7"},
139 {NULL, NULL, 8, "gzip-8"},
140 {NULL, NULL, 9, "gzip-9"},
141 {NULL, zle_decompress, 64, "zle"},
145 byteswap_uint64_array(void *vbuf, size_t size)
147 uint64_t *buf = vbuf;
148 size_t count = size >> 3;
151 ASSERT((size & 7) == 0);
153 for (i = 0; i < count; i++)
154 buf[i] = BSWAP_64(buf[i]);
158 * Set the external verifier for a gang block based on <vdev, offset, txg>,
159 * a tuple which is guaranteed to be unique for the life of the pool.
162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
164 const dva_t *dva = BP_IDENTITY(bp);
165 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
167 ASSERT(BP_IS_GANG(bp));
169 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
173 * Set the external verifier for a label block based on its offset.
174 * The vdev is implicit, and the txg is unknowable at pool open time --
175 * hence the logic in vdev_uberblock_load() to find the most recent copy.
178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
180 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
184 zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
186 unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
187 uint64_t size = BP_GET_PSIZE(bp);
188 zio_checksum_info_t *ci;
189 zio_cksum_t actual_cksum, expected_cksum, verifier;
192 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
194 ci = &zio_checksum_table[checksum];
195 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
201 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
202 checksum == ZIO_CHECKSUM_LABEL);
204 eck = (zio_eck_t *)((char *)data + size) - 1;
206 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
207 zio_checksum_gang_verifier(&verifier, bp);
208 else if (checksum == ZIO_CHECKSUM_LABEL)
209 zio_checksum_label_verifier(&verifier, offset);
211 verifier = bp->blk_cksum;
213 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
216 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
218 expected_cksum = eck->zec_cksum;
219 eck->zec_cksum = verifier;
220 ci->ci_func[byteswap](data, size, &actual_cksum);
221 eck->zec_cksum = expected_cksum;
224 byteswap_uint64_array(&expected_cksum,
225 sizeof (zio_cksum_t));
227 ASSERT(!BP_IS_GANG(bp));
228 expected_cksum = bp->blk_cksum;
229 ci->ci_func[0](data, size, &actual_cksum);
232 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
233 /*printf("ZFS: read checksum failed\n");*/
241 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
242 void *dest, uint64_t destsize)
244 zio_compress_info_t *ci;
246 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
247 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
251 ci = &zio_compress_table[cpfunc];
252 if (!ci->ci_decompress) {
253 printf("ZFS: unsupported compression algorithm %s\n",
258 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
262 zap_hash(uint64_t salt, const char *name)
269 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
270 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
271 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
274 * Only use 28 bits, since we need 4 bits in the cookie for the
275 * collision differentiator. We MUST use the high bits, since
276 * those are the onces that we first pay attention to when
277 * chosing the bucket.
279 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
284 static void *zfs_alloc(size_t size);
285 static void zfs_free(void *ptr, size_t size);
287 typedef struct raidz_col {
288 uint64_t rc_devidx; /* child device index for I/O */
289 uint64_t rc_offset; /* device offset */
290 uint64_t rc_size; /* I/O size */
291 void *rc_data; /* I/O data */
292 int rc_error; /* I/O error for this device */
293 uint8_t rc_tried; /* Did we attempt this I/O column? */
294 uint8_t rc_skipped; /* Did we skip this I/O column? */
297 typedef struct raidz_map {
298 uint64_t rm_cols; /* Regular column count */
299 uint64_t rm_scols; /* Count including skipped columns */
300 uint64_t rm_bigcols; /* Number of oversized columns */
301 uint64_t rm_asize; /* Actual total I/O size */
302 uint64_t rm_missingdata; /* Count of missing data devices */
303 uint64_t rm_missingparity; /* Count of missing parity devices */
304 uint64_t rm_firstdatacol; /* First data column/parity count */
305 uint64_t rm_nskip; /* Skipped sectors for padding */
306 uint64_t rm_skipstart; /* Column index of padding start */
307 uintptr_t rm_reports; /* # of referencing checksum reports */
308 uint8_t rm_freed; /* map no longer has referencing ZIO */
309 uint8_t rm_ecksuminjected; /* checksum error was injected */
310 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
313 #define VDEV_RAIDZ_P 0
314 #define VDEV_RAIDZ_Q 1
315 #define VDEV_RAIDZ_R 2
317 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
318 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
321 * We provide a mechanism to perform the field multiplication operation on a
322 * 64-bit value all at once rather than a byte at a time. This works by
323 * creating a mask from the top bit in each byte and using that to
324 * conditionally apply the XOR of 0x1d.
326 #define VDEV_RAIDZ_64MUL_2(x, mask) \
328 (mask) = (x) & 0x8080808080808080ULL; \
329 (mask) = ((mask) << 1) - ((mask) >> 7); \
330 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
331 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
334 #define VDEV_RAIDZ_64MUL_4(x, mask) \
336 VDEV_RAIDZ_64MUL_2((x), mask); \
337 VDEV_RAIDZ_64MUL_2((x), mask); \
341 * These two tables represent powers and logs of 2 in the Galois field defined
342 * above. These values were computed by repeatedly multiplying by 2 as above.
344 static const uint8_t vdev_raidz_pow2[256] = {
345 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
346 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
347 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
348 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
349 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
350 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
351 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
352 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
353 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
354 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
355 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
356 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
357 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
358 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
359 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
360 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
361 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
362 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
363 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
364 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
365 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
366 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
367 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
368 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
369 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
370 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
371 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
372 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
373 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
374 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
375 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
376 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
378 static const uint8_t vdev_raidz_log2[256] = {
379 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
380 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
381 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
382 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
383 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
384 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
385 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
386 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
387 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
388 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
389 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
390 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
391 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
392 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
393 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
394 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
395 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
396 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
397 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
398 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
399 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
400 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
401 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
402 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
403 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
404 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
405 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
406 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
407 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
408 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
409 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
410 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
414 * Multiply a given number by 2 raised to the given power.
417 vdev_raidz_exp2(uint8_t a, int exp)
423 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
425 exp += vdev_raidz_log2[a];
429 return (vdev_raidz_pow2[exp]);
433 vdev_raidz_generate_parity_p(raidz_map_t *rm)
435 uint64_t *p, *src, pcount, ccount, i;
438 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
440 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 src = rm->rm_col[c].rc_data;
442 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
443 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
445 if (c == rm->rm_firstdatacol) {
446 ASSERT(ccount == pcount);
447 for (i = 0; i < ccount; i++, src++, p++) {
451 ASSERT(ccount <= pcount);
452 for (i = 0; i < ccount; i++, src++, p++) {
460 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
462 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
465 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
466 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
467 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
470 src = rm->rm_col[c].rc_data;
471 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
472 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
474 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
476 if (c == rm->rm_firstdatacol) {
477 ASSERT(ccnt == pcnt || ccnt == 0);
478 for (i = 0; i < ccnt; i++, src++, p++, q++) {
482 for (; i < pcnt; i++, src++, p++, q++) {
487 ASSERT(ccnt <= pcnt);
490 * Apply the algorithm described above by multiplying
491 * the previous result and adding in the new value.
493 for (i = 0; i < ccnt; i++, src++, p++, q++) {
496 VDEV_RAIDZ_64MUL_2(*q, mask);
501 * Treat short columns as though they are full of 0s.
502 * Note that there's therefore nothing needed for P.
504 for (; i < pcnt; i++, q++) {
505 VDEV_RAIDZ_64MUL_2(*q, mask);
512 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
514 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
517 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
518 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
519 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
520 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
521 rm->rm_col[VDEV_RAIDZ_R].rc_size);
523 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
524 src = rm->rm_col[c].rc_data;
525 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
526 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
527 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
529 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
531 if (c == rm->rm_firstdatacol) {
532 ASSERT(ccnt == pcnt || ccnt == 0);
533 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
538 for (; i < pcnt; i++, src++, p++, q++, r++) {
544 ASSERT(ccnt <= pcnt);
547 * Apply the algorithm described above by multiplying
548 * the previous result and adding in the new value.
550 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
553 VDEV_RAIDZ_64MUL_2(*q, mask);
556 VDEV_RAIDZ_64MUL_4(*r, mask);
561 * Treat short columns as though they are full of 0s.
562 * Note that there's therefore nothing needed for P.
564 for (; i < pcnt; i++, q++, r++) {
565 VDEV_RAIDZ_64MUL_2(*q, mask);
566 VDEV_RAIDZ_64MUL_4(*r, mask);
573 * Generate RAID parity in the first virtual columns according to the number of
574 * parity columns available.
577 vdev_raidz_generate_parity(raidz_map_t *rm)
579 switch (rm->rm_firstdatacol) {
581 vdev_raidz_generate_parity_p(rm);
584 vdev_raidz_generate_parity_pq(rm);
587 vdev_raidz_generate_parity_pqr(rm);
590 panic("invalid RAID-Z configuration");
596 * In the general case of reconstruction, we must solve the system of linear
597 * equations defined by the coeffecients used to generate parity as well as
598 * the contents of the data and parity disks. This can be expressed with
599 * vectors for the original data (D) and the actual data (d) and parity (p)
600 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
604 * | V | | D_0 | | p_m-1 |
605 * | | x | : | = | d_0 |
606 * | I | | D_n-1 | | : |
607 * | | ~~ ~~ | d_n-1 |
610 * I is simply a square identity matrix of size n, and V is a vandermonde
611 * matrix defined by the coeffecients we chose for the various parity columns
612 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
613 * computation as well as linear separability.
616 * | 1 .. 1 1 1 | | p_0 |
617 * | 2^n-1 .. 4 2 1 | __ __ | : |
618 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
619 * | 1 .. 0 0 0 | | D_1 | | d_0 |
620 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
621 * | : : : : | | : | | d_2 |
622 * | 0 .. 1 0 0 | | D_n-1 | | : |
623 * | 0 .. 0 1 0 | ~~ ~~ | : |
624 * | 0 .. 0 0 1 | | d_n-1 |
627 * Note that I, V, d, and p are known. To compute D, we must invert the
628 * matrix and use the known data and parity values to reconstruct the unknown
629 * data values. We begin by removing the rows in V|I and d|p that correspond
630 * to failed or missing columns; we then make V|I square (n x n) and d|p
631 * sized n by removing rows corresponding to unused parity from the bottom up
632 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
633 * using Gauss-Jordan elimination. In the example below we use m=3 parity
634 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
636 * | 1 1 1 1 1 1 1 1 |
637 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
638 * | 19 205 116 29 64 16 4 1 | / /
639 * | 1 0 0 0 0 0 0 0 | / /
640 * | 0 1 0 0 0 0 0 0 | <--' /
641 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
642 * | 0 0 0 1 0 0 0 0 |
643 * | 0 0 0 0 1 0 0 0 |
644 * | 0 0 0 0 0 1 0 0 |
645 * | 0 0 0 0 0 0 1 0 |
646 * | 0 0 0 0 0 0 0 1 |
649 * | 1 1 1 1 1 1 1 1 |
650 * | 128 64 32 16 8 4 2 1 |
651 * | 19 205 116 29 64 16 4 1 |
652 * | 1 0 0 0 0 0 0 0 |
653 * | 0 1 0 0 0 0 0 0 |
654 * (V|I)' = | 0 0 1 0 0 0 0 0 |
655 * | 0 0 0 1 0 0 0 0 |
656 * | 0 0 0 0 1 0 0 0 |
657 * | 0 0 0 0 0 1 0 0 |
658 * | 0 0 0 0 0 0 1 0 |
659 * | 0 0 0 0 0 0 0 1 |
662 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
663 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
664 * matrix is not singular.
666 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
667 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
668 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
669 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
670 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
671 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
672 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
673 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
676 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
677 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
678 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
679 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
680 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
681 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
682 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
683 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
686 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
687 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
688 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
689 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
690 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
691 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
692 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
693 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
696 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
697 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
698 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
699 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
700 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
701 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
702 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
703 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
706 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
707 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
708 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
709 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
710 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
711 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
712 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
713 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
716 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
717 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
718 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
719 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
720 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
721 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
722 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
723 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
726 * | 0 0 1 0 0 0 0 0 |
727 * | 167 100 5 41 159 169 217 208 |
728 * | 166 100 4 40 158 168 216 209 |
729 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
730 * | 0 0 0 0 1 0 0 0 |
731 * | 0 0 0 0 0 1 0 0 |
732 * | 0 0 0 0 0 0 1 0 |
733 * | 0 0 0 0 0 0 0 1 |
736 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
737 * of the missing data.
739 * As is apparent from the example above, the only non-trivial rows in the
740 * inverse matrix correspond to the data disks that we're trying to
741 * reconstruct. Indeed, those are the only rows we need as the others would
742 * only be useful for reconstructing data known or assumed to be valid. For
743 * that reason, we only build the coefficients in the rows that correspond to
749 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
755 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
758 * Fill in the missing rows of interest.
760 for (i = 0; i < nmap; i++) {
761 ASSERT3S(0, <=, map[i]);
762 ASSERT3S(map[i], <=, 2);
769 for (j = 0; j < n; j++) {
773 rows[i][j] = vdev_raidz_pow2[pow];
779 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
780 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
786 * Assert that the first nmissing entries from the array of used
787 * columns correspond to parity columns and that subsequent entries
788 * correspond to data columns.
790 for (i = 0; i < nmissing; i++) {
791 ASSERT3S(used[i], <, rm->rm_firstdatacol);
794 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
798 * First initialize the storage where we'll compute the inverse rows.
800 for (i = 0; i < nmissing; i++) {
801 for (j = 0; j < n; j++) {
802 invrows[i][j] = (i == j) ? 1 : 0;
807 * Subtract all trivial rows from the rows of consequence.
809 for (i = 0; i < nmissing; i++) {
810 for (j = nmissing; j < n; j++) {
811 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
812 jj = used[j] - rm->rm_firstdatacol;
814 invrows[i][j] = rows[i][jj];
820 * For each of the rows of interest, we must normalize it and subtract
821 * a multiple of it from the other rows.
823 for (i = 0; i < nmissing; i++) {
824 for (j = 0; j < missing[i]; j++) {
825 ASSERT3U(rows[i][j], ==, 0);
827 ASSERT3U(rows[i][missing[i]], !=, 0);
830 * Compute the inverse of the first element and multiply each
831 * element in the row by that value.
833 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
835 for (j = 0; j < n; j++) {
836 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
837 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
840 for (ii = 0; ii < nmissing; ii++) {
844 ASSERT3U(rows[ii][missing[i]], !=, 0);
846 log = vdev_raidz_log2[rows[ii][missing[i]]];
848 for (j = 0; j < n; j++) {
850 vdev_raidz_exp2(rows[i][j], log);
852 vdev_raidz_exp2(invrows[i][j], log);
858 * Verify that the data that is left in the rows are properly part of
859 * an identity matrix.
861 for (i = 0; i < nmissing; i++) {
862 for (j = 0; j < n; j++) {
863 if (j == missing[i]) {
864 ASSERT3U(rows[i][j], ==, 1);
866 ASSERT3U(rows[i][j], ==, 0);
873 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
874 int *missing, uint8_t **invrows, const uint8_t *used)
879 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
880 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
883 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
888 psize = sizeof (invlog[0][0]) * n * nmissing;
889 p = zfs_alloc(psize);
891 for (pp = p, i = 0; i < nmissing; i++) {
896 for (i = 0; i < nmissing; i++) {
897 for (j = 0; j < n; j++) {
898 ASSERT3U(invrows[i][j], !=, 0);
899 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
903 for (i = 0; i < n; i++) {
905 ASSERT3U(c, <, rm->rm_cols);
907 src = rm->rm_col[c].rc_data;
908 ccount = rm->rm_col[c].rc_size;
909 for (j = 0; j < nmissing; j++) {
910 cc = missing[j] + rm->rm_firstdatacol;
911 ASSERT3U(cc, >=, rm->rm_firstdatacol);
912 ASSERT3U(cc, <, rm->rm_cols);
915 dst[j] = rm->rm_col[cc].rc_data;
916 dcount[j] = rm->rm_col[cc].rc_size;
919 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
921 for (x = 0; x < ccount; x++, src++) {
923 log = vdev_raidz_log2[*src];
925 for (cc = 0; cc < nmissing; cc++) {
932 if ((ll = log + invlog[cc][i]) >= 255)
934 val = vdev_raidz_pow2[ll];
949 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
953 int missing_rows[VDEV_RAIDZ_MAXPARITY];
954 int parity_map[VDEV_RAIDZ_MAXPARITY];
959 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
960 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
966 n = rm->rm_cols - rm->rm_firstdatacol;
969 * Figure out which data columns are missing.
972 for (t = 0; t < ntgts; t++) {
973 if (tgts[t] >= rm->rm_firstdatacol) {
974 missing_rows[nmissing_rows++] =
975 tgts[t] - rm->rm_firstdatacol;
980 * Figure out which parity columns to use to help generate the missing
983 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
985 ASSERT(c < rm->rm_firstdatacol);
988 * Skip any targeted parity columns.
1002 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1004 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1005 nmissing_rows * n + sizeof (used[0]) * n;
1006 p = kmem_alloc(psize, KM_SLEEP);
1008 for (pp = p, i = 0; i < nmissing_rows; i++) {
1016 for (i = 0; i < nmissing_rows; i++) {
1017 used[i] = parity_map[i];
1020 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1021 if (tt < nmissing_rows &&
1022 c == missing_rows[tt] + rm->rm_firstdatacol) {
1033 * Initialize the interesting rows of the matrix.
1035 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1038 * Invert the matrix.
1040 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1044 * Reconstruct the missing data using the generated matrix.
1046 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1049 kmem_free(p, psize);
1055 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1057 int tgts[VDEV_RAIDZ_MAXPARITY];
1061 int nbadparity, nbaddata;
1064 * The tgts list must already be sorted.
1066 for (i = 1; i < nt; i++) {
1067 ASSERT(t[i] > t[i - 1]);
1070 nbadparity = rm->rm_firstdatacol;
1071 nbaddata = rm->rm_cols - nbadparity;
1073 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1074 if (i < nt && c == t[i]) {
1077 } else if (rm->rm_col[c].rc_error != 0) {
1079 } else if (c >= rm->rm_firstdatacol) {
1086 ASSERT(ntgts >= nt);
1087 ASSERT(nbaddata >= 0);
1088 ASSERT(nbaddata + nbadparity == ntgts);
1090 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1091 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1096 static raidz_map_t *
1097 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1098 uint64_t dcols, uint64_t nparity)
1101 uint64_t b = offset >> unit_shift;
1102 uint64_t s = size >> unit_shift;
1103 uint64_t f = b % dcols;
1104 uint64_t o = (b / dcols) << unit_shift;
1105 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1107 q = s / (dcols - nparity);
1108 r = s - q * (dcols - nparity);
1109 bc = (r == 0 ? 0 : r + nparity);
1110 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1114 scols = MIN(dcols, roundup(bc, nparity + 1));
1120 ASSERT3U(acols, <=, scols);
1122 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1124 rm->rm_cols = acols;
1125 rm->rm_scols = scols;
1126 rm->rm_bigcols = bc;
1127 rm->rm_skipstart = bc;
1128 rm->rm_missingdata = 0;
1129 rm->rm_missingparity = 0;
1130 rm->rm_firstdatacol = nparity;
1133 rm->rm_ecksuminjected = 0;
1137 for (c = 0; c < scols; c++) {
1142 coff += 1ULL << unit_shift;
1144 rm->rm_col[c].rc_devidx = col;
1145 rm->rm_col[c].rc_offset = coff;
1146 rm->rm_col[c].rc_data = NULL;
1147 rm->rm_col[c].rc_error = 0;
1148 rm->rm_col[c].rc_tried = 0;
1149 rm->rm_col[c].rc_skipped = 0;
1152 rm->rm_col[c].rc_size = 0;
1154 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1156 rm->rm_col[c].rc_size = q << unit_shift;
1158 asize += rm->rm_col[c].rc_size;
1161 ASSERT3U(asize, ==, tot << unit_shift);
1162 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1163 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1164 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1165 ASSERT3U(rm->rm_nskip, <=, nparity);
1167 for (c = 0; c < rm->rm_firstdatacol; c++)
1168 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1170 rm->rm_col[c].rc_data = data;
1172 for (c = c + 1; c < acols; c++)
1173 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1174 rm->rm_col[c - 1].rc_size;
1177 * If all data stored spans all columns, there's a danger that parity
1178 * will always be on the same device and, since parity isn't read
1179 * during normal operation, that that device's I/O bandwidth won't be
1180 * used effectively. We therefore switch the parity every 1MB.
1182 * ... at least that was, ostensibly, the theory. As a practical
1183 * matter unless we juggle the parity between all devices evenly, we
1184 * won't see any benefit. Further, occasional writes that aren't a
1185 * multiple of the LCM of the number of children and the minimum
1186 * stripe width are sufficient to avoid pessimal behavior.
1187 * Unfortunately, this decision created an implicit on-disk format
1188 * requirement that we need to support for all eternity, but only
1189 * for single-parity RAID-Z.
1191 * If we intend to skip a sector in the zeroth column for padding
1192 * we must make sure to note this swap. We will never intend to
1193 * skip the first column since at least one data and one parity
1194 * column must appear in each row.
1196 ASSERT(rm->rm_cols >= 2);
1197 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1199 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1200 devidx = rm->rm_col[0].rc_devidx;
1201 o = rm->rm_col[0].rc_offset;
1202 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1203 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1204 rm->rm_col[1].rc_devidx = devidx;
1205 rm->rm_col[1].rc_offset = o;
1207 if (rm->rm_skipstart == 0)
1208 rm->rm_skipstart = 1;
1215 vdev_raidz_map_free(raidz_map_t *rm)
1220 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1221 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1224 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
1225 size += rm->rm_col[c].rc_size;
1227 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1231 vdev_child(vdev_t *pvd, uint64_t devidx)
1235 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236 if (cvd->v_id == devidx)
1244 * We keep track of whether or not there were any injected errors, so that
1245 * any ereports we generate can note it.
1248 raidz_checksum_verify(const blkptr_t *bp, void *data)
1251 return (zio_checksum_error(bp, data, 0));
1255 * Generate the parity from the data columns. If we tried and were able to
1256 * read the parity without error, verify that the generated parity matches the
1257 * data we read. If it doesn't, we fire off a checksum error. Return the
1258 * number such failures.
1261 raidz_parity_verify(raidz_map_t *rm)
1263 void *orig[VDEV_RAIDZ_MAXPARITY];
1267 for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 rc = &rm->rm_col[c];
1269 if (!rc->rc_tried || rc->rc_error != 0)
1271 orig[c] = zfs_alloc(rc->rc_size);
1272 bcopy(rc->rc_data, orig[c], rc->rc_size);
1275 vdev_raidz_generate_parity(rm);
1277 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278 rc = &rm->rm_col[c];
1279 if (!rc->rc_tried || rc->rc_error != 0)
1281 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282 rc->rc_error = ECKSUM;
1285 zfs_free(orig[c], rc->rc_size);
1292 * Iterate over all combinations of bad data and attempt a reconstruction.
1293 * Note that the algorithm below is non-optimal because it doesn't take into
1294 * account how reconstruction is actually performed. For example, with
1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297 * cases we'd only use parity information in column 0.
1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301 off_t offset, int total_errors, int data_errors)
1304 void *orig[VDEV_RAIDZ_MAXPARITY];
1305 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306 int *tgts = &tstore[1];
1307 int current, next, i, c, n;
1310 ASSERT(total_errors < rm->rm_firstdatacol);
1313 * This simplifies one edge condition.
1317 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1319 * Initialize the targets array by finding the first n columns
1320 * that contain no error.
1322 * If there were no data errors, we need to ensure that we're
1323 * always explicitly attempting to reconstruct at least one
1324 * data column. To do this, we simply push the highest target
1325 * up into the data columns.
1327 for (c = 0, i = 0; i < n; i++) {
1328 if (i == n - 1 && data_errors == 0 &&
1329 c < rm->rm_firstdatacol) {
1330 c = rm->rm_firstdatacol;
1333 while (rm->rm_col[c].rc_error != 0) {
1335 ASSERT3S(c, <, rm->rm_cols);
1342 * Setting tgts[n] simplifies the other edge condition.
1344 tgts[n] = rm->rm_cols;
1347 * These buffers were allocated in previous iterations.
1349 for (i = 0; i < n - 1; i++) {
1350 ASSERT(orig[i] != NULL);
1353 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1356 next = tgts[current];
1358 while (current != n) {
1359 tgts[current] = next;
1363 * Save off the original data that we're going to
1364 * attempt to reconstruct.
1366 for (i = 0; i < n; i++) {
1367 ASSERT(orig[i] != NULL);
1370 ASSERT3S(c, <, rm->rm_cols);
1371 rc = &rm->rm_col[c];
1372 bcopy(rc->rc_data, orig[i], rc->rc_size);
1376 * Attempt a reconstruction and exit the outer loop on
1379 code = vdev_raidz_reconstruct(rm, tgts, n);
1380 if (raidz_checksum_verify(bp, data) == 0) {
1381 for (i = 0; i < n; i++) {
1383 rc = &rm->rm_col[c];
1384 ASSERT(rc->rc_error == 0);
1385 rc->rc_error = ECKSUM;
1393 * Restore the original data.
1395 for (i = 0; i < n; i++) {
1397 rc = &rm->rm_col[c];
1398 bcopy(orig[i], rc->rc_data, rc->rc_size);
1403 * Find the next valid column after the current
1406 for (next = tgts[current] + 1;
1407 next < rm->rm_cols &&
1408 rm->rm_col[next].rc_error != 0; next++)
1411 ASSERT(next <= tgts[current + 1]);
1414 * If that spot is available, we're done here.
1416 if (next != tgts[current + 1])
1420 * Otherwise, find the next valid column after
1421 * the previous position.
1423 for (c = tgts[current - 1] + 1;
1424 rm->rm_col[c].rc_error != 0; c++)
1430 } while (current != n);
1435 for (i = n - 1; i >= 0; i--) {
1436 zfs_free(orig[i], rm->rm_col[0].rc_size);
1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444 off_t offset, size_t bytes)
1446 vdev_t *tvd = vd->v_top;
1451 int unexpected_errors;
1457 int tgts[VDEV_RAIDZ_MAXPARITY];
1460 rc = NULL; /* gcc */
1463 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464 vd->v_nchildren, vd->v_nparity);
1467 * Iterate over the columns in reverse order so that we hit the parity
1468 * last -- any errors along the way will force us to read the parity.
1470 for (c = rm->rm_cols - 1; c >= 0; c--) {
1471 rc = &rm->rm_col[c];
1472 cvd = vdev_child(vd, rc->rc_devidx);
1473 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474 if (c >= rm->rm_firstdatacol)
1475 rm->rm_missingdata++;
1477 rm->rm_missingparity++;
1478 rc->rc_error = ENXIO;
1479 rc->rc_tried = 1; /* don't even try */
1483 #if 0 /* XXX: Too hard for the boot code. */
1484 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485 if (c >= rm->rm_firstdatacol)
1486 rm->rm_missingdata++;
1488 rm->rm_missingparity++;
1489 rc->rc_error = ESTALE;
1494 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496 rc->rc_offset, rc->rc_size);
1503 unexpected_errors = 0;
1509 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1512 for (c = 0; c < rm->rm_cols; c++) {
1513 rc = &rm->rm_col[c];
1516 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1518 if (c < rm->rm_firstdatacol)
1523 if (!rc->rc_skipped)
1524 unexpected_errors++;
1527 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1533 * There are three potential phases for a read:
1534 * 1. produce valid data from the columns read
1535 * 2. read all disks and try again
1536 * 3. perform combinatorial reconstruction
1538 * Each phase is progressively both more expensive and less likely to
1539 * occur. If we encounter more errors than we can repair or all phases
1540 * fail, we have no choice but to return an error.
1544 * If the number of errors we saw was correctable -- less than or equal
1545 * to the number of parity disks read -- attempt to produce data that
1546 * has a valid checksum. Naturally, this case applies in the absence of
1549 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550 if (data_errors == 0) {
1551 if (raidz_checksum_verify(bp, data) == 0) {
1553 * If we read parity information (unnecessarily
1554 * as it happens since no reconstruction was
1555 * needed) regenerate and verify the parity.
1556 * We also regenerate parity when resilvering
1557 * so we can write it out to the failed device
1560 if (parity_errors + parity_untried <
1561 rm->rm_firstdatacol) {
1562 n = raidz_parity_verify(rm);
1563 unexpected_errors += n;
1564 ASSERT(parity_errors + n <=
1565 rm->rm_firstdatacol);
1571 * We either attempt to read all the parity columns or
1572 * none of them. If we didn't try to read parity, we
1573 * wouldn't be here in the correctable case. There must
1574 * also have been fewer parity errors than parity
1575 * columns or, again, we wouldn't be in this code path.
1577 ASSERT(parity_untried == 0);
1578 ASSERT(parity_errors < rm->rm_firstdatacol);
1581 * Identify the data columns that reported an error.
1584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585 rc = &rm->rm_col[c];
1586 if (rc->rc_error != 0) {
1587 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1592 ASSERT(rm->rm_firstdatacol >= n);
1594 code = vdev_raidz_reconstruct(rm, tgts, n);
1596 if (raidz_checksum_verify(bp, data) == 0) {
1598 * If we read more parity disks than were used
1599 * for reconstruction, confirm that the other
1600 * parity disks produced correct data. This
1601 * routine is suboptimal in that it regenerates
1602 * the parity that we already used in addition
1603 * to the parity that we're attempting to
1604 * verify, but this should be a relatively
1605 * uncommon case, and can be optimized if it
1606 * becomes a problem. Note that we regenerate
1607 * parity when resilvering so we can write it
1608 * out to failed devices later.
1610 if (parity_errors < rm->rm_firstdatacol - n) {
1611 n = raidz_parity_verify(rm);
1612 unexpected_errors += n;
1613 ASSERT(parity_errors + n <=
1614 rm->rm_firstdatacol);
1623 * This isn't a typical situation -- either we got a read
1624 * error or a child silently returned bad data. Read every
1625 * block so we can try again with as much data and parity as
1626 * we can track down. If we've already been through once
1627 * before, all children will be marked as tried so we'll
1628 * proceed to combinatorial reconstruction.
1630 unexpected_errors = 1;
1631 rm->rm_missingdata = 0;
1632 rm->rm_missingparity = 0;
1635 for (c = 0; c < rm->rm_cols; c++) {
1636 if (rm->rm_col[c].rc_tried)
1639 cvd = vdev_child(vd, rc->rc_devidx);
1640 ASSERT(cvd != NULL);
1641 rc->rc_error = cvd->v_read(cvd, NULL,
1642 rc->rc_data, rc->rc_offset, rc->rc_size);
1643 if (rc->rc_error == 0)
1649 * If we managed to read anything more, retry the
1656 * At this point we've attempted to reconstruct the data given the
1657 * errors we detected, and we've attempted to read all columns. There
1658 * must, therefore, be one or more additional problems -- silent errors
1659 * resulting in invalid data rather than explicit I/O errors resulting
1660 * in absent data. We check if there is enough additional data to
1661 * possibly reconstruct the data and then perform combinatorial
1662 * reconstruction over all possible combinations. If that fails,
1665 if (total_errors > rm->rm_firstdatacol) {
1667 } else if (total_errors < rm->rm_firstdatacol &&
1668 (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
1669 data_errors)) != 0) {
1671 * If we didn't use all the available parity for the
1672 * combinatorial reconstruction, verify that the remaining
1673 * parity is correct.
1675 if (code != (1 << rm->rm_firstdatacol) - 1)
1676 (void) raidz_parity_verify(rm);
1679 * We're here because either:
1681 * total_errors == rm_first_datacol, or
1682 * vdev_raidz_combrec() failed
1684 * In either case, there is enough bad data to prevent
1687 * Start checksum ereports for all children which haven't
1688 * failed, and the IO wasn't speculative.
1694 vdev_raidz_map_free(rm);