4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 static uint64_t zfs_crc64_table[256];
33 #define ASSERT(...) do { } while (0)
34 #define ASSERT3U(...) do { } while (0)
35 #define ASSERT3S(...) do { } while (0)
37 #define panic(...) do { \
38 printf(__VA_ARGS__); \
42 #define kmem_alloc(size, flag) zfs_alloc((size))
43 #define kmem_free(ptr, size) zfs_free((ptr), (size))
52 * Calculate the crc64 table (used for the zap hash
55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 for (i = 0; i < 256; i++)
58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
70 * Signature for checksum functions.
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75 * Information about each checksum function.
77 typedef struct zio_checksum_info {
78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
79 int ci_correctable; /* number of correctable bits */
80 int ci_eck; /* uses zio embedded checksum? */
81 int ci_dedup; /* strong enough for dedup? */
82 const char *ci_name; /* descriptive name */
83 } zio_checksum_info_t;
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 {{NULL, NULL}, 0, 0, 0, "inherit"},
90 {{NULL, NULL}, 0, 0, 0, "on"},
91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"},
98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"},
103 * Common signature for all zio compress/decompress functions.
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106 size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108 size_t s_len, size_t d_len, int);
111 * Information about each compression function.
113 typedef struct zio_compress_info {
114 zio_compress_func_t *ci_compress; /* compression function */
115 zio_decompress_func_t *ci_decompress; /* decompression function */
116 int ci_level; /* level parameter */
117 const char *ci_name; /* algorithm name */
118 } zio_compress_info_t;
124 * Compression vectors.
126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
127 {NULL, NULL, 0, "inherit"},
128 {NULL, NULL, 0, "on"},
129 {NULL, NULL, 0, "uncompressed"},
130 {NULL, lzjb_decompress, 0, "lzjb"},
131 {NULL, NULL, 0, "empty"},
132 {NULL, NULL, 1, "gzip-1"},
133 {NULL, NULL, 2, "gzip-2"},
134 {NULL, NULL, 3, "gzip-3"},
135 {NULL, NULL, 4, "gzip-4"},
136 {NULL, NULL, 5, "gzip-5"},
137 {NULL, NULL, 6, "gzip-6"},
138 {NULL, NULL, 7, "gzip-7"},
139 {NULL, NULL, 8, "gzip-8"},
140 {NULL, NULL, 9, "gzip-9"},
141 {NULL, zle_decompress, 64, "zle"},
145 byteswap_uint64_array(void *vbuf, size_t size)
147 uint64_t *buf = vbuf;
148 size_t count = size >> 3;
151 ASSERT((size & 7) == 0);
153 for (i = 0; i < count; i++)
154 buf[i] = BSWAP_64(buf[i]);
158 * Set the external verifier for a gang block based on <vdev, offset, txg>,
159 * a tuple which is guaranteed to be unique for the life of the pool.
162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
164 const dva_t *dva = BP_IDENTITY(bp);
165 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
167 ASSERT(BP_IS_GANG(bp));
169 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
173 * Set the external verifier for a label block based on its offset.
174 * The vdev is implicit, and the txg is unknowable at pool open time --
175 * hence the logic in vdev_uberblock_load() to find the most recent copy.
178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
180 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
184 zio_checksum_verify(const blkptr_t *bp, void *data)
187 unsigned int checksum;
188 zio_checksum_info_t *ci;
189 zio_cksum_t actual_cksum, expected_cksum, verifier;
192 checksum = BP_GET_CHECKSUM(bp);
193 size = BP_GET_PSIZE(bp);
195 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
197 ci = &zio_checksum_table[checksum];
198 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
204 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
205 checksum == ZIO_CHECKSUM_LABEL);
207 eck = (zio_eck_t *)((char *)data + size) - 1;
209 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
210 zio_checksum_gang_verifier(&verifier, bp);
211 else if (checksum == ZIO_CHECKSUM_LABEL)
212 zio_checksum_label_verifier(&verifier,
213 DVA_GET_OFFSET(BP_IDENTITY(bp)));
215 verifier = bp->blk_cksum;
217 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
220 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
222 expected_cksum = eck->zec_cksum;
223 eck->zec_cksum = verifier;
224 ci->ci_func[byteswap](data, size, &actual_cksum);
225 eck->zec_cksum = expected_cksum;
228 byteswap_uint64_array(&expected_cksum,
229 sizeof (zio_cksum_t));
231 expected_cksum = bp->blk_cksum;
232 ci->ci_func[0](data, size, &actual_cksum);
235 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
236 /*printf("ZFS: read checksum failed\n");*/
244 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
245 void *dest, uint64_t destsize)
247 zio_compress_info_t *ci;
249 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
250 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
254 ci = &zio_compress_table[cpfunc];
255 if (!ci->ci_decompress) {
256 printf("ZFS: unsupported compression algorithm %s\n",
261 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
265 zap_hash(uint64_t salt, const char *name)
272 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
273 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
274 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
277 * Only use 28 bits, since we need 4 bits in the cookie for the
278 * collision differentiator. We MUST use the high bits, since
279 * those are the onces that we first pay attention to when
280 * chosing the bucket.
282 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
287 static void *zfs_alloc(size_t size);
288 static void zfs_free(void *ptr, size_t size);
290 typedef struct raidz_col {
291 uint64_t rc_devidx; /* child device index for I/O */
292 uint64_t rc_offset; /* device offset */
293 uint64_t rc_size; /* I/O size */
294 void *rc_data; /* I/O data */
295 int rc_error; /* I/O error for this device */
296 uint8_t rc_tried; /* Did we attempt this I/O column? */
297 uint8_t rc_skipped; /* Did we skip this I/O column? */
300 typedef struct raidz_map {
301 uint64_t rm_cols; /* Regular column count */
302 uint64_t rm_scols; /* Count including skipped columns */
303 uint64_t rm_bigcols; /* Number of oversized columns */
304 uint64_t rm_asize; /* Actual total I/O size */
305 uint64_t rm_missingdata; /* Count of missing data devices */
306 uint64_t rm_missingparity; /* Count of missing parity devices */
307 uint64_t rm_firstdatacol; /* First data column/parity count */
308 uint64_t rm_nskip; /* Skipped sectors for padding */
309 uint64_t rm_skipstart; /* Column index of padding start */
310 uintptr_t rm_reports; /* # of referencing checksum reports */
311 uint8_t rm_freed; /* map no longer has referencing ZIO */
312 uint8_t rm_ecksuminjected; /* checksum error was injected */
313 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
316 #define VDEV_RAIDZ_P 0
317 #define VDEV_RAIDZ_Q 1
318 #define VDEV_RAIDZ_R 2
320 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
321 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
324 * We provide a mechanism to perform the field multiplication operation on a
325 * 64-bit value all at once rather than a byte at a time. This works by
326 * creating a mask from the top bit in each byte and using that to
327 * conditionally apply the XOR of 0x1d.
329 #define VDEV_RAIDZ_64MUL_2(x, mask) \
331 (mask) = (x) & 0x8080808080808080ULL; \
332 (mask) = ((mask) << 1) - ((mask) >> 7); \
333 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
334 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
337 #define VDEV_RAIDZ_64MUL_4(x, mask) \
339 VDEV_RAIDZ_64MUL_2((x), mask); \
340 VDEV_RAIDZ_64MUL_2((x), mask); \
344 * These two tables represent powers and logs of 2 in the Galois field defined
345 * above. These values were computed by repeatedly multiplying by 2 as above.
347 static const uint8_t vdev_raidz_pow2[256] = {
348 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
349 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
350 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
351 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
352 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
353 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
354 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
355 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
356 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
357 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
358 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
359 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
360 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
361 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
362 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
363 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
364 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
365 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
366 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
367 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
368 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
369 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
370 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
371 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
372 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
373 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
374 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
375 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
376 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
377 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
378 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
379 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
381 static const uint8_t vdev_raidz_log2[256] = {
382 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
383 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
384 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
385 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
386 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
387 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
388 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
389 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
390 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
391 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
392 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
393 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
394 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
395 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
396 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
397 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
398 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
399 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
400 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
401 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
402 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
403 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
404 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
405 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
406 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
407 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
408 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
409 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
410 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
411 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
412 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
413 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
417 * Multiply a given number by 2 raised to the given power.
420 vdev_raidz_exp2(uint8_t a, int exp)
426 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
428 exp += vdev_raidz_log2[a];
432 return (vdev_raidz_pow2[exp]);
436 vdev_raidz_generate_parity_p(raidz_map_t *rm)
438 uint64_t *p, *src, pcount, ccount, i;
441 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
443 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
444 src = rm->rm_col[c].rc_data;
445 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
446 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
448 if (c == rm->rm_firstdatacol) {
449 ASSERT(ccount == pcount);
450 for (i = 0; i < ccount; i++, src++, p++) {
454 ASSERT(ccount <= pcount);
455 for (i = 0; i < ccount; i++, src++, p++) {
463 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
465 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
468 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
469 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
470 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
472 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
473 src = rm->rm_col[c].rc_data;
474 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
475 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
477 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
479 if (c == rm->rm_firstdatacol) {
480 ASSERT(ccnt == pcnt || ccnt == 0);
481 for (i = 0; i < ccnt; i++, src++, p++, q++) {
485 for (; i < pcnt; i++, src++, p++, q++) {
490 ASSERT(ccnt <= pcnt);
493 * Apply the algorithm described above by multiplying
494 * the previous result and adding in the new value.
496 for (i = 0; i < ccnt; i++, src++, p++, q++) {
499 VDEV_RAIDZ_64MUL_2(*q, mask);
504 * Treat short columns as though they are full of 0s.
505 * Note that there's therefore nothing needed for P.
507 for (; i < pcnt; i++, q++) {
508 VDEV_RAIDZ_64MUL_2(*q, mask);
515 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
517 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
520 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
521 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
522 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
523 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524 rm->rm_col[VDEV_RAIDZ_R].rc_size);
526 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
527 src = rm->rm_col[c].rc_data;
528 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
529 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
530 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
532 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
534 if (c == rm->rm_firstdatacol) {
535 ASSERT(ccnt == pcnt || ccnt == 0);
536 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
541 for (; i < pcnt; i++, src++, p++, q++, r++) {
547 ASSERT(ccnt <= pcnt);
550 * Apply the algorithm described above by multiplying
551 * the previous result and adding in the new value.
553 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
556 VDEV_RAIDZ_64MUL_2(*q, mask);
559 VDEV_RAIDZ_64MUL_4(*r, mask);
564 * Treat short columns as though they are full of 0s.
565 * Note that there's therefore nothing needed for P.
567 for (; i < pcnt; i++, q++, r++) {
568 VDEV_RAIDZ_64MUL_2(*q, mask);
569 VDEV_RAIDZ_64MUL_4(*r, mask);
576 * Generate RAID parity in the first virtual columns according to the number of
577 * parity columns available.
580 vdev_raidz_generate_parity(raidz_map_t *rm)
582 switch (rm->rm_firstdatacol) {
584 vdev_raidz_generate_parity_p(rm);
587 vdev_raidz_generate_parity_pq(rm);
590 vdev_raidz_generate_parity_pqr(rm);
593 panic("invalid RAID-Z configuration");
599 * In the general case of reconstruction, we must solve the system of linear
600 * equations defined by the coeffecients used to generate parity as well as
601 * the contents of the data and parity disks. This can be expressed with
602 * vectors for the original data (D) and the actual data (d) and parity (p)
603 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
607 * | V | | D_0 | | p_m-1 |
608 * | | x | : | = | d_0 |
609 * | I | | D_n-1 | | : |
610 * | | ~~ ~~ | d_n-1 |
613 * I is simply a square identity matrix of size n, and V is a vandermonde
614 * matrix defined by the coeffecients we chose for the various parity columns
615 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
616 * computation as well as linear separability.
619 * | 1 .. 1 1 1 | | p_0 |
620 * | 2^n-1 .. 4 2 1 | __ __ | : |
621 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
622 * | 1 .. 0 0 0 | | D_1 | | d_0 |
623 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
624 * | : : : : | | : | | d_2 |
625 * | 0 .. 1 0 0 | | D_n-1 | | : |
626 * | 0 .. 0 1 0 | ~~ ~~ | : |
627 * | 0 .. 0 0 1 | | d_n-1 |
630 * Note that I, V, d, and p are known. To compute D, we must invert the
631 * matrix and use the known data and parity values to reconstruct the unknown
632 * data values. We begin by removing the rows in V|I and d|p that correspond
633 * to failed or missing columns; we then make V|I square (n x n) and d|p
634 * sized n by removing rows corresponding to unused parity from the bottom up
635 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
636 * using Gauss-Jordan elimination. In the example below we use m=3 parity
637 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
639 * | 1 1 1 1 1 1 1 1 |
640 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
641 * | 19 205 116 29 64 16 4 1 | / /
642 * | 1 0 0 0 0 0 0 0 | / /
643 * | 0 1 0 0 0 0 0 0 | <--' /
644 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
645 * | 0 0 0 1 0 0 0 0 |
646 * | 0 0 0 0 1 0 0 0 |
647 * | 0 0 0 0 0 1 0 0 |
648 * | 0 0 0 0 0 0 1 0 |
649 * | 0 0 0 0 0 0 0 1 |
652 * | 1 1 1 1 1 1 1 1 |
653 * | 128 64 32 16 8 4 2 1 |
654 * | 19 205 116 29 64 16 4 1 |
655 * | 1 0 0 0 0 0 0 0 |
656 * | 0 1 0 0 0 0 0 0 |
657 * (V|I)' = | 0 0 1 0 0 0 0 0 |
658 * | 0 0 0 1 0 0 0 0 |
659 * | 0 0 0 0 1 0 0 0 |
660 * | 0 0 0 0 0 1 0 0 |
661 * | 0 0 0 0 0 0 1 0 |
662 * | 0 0 0 0 0 0 0 1 |
665 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
666 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
667 * matrix is not singular.
669 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
670 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
671 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
672 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
673 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
674 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
675 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
676 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
679 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
680 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
681 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
682 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
683 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
684 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
685 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
686 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
689 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
690 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
691 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
692 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
693 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
694 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
695 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
696 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
699 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
700 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
701 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
702 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
703 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
704 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
705 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
706 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
709 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
710 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
711 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
712 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
713 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
714 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
715 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
716 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
719 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
720 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
721 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
722 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
723 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
724 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
725 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
726 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
729 * | 0 0 1 0 0 0 0 0 |
730 * | 167 100 5 41 159 169 217 208 |
731 * | 166 100 4 40 158 168 216 209 |
732 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
733 * | 0 0 0 0 1 0 0 0 |
734 * | 0 0 0 0 0 1 0 0 |
735 * | 0 0 0 0 0 0 1 0 |
736 * | 0 0 0 0 0 0 0 1 |
739 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
740 * of the missing data.
742 * As is apparent from the example above, the only non-trivial rows in the
743 * inverse matrix correspond to the data disks that we're trying to
744 * reconstruct. Indeed, those are the only rows we need as the others would
745 * only be useful for reconstructing data known or assumed to be valid. For
746 * that reason, we only build the coefficients in the rows that correspond to
752 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
758 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
761 * Fill in the missing rows of interest.
763 for (i = 0; i < nmap; i++) {
764 ASSERT3S(0, <=, map[i]);
765 ASSERT3S(map[i], <=, 2);
772 for (j = 0; j < n; j++) {
776 rows[i][j] = vdev_raidz_pow2[pow];
782 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
783 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
789 * Assert that the first nmissing entries from the array of used
790 * columns correspond to parity columns and that subsequent entries
791 * correspond to data columns.
793 for (i = 0; i < nmissing; i++) {
794 ASSERT3S(used[i], <, rm->rm_firstdatacol);
797 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
801 * First initialize the storage where we'll compute the inverse rows.
803 for (i = 0; i < nmissing; i++) {
804 for (j = 0; j < n; j++) {
805 invrows[i][j] = (i == j) ? 1 : 0;
810 * Subtract all trivial rows from the rows of consequence.
812 for (i = 0; i < nmissing; i++) {
813 for (j = nmissing; j < n; j++) {
814 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
815 jj = used[j] - rm->rm_firstdatacol;
817 invrows[i][j] = rows[i][jj];
823 * For each of the rows of interest, we must normalize it and subtract
824 * a multiple of it from the other rows.
826 for (i = 0; i < nmissing; i++) {
827 for (j = 0; j < missing[i]; j++) {
828 ASSERT3U(rows[i][j], ==, 0);
830 ASSERT3U(rows[i][missing[i]], !=, 0);
833 * Compute the inverse of the first element and multiply each
834 * element in the row by that value.
836 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
838 for (j = 0; j < n; j++) {
839 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
840 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
843 for (ii = 0; ii < nmissing; ii++) {
847 ASSERT3U(rows[ii][missing[i]], !=, 0);
849 log = vdev_raidz_log2[rows[ii][missing[i]]];
851 for (j = 0; j < n; j++) {
853 vdev_raidz_exp2(rows[i][j], log);
855 vdev_raidz_exp2(invrows[i][j], log);
861 * Verify that the data that is left in the rows are properly part of
862 * an identity matrix.
864 for (i = 0; i < nmissing; i++) {
865 for (j = 0; j < n; j++) {
866 if (j == missing[i]) {
867 ASSERT3U(rows[i][j], ==, 1);
869 ASSERT3U(rows[i][j], ==, 0);
876 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
877 int *missing, uint8_t **invrows, const uint8_t *used)
882 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
883 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
886 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
891 psize = sizeof (invlog[0][0]) * n * nmissing;
892 p = zfs_alloc(psize);
894 for (pp = p, i = 0; i < nmissing; i++) {
899 for (i = 0; i < nmissing; i++) {
900 for (j = 0; j < n; j++) {
901 ASSERT3U(invrows[i][j], !=, 0);
902 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
906 for (i = 0; i < n; i++) {
908 ASSERT3U(c, <, rm->rm_cols);
910 src = rm->rm_col[c].rc_data;
911 ccount = rm->rm_col[c].rc_size;
912 for (j = 0; j < nmissing; j++) {
913 cc = missing[j] + rm->rm_firstdatacol;
914 ASSERT3U(cc, >=, rm->rm_firstdatacol);
915 ASSERT3U(cc, <, rm->rm_cols);
918 dst[j] = rm->rm_col[cc].rc_data;
919 dcount[j] = rm->rm_col[cc].rc_size;
922 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
924 for (x = 0; x < ccount; x++, src++) {
926 log = vdev_raidz_log2[*src];
928 for (cc = 0; cc < nmissing; cc++) {
935 if ((ll = log + invlog[cc][i]) >= 255)
937 val = vdev_raidz_pow2[ll];
952 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
956 int missing_rows[VDEV_RAIDZ_MAXPARITY];
957 int parity_map[VDEV_RAIDZ_MAXPARITY];
962 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
963 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
969 n = rm->rm_cols - rm->rm_firstdatacol;
972 * Figure out which data columns are missing.
975 for (t = 0; t < ntgts; t++) {
976 if (tgts[t] >= rm->rm_firstdatacol) {
977 missing_rows[nmissing_rows++] =
978 tgts[t] - rm->rm_firstdatacol;
983 * Figure out which parity columns to use to help generate the missing
986 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
988 ASSERT(c < rm->rm_firstdatacol);
991 * Skip any targeted parity columns.
1005 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1007 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1008 nmissing_rows * n + sizeof (used[0]) * n;
1009 p = kmem_alloc(psize, KM_SLEEP);
1011 for (pp = p, i = 0; i < nmissing_rows; i++) {
1019 for (i = 0; i < nmissing_rows; i++) {
1020 used[i] = parity_map[i];
1023 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1024 if (tt < nmissing_rows &&
1025 c == missing_rows[tt] + rm->rm_firstdatacol) {
1036 * Initialize the interesting rows of the matrix.
1038 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1041 * Invert the matrix.
1043 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1047 * Reconstruct the missing data using the generated matrix.
1049 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1052 kmem_free(p, psize);
1058 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1060 int tgts[VDEV_RAIDZ_MAXPARITY];
1064 int nbadparity, nbaddata;
1067 * The tgts list must already be sorted.
1069 for (i = 1; i < nt; i++) {
1070 ASSERT(t[i] > t[i - 1]);
1073 nbadparity = rm->rm_firstdatacol;
1074 nbaddata = rm->rm_cols - nbadparity;
1076 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1077 if (i < nt && c == t[i]) {
1080 } else if (rm->rm_col[c].rc_error != 0) {
1082 } else if (c >= rm->rm_firstdatacol) {
1089 ASSERT(ntgts >= nt);
1090 ASSERT(nbaddata >= 0);
1091 ASSERT(nbaddata + nbadparity == ntgts);
1093 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1094 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1099 static raidz_map_t *
1100 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1101 uint64_t dcols, uint64_t nparity)
1104 uint64_t b = offset >> unit_shift;
1105 uint64_t s = size >> unit_shift;
1106 uint64_t f = b % dcols;
1107 uint64_t o = (b / dcols) << unit_shift;
1108 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1110 q = s / (dcols - nparity);
1111 r = s - q * (dcols - nparity);
1112 bc = (r == 0 ? 0 : r + nparity);
1113 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1117 scols = MIN(dcols, roundup(bc, nparity + 1));
1123 ASSERT3U(acols, <=, scols);
1125 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1127 rm->rm_cols = acols;
1128 rm->rm_scols = scols;
1129 rm->rm_bigcols = bc;
1130 rm->rm_skipstart = bc;
1131 rm->rm_missingdata = 0;
1132 rm->rm_missingparity = 0;
1133 rm->rm_firstdatacol = nparity;
1136 rm->rm_ecksuminjected = 0;
1140 for (c = 0; c < scols; c++) {
1145 coff += 1ULL << unit_shift;
1147 rm->rm_col[c].rc_devidx = col;
1148 rm->rm_col[c].rc_offset = coff;
1149 rm->rm_col[c].rc_data = NULL;
1150 rm->rm_col[c].rc_error = 0;
1151 rm->rm_col[c].rc_tried = 0;
1152 rm->rm_col[c].rc_skipped = 0;
1155 rm->rm_col[c].rc_size = 0;
1157 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1159 rm->rm_col[c].rc_size = q << unit_shift;
1161 asize += rm->rm_col[c].rc_size;
1164 ASSERT3U(asize, ==, tot << unit_shift);
1165 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1166 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1167 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1168 ASSERT3U(rm->rm_nskip, <=, nparity);
1170 for (c = 0; c < rm->rm_firstdatacol; c++)
1171 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1173 rm->rm_col[c].rc_data = data;
1175 for (c = c + 1; c < acols; c++)
1176 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1177 rm->rm_col[c - 1].rc_size;
1180 * If all data stored spans all columns, there's a danger that parity
1181 * will always be on the same device and, since parity isn't read
1182 * during normal operation, that that device's I/O bandwidth won't be
1183 * used effectively. We therefore switch the parity every 1MB.
1185 * ... at least that was, ostensibly, the theory. As a practical
1186 * matter unless we juggle the parity between all devices evenly, we
1187 * won't see any benefit. Further, occasional writes that aren't a
1188 * multiple of the LCM of the number of children and the minimum
1189 * stripe width are sufficient to avoid pessimal behavior.
1190 * Unfortunately, this decision created an implicit on-disk format
1191 * requirement that we need to support for all eternity, but only
1192 * for single-parity RAID-Z.
1194 * If we intend to skip a sector in the zeroth column for padding
1195 * we must make sure to note this swap. We will never intend to
1196 * skip the first column since at least one data and one parity
1197 * column must appear in each row.
1199 ASSERT(rm->rm_cols >= 2);
1200 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1202 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1203 devidx = rm->rm_col[0].rc_devidx;
1204 o = rm->rm_col[0].rc_offset;
1205 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1206 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1207 rm->rm_col[1].rc_devidx = devidx;
1208 rm->rm_col[1].rc_offset = o;
1210 if (rm->rm_skipstart == 0)
1211 rm->rm_skipstart = 1;
1218 vdev_raidz_map_free(raidz_map_t *rm)
1222 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1223 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1225 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1229 vdev_child(vdev_t *pvd, uint64_t devidx)
1233 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1234 if (cvd->v_id == devidx)
1242 * We keep track of whether or not there were any injected errors, so that
1243 * any ereports we generate can note it.
1246 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1249 return (zio_checksum_verify(bp, data));
1253 * Generate the parity from the data columns. If we tried and were able to
1254 * read the parity without error, verify that the generated parity matches the
1255 * data we read. If it doesn't, we fire off a checksum error. Return the
1256 * number such failures.
1259 raidz_parity_verify(raidz_map_t *rm)
1261 void *orig[VDEV_RAIDZ_MAXPARITY];
1265 for (c = 0; c < rm->rm_firstdatacol; c++) {
1266 rc = &rm->rm_col[c];
1267 if (!rc->rc_tried || rc->rc_error != 0)
1269 orig[c] = zfs_alloc(rc->rc_size);
1270 bcopy(rc->rc_data, orig[c], rc->rc_size);
1273 vdev_raidz_generate_parity(rm);
1275 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1276 rc = &rm->rm_col[c];
1277 if (!rc->rc_tried || rc->rc_error != 0)
1279 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1280 rc->rc_error = ECKSUM;
1283 zfs_free(orig[c], rc->rc_size);
1290 * Iterate over all combinations of bad data and attempt a reconstruction.
1291 * Note that the algorithm below is non-optimal because it doesn't take into
1292 * account how reconstruction is actually performed. For example, with
1293 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1294 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1295 * cases we'd only use parity information in column 0.
1298 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1299 off_t offset, uint64_t bytes, int total_errors, int data_errors)
1302 void *orig[VDEV_RAIDZ_MAXPARITY];
1303 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1304 int *tgts = &tstore[1];
1305 int current, next, i, c, n;
1308 ASSERT(total_errors < rm->rm_firstdatacol);
1311 * This simplifies one edge condition.
1315 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1317 * Initialize the targets array by finding the first n columns
1318 * that contain no error.
1320 * If there were no data errors, we need to ensure that we're
1321 * always explicitly attempting to reconstruct at least one
1322 * data column. To do this, we simply push the highest target
1323 * up into the data columns.
1325 for (c = 0, i = 0; i < n; i++) {
1326 if (i == n - 1 && data_errors == 0 &&
1327 c < rm->rm_firstdatacol) {
1328 c = rm->rm_firstdatacol;
1331 while (rm->rm_col[c].rc_error != 0) {
1333 ASSERT3S(c, <, rm->rm_cols);
1340 * Setting tgts[n] simplifies the other edge condition.
1342 tgts[n] = rm->rm_cols;
1345 * These buffers were allocated in previous iterations.
1347 for (i = 0; i < n - 1; i++) {
1348 ASSERT(orig[i] != NULL);
1351 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354 next = tgts[current];
1356 while (current != n) {
1357 tgts[current] = next;
1361 * Save off the original data that we're going to
1362 * attempt to reconstruct.
1364 for (i = 0; i < n; i++) {
1365 ASSERT(orig[i] != NULL);
1368 ASSERT3S(c, <, rm->rm_cols);
1369 rc = &rm->rm_col[c];
1370 bcopy(rc->rc_data, orig[i], rc->rc_size);
1374 * Attempt a reconstruction and exit the outer loop on
1377 code = vdev_raidz_reconstruct(rm, tgts, n);
1378 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1379 for (i = 0; i < n; i++) {
1381 rc = &rm->rm_col[c];
1382 ASSERT(rc->rc_error == 0);
1383 rc->rc_error = ECKSUM;
1391 * Restore the original data.
1393 for (i = 0; i < n; i++) {
1395 rc = &rm->rm_col[c];
1396 bcopy(orig[i], rc->rc_data, rc->rc_size);
1401 * Find the next valid column after the current
1404 for (next = tgts[current] + 1;
1405 next < rm->rm_cols &&
1406 rm->rm_col[next].rc_error != 0; next++)
1409 ASSERT(next <= tgts[current + 1]);
1412 * If that spot is available, we're done here.
1414 if (next != tgts[current + 1])
1418 * Otherwise, find the next valid column after
1419 * the previous position.
1421 for (c = tgts[current - 1] + 1;
1422 rm->rm_col[c].rc_error != 0; c++)
1428 } while (current != n);
1433 for (i = n - 1; i >= 0; i--) {
1434 zfs_free(orig[i], rm->rm_col[0].rc_size);
1441 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1442 off_t offset, size_t bytes)
1444 vdev_t *tvd = vd->v_top;
1449 int unexpected_errors;
1455 int tgts[VDEV_RAIDZ_MAXPARITY];
1458 rc = NULL; /* gcc */
1461 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1462 vd->v_nchildren, vd->v_nparity);
1465 * Iterate over the columns in reverse order so that we hit the parity
1466 * last -- any errors along the way will force us to read the parity.
1468 for (c = rm->rm_cols - 1; c >= 0; c--) {
1469 rc = &rm->rm_col[c];
1470 cvd = vdev_child(vd, rc->rc_devidx);
1471 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1472 if (c >= rm->rm_firstdatacol)
1473 rm->rm_missingdata++;
1475 rm->rm_missingparity++;
1476 rc->rc_error = ENXIO;
1477 rc->rc_tried = 1; /* don't even try */
1481 #if 0 /* XXX: Too hard for the boot code. */
1482 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1483 if (c >= rm->rm_firstdatacol)
1484 rm->rm_missingdata++;
1486 rm->rm_missingparity++;
1487 rc->rc_error = ESTALE;
1492 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1493 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1494 rc->rc_offset, rc->rc_size);
1501 unexpected_errors = 0;
1507 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1508 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1510 for (c = 0; c < rm->rm_cols; c++) {
1511 rc = &rm->rm_col[c];
1514 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1516 if (c < rm->rm_firstdatacol)
1521 if (!rc->rc_skipped)
1522 unexpected_errors++;
1525 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1531 * There are three potential phases for a read:
1532 * 1. produce valid data from the columns read
1533 * 2. read all disks and try again
1534 * 3. perform combinatorial reconstruction
1536 * Each phase is progressively both more expensive and less likely to
1537 * occur. If we encounter more errors than we can repair or all phases
1538 * fail, we have no choice but to return an error.
1542 * If the number of errors we saw was correctable -- less than or equal
1543 * to the number of parity disks read -- attempt to produce data that
1544 * has a valid checksum. Naturally, this case applies in the absence of
1547 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1548 if (data_errors == 0) {
1549 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1551 * If we read parity information (unnecessarily
1552 * as it happens since no reconstruction was
1553 * needed) regenerate and verify the parity.
1554 * We also regenerate parity when resilvering
1555 * so we can write it out to the failed device
1558 if (parity_errors + parity_untried <
1559 rm->rm_firstdatacol) {
1560 n = raidz_parity_verify(rm);
1561 unexpected_errors += n;
1562 ASSERT(parity_errors + n <=
1563 rm->rm_firstdatacol);
1569 * We either attempt to read all the parity columns or
1570 * none of them. If we didn't try to read parity, we
1571 * wouldn't be here in the correctable case. There must
1572 * also have been fewer parity errors than parity
1573 * columns or, again, we wouldn't be in this code path.
1575 ASSERT(parity_untried == 0);
1576 ASSERT(parity_errors < rm->rm_firstdatacol);
1579 * Identify the data columns that reported an error.
1582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1583 rc = &rm->rm_col[c];
1584 if (rc->rc_error != 0) {
1585 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1590 ASSERT(rm->rm_firstdatacol >= n);
1592 code = vdev_raidz_reconstruct(rm, tgts, n);
1594 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1596 * If we read more parity disks than were used
1597 * for reconstruction, confirm that the other
1598 * parity disks produced correct data. This
1599 * routine is suboptimal in that it regenerates
1600 * the parity that we already used in addition
1601 * to the parity that we're attempting to
1602 * verify, but this should be a relatively
1603 * uncommon case, and can be optimized if it
1604 * becomes a problem. Note that we regenerate
1605 * parity when resilvering so we can write it
1606 * out to failed devices later.
1608 if (parity_errors < rm->rm_firstdatacol - n) {
1609 n = raidz_parity_verify(rm);
1610 unexpected_errors += n;
1611 ASSERT(parity_errors + n <=
1612 rm->rm_firstdatacol);
1621 * This isn't a typical situation -- either we got a read
1622 * error or a child silently returned bad data. Read every
1623 * block so we can try again with as much data and parity as
1624 * we can track down. If we've already been through once
1625 * before, all children will be marked as tried so we'll
1626 * proceed to combinatorial reconstruction.
1628 unexpected_errors = 1;
1629 rm->rm_missingdata = 0;
1630 rm->rm_missingparity = 0;
1633 for (c = 0; c < rm->rm_cols; c++) {
1634 rc = &rm->rm_col[c];
1639 cvd = vdev_child(vd, rc->rc_devidx);
1640 ASSERT(cvd != NULL);
1641 rc->rc_error = cvd->v_read(cvd, NULL,
1642 rc->rc_data, rc->rc_offset, rc->rc_size);
1643 if (rc->rc_error == 0)
1649 * If we managed to read anything more, retry the
1656 * At this point we've attempted to reconstruct the data given the
1657 * errors we detected, and we've attempted to read all columns. There
1658 * must, therefore, be one or more additional problems -- silent errors
1659 * resulting in invalid data rather than explicit I/O errors resulting
1660 * in absent data. We check if there is enough additional data to
1661 * possibly reconstruct the data and then perform combinatorial
1662 * reconstruction over all possible combinations. If that fails,
1665 if (total_errors > rm->rm_firstdatacol) {
1667 } else if (total_errors < rm->rm_firstdatacol &&
1668 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1669 total_errors, data_errors)) != 0) {
1671 * If we didn't use all the available parity for the
1672 * combinatorial reconstruction, verify that the remaining
1673 * parity is correct.
1675 if (code != (1 << rm->rm_firstdatacol) - 1)
1676 (void) raidz_parity_verify(rm);
1679 * We're here because either:
1681 * total_errors == rm_first_datacol, or
1682 * vdev_raidz_combrec() failed
1684 * In either case, there is enough bad data to prevent
1687 * Start checksum ereports for all children which haven't
1688 * failed, and the IO wasn't speculative.
1694 vdev_raidz_map_free(rm);