4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
27 #include <sys/zfs_context.h>
29 #include <sys/vdev_impl.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
36 * Virtual device vector for RAID-Z.
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
101 typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
106 void *rc_gdata; /* used to store the "good" version */
107 int rc_error; /* I/O error for this device */
108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 uint8_t rc_skipped; /* Did we skip this I/O column? */
112 typedef struct raidz_map {
113 uint64_t rm_cols; /* Regular column count */
114 uint64_t rm_scols; /* Count including skipped columns */
115 uint64_t rm_bigcols; /* Number of oversized columns */
116 uint64_t rm_asize; /* Actual total I/O size */
117 uint64_t rm_missingdata; /* Count of missing data devices */
118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 uint64_t rm_firstdatacol; /* First data column/parity count */
120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 uint64_t rm_skipstart; /* Column index of padding start */
122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
129 #define VDEV_RAIDZ_P 0
130 #define VDEV_RAIDZ_Q 1
131 #define VDEV_RAIDZ_R 2
133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
144 (mask) = (x) & 0x8080808080808080ULL; \
145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 ((mask) & 0x1d1d1d1d1d1d1d1d); \
150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
157 * Force reconstruction to use the general purpose method.
159 int vdev_raidz_default_to_general;
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
165 static const uint8_t vdev_raidz_pow2[256] = {
166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
199 static const uint8_t vdev_raidz_log2[256] = {
200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
237 * Multiply a given number by 2 raised to the given power.
240 vdev_raidz_exp2(uint_t a, int exp)
246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
248 exp += vdev_raidz_log2[a];
252 return (vdev_raidz_pow2[exp]);
256 vdev_raidz_map_free(raidz_map_t *rm)
261 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 if (rm->rm_col[c].rc_data != NULL)
263 zio_buf_free(rm->rm_col[c].rc_data,
264 rm->rm_col[c].rc_size);
266 if (rm->rm_col[c].rc_gdata != NULL)
267 zio_buf_free(rm->rm_col[c].rc_gdata,
268 rm->rm_col[c].rc_size);
272 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
273 size += rm->rm_col[c].rc_size;
275 if (rm->rm_datacopy != NULL)
276 zio_buf_free(rm->rm_datacopy, size);
278 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282 vdev_raidz_map_free_vsd(zio_t *zio)
284 raidz_map_t *rm = zio->io_vsd;
286 ASSERT0(rm->rm_freed);
289 if (rm->rm_reports == 0)
290 vdev_raidz_map_free(rm);
295 vdev_raidz_cksum_free(void *arg, size_t ignored)
297 raidz_map_t *rm = arg;
299 ASSERT3U(rm->rm_reports, >, 0);
301 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
302 vdev_raidz_map_free(rm);
306 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
308 raidz_map_t *rm = zcr->zcr_cbdata;
309 size_t c = zcr->zcr_cbinfo;
312 const char *good = NULL;
313 const char *bad = rm->rm_col[c].rc_data;
315 if (good_data == NULL) {
316 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
320 if (c < rm->rm_firstdatacol) {
322 * The first time through, calculate the parity blocks for
323 * the good data (this relies on the fact that the good
324 * data never changes for a given logical ZIO)
326 if (rm->rm_col[0].rc_gdata == NULL) {
327 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
331 * Set up the rm_col[]s to generate the parity for
332 * good_data, first saving the parity bufs and
333 * replacing them with buffers to hold the result.
335 for (x = 0; x < rm->rm_firstdatacol; x++) {
336 bad_parity[x] = rm->rm_col[x].rc_data;
337 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
338 zio_buf_alloc(rm->rm_col[x].rc_size);
341 /* fill in the data columns from good_data */
342 buf = (char *)good_data;
343 for (; x < rm->rm_cols; x++) {
344 rm->rm_col[x].rc_data = buf;
345 buf += rm->rm_col[x].rc_size;
349 * Construct the parity from the good data.
351 vdev_raidz_generate_parity(rm);
353 /* restore everything back to its original state */
354 for (x = 0; x < rm->rm_firstdatacol; x++)
355 rm->rm_col[x].rc_data = bad_parity[x];
357 buf = rm->rm_datacopy;
358 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
359 rm->rm_col[x].rc_data = buf;
360 buf += rm->rm_col[x].rc_size;
364 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
365 good = rm->rm_col[c].rc_gdata;
367 /* adjust good_data to point at the start of our column */
370 for (x = rm->rm_firstdatacol; x < c; x++)
371 good += rm->rm_col[x].rc_size;
374 /* we drop the ereport if it ends up that the data was good */
375 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
379 * Invoked indirectly by zfs_ereport_start_checksum(), called
380 * below when our read operation fails completely. The main point
381 * is to keep a copy of everything we read from disk, so that at
382 * vdev_raidz_cksum_finish() time we can compare it with the good data.
385 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
387 size_t c = (size_t)(uintptr_t)arg;
390 raidz_map_t *rm = zio->io_vsd;
393 /* set up the report and bump the refcount */
394 zcr->zcr_cbdata = rm;
396 zcr->zcr_finish = vdev_raidz_cksum_finish;
397 zcr->zcr_free = vdev_raidz_cksum_free;
400 ASSERT3U(rm->rm_reports, >, 0);
402 if (rm->rm_datacopy != NULL)
406 * It's the first time we're called for this raidz_map_t, so we need
407 * to copy the data aside; there's no guarantee that our zio's buffer
408 * won't be re-used for something else.
410 * Our parity data is already in separate buffers, so there's no need
415 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
416 size += rm->rm_col[c].rc_size;
418 buf = rm->rm_datacopy = zio_buf_alloc(size);
420 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
421 raidz_col_t *col = &rm->rm_col[c];
423 bcopy(col->rc_data, buf, col->rc_size);
428 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
431 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432 vdev_raidz_map_free_vsd,
433 vdev_raidz_cksum_report
437 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
441 uint64_t b = zio->io_offset >> unit_shift;
442 uint64_t s = zio->io_size >> unit_shift;
443 uint64_t f = b % dcols;
444 uint64_t o = (b / dcols) << unit_shift;
445 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
447 q = s / (dcols - nparity);
448 r = s - q * (dcols - nparity);
449 bc = (r == 0 ? 0 : r + nparity);
450 tot = s + nparity * (q + (r == 0 ? 0 : 1));
454 scols = MIN(dcols, roundup(bc, nparity + 1));
460 ASSERT3U(acols, <=, scols);
462 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
465 rm->rm_scols = scols;
467 rm->rm_skipstart = bc;
468 rm->rm_missingdata = 0;
469 rm->rm_missingparity = 0;
470 rm->rm_firstdatacol = nparity;
471 rm->rm_datacopy = NULL;
474 rm->rm_ecksuminjected = 0;
478 for (c = 0; c < scols; c++) {
483 coff += 1ULL << unit_shift;
485 rm->rm_col[c].rc_devidx = col;
486 rm->rm_col[c].rc_offset = coff;
487 rm->rm_col[c].rc_data = NULL;
488 rm->rm_col[c].rc_gdata = NULL;
489 rm->rm_col[c].rc_error = 0;
490 rm->rm_col[c].rc_tried = 0;
491 rm->rm_col[c].rc_skipped = 0;
494 rm->rm_col[c].rc_size = 0;
496 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
498 rm->rm_col[c].rc_size = q << unit_shift;
500 asize += rm->rm_col[c].rc_size;
503 ASSERT3U(asize, ==, tot << unit_shift);
504 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
505 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
506 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
507 ASSERT3U(rm->rm_nskip, <=, nparity);
509 if (zio->io_type != ZIO_TYPE_FREE) {
510 for (c = 0; c < rm->rm_firstdatacol; c++) {
511 rm->rm_col[c].rc_data =
512 zio_buf_alloc(rm->rm_col[c].rc_size);
515 rm->rm_col[c].rc_data = zio->io_data;
517 for (c = c + 1; c < acols; c++) {
518 rm->rm_col[c].rc_data =
519 (char *)rm->rm_col[c - 1].rc_data +
520 rm->rm_col[c - 1].rc_size;
525 * If all data stored spans all columns, there's a danger that parity
526 * will always be on the same device and, since parity isn't read
527 * during normal operation, that that device's I/O bandwidth won't be
528 * used effectively. We therefore switch the parity every 1MB.
530 * ... at least that was, ostensibly, the theory. As a practical
531 * matter unless we juggle the parity between all devices evenly, we
532 * won't see any benefit. Further, occasional writes that aren't a
533 * multiple of the LCM of the number of children and the minimum
534 * stripe width are sufficient to avoid pessimal behavior.
535 * Unfortunately, this decision created an implicit on-disk format
536 * requirement that we need to support for all eternity, but only
537 * for single-parity RAID-Z.
539 * If we intend to skip a sector in the zeroth column for padding
540 * we must make sure to note this swap. We will never intend to
541 * skip the first column since at least one data and one parity
542 * column must appear in each row.
544 ASSERT(rm->rm_cols >= 2);
545 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
547 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
548 devidx = rm->rm_col[0].rc_devidx;
549 o = rm->rm_col[0].rc_offset;
550 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
551 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
552 rm->rm_col[1].rc_devidx = devidx;
553 rm->rm_col[1].rc_offset = o;
555 if (rm->rm_skipstart == 0)
556 rm->rm_skipstart = 1;
560 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
565 vdev_raidz_generate_parity_p(raidz_map_t *rm)
567 uint64_t *p, *src, pcount, ccount, i;
570 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
572 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
573 src = rm->rm_col[c].rc_data;
574 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
575 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
577 if (c == rm->rm_firstdatacol) {
578 ASSERT(ccount == pcount);
579 for (i = 0; i < ccount; i++, src++, p++) {
583 ASSERT(ccount <= pcount);
584 for (i = 0; i < ccount; i++, src++, p++) {
592 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
594 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
597 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
598 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
599 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
601 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
602 src = rm->rm_col[c].rc_data;
603 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
604 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
606 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
608 if (c == rm->rm_firstdatacol) {
609 ASSERT(ccnt == pcnt || ccnt == 0);
610 for (i = 0; i < ccnt; i++, src++, p++, q++) {
614 for (; i < pcnt; i++, src++, p++, q++) {
619 ASSERT(ccnt <= pcnt);
622 * Apply the algorithm described above by multiplying
623 * the previous result and adding in the new value.
625 for (i = 0; i < ccnt; i++, src++, p++, q++) {
628 VDEV_RAIDZ_64MUL_2(*q, mask);
633 * Treat short columns as though they are full of 0s.
634 * Note that there's therefore nothing needed for P.
636 for (; i < pcnt; i++, q++) {
637 VDEV_RAIDZ_64MUL_2(*q, mask);
644 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
646 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
649 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
650 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
651 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
652 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
653 rm->rm_col[VDEV_RAIDZ_R].rc_size);
655 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
656 src = rm->rm_col[c].rc_data;
657 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
658 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
659 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
661 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
663 if (c == rm->rm_firstdatacol) {
664 ASSERT(ccnt == pcnt || ccnt == 0);
665 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
670 for (; i < pcnt; i++, src++, p++, q++, r++) {
676 ASSERT(ccnt <= pcnt);
679 * Apply the algorithm described above by multiplying
680 * the previous result and adding in the new value.
682 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
685 VDEV_RAIDZ_64MUL_2(*q, mask);
688 VDEV_RAIDZ_64MUL_4(*r, mask);
693 * Treat short columns as though they are full of 0s.
694 * Note that there's therefore nothing needed for P.
696 for (; i < pcnt; i++, q++, r++) {
697 VDEV_RAIDZ_64MUL_2(*q, mask);
698 VDEV_RAIDZ_64MUL_4(*r, mask);
705 * Generate RAID parity in the first virtual columns according to the number of
706 * parity columns available.
709 vdev_raidz_generate_parity(raidz_map_t *rm)
711 switch (rm->rm_firstdatacol) {
713 vdev_raidz_generate_parity_p(rm);
716 vdev_raidz_generate_parity_pq(rm);
719 vdev_raidz_generate_parity_pqr(rm);
722 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
727 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
729 uint64_t *dst, *src, xcount, ccount, count, i;
734 ASSERT(x >= rm->rm_firstdatacol);
735 ASSERT(x < rm->rm_cols);
737 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
738 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
741 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
742 dst = rm->rm_col[x].rc_data;
743 for (i = 0; i < xcount; i++, dst++, src++) {
747 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
748 src = rm->rm_col[c].rc_data;
749 dst = rm->rm_col[x].rc_data;
754 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
755 count = MIN(ccount, xcount);
757 for (i = 0; i < count; i++, dst++, src++) {
762 return (1 << VDEV_RAIDZ_P);
766 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
768 uint64_t *dst, *src, xcount, ccount, count, mask, i;
775 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
776 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
778 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
779 src = rm->rm_col[c].rc_data;
780 dst = rm->rm_col[x].rc_data;
785 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
787 count = MIN(ccount, xcount);
789 if (c == rm->rm_firstdatacol) {
790 for (i = 0; i < count; i++, dst++, src++) {
793 for (; i < xcount; i++, dst++) {
798 for (i = 0; i < count; i++, dst++, src++) {
799 VDEV_RAIDZ_64MUL_2(*dst, mask);
803 for (; i < xcount; i++, dst++) {
804 VDEV_RAIDZ_64MUL_2(*dst, mask);
809 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
810 dst = rm->rm_col[x].rc_data;
811 exp = 255 - (rm->rm_cols - 1 - x);
813 for (i = 0; i < xcount; i++, dst++, src++) {
815 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
816 *b = vdev_raidz_exp2(*b, exp);
820 return (1 << VDEV_RAIDZ_Q);
824 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
826 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
828 uint64_t xsize, ysize, i;
834 ASSERT(x >= rm->rm_firstdatacol);
835 ASSERT(y < rm->rm_cols);
837 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
840 * Move the parity data aside -- we're going to compute parity as
841 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
842 * reuse the parity generation mechanism without trashing the actual
843 * parity so we make those columns appear to be full of zeros by
844 * setting their lengths to zero.
846 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
847 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
848 xsize = rm->rm_col[x].rc_size;
849 ysize = rm->rm_col[y].rc_size;
851 rm->rm_col[VDEV_RAIDZ_P].rc_data =
852 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
853 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
854 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
855 rm->rm_col[x].rc_size = 0;
856 rm->rm_col[y].rc_size = 0;
858 vdev_raidz_generate_parity_pq(rm);
860 rm->rm_col[x].rc_size = xsize;
861 rm->rm_col[y].rc_size = ysize;
865 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
866 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
867 xd = rm->rm_col[x].rc_data;
868 yd = rm->rm_col[y].rc_data;
872 * Pxy = P + D_x + D_y
873 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
875 * We can then solve for D_x:
876 * D_x = A * (P + Pxy) + B * (Q + Qxy)
878 * A = 2^(x - y) * (2^(x - y) + 1)^-1
879 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
881 * With D_x in hand, we can easily solve for D_y:
882 * D_y = P + Pxy + D_x
885 a = vdev_raidz_pow2[255 + x - y];
886 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
887 tmp = 255 - vdev_raidz_log2[a ^ 1];
889 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
890 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
892 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
893 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
894 vdev_raidz_exp2(*q ^ *qxy, bexp);
897 *yd = *p ^ *pxy ^ *xd;
900 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
901 rm->rm_col[VDEV_RAIDZ_P].rc_size);
902 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
903 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
906 * Restore the saved parity data.
908 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
909 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
911 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
916 * In the general case of reconstruction, we must solve the system of linear
917 * equations defined by the coeffecients used to generate parity as well as
918 * the contents of the data and parity disks. This can be expressed with
919 * vectors for the original data (D) and the actual data (d) and parity (p)
920 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
924 * | V | | D_0 | | p_m-1 |
925 * | | x | : | = | d_0 |
926 * | I | | D_n-1 | | : |
927 * | | ~~ ~~ | d_n-1 |
930 * I is simply a square identity matrix of size n, and V is a vandermonde
931 * matrix defined by the coeffecients we chose for the various parity columns
932 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
933 * computation as well as linear separability.
936 * | 1 .. 1 1 1 | | p_0 |
937 * | 2^n-1 .. 4 2 1 | __ __ | : |
938 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
939 * | 1 .. 0 0 0 | | D_1 | | d_0 |
940 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
941 * | : : : : | | : | | d_2 |
942 * | 0 .. 1 0 0 | | D_n-1 | | : |
943 * | 0 .. 0 1 0 | ~~ ~~ | : |
944 * | 0 .. 0 0 1 | | d_n-1 |
947 * Note that I, V, d, and p are known. To compute D, we must invert the
948 * matrix and use the known data and parity values to reconstruct the unknown
949 * data values. We begin by removing the rows in V|I and d|p that correspond
950 * to failed or missing columns; we then make V|I square (n x n) and d|p
951 * sized n by removing rows corresponding to unused parity from the bottom up
952 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
953 * using Gauss-Jordan elimination. In the example below we use m=3 parity
954 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
956 * | 1 1 1 1 1 1 1 1 |
957 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
958 * | 19 205 116 29 64 16 4 1 | / /
959 * | 1 0 0 0 0 0 0 0 | / /
960 * | 0 1 0 0 0 0 0 0 | <--' /
961 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
962 * | 0 0 0 1 0 0 0 0 |
963 * | 0 0 0 0 1 0 0 0 |
964 * | 0 0 0 0 0 1 0 0 |
965 * | 0 0 0 0 0 0 1 0 |
966 * | 0 0 0 0 0 0 0 1 |
969 * | 1 1 1 1 1 1 1 1 |
970 * | 128 64 32 16 8 4 2 1 |
971 * | 19 205 116 29 64 16 4 1 |
972 * | 1 0 0 0 0 0 0 0 |
973 * | 0 1 0 0 0 0 0 0 |
974 * (V|I)' = | 0 0 1 0 0 0 0 0 |
975 * | 0 0 0 1 0 0 0 0 |
976 * | 0 0 0 0 1 0 0 0 |
977 * | 0 0 0 0 0 1 0 0 |
978 * | 0 0 0 0 0 0 1 0 |
979 * | 0 0 0 0 0 0 0 1 |
982 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
983 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
984 * matrix is not singular.
986 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
987 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
990 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
991 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
992 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
993 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
996 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
997 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
998 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
999 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1000 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1001 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1002 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1003 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1006 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1007 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1008 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1009 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1010 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1011 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1012 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1013 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1016 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1017 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1018 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1027 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1028 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1037 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1038 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1046 * | 0 0 1 0 0 0 0 0 |
1047 * | 167 100 5 41 159 169 217 208 |
1048 * | 166 100 4 40 158 168 216 209 |
1049 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1050 * | 0 0 0 0 1 0 0 0 |
1051 * | 0 0 0 0 0 1 0 0 |
1052 * | 0 0 0 0 0 0 1 0 |
1053 * | 0 0 0 0 0 0 0 1 |
1056 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1057 * of the missing data.
1059 * As is apparent from the example above, the only non-trivial rows in the
1060 * inverse matrix correspond to the data disks that we're trying to
1061 * reconstruct. Indeed, those are the only rows we need as the others would
1062 * only be useful for reconstructing data known or assumed to be valid. For
1063 * that reason, we only build the coefficients in the rows that correspond to
1069 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1075 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1078 * Fill in the missing rows of interest.
1080 for (i = 0; i < nmap; i++) {
1081 ASSERT3S(0, <=, map[i]);
1082 ASSERT3S(map[i], <=, 2);
1089 for (j = 0; j < n; j++) {
1093 rows[i][j] = vdev_raidz_pow2[pow];
1099 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1100 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1106 * Assert that the first nmissing entries from the array of used
1107 * columns correspond to parity columns and that subsequent entries
1108 * correspond to data columns.
1110 for (i = 0; i < nmissing; i++) {
1111 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1113 for (; i < n; i++) {
1114 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1118 * First initialize the storage where we'll compute the inverse rows.
1120 for (i = 0; i < nmissing; i++) {
1121 for (j = 0; j < n; j++) {
1122 invrows[i][j] = (i == j) ? 1 : 0;
1127 * Subtract all trivial rows from the rows of consequence.
1129 for (i = 0; i < nmissing; i++) {
1130 for (j = nmissing; j < n; j++) {
1131 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1132 jj = used[j] - rm->rm_firstdatacol;
1134 invrows[i][j] = rows[i][jj];
1140 * For each of the rows of interest, we must normalize it and subtract
1141 * a multiple of it from the other rows.
1143 for (i = 0; i < nmissing; i++) {
1144 for (j = 0; j < missing[i]; j++) {
1145 ASSERT0(rows[i][j]);
1147 ASSERT3U(rows[i][missing[i]], !=, 0);
1150 * Compute the inverse of the first element and multiply each
1151 * element in the row by that value.
1153 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1155 for (j = 0; j < n; j++) {
1156 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1157 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1160 for (ii = 0; ii < nmissing; ii++) {
1164 ASSERT3U(rows[ii][missing[i]], !=, 0);
1166 log = vdev_raidz_log2[rows[ii][missing[i]]];
1168 for (j = 0; j < n; j++) {
1170 vdev_raidz_exp2(rows[i][j], log);
1172 vdev_raidz_exp2(invrows[i][j], log);
1178 * Verify that the data that is left in the rows are properly part of
1179 * an identity matrix.
1181 for (i = 0; i < nmissing; i++) {
1182 for (j = 0; j < n; j++) {
1183 if (j == missing[i]) {
1184 ASSERT3U(rows[i][j], ==, 1);
1186 ASSERT0(rows[i][j]);
1193 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1194 int *missing, uint8_t **invrows, const uint8_t *used)
1199 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1200 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1204 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1208 psize = sizeof (invlog[0][0]) * n * nmissing;
1209 p = kmem_alloc(psize, KM_SLEEP);
1211 for (pp = p, i = 0; i < nmissing; i++) {
1216 for (i = 0; i < nmissing; i++) {
1217 for (j = 0; j < n; j++) {
1218 ASSERT3U(invrows[i][j], !=, 0);
1219 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1223 for (i = 0; i < n; i++) {
1225 ASSERT3U(c, <, rm->rm_cols);
1227 src = rm->rm_col[c].rc_data;
1228 ccount = rm->rm_col[c].rc_size;
1229 for (j = 0; j < nmissing; j++) {
1230 cc = missing[j] + rm->rm_firstdatacol;
1231 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1232 ASSERT3U(cc, <, rm->rm_cols);
1233 ASSERT3U(cc, !=, c);
1235 dst[j] = rm->rm_col[cc].rc_data;
1236 dcount[j] = rm->rm_col[cc].rc_size;
1239 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1241 for (x = 0; x < ccount; x++, src++) {
1243 log = vdev_raidz_log2[*src];
1245 for (cc = 0; cc < nmissing; cc++) {
1246 if (x >= dcount[cc])
1252 if ((ll = log + invlog[cc][i]) >= 255)
1254 val = vdev_raidz_pow2[ll];
1265 kmem_free(p, psize);
1269 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1273 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1274 int parity_map[VDEV_RAIDZ_MAXPARITY];
1279 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1280 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1286 n = rm->rm_cols - rm->rm_firstdatacol;
1289 * Figure out which data columns are missing.
1292 for (t = 0; t < ntgts; t++) {
1293 if (tgts[t] >= rm->rm_firstdatacol) {
1294 missing_rows[nmissing_rows++] =
1295 tgts[t] - rm->rm_firstdatacol;
1300 * Figure out which parity columns to use to help generate the missing
1303 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1305 ASSERT(c < rm->rm_firstdatacol);
1308 * Skip any targeted parity columns.
1310 if (c == tgts[tt]) {
1322 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1324 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1325 nmissing_rows * n + sizeof (used[0]) * n;
1326 p = kmem_alloc(psize, KM_SLEEP);
1328 for (pp = p, i = 0; i < nmissing_rows; i++) {
1336 for (i = 0; i < nmissing_rows; i++) {
1337 used[i] = parity_map[i];
1340 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1341 if (tt < nmissing_rows &&
1342 c == missing_rows[tt] + rm->rm_firstdatacol) {
1353 * Initialize the interesting rows of the matrix.
1355 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1358 * Invert the matrix.
1360 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1364 * Reconstruct the missing data using the generated matrix.
1366 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1369 kmem_free(p, psize);
1375 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1377 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1381 int nbadparity, nbaddata;
1382 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1385 * The tgts list must already be sorted.
1387 for (i = 1; i < nt; i++) {
1388 ASSERT(t[i] > t[i - 1]);
1391 nbadparity = rm->rm_firstdatacol;
1392 nbaddata = rm->rm_cols - nbadparity;
1394 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1395 if (c < rm->rm_firstdatacol)
1396 parity_valid[c] = B_FALSE;
1398 if (i < nt && c == t[i]) {
1401 } else if (rm->rm_col[c].rc_error != 0) {
1403 } else if (c >= rm->rm_firstdatacol) {
1406 parity_valid[c] = B_TRUE;
1411 ASSERT(ntgts >= nt);
1412 ASSERT(nbaddata >= 0);
1413 ASSERT(nbaddata + nbadparity == ntgts);
1415 dt = &tgts[nbadparity];
1418 * See if we can use any of our optimized reconstruction routines.
1420 if (!vdev_raidz_default_to_general) {
1423 if (parity_valid[VDEV_RAIDZ_P])
1424 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1426 ASSERT(rm->rm_firstdatacol > 1);
1428 if (parity_valid[VDEV_RAIDZ_Q])
1429 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1431 ASSERT(rm->rm_firstdatacol > 2);
1435 ASSERT(rm->rm_firstdatacol > 1);
1437 if (parity_valid[VDEV_RAIDZ_P] &&
1438 parity_valid[VDEV_RAIDZ_Q])
1439 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1441 ASSERT(rm->rm_firstdatacol > 2);
1447 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1448 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1454 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1458 uint64_t nparity = vd->vdev_nparity;
1463 ASSERT(nparity > 0);
1465 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1466 vd->vdev_children < nparity + 1) {
1467 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1468 return (SET_ERROR(EINVAL));
1471 vdev_open_children(vd);
1473 for (c = 0; c < vd->vdev_children; c++) {
1474 cvd = vd->vdev_child[c];
1476 if (cvd->vdev_open_error != 0) {
1477 lasterror = cvd->vdev_open_error;
1482 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1483 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1484 *ashift = MAX(*ashift, cvd->vdev_ashift);
1487 *asize *= vd->vdev_children;
1488 *max_asize *= vd->vdev_children;
1490 if (numerrors > nparity) {
1491 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1499 vdev_raidz_close(vdev_t *vd)
1503 for (c = 0; c < vd->vdev_children; c++)
1504 vdev_close(vd->vdev_child[c]);
1508 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1511 uint64_t ashift = vd->vdev_top->vdev_ashift;
1512 uint64_t cols = vd->vdev_children;
1513 uint64_t nparity = vd->vdev_nparity;
1515 asize = ((psize - 1) >> ashift) + 1;
1516 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1517 asize = roundup(asize, nparity + 1) << ashift;
1523 vdev_raidz_child_done(zio_t *zio)
1525 raidz_col_t *rc = zio->io_private;
1527 rc->rc_error = zio->io_error;
1533 vdev_raidz_io_start(zio_t *zio)
1535 vdev_t *vd = zio->io_vd;
1536 vdev_t *tvd = vd->vdev_top;
1542 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1545 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1547 if (zio->io_type == ZIO_TYPE_FREE) {
1548 for (c = 0; c < rm->rm_cols; c++) {
1549 rc = &rm->rm_col[c];
1550 cvd = vd->vdev_child[rc->rc_devidx];
1551 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1552 rc->rc_offset, rc->rc_data, rc->rc_size,
1553 zio->io_type, zio->io_priority, 0,
1554 vdev_raidz_child_done, rc));
1556 return (ZIO_PIPELINE_CONTINUE);
1559 if (zio->io_type == ZIO_TYPE_WRITE) {
1560 vdev_raidz_generate_parity(rm);
1562 for (c = 0; c < rm->rm_cols; c++) {
1563 rc = &rm->rm_col[c];
1564 cvd = vd->vdev_child[rc->rc_devidx];
1565 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1566 rc->rc_offset, rc->rc_data, rc->rc_size,
1567 zio->io_type, zio->io_priority, 0,
1568 vdev_raidz_child_done, rc));
1572 * Generate optional I/Os for any skipped sectors to improve
1573 * aggregation contiguity.
1575 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1576 ASSERT(c <= rm->rm_scols);
1577 if (c == rm->rm_scols)
1579 rc = &rm->rm_col[c];
1580 cvd = vd->vdev_child[rc->rc_devidx];
1581 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1582 rc->rc_offset + rc->rc_size, NULL,
1583 1 << tvd->vdev_ashift,
1584 zio->io_type, zio->io_priority,
1585 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1588 return (ZIO_PIPELINE_CONTINUE);
1591 ASSERT(zio->io_type == ZIO_TYPE_READ);
1594 * Iterate over the columns in reverse order so that we hit the parity
1595 * last -- any errors along the way will force us to read the parity.
1597 for (c = rm->rm_cols - 1; c >= 0; c--) {
1598 rc = &rm->rm_col[c];
1599 cvd = vd->vdev_child[rc->rc_devidx];
1600 if (!vdev_readable(cvd)) {
1601 if (c >= rm->rm_firstdatacol)
1602 rm->rm_missingdata++;
1604 rm->rm_missingparity++;
1605 rc->rc_error = SET_ERROR(ENXIO);
1606 rc->rc_tried = 1; /* don't even try */
1610 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1611 if (c >= rm->rm_firstdatacol)
1612 rm->rm_missingdata++;
1614 rm->rm_missingparity++;
1615 rc->rc_error = SET_ERROR(ESTALE);
1619 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1620 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1621 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1622 rc->rc_offset, rc->rc_data, rc->rc_size,
1623 zio->io_type, zio->io_priority, 0,
1624 vdev_raidz_child_done, rc));
1628 return (ZIO_PIPELINE_CONTINUE);
1633 * Report a checksum error for a child of a RAID-Z device.
1636 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1638 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1640 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1641 zio_bad_cksum_t zbc;
1642 raidz_map_t *rm = zio->io_vsd;
1644 mutex_enter(&vd->vdev_stat_lock);
1645 vd->vdev_stat.vs_checksum_errors++;
1646 mutex_exit(&vd->vdev_stat_lock);
1648 zbc.zbc_has_cksum = 0;
1649 zbc.zbc_injected = rm->rm_ecksuminjected;
1651 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1652 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1658 * We keep track of whether or not there were any injected errors, so that
1659 * any ereports we generate can note it.
1662 raidz_checksum_verify(zio_t *zio)
1664 zio_bad_cksum_t zbc;
1665 raidz_map_t *rm = zio->io_vsd;
1667 int ret = zio_checksum_error(zio, &zbc);
1668 if (ret != 0 && zbc.zbc_injected != 0)
1669 rm->rm_ecksuminjected = 1;
1675 * Generate the parity from the data columns. If we tried and were able to
1676 * read the parity without error, verify that the generated parity matches the
1677 * data we read. If it doesn't, we fire off a checksum error. Return the
1678 * number such failures.
1681 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1683 void *orig[VDEV_RAIDZ_MAXPARITY];
1687 for (c = 0; c < rm->rm_firstdatacol; c++) {
1688 rc = &rm->rm_col[c];
1689 if (!rc->rc_tried || rc->rc_error != 0)
1691 orig[c] = zio_buf_alloc(rc->rc_size);
1692 bcopy(rc->rc_data, orig[c], rc->rc_size);
1695 vdev_raidz_generate_parity(rm);
1697 for (c = 0; c < rm->rm_firstdatacol; c++) {
1698 rc = &rm->rm_col[c];
1699 if (!rc->rc_tried || rc->rc_error != 0)
1701 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1702 raidz_checksum_error(zio, rc, orig[c]);
1703 rc->rc_error = SET_ERROR(ECKSUM);
1706 zio_buf_free(orig[c], rc->rc_size);
1713 * Keep statistics on all the ways that we used parity to correct data.
1715 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1718 vdev_raidz_worst_error(raidz_map_t *rm)
1722 for (int c = 0; c < rm->rm_cols; c++)
1723 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1729 * Iterate over all combinations of bad data and attempt a reconstruction.
1730 * Note that the algorithm below is non-optimal because it doesn't take into
1731 * account how reconstruction is actually performed. For example, with
1732 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1733 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1734 * cases we'd only use parity information in column 0.
1737 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1739 raidz_map_t *rm = zio->io_vsd;
1741 void *orig[VDEV_RAIDZ_MAXPARITY];
1742 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1743 int *tgts = &tstore[1];
1744 int current, next, i, c, n;
1747 ASSERT(total_errors < rm->rm_firstdatacol);
1750 * This simplifies one edge condition.
1754 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1756 * Initialize the targets array by finding the first n columns
1757 * that contain no error.
1759 * If there were no data errors, we need to ensure that we're
1760 * always explicitly attempting to reconstruct at least one
1761 * data column. To do this, we simply push the highest target
1762 * up into the data columns.
1764 for (c = 0, i = 0; i < n; i++) {
1765 if (i == n - 1 && data_errors == 0 &&
1766 c < rm->rm_firstdatacol) {
1767 c = rm->rm_firstdatacol;
1770 while (rm->rm_col[c].rc_error != 0) {
1772 ASSERT3S(c, <, rm->rm_cols);
1779 * Setting tgts[n] simplifies the other edge condition.
1781 tgts[n] = rm->rm_cols;
1784 * These buffers were allocated in previous iterations.
1786 for (i = 0; i < n - 1; i++) {
1787 ASSERT(orig[i] != NULL);
1790 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1793 next = tgts[current];
1795 while (current != n) {
1796 tgts[current] = next;
1800 * Save off the original data that we're going to
1801 * attempt to reconstruct.
1803 for (i = 0; i < n; i++) {
1804 ASSERT(orig[i] != NULL);
1807 ASSERT3S(c, <, rm->rm_cols);
1808 rc = &rm->rm_col[c];
1809 bcopy(rc->rc_data, orig[i], rc->rc_size);
1813 * Attempt a reconstruction and exit the outer loop on
1816 code = vdev_raidz_reconstruct(rm, tgts, n);
1817 if (raidz_checksum_verify(zio) == 0) {
1818 atomic_inc_64(&raidz_corrected[code]);
1820 for (i = 0; i < n; i++) {
1822 rc = &rm->rm_col[c];
1823 ASSERT(rc->rc_error == 0);
1825 raidz_checksum_error(zio, rc,
1827 rc->rc_error = SET_ERROR(ECKSUM);
1835 * Restore the original data.
1837 for (i = 0; i < n; i++) {
1839 rc = &rm->rm_col[c];
1840 bcopy(orig[i], rc->rc_data, rc->rc_size);
1845 * Find the next valid column after the current
1848 for (next = tgts[current] + 1;
1849 next < rm->rm_cols &&
1850 rm->rm_col[next].rc_error != 0; next++)
1853 ASSERT(next <= tgts[current + 1]);
1856 * If that spot is available, we're done here.
1858 if (next != tgts[current + 1])
1862 * Otherwise, find the next valid column after
1863 * the previous position.
1865 for (c = tgts[current - 1] + 1;
1866 rm->rm_col[c].rc_error != 0; c++)
1872 } while (current != n);
1877 for (i = 0; i < n; i++) {
1878 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1885 vdev_raidz_io_done(zio_t *zio)
1887 vdev_t *vd = zio->io_vd;
1889 raidz_map_t *rm = zio->io_vsd;
1891 int unexpected_errors = 0;
1892 int parity_errors = 0;
1893 int parity_untried = 0;
1894 int data_errors = 0;
1895 int total_errors = 0;
1897 int tgts[VDEV_RAIDZ_MAXPARITY];
1900 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1902 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1903 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1905 for (c = 0; c < rm->rm_cols; c++) {
1906 rc = &rm->rm_col[c];
1909 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1911 if (c < rm->rm_firstdatacol)
1916 if (!rc->rc_skipped)
1917 unexpected_errors++;
1920 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1925 if (zio->io_type == ZIO_TYPE_WRITE) {
1927 * XXX -- for now, treat partial writes as a success.
1928 * (If we couldn't write enough columns to reconstruct
1929 * the data, the I/O failed. Otherwise, good enough.)
1931 * Now that we support write reallocation, it would be better
1932 * to treat partial failure as real failure unless there are
1933 * no non-degraded top-level vdevs left, and not update DTLs
1934 * if we intend to reallocate.
1937 if (total_errors > rm->rm_firstdatacol)
1938 zio->io_error = vdev_raidz_worst_error(rm);
1941 } else if (zio->io_type == ZIO_TYPE_FREE) {
1945 ASSERT(zio->io_type == ZIO_TYPE_READ);
1947 * There are three potential phases for a read:
1948 * 1. produce valid data from the columns read
1949 * 2. read all disks and try again
1950 * 3. perform combinatorial reconstruction
1952 * Each phase is progressively both more expensive and less likely to
1953 * occur. If we encounter more errors than we can repair or all phases
1954 * fail, we have no choice but to return an error.
1958 * If the number of errors we saw was correctable -- less than or equal
1959 * to the number of parity disks read -- attempt to produce data that
1960 * has a valid checksum. Naturally, this case applies in the absence of
1963 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1964 if (data_errors == 0) {
1965 if (raidz_checksum_verify(zio) == 0) {
1967 * If we read parity information (unnecessarily
1968 * as it happens since no reconstruction was
1969 * needed) regenerate and verify the parity.
1970 * We also regenerate parity when resilvering
1971 * so we can write it out to the failed device
1974 if (parity_errors + parity_untried <
1975 rm->rm_firstdatacol ||
1976 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1977 n = raidz_parity_verify(zio, rm);
1978 unexpected_errors += n;
1979 ASSERT(parity_errors + n <=
1980 rm->rm_firstdatacol);
1986 * We either attempt to read all the parity columns or
1987 * none of them. If we didn't try to read parity, we
1988 * wouldn't be here in the correctable case. There must
1989 * also have been fewer parity errors than parity
1990 * columns or, again, we wouldn't be in this code path.
1992 ASSERT(parity_untried == 0);
1993 ASSERT(parity_errors < rm->rm_firstdatacol);
1996 * Identify the data columns that reported an error.
1999 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2000 rc = &rm->rm_col[c];
2001 if (rc->rc_error != 0) {
2002 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2007 ASSERT(rm->rm_firstdatacol >= n);
2009 code = vdev_raidz_reconstruct(rm, tgts, n);
2011 if (raidz_checksum_verify(zio) == 0) {
2012 atomic_inc_64(&raidz_corrected[code]);
2015 * If we read more parity disks than were used
2016 * for reconstruction, confirm that the other
2017 * parity disks produced correct data. This
2018 * routine is suboptimal in that it regenerates
2019 * the parity that we already used in addition
2020 * to the parity that we're attempting to
2021 * verify, but this should be a relatively
2022 * uncommon case, and can be optimized if it
2023 * becomes a problem. Note that we regenerate
2024 * parity when resilvering so we can write it
2025 * out to failed devices later.
2027 if (parity_errors < rm->rm_firstdatacol - n ||
2028 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2029 n = raidz_parity_verify(zio, rm);
2030 unexpected_errors += n;
2031 ASSERT(parity_errors + n <=
2032 rm->rm_firstdatacol);
2041 * This isn't a typical situation -- either we got a read error or
2042 * a child silently returned bad data. Read every block so we can
2043 * try again with as much data and parity as we can track down. If
2044 * we've already been through once before, all children will be marked
2045 * as tried so we'll proceed to combinatorial reconstruction.
2047 unexpected_errors = 1;
2048 rm->rm_missingdata = 0;
2049 rm->rm_missingparity = 0;
2051 for (c = 0; c < rm->rm_cols; c++) {
2052 if (rm->rm_col[c].rc_tried)
2055 zio_vdev_io_redone(zio);
2057 rc = &rm->rm_col[c];
2060 zio_nowait(zio_vdev_child_io(zio, NULL,
2061 vd->vdev_child[rc->rc_devidx],
2062 rc->rc_offset, rc->rc_data, rc->rc_size,
2063 zio->io_type, zio->io_priority, 0,
2064 vdev_raidz_child_done, rc));
2065 } while (++c < rm->rm_cols);
2071 * At this point we've attempted to reconstruct the data given the
2072 * errors we detected, and we've attempted to read all columns. There
2073 * must, therefore, be one or more additional problems -- silent errors
2074 * resulting in invalid data rather than explicit I/O errors resulting
2075 * in absent data. We check if there is enough additional data to
2076 * possibly reconstruct the data and then perform combinatorial
2077 * reconstruction over all possible combinations. If that fails,
2080 if (total_errors > rm->rm_firstdatacol) {
2081 zio->io_error = vdev_raidz_worst_error(rm);
2083 } else if (total_errors < rm->rm_firstdatacol &&
2084 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2086 * If we didn't use all the available parity for the
2087 * combinatorial reconstruction, verify that the remaining
2088 * parity is correct.
2090 if (code != (1 << rm->rm_firstdatacol) - 1)
2091 (void) raidz_parity_verify(zio, rm);
2094 * We're here because either:
2096 * total_errors == rm_first_datacol, or
2097 * vdev_raidz_combrec() failed
2099 * In either case, there is enough bad data to prevent
2102 * Start checksum ereports for all children which haven't
2103 * failed, and the IO wasn't speculative.
2105 zio->io_error = SET_ERROR(ECKSUM);
2107 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2108 for (c = 0; c < rm->rm_cols; c++) {
2109 rc = &rm->rm_col[c];
2110 if (rc->rc_error == 0) {
2111 zio_bad_cksum_t zbc;
2112 zbc.zbc_has_cksum = 0;
2114 rm->rm_ecksuminjected;
2116 zfs_ereport_start_checksum(
2118 vd->vdev_child[rc->rc_devidx],
2119 zio, rc->rc_offset, rc->rc_size,
2120 (void *)(uintptr_t)c, &zbc);
2127 zio_checksum_verified(zio);
2129 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2130 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2132 * Use the good data we have in hand to repair damaged children.
2134 for (c = 0; c < rm->rm_cols; c++) {
2135 rc = &rm->rm_col[c];
2136 cvd = vd->vdev_child[rc->rc_devidx];
2138 if (rc->rc_error == 0)
2141 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2142 rc->rc_offset, rc->rc_data, rc->rc_size,
2143 ZIO_TYPE_WRITE, zio->io_priority,
2144 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2145 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2151 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2153 if (faulted > vd->vdev_nparity)
2154 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2155 VDEV_AUX_NO_REPLICAS);
2156 else if (degraded + faulted != 0)
2157 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2159 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2162 vdev_ops_t vdev_raidz_ops = {
2166 vdev_raidz_io_start,
2168 vdev_raidz_state_change,
2171 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2172 B_FALSE /* not a leaf vdev */